950a640e
tangwang
embeddings
|
1
|
"""Image embedding client for the local embedding HTTP service."""
|
be52af70
tangwang
first commit
|
2
|
|
be52af70
tangwang
first commit
|
3
|
import os
|
950a640e
tangwang
embeddings
|
4
5
6
|
import logging
from typing import Any, List, Optional, Union
|
be52af70
tangwang
first commit
|
7
|
import numpy as np
|
950a640e
tangwang
embeddings
|
8
|
import requests
|
be52af70
tangwang
first commit
|
9
|
from PIL import Image
|
be52af70
tangwang
first commit
|
10
|
|
325eec03
tangwang
1. 日志、配置基础设施,使用优化
|
11
|
logger = logging.getLogger(__name__)
|
be52af70
tangwang
first commit
|
12
|
|
42e3aea6
tangwang
tidy
|
13
14
|
from config.services_config import get_embedding_base_url
|
be52af70
tangwang
first commit
|
15
16
17
|
class CLIPImageEncoder:
"""
|
325eec03
tangwang
1. 日志、配置基础设施,使用优化
|
18
|
Image Encoder for generating image embeddings using network service.
|
be52af70
tangwang
first commit
|
19
|
|
950a640e
tangwang
embeddings
|
20
|
This client is stateless and safe to instantiate per caller.
|
be52af70
tangwang
first commit
|
21
22
|
"""
|
950a640e
tangwang
embeddings
|
23
24
25
26
27
|
def __init__(self, service_url: Optional[str] = None):
resolved_url = service_url or os.getenv("EMBEDDING_SERVICE_URL") or get_embedding_base_url()
self.service_url = str(resolved_url).rstrip("/")
self.endpoint = f"{self.service_url}/embed/image"
logger.info("Creating CLIPImageEncoder instance with service URL: %s", self.service_url)
|
be52af70
tangwang
first commit
|
28
|
|
200fdddf
tangwang
embed norm
|
29
|
def _call_service(self, request_data: List[str], normalize_embeddings: bool = True) -> List[Any]:
|
325eec03
tangwang
1. 日志、配置基础设施,使用优化
|
30
31
|
"""
Call the embedding service API.
|
be52af70
tangwang
first commit
|
32
|
|
325eec03
tangwang
1. 日志、配置基础设施,使用优化
|
33
|
Args:
|
7bfb9946
tangwang
向量化模块
|
34
|
request_data: List of image URLs / local file paths
|
be52af70
tangwang
first commit
|
35
|
|
325eec03
tangwang
1. 日志、配置基础设施,使用优化
|
36
|
Returns:
|
7bfb9946
tangwang
向量化模块
|
37
|
List of embeddings (list[float]) or nulls (None), aligned to input order
|
325eec03
tangwang
1. 日志、配置基础设施,使用优化
|
38
|
"""
|
be52af70
tangwang
first commit
|
39
|
try:
|
325eec03
tangwang
1. 日志、配置基础设施,使用优化
|
40
41
|
response = requests.post(
self.endpoint,
|
200fdddf
tangwang
embed norm
|
42
|
params={"normalize": "true" if normalize_embeddings else "false"},
|
325eec03
tangwang
1. 日志、配置基础设施,使用优化
|
43
44
45
46
47
48
49
50
|
json=request_data,
timeout=60
)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
logger.error(f"CLIPImageEncoder service request failed: {e}", exc_info=True)
raise
|
be52af70
tangwang
first commit
|
51
|
|
ed948666
tangwang
tidy
|
52
|
def encode_image(self, image: Image.Image) -> np.ndarray:
|
325eec03
tangwang
1. 日志、配置基础设施,使用优化
|
53
54
|
"""
Encode image to embedding vector using network service.
|
be52af70
tangwang
first commit
|
55
|
|
325eec03
tangwang
1. 日志、配置基础设施,使用优化
|
56
57
|
Note: This method is kept for compatibility but the service only works with URLs.
"""
|
ed948666
tangwang
tidy
|
58
|
raise NotImplementedError("encode_image with PIL Image is not supported by embedding service")
|
be52af70
tangwang
first commit
|
59
|
|
200fdddf
tangwang
embed norm
|
60
|
def encode_image_from_url(self, url: str, normalize_embeddings: bool = True) -> np.ndarray:
|
325eec03
tangwang
1. 日志、配置基础设施,使用优化
|
61
62
|
"""
Generate image embedding via network service using URL.
|
be52af70
tangwang
first commit
|
63
|
|
325eec03
tangwang
1. 日志、配置基础设施,使用优化
|
64
65
|
Args:
url: Image URL to process
|
be52af70
tangwang
first commit
|
66
|
|
325eec03
tangwang
1. 日志、配置基础设施,使用优化
|
67
|
Returns:
|
ed948666
tangwang
tidy
|
68
|
Embedding vector
|
325eec03
tangwang
1. 日志、配置基础设施,使用优化
|
69
|
"""
|
200fdddf
tangwang
embed norm
|
70
|
response_data = self._call_service([url], normalize_embeddings=normalize_embeddings)
|
ed948666
tangwang
tidy
|
71
72
73
74
75
76
|
if not response_data or len(response_data) != 1 or response_data[0] is None:
raise RuntimeError(f"No image embedding returned for URL: {url}")
vec = np.array(response_data[0], dtype=np.float32)
if vec.ndim != 1 or vec.size == 0 or not np.isfinite(vec).all():
raise RuntimeError(f"Invalid image embedding returned for URL: {url}")
return vec
|
be52af70
tangwang
first commit
|
77
78
79
80
|
def encode_batch(
self,
images: List[Union[str, Image.Image]],
|
200fdddf
tangwang
embed norm
|
81
82
|
batch_size: int = 8,
normalize_embeddings: bool = True,
|
ed948666
tangwang
tidy
|
83
|
) -> List[np.ndarray]:
|
be52af70
tangwang
first commit
|
84
|
"""
|
325eec03
tangwang
1. 日志、配置基础设施,使用优化
|
85
|
Encode a batch of images efficiently via network service.
|
be52af70
tangwang
first commit
|
86
87
88
|
Args:
images: List of image URLs or PIL Images
|
325eec03
tangwang
1. 日志、配置基础设施,使用优化
|
89
|
batch_size: Batch size for processing (used for service requests)
|
be52af70
tangwang
first commit
|
90
91
|
Returns:
|
ed948666
tangwang
tidy
|
92
|
List of embeddings
|
be52af70
tangwang
first commit
|
93
|
"""
|
325eec03
tangwang
1. 日志、配置基础设施,使用优化
|
94
|
for i, img in enumerate(images):
|
ed948666
tangwang
tidy
|
95
96
97
98
99
100
101
102
|
if isinstance(img, Image.Image):
raise NotImplementedError(f"PIL Image at index {i} is not supported by service")
if not isinstance(img, str) or not img.strip():
raise ValueError(f"Invalid image URL/path at index {i}: {img!r}")
results: List[np.ndarray] = []
for i in range(0, len(images), batch_size):
batch_urls = [str(u).strip() for u in images[i:i + batch_size]]
|
200fdddf
tangwang
embed norm
|
103
|
response_data = self._call_service(batch_urls, normalize_embeddings=normalize_embeddings)
|
ed948666
tangwang
tidy
|
104
105
106
107
108
109
110
111
112
113
114
115
116
|
if not response_data or len(response_data) != len(batch_urls):
raise RuntimeError(
f"Image embedding response length mismatch: expected {len(batch_urls)}, "
f"got {0 if response_data is None else len(response_data)}"
)
for j, url in enumerate(batch_urls):
embedding = response_data[j]
if embedding is None:
raise RuntimeError(f"No image embedding returned for URL: {url}")
vec = np.array(embedding, dtype=np.float32)
if vec.ndim != 1 or vec.size == 0 or not np.isfinite(vec).all():
raise RuntimeError(f"Invalid image embedding returned for URL: {url}")
results.append(vec)
|
be52af70
tangwang
first commit
|
117
118
|
return results
|
e7a2c0b7
tangwang
img encode
|
119
120
121
122
123
|
def encode_image_urls(
self,
urls: List[str],
batch_size: Optional[int] = None,
|
200fdddf
tangwang
embed norm
|
124
|
normalize_embeddings: bool = True,
|
ed948666
tangwang
tidy
|
125
|
) -> List[np.ndarray]:
|
e7a2c0b7
tangwang
img encode
|
126
127
128
129
130
131
132
133
|
"""
与 ClipImageModel / ClipAsServiceImageEncoder 一致的接口,供索引器 document_transformer 调用。
Args:
urls: 图片 URL 列表
batch_size: 批大小(默认 8)
Returns:
|
ed948666
tangwang
tidy
|
134
|
与 urls 等长的向量列表
|
e7a2c0b7
tangwang
img encode
|
135
|
"""
|
200fdddf
tangwang
embed norm
|
136
137
138
139
140
|
return self.encode_batch(
urls,
batch_size=batch_size or 8,
normalize_embeddings=normalize_embeddings,
)
|