Commit c10f90fea107e1ce226cd28caa66da1f07be6104

Authored by tangwang
1 parent 42e3aea6

cnclip

docs/reference/商品数据源入ES配置规范.md deleted
@@ -1,221 +0,0 @@ @@ -1,221 +0,0 @@
1 -根据您提供的内容,我将其整理为规范的Markdown格式:  
2 -  
3 -# ES索引配置文档  
4 -  
5 -## 1. 全局配置  
6 -  
7 -### 1.1 文本字段相关性设定  
8 -需要修改所有text字段相关性算法-BM25算法的默认参数:  
9 -```json  
10 -"similarity": {  
11 - "default": {  
12 - "type": "BM25",  
13 - "b": "0.0",  
14 - "k1": "0.0"  
15 - }  
16 -}  
17 -```  
18 -  
19 -### 1.2 索引分片设定  
20 -- `number_of_replicas`:0/1  
21 -- `number_of_shards`:设置建议 分片数 <= ES集群的总CPU核心个数/ (副本数 + 1)  
22 -  
23 -### 1.3 索引刷新时间设定  
24 -- `refresh_interval`:默认30S,根据客户需要进行调整  
25 -```json  
26 -"refresh_interval": "30s"  
27 -```  
28 -  
29 -## 2. 单个字段配置  
30 -  
31 -| 分析方式 | 字段预处理和ES输入格式要求 | 对应ES mapping配置 | 备注 |  
32 -|---------|--------------------------|-------------------|------|  
33 -| 电商通用分析-中文 | - | ```json { "type": "text", "analyzer": "index_ansj", "search_analyzer": "query_ansj" } ``` | - |  
34 -| 文本-多语言向量化 | 调用"文本向量化"模块得到1024维向量 | ```json { "type": "dense_vector", "dims": 1024, "index": true, "similarity": "dot_product" } ``` | 1. 依赖"文本向量化"模块<br>2. 如果定期全量,需要对向量化结果做缓存 |  
35 -| 图片-向量化 | 调用"图片向量化"模块得到1024维向量 | ```json { "type": "nested", "properties": { "vector": { "type": "dense_vector", "dims": 1024, "similarity": "dot_product" }, "url": { "type": "text" } } } ``` | 1. 依赖"图片向量化"模块<br>2. 如果定期全量,需要对向量化结果做缓存 |  
36 -| 关键词 | ES输入格式:list或者单个值 | ```json {"type": "keyword"} ``` | - |  
37 -| 电商通用分析-英文 | - | ```json {"type": "text", "analyzer": "english"} ``` | - |  
38 -| 电商通用分析-阿拉伯文 | - | ```json {"type": "text", "analyzer": "arabic"} ``` | - |  
39 -| 电商通用分析-西班牙文 | - | ```json {"type": "text", "analyzer": "spanish"} ``` | - |  
40 -| 电商通用分析-俄文 | - | ```json {"type": "text", "analyzer": "russian"} ``` | - |  
41 -| 电商通用分析-日文 | - | ```json {"type": "text", "analyzer": "japanese"} ``` | - |  
42 -| 数值-整数 | - | ```json {"type": "long"} ``` | - |  
43 -| 数值-浮点型 | - | ```json {"type": "float"} ``` | - |  
44 -| 分值 | 输入是float,配置处理方式:log, pow, sigmoid等 | TODO:给代码, log | - |  
45 -| 子串 | - | 暂时不支持 | - |  
46 -| ngram匹配或前缀匹配或边缘前缀匹配 | - | 暂时不支持 | 以后根据需要再添加 |  
47 -  
48 -这样整理后,文档结构更加清晰,表格格式规范,便于阅读和理解。  
49 -  
50 -  
51 -参考 opensearch:  
52 -  
53 -数据接口  
54 -文本相关性字段  
55 -向量相关性字段  
56 -3. 模块提取  
57 -文本向量化  
58 -import sys  
59 -import torch  
60 -from sentence_transformers import SentenceTransformer  
61 -import time  
62 -import threading  
63 -from modelscope import snapshot_download  
64 -from transformers import AutoModel  
65 -import os  
66 -from openai import OpenAI  
67 -from config.logging_config import get_app_logger  
68 -  
69 -# Get logger for this module  
70 -logger = get_app_logger(__name__)  
71 -  
72 -class BgeEncoder:  
73 - _instance = None  
74 - _lock = threading.Lock()  
75 -  
76 - def __new__(cls, model_dir='Xorbits/bge-m3'):  
77 - with cls._lock:  
78 - if cls._instance is None:  
79 - cls._instance = super(BgeEncoder, cls).__new__(cls)  
80 - logger.info("[BgeEncoder] Creating a new instance with model directory: %s", model_dir)  
81 - cls._instance.model = SentenceTransformer(snapshot_download(model_dir))  
82 - logger.info("[BgeEncoder] New instance has been created")  
83 - return cls._instance  
84 -  
85 - def encode(self, sentences, normalize_embeddings=True, device='cuda'):  
86 - # Move model to specified device  
87 - if device == 'gpu':  
88 - device = 'cuda'  
89 - self.model = self.model.to(device)  
90 - embeddings = self.model.encode(sentences, normalize_embeddings=normalize_embeddings, device=device, show_progress_bar=False)  
91 - return embeddings  
92 -图片向量化  
93 -import sys  
94 -import os  
95 -import io  
96 -import requests  
97 -import torch  
98 -import numpy as np  
99 -from PIL import Image  
100 -import logging  
101 -import threading  
102 -from typing import List, Optional, Union  
103 -from config.logging_config import get_app_logger  
104 -import cn_clip.clip as clip  
105 -from cn_clip.clip import load_from_name  
106 -  
107 -# Get logger for this module  
108 -logger = get_app_logger(__name__)  
109 -  
110 -# DEFAULT_MODEL_NAME = "ViT-L-14-336" # ["ViT-B-16", "ViT-L-14", "ViT-L-14-336", "ViT-H-14", "RN50"]  
111 -DEFAULT_MODEL_NAME = "ViT-H-14"  
112 -MODEL_DOWNLOAD_DIR = "/data/tw/uat/EsSearcher"  
113 -  
114 -class CLIPImageEncoder:  
115 - """CLIP Image Encoder for generating image embeddings using cn_clip"""  
116 -  
117 - _instance = None  
118 - _lock = threading.Lock()  
119 -  
120 - def __new__(cls, model_name=DEFAULT_MODEL_NAME, device=None):  
121 - with cls._lock:  
122 - if cls._instance is None:  
123 - cls._instance = super(CLIPImageEncoder, cls).__new__(cls)  
124 - logger.info(f"[CLIPImageEncoder] Creating new instance with model: {model_name}")  
125 - cls._instance._initialize_model(model_name, device)  
126 - return cls._instance  
127 -  
128 - def _initialize_model(self, model_name, device):  
129 - """Initialize the CLIP model using cn_clip"""  
130 - try:  
131 - self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")  
132 - self.model, self.preprocess = load_from_name(model_name, device=self.device, download_root=MODEL_DOWNLOAD_DIR)  
133 - self.model.eval()  
134 - self.model_name = model_name  
135 - logger.info(f"[CLIPImageEncoder] Model {model_name} initialized successfully on device {self.device}")  
136 -  
137 - except Exception as e:  
138 - logger.error(f"[CLIPImageEncoder] Failed to initialize model: {str(e)}")  
139 - raise  
140 -  
141 - def validate_image(self, image_data: bytes) -> Image.Image:  
142 - """Validate image data and return PIL Image if valid"""  
143 - try:  
144 - image_stream = io.BytesIO(image_data)  
145 - image = Image.open(image_stream)  
146 - image.verify()  
147 - image_stream.seek(0)  
148 - image = Image.open(image_stream)  
149 - if image.mode != 'RGB':  
150 - image = image.convert('RGB')  
151 - return image  
152 - except Exception as e:  
153 - raise ValueError(f"Invalid image data: {str(e)}")  
154 -  
155 - def download_image(self, url: str, timeout: int = 10) -> bytes:  
156 - """Download image from URL"""  
157 - try:  
158 - if url.startswith(('http://', 'https://')):  
159 - response = requests.get(url, timeout=timeout)  
160 - if response.status_code != 200:  
161 - raise ValueError(f"HTTP {response.status_code}")  
162 - return response.content  
163 - else:  
164 - # Local file path  
165 - with open(url, 'rb') as f:  
166 - return f.read()  
167 - except Exception as e:  
168 - raise ValueError(f"Failed to download image from {url}: {str(e)}")  
169 -  
170 - def preprocess_image(self, image: Image.Image, max_size: int = 1024) -> Image.Image:  
171 - """Preprocess image for CLIP model"""  
172 - # Resize if too large  
173 - if max(image.size) > max_size:  
174 - ratio = max_size / max(image.size)  
175 - new_size = tuple(int(dim * ratio) for dim in image.size)  
176 - image = image.resize(new_size, Image.Resampling.LANCZOS)  
177 - return image  
178 -  
179 - def encode_text(self, text):  
180 - """Encode text to embedding vector using cn_clip"""  
181 - text_data = clip.tokenize([text] if type(text) == str else text).to(self.device)  
182 - with torch.no_grad():  
183 - text_features = self.model.encode_text(text_data)  
184 - text_features /= text_features.norm(dim=-1, keepdim=True)  
185 - return text_features  
186 -  
187 - def encode_image(self, image: Image.Image) -> Optional[np.ndarray]:  
188 - """Encode image to embedding vector using cn_clip"""  
189 - if not isinstance(image, Image.Image):  
190 - raise ValueError("CLIPImageEncoder.encode_image Input must be a PIL.Image")  
191 -  
192 - try:  
193 - infer_data = self.preprocess(image).unsqueeze(0).to(self.device)  
194 - with torch.no_grad():  
195 - image_features = self.model.encode_image(infer_data)  
196 - image_features /= image_features.norm(dim=-1, keepdim=True)  
197 - return image_features.cpu().numpy().astype('float32')[0]  
198 - except Exception as e:  
199 - logger.error(f"Failed to process image. Reason: {str(e)}")  
200 - return None  
201 -  
202 - def encode_image_from_url(self, url: str) -> Optional[np.ndarray]:  
203 - """Complete pipeline: download, validate, preprocess and encode image from URL"""  
204 - try:  
205 - # Download image  
206 - image_data = self.download_image(url)  
207 -  
208 - # Validate image  
209 - image = self.validate_image(image_data)  
210 -  
211 - # Preprocess image  
212 - image = self.preprocess_image(image)  
213 -  
214 - # Encode image  
215 - embedding = self.encode_image(image)  
216 -  
217 - return embedding  
218 -  
219 - except Exception as e:  
220 - logger.error(f"Error processing image from URL {url}: {str(e)}")  
221 - return None  
222 \ No newline at end of file 0 \ No newline at end of file
docs/reference/阿里opensearch电商行业.md deleted
@@ -1,47 +0,0 @@ @@ -1,47 +0,0 @@
1 -https://help.aliyun.com/zh/open-search/industry-algorithm-edition/e-commerce?spm=a2c4g.11186623.help-menu-29102.d_3_2_1.5a903cfbxOsaHt&scm=20140722.H_99739._.OR_help-T_cn~zh-V_1  
2 -  
3 -  
4 -## 定义应用结构  
5 -示例如下:  
6 -| 字段名称 | 主键 | 字段标签 | 类型 |  
7 -|----------------|------|------------|--------------|  
8 -| title | | 商品标题 | TEXT |  
9 -| text_embedding | | 文本向量 | EMBEDDING |  
10 -| image_embedding | | 图片向量 | EMBEDDING |  
11 -| category_name | | 类目名称 | TEXT |  
12 -| image_url | | | LITERAL_ARRAY|  
13 -| description | | 商品描述 | TEXT |  
14 -| brand_name | | 品牌名称 | TEXT |  
15 -| thumbnail_url | | | LITERAL_ARRAY|  
16 -| is_onsale | | | INT |  
17 -| url | | | LITERAL |  
18 -| brand_id | | | LITERAL |  
19 -| series_id | | | LITERAL |  
20 -| sold_num | | 商品销量 | INT |  
21 -| category_id | | | INT |  
22 -| onsale_time | | 上架时间 | INT |  
23 -| price | | | DOUBLE |  
24 -| series_name | | | TEXT |  
25 -| discount_price | | DOUBLE |  
26 -| pid | ● | INT |  
27 -| sale_price | | DOUBLE |  
28 -| act_price | | DOUBLE |  
29 -  
30 -  
31 -## 定义索引结构  
32 -  
33 -| 索引名称 | 索引标签 | 包含字段 | 分析方式 | 使用示例 |  
34 -| --- | --- | --- | --- | --- |  
35 -| default | 默认索引 | category_name, description, brand_name, title, create_by, update_by | 行业 - 电商通用分析 | query=default:“云搜索” |  
36 -| category_name | 类目名称索引 | category_name | 行业 - 电商通用分析 | query=category_name:“云搜索” |  
37 -| category_id | | category_id | 关键字 | query=category_id:“云搜索” |  
38 -| series_name | | series_name | 中文 - 通用分析 | query=series_name:“云搜索” |  
39 -| brand_name | | brand_name | 中文 - 通用分析 | query=brand_name:“云搜索” |  
40 -| id | | id | 关键字 | query=id:“云搜索” |  
41 -| title | 标题索引 | title | 行业 - 电商通用分析 | query=title:“云搜索” |  
42 -| seller_id | | seller_id | 关键字 | query=seller_id:“云搜索” |  
43 -| brand_id | | brand_id | 关键字 | query=brand_id:“云搜索” |  
44 -| series_id | | series_id | 关键字 | query=series_id:“云搜索” |  
45 -  
46 -上面的只是阿里云的opensearch的例子,我们也要有同样的一套配置,这里支持的“字分析方式” 为ES预先支持的 多种分析器,我们要支持的分析方式参考 @商品数据源入ES配置规范.md  
47 -  
docs/temporary/sku_image_src问题诊断报告.md deleted
@@ -1,117 +0,0 @@ @@ -1,117 +0,0 @@
1 -# SKU image_src 字段为空问题诊断报告  
2 -  
3 -## 问题描述  
4 -  
5 -返回结果的每条结果中,多款式字段 `skus` 下面每个 SKU 的 `image_src` 为空。  
6 -  
7 -## 问题分析  
8 -  
9 -### 1. ES 数据检查  
10 -  
11 -通过查询 ES 数据,发现:  
12 -- ES 中确实有 `skus` 数据(不是空数组)  
13 -- 但是 `skus` 数组中的每个 SKU 对象**都没有 `image_src` 字段**  
14 -  
15 -示例 ES 文档:  
16 -```json  
17 -{  
18 - "spu_id": "68238",  
19 - "skus": [  
20 - {  
21 - "sku_id": "3568395",  
22 - "price": 329.61,  
23 - "compare_at_price": 485.65,  
24 - "sku_code": "3468269",  
25 - "stock": 57,  
26 - "weight": 0.26,  
27 - "weight_unit": "kg",  
28 - "option1_value": "",  
29 - "option2_value": "",  
30 - "option3_value": ""  
31 - // 注意:这里没有 image_src 字段  
32 - }  
33 - ]  
34 -}  
35 -```  
36 -  
37 -### 2. 代码逻辑检查  
38 -  
39 -在 `indexer/document_transformer.py` 的 `_transform_sku_row` 方法中(第558-560行),原有逻辑为:  
40 -  
41 -```python  
42 -# Image src  
43 -if pd.notna(sku_row.get('image_src')):  
44 - sku_data['image_src'] = str(sku_row['image_src'])  
45 -```  
46 -  
47 -**问题根源**:  
48 -- 只有当 MySQL 中的 `image_src` 字段**非空**时,才会将其添加到 `sku_data` 字典中  
49 -- 如果 MySQL 中的 `image_src` 是 `NULL` 或空字符串,这个字段就**不会出现在返回的字典中**  
50 -- 导致 ES 文档中缺少 `image_src` 字段  
51 -- API 返回时,`sku_entry.get('image_src')` 返回 `None`,前端看到的就是空值  
52 -  
53 -### 3. MySQL 数据情况  
54 -  
55 -根据代码逻辑推断:  
56 -- MySQL 的 `shoplazza_product_sku` 表中,`image_src` 字段可能为 `NULL` 或空字符串  
57 -- 这导致索引时该字段没有被写入 ES  
58 -  
59 -## 解决方案  
60 -  
61 -### 修复方案  
62 -  
63 -修改 `indexer/document_transformer.py` 中的 `_transform_sku_row` 方法,**始终包含 `image_src` 字段**,即使值为空也设置为 `None`:  
64 -  
65 -```python  
66 -# Image src - always include this field, even if empty  
67 -# This ensures the field is present in ES documents and API responses  
68 -image_src = sku_row.get('image_src')  
69 -if pd.notna(image_src) and str(image_src).strip():  
70 - sku_data['image_src'] = str(image_src).strip()  
71 -else:  
72 - # Set to None (will be serialized as null in JSON) instead of omitting the field  
73 - sku_data['image_src'] = None  
74 -```  
75 -  
76 -### 修复效果  
77 -  
78 -修复后:  
79 -1. **即使 MySQL 中 `image_src` 为 NULL 或空字符串**,ES 文档中也会包含该字段(值为 `null`)  
80 -2. API 返回时,前端可以明确知道该字段存在但值为空  
81 -3. 符合 API 模型定义:`image_src: Optional[str] = Field(None, ...)`  
82 -  
83 -## 问题分类  
84 -  
85 -**问题类型**:**本项目填充的问题**  
86 -  
87 -- ✅ **不是 MySQL 原始数据的问题**:MySQL 中 `image_src` 字段可能确实为 NULL,但这是正常的业务数据  
88 -- ✅ **不是 ES 数据的问题**:ES mapping 中 `image_src` 字段定义正确  
89 -- ❌ **是本项目填充的问题**:代码逻辑导致当 MySQL 中 `image_src` 为空时,该字段没有被写入 ES 文档  
90 -  
91 -## 后续操作  
92 -  
93 -1. **重新索引数据**:修复代码后,需要重新索引数据才能生效  
94 - ```bash  
95 - # 重新索引指定租户的数据  
96 - ./scripts/ingest.sh <tenant_id> true  
97 - ```  
98 -  
99 -2. **验证修复**:重新索引后,查询 ES 验证 `image_src` 字段是否已包含:  
100 - ```bash  
101 - curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' \  
102 - -H 'Content-Type: application/json' \  
103 - -d '{  
104 - "size": 1,  
105 - "query": {"nested": {"path": "skus", "query": {"exists": {"field": "skus"}}}},  
106 - "_source": ["spu_id", "skus"]  
107 - }'  
108 - ```  
109 -  
110 -3. **可选优化**:如果业务需要,可以考虑当 SKU 的 `image_src` 为空时,使用 SPU 的主图(`image_url`)作为默认值  
111 -  
112 -## 相关文件  
113 -  
114 -- `indexer/document_transformer.py` - 已修复  
115 -- `api/models.py` - `SkuResult.image_src: Optional[str]` - 模型定义正确  
116 -- `api/result_formatter.py` - `image_src=sku_entry.get('image_src')` - 读取逻辑正确  
117 -- `mappings/search_products.json` - `skus.image_src` mapping 定义正确  
embeddings/README.md
@@ -8,8 +8,10 @@ @@ -8,8 +8,10 @@
8 8
9 - **HTTP 客户端**:`text_encoder.py` / `image_encoder.py`(供搜索/索引模块调用) 9 - **HTTP 客户端**:`text_encoder.py` / `image_encoder.py`(供搜索/索引模块调用)
10 - **本地模型实现**:`bge_model.py` / `clip_model.py` 10 - **本地模型实现**:`bge_model.py` / `clip_model.py`
  11 +- **clip-as-service 客户端**:`clip_as_service_encoder.py`(图片向量,推荐)
11 - **向量化服务(FastAPI)**:`server.py` 12 - **向量化服务(FastAPI)**:`server.py`
12 - **统一配置**:`config.py` 13 - **统一配置**:`config.py`
  14 +- **接口契约**:`protocols.ImageEncoderProtocol`(图片编码统一为 `encode_image_urls(urls, batch_size)`,本地 CN-CLIP 与 clip-as-service 均实现该接口)
13 15
14 ### 服务接口 16 ### 服务接口
15 17
@@ -21,6 +23,24 @@ @@ -21,6 +23,24 @@
21 - 入参:`["url或本地路径1", ...]` 23 - 入参:`["url或本地路径1", ...]`
22 - 出参:`[[...], null, ...]`(与输入按 index 对齐,失败为 `null`) 24 - 出参:`[[...], null, ...]`(与输入按 index 对齐,失败为 `null`)
23 25
  26 +### 图片向量:clip-as-service(推荐)
  27 +
  28 +默认使用 `third-party/clip-as-service` 的 Jina CLIP 服务生成图片向量。
  29 +
  30 +1. **安装 clip-client**(首次使用):
  31 + ```bash
  32 + pip install -e third-party/clip-as-service/client
  33 + ```
  34 +
  35 +2. **启动 CN-CLIP 服务**(独立 gRPC 服务,默认端口 51000,详见 `docs/CNCLIP_SERVICE说明文档.md`):
  36 + ```bash
  37 + ./scripts/start_cnclip_service.sh
  38 + ```
  39 +
  40 +3. **配置**(`embeddings/config.py` 或环境变量):
  41 + - `USE_CLIP_AS_SERVICE=true`(默认)
  42 + - `CLIP_AS_SERVICE_SERVER=grpc://127.0.0.1:51000`
  43 +
24 ### 启动服务 44 ### 启动服务
25 45
26 使用仓库脚本启动(默认端口 6005): 46 使用仓库脚本启动(默认端口 6005):
@@ -35,5 +55,6 @@ @@ -35,5 +55,6 @@
35 55
36 - `PORT`: 服务端口(默认 6005) 56 - `PORT`: 服务端口(默认 6005)
37 - `TEXT_MODEL_DIR`, `TEXT_DEVICE`, `TEXT_BATCH_SIZE` 57 - `TEXT_MODEL_DIR`, `TEXT_DEVICE`, `TEXT_BATCH_SIZE`
38 -- `IMAGE_MODEL_NAME`, `IMAGE_DEVICE` 58 +- `USE_CLIP_AS_SERVICE`, `CLIP_AS_SERVICE_SERVER`:图片向量(clip-as-service)
  59 +- `IMAGE_MODEL_NAME`, `IMAGE_DEVICE`:本地 CN-CLIP(当 `USE_CLIP_AS_SERVICE=false` 时)
39 60
embeddings/clip_as_service_encoder.py 0 → 100644
@@ -0,0 +1,122 @@ @@ -0,0 +1,122 @@
  1 +"""
  2 +Image encoder using third-party clip-as-service (Jina CLIP server).
  3 +
  4 +Requires clip-as-service server to be running. The client is loaded from
  5 +third-party/clip-as-service/client so no separate pip install is needed
  6 +if that path is on sys.path or the package is installed in development mode.
  7 +"""
  8 +
  9 +import logging
  10 +import os
  11 +import sys
  12 +from typing import List, Optional
  13 +
  14 +import numpy as np
  15 +
  16 +logger = logging.getLogger(__name__)
  17 +
  18 +# Ensure third-party clip client is importable
  19 +def _ensure_clip_client_path():
  20 + repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
  21 + client_path = os.path.join(repo_root, "third-party", "clip-as-service", "client")
  22 + if os.path.isdir(client_path) and client_path not in sys.path:
  23 + sys.path.insert(0, client_path)
  24 +
  25 +
  26 +def _normalize_image_url(url: str) -> str:
  27 + """Normalize image URL for clip-as-service (e.g. //host/path -> https://host/path)."""
  28 + if not url or not isinstance(url, str):
  29 + return ""
  30 + url = url.strip()
  31 + if url.startswith("//"):
  32 + return "https:" + url
  33 + return url
  34 +
  35 +
  36 +class ClipAsServiceImageEncoder:
  37 + """
  38 + Image embedding encoder using clip-as-service Client.
  39 + Encodes image URLs in batch; returns 1024-dim vectors (server model must match).
  40 + """
  41 +
  42 + def __init__(
  43 + self,
  44 + server: str = "grpc://127.0.0.1:51000",
  45 + batch_size: int = 8,
  46 + show_progress: bool = False,
  47 + ):
  48 + """
  49 + Args:
  50 + server: clip-as-service server URI (e.g. grpc://127.0.0.1:51000 or http://127.0.0.1:51000).
  51 + batch_size: batch size for encode requests.
  52 + show_progress: whether to show progress bar when encoding.
  53 + """
  54 + _ensure_clip_client_path()
  55 + try:
  56 + from clip_client import Client
  57 + except ImportError as e:
  58 + raise ImportError(
  59 + "clip_client not found. Add third-party/clip-as-service/client to PYTHONPATH "
  60 + "or run: pip install -e third-party/clip-as-service/client"
  61 + ) from e
  62 +
  63 + self._server = server
  64 + self._batch_size = batch_size
  65 + self._show_progress = show_progress
  66 + self._client = Client(server)
  67 +
  68 + def encode_image_urls(
  69 + self,
  70 + urls: List[str],
  71 + batch_size: Optional[int] = None,
  72 + ) -> List[Optional[np.ndarray]]:
  73 + """
  74 + Encode a list of image URLs to vectors.
  75 +
  76 + Args:
  77 + urls: list of image URLs (http/https or //host/path).
  78 + batch_size: override instance batch_size for this call.
  79 +
  80 + Returns:
  81 + List of vectors (1024-dim float32) or None for failed items, same length as urls.
  82 + """
  83 + if not urls:
  84 + return []
  85 +
  86 + normalized = [_normalize_image_url(u) for u in urls]
  87 + valid_indices = [i for i, u in enumerate(normalized) if u]
  88 + if not valid_indices:
  89 + return [None] * len(urls)
  90 +
  91 + valid_urls = [normalized[i] for i in valid_indices]
  92 + bs = batch_size if batch_size is not None else self._batch_size
  93 + out: List[Optional[np.ndarray]] = [None] * len(urls)
  94 +
  95 + try:
  96 + # Client.encode(iterable of str) returns np.ndarray [N, D] for string input
  97 + arr = self._client.encode(
  98 + valid_urls,
  99 + batch_size=bs,
  100 + show_progress=self._show_progress,
  101 + )
  102 + if arr is not None and hasattr(arr, "shape") and len(arr) == len(valid_indices):
  103 + for j, idx in enumerate(valid_indices):
  104 + row = arr[j]
  105 + if row is not None and hasattr(row, "tolist"):
  106 + out[idx] = np.asarray(row, dtype=np.float32)
  107 + else:
  108 + out[idx] = np.array(row, dtype=np.float32)
  109 + else:
  110 + logger.warning(
  111 + "clip-as-service encode returned unexpected shape/length, "
  112 + "expected %d vectors", len(valid_indices)
  113 + )
  114 + except Exception as e:
  115 + logger.warning("clip-as-service encode failed: %s", e, exc_info=True)
  116 +
  117 + return out
  118 +
  119 + def encode_image_from_url(self, url: str) -> Optional[np.ndarray]:
  120 + """Encode a single image URL. Returns 1024-dim vector or None."""
  121 + results = self.encode_image_urls([url], batch_size=1)
  122 + return results[0] if results else None
embeddings/clip_model.py
@@ -17,7 +17,7 @@ import cn_clip.clip as clip @@ -17,7 +17,7 @@ import cn_clip.clip as clip
17 17
18 18
19 DEFAULT_MODEL_NAME = "ViT-H-14" 19 DEFAULT_MODEL_NAME = "ViT-H-14"
20 -MODEL_DOWNLOAD_DIR = "/data/tw/uat/EsSearcher" 20 +MODEL_DOWNLOAD_DIR = "/data/"
21 21
22 22
23 class ClipImageModel(object): 23 class ClipImageModel(object):
@@ -91,6 +91,23 @@ class ClipImageModel(object): @@ -91,6 +91,23 @@ class ClipImageModel(object):
91 image = self.preprocess_image(image) 91 image = self.preprocess_image(image)
92 return self.encode_image(image) 92 return self.encode_image(image)
93 93
  94 + def encode_image_urls(
  95 + self,
  96 + urls: List[str],
  97 + batch_size: Optional[int] = None,
  98 + ) -> List[Optional[np.ndarray]]:
  99 + """
  100 + Encode a list of image URLs to vectors. Same interface as ClipAsServiceImageEncoder.
  101 +
  102 + Args:
  103 + urls: list of image URLs or local paths.
  104 + batch_size: batch size for internal batching (default 8).
  105 +
  106 + Returns:
  107 + List of vectors (or None for failed items), same length as urls.
  108 + """
  109 + return self.encode_batch(urls, batch_size=batch_size or 8)
  110 +
94 def encode_batch(self, images: List[Union[str, Image.Image]], batch_size: int = 8) -> List[Optional[np.ndarray]]: 111 def encode_batch(self, images: List[Union[str, Image.Image]], batch_size: int = 8) -> List[Optional[np.ndarray]]:
95 results: List[Optional[np.ndarray]] = [] 112 results: List[Optional[np.ndarray]] = []
96 for i in range(0, len(images), batch_size): 113 for i in range(0, len(images), batch_size):
embeddings/cloud_text_encoder.py
@@ -35,7 +35,7 @@ class CloudTextEncoder: @@ -35,7 +35,7 @@ class CloudTextEncoder:
35 if not api_key: 35 if not api_key:
36 raise ValueError("DASHSCOPE_API_KEY must be set in environment or passed as parameter") 36 raise ValueError("DASHSCOPE_API_KEY must be set in environment or passed as parameter")
37 37
38 - # Use Beijing region by default 38 + # 以下是北京地域base-url,如果使用新加坡地域的模型,需要将base_url替换为:https://dashscope-intl.aliyuncs.com/compatible-mode/v1
39 base_url = base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1" 39 base_url = base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1"
40 40
41 cls._instance.client = OpenAI( 41 cls._instance.client = OpenAI(
embeddings/config.py
@@ -21,7 +21,12 @@ class EmbeddingConfig(object): @@ -21,7 +21,12 @@ class EmbeddingConfig(object):
21 TEXT_DEVICE = "cuda" # "cuda" or "cpu" (model may fall back to CPU if needed) 21 TEXT_DEVICE = "cuda" # "cuda" or "cpu" (model may fall back to CPU if needed)
22 TEXT_BATCH_SIZE = 32 22 TEXT_BATCH_SIZE = 32
23 23
24 - # Image embeddings (CN-CLIP) 24 + # Image embeddings
  25 + # Option A: clip-as-service (Jina CLIP server, recommended)
  26 + USE_CLIP_AS_SERVICE = os.getenv("USE_CLIP_AS_SERVICE", "true").lower() in ("1", "true", "yes")
  27 + CLIP_AS_SERVICE_SERVER = os.getenv("CLIP_AS_SERVICE_SERVER", "grpc://127.0.0.1:51000")
  28 +
  29 + # Option B: local CN-CLIP (when USE_CLIP_AS_SERVICE=false)
25 IMAGE_MODEL_NAME = "ViT-H-14" 30 IMAGE_MODEL_NAME = "ViT-H-14"
26 IMAGE_DEVICE = None # type: Optional[str] # "cuda" / "cpu" / None(auto) 31 IMAGE_DEVICE = None # type: Optional[str] # "cuda" / "cpu" / None(auto)
27 32
embeddings/protocols.py 0 → 100644
@@ -0,0 +1,27 @@ @@ -0,0 +1,27 @@
  1 +"""
  2 +Protocols for embedding backends (structural typing, no inheritance required).
  3 +
  4 +Used by the embedding service so that any backend (ClipAsServiceImageEncoder,
  5 +ClipImageModel, etc.) can be used as long as it implements the same interface.
  6 +"""
  7 +
  8 +from typing import List, Optional, Protocol
  9 +
  10 +import numpy as np
  11 +
  12 +
  13 +class ImageEncoderProtocol(Protocol):
  14 + """Contract for image encoders used by the embedding service /embed/image endpoint."""
  15 +
  16 + def encode_image_urls(
  17 + self,
  18 + urls: List[str],
  19 + batch_size: Optional[int] = None,
  20 + ) -> List[Optional[np.ndarray]]:
  21 + """
  22 + Encode a list of image URLs to vectors.
  23 +
  24 + Returns:
  25 + List of vectors (or None for failed items), same length as urls.
  26 + """
  27 + ...
embeddings/server.py
@@ -16,6 +16,8 @@ from fastapi import FastAPI @@ -16,6 +16,8 @@ from fastapi import FastAPI
16 from embeddings.config import CONFIG 16 from embeddings.config import CONFIG
17 from embeddings.bge_model import BgeTextModel 17 from embeddings.bge_model import BgeTextModel
18 from embeddings.clip_model import ClipImageModel 18 from embeddings.clip_model import ClipImageModel
  19 +from embeddings.clip_as_service_encoder import ClipAsServiceImageEncoder
  20 +from embeddings.protocols import ImageEncoderProtocol
19 21
20 logger = logging.getLogger(__name__) 22 logger = logging.getLogger(__name__)
21 23
@@ -23,9 +25,9 @@ app = FastAPI(title=&quot;saas-search Embedding Service&quot;, version=&quot;1.0.0&quot;) @@ -23,9 +25,9 @@ app = FastAPI(title=&quot;saas-search Embedding Service&quot;, version=&quot;1.0.0&quot;)
23 25
24 # Models are loaded at startup, not lazily 26 # Models are loaded at startup, not lazily
25 _text_model: Optional[BgeTextModel] = None 27 _text_model: Optional[BgeTextModel] = None
26 -_image_model: Optional[ClipImageModel] = None 28 +_image_model: Optional[ImageEncoderProtocol] = None
27 open_text_model = True 29 open_text_model = True
28 -open_image_model = False 30 +open_image_model = True # Enable image embedding when using clip-as-service
29 31
30 _text_encode_lock = threading.Lock() 32 _text_encode_lock = threading.Lock()
31 _image_encode_lock = threading.Lock() 33 _image_encode_lock = threading.Lock()
@@ -49,15 +51,23 @@ def load_models(): @@ -49,15 +51,23 @@ def load_models():
49 raise 51 raise
50 52
51 53
52 - # Load image model 54 + # Load image model: clip-as-service (recommended) or local CN-CLIP
53 if open_image_model: 55 if open_image_model:
54 try: 56 try:
55 - logger.info(f"Loading image model: {CONFIG.IMAGE_MODEL_NAME} (device: {CONFIG.IMAGE_DEVICE})")  
56 - _image_model = ClipImageModel(  
57 - model_name=CONFIG.IMAGE_MODEL_NAME,  
58 - device=CONFIG.IMAGE_DEVICE,  
59 - )  
60 - logger.info("Image model loaded successfully") 57 + if CONFIG.USE_CLIP_AS_SERVICE:
  58 + logger.info(f"Loading image encoder via clip-as-service: {CONFIG.CLIP_AS_SERVICE_SERVER}")
  59 + _image_model = ClipAsServiceImageEncoder(
  60 + server=CONFIG.CLIP_AS_SERVICE_SERVER,
  61 + batch_size=CONFIG.IMAGE_BATCH_SIZE,
  62 + )
  63 + logger.info("Image model (clip-as-service) loaded successfully")
  64 + else:
  65 + logger.info(f"Loading local image model: {CONFIG.IMAGE_MODEL_NAME} (device: {CONFIG.IMAGE_DEVICE})")
  66 + _image_model = ClipImageModel(
  67 + model_name=CONFIG.IMAGE_MODEL_NAME,
  68 + device=CONFIG.IMAGE_DEVICE,
  69 + )
  70 + logger.info("Image model (local CN-CLIP) loaded successfully")
61 except Exception as e: 71 except Exception as e:
62 logger.error(f"Failed to load image model: {e}", exc_info=True) 72 logger.error(f"Failed to load image model: {e}", exc_info=True)
63 raise 73 raise
@@ -125,20 +135,31 @@ def embed_image(images: List[str]) -&gt; List[Optional[List[float]]]: @@ -125,20 +135,31 @@ def embed_image(images: List[str]) -&gt; List[Optional[List[float]]]:
125 raise RuntimeError("Image model not loaded") 135 raise RuntimeError("Image model not loaded")
126 out: List[Optional[List[float]]] = [None] * len(images) 136 out: List[Optional[List[float]]] = [None] * len(images)
127 137
  138 + # Normalize inputs
  139 + urls = []
  140 + indices = []
  141 + for i, url_or_path in enumerate(images):
  142 + if url_or_path is None:
  143 + continue
  144 + if not isinstance(url_or_path, str):
  145 + url_or_path = str(url_or_path)
  146 + url_or_path = url_or_path.strip()
  147 + if url_or_path:
  148 + urls.append(url_or_path)
  149 + indices.append(i)
  150 +
  151 + if not urls:
  152 + return out
  153 +
128 with _image_encode_lock: 154 with _image_encode_lock:
129 - for i, url_or_path in enumerate(images):  
130 - try:  
131 - if url_or_path is None:  
132 - continue  
133 - if not isinstance(url_or_path, str):  
134 - url_or_path = str(url_or_path)  
135 - url_or_path = url_or_path.strip()  
136 - if not url_or_path:  
137 - continue  
138 - emb = _image_model.encode_image_from_url(url_or_path)  
139 - out[i] = _as_list(emb)  
140 - except Exception:  
141 - out[i] = None 155 + try:
  156 + # Both ClipAsServiceImageEncoder and ClipImageModel implement encode_image_urls(urls, batch_size)
  157 + vectors = _image_model.encode_image_urls(urls, batch_size=CONFIG.IMAGE_BATCH_SIZE)
  158 + for j, idx in enumerate(indices):
  159 + out[idx] = _as_list(vectors[j] if j < len(vectors) else None)
  160 + except Exception:
  161 + for idx in indices:
  162 + out[idx] = None
142 return out 163 return out
143 164
144 165
@@ -40,3 +40,9 @@ click&gt;=8.1.0 @@ -40,3 +40,9 @@ click&gt;=8.1.0
40 pytest>=7.4.0 40 pytest>=7.4.0
41 pytest-asyncio>=0.21.0 41 pytest-asyncio>=0.21.0
42 httpx>=0.24.0 42 httpx>=0.24.0
  43 +
  44 +# clip-as-service client (for image embeddings via clip-as-service)
  45 +# Install with: pip install -e third-party/clip-as-service/client
  46 +# Or: pip install jina docarray
  47 +jina>=3.12.0
  48 +docarray[common]>=0.19.0,<0.30.0