Commit c10f90fea107e1ce226cd28caa66da1f07be6104

Authored by tangwang
1 parent 42e3aea6

cnclip

docs/reference/商品数据源入ES配置规范.md deleted
... ... @@ -1,221 +0,0 @@
1   -根据您提供的内容,我将其整理为规范的Markdown格式:
2   -
3   -# ES索引配置文档
4   -
5   -## 1. 全局配置
6   -
7   -### 1.1 文本字段相关性设定
8   -需要修改所有text字段相关性算法-BM25算法的默认参数:
9   -```json
10   -"similarity": {
11   - "default": {
12   - "type": "BM25",
13   - "b": "0.0",
14   - "k1": "0.0"
15   - }
16   -}
17   -```
18   -
19   -### 1.2 索引分片设定
20   -- `number_of_replicas`:0/1
21   -- `number_of_shards`:设置建议 分片数 <= ES集群的总CPU核心个数/ (副本数 + 1)
22   -
23   -### 1.3 索引刷新时间设定
24   -- `refresh_interval`:默认30S,根据客户需要进行调整
25   -```json
26   -"refresh_interval": "30s"
27   -```
28   -
29   -## 2. 单个字段配置
30   -
31   -| 分析方式 | 字段预处理和ES输入格式要求 | 对应ES mapping配置 | 备注 |
32   -|---------|--------------------------|-------------------|------|
33   -| 电商通用分析-中文 | - | ```json { "type": "text", "analyzer": "index_ansj", "search_analyzer": "query_ansj" } ``` | - |
34   -| 文本-多语言向量化 | 调用"文本向量化"模块得到1024维向量 | ```json { "type": "dense_vector", "dims": 1024, "index": true, "similarity": "dot_product" } ``` | 1. 依赖"文本向量化"模块<br>2. 如果定期全量,需要对向量化结果做缓存 |
35   -| 图片-向量化 | 调用"图片向量化"模块得到1024维向量 | ```json { "type": "nested", "properties": { "vector": { "type": "dense_vector", "dims": 1024, "similarity": "dot_product" }, "url": { "type": "text" } } } ``` | 1. 依赖"图片向量化"模块<br>2. 如果定期全量,需要对向量化结果做缓存 |
36   -| 关键词 | ES输入格式:list或者单个值 | ```json {"type": "keyword"} ``` | - |
37   -| 电商通用分析-英文 | - | ```json {"type": "text", "analyzer": "english"} ``` | - |
38   -| 电商通用分析-阿拉伯文 | - | ```json {"type": "text", "analyzer": "arabic"} ``` | - |
39   -| 电商通用分析-西班牙文 | - | ```json {"type": "text", "analyzer": "spanish"} ``` | - |
40   -| 电商通用分析-俄文 | - | ```json {"type": "text", "analyzer": "russian"} ``` | - |
41   -| 电商通用分析-日文 | - | ```json {"type": "text", "analyzer": "japanese"} ``` | - |
42   -| 数值-整数 | - | ```json {"type": "long"} ``` | - |
43   -| 数值-浮点型 | - | ```json {"type": "float"} ``` | - |
44   -| 分值 | 输入是float,配置处理方式:log, pow, sigmoid等 | TODO:给代码, log | - |
45   -| 子串 | - | 暂时不支持 | - |
46   -| ngram匹配或前缀匹配或边缘前缀匹配 | - | 暂时不支持 | 以后根据需要再添加 |
47   -
48   -这样整理后,文档结构更加清晰,表格格式规范,便于阅读和理解。
49   -
50   -
51   -参考 opensearch:
52   -
53   -数据接口
54   -文本相关性字段
55   -向量相关性字段
56   -3. 模块提取
57   -文本向量化
58   -import sys
59   -import torch
60   -from sentence_transformers import SentenceTransformer
61   -import time
62   -import threading
63   -from modelscope import snapshot_download
64   -from transformers import AutoModel
65   -import os
66   -from openai import OpenAI
67   -from config.logging_config import get_app_logger
68   -
69   -# Get logger for this module
70   -logger = get_app_logger(__name__)
71   -
72   -class BgeEncoder:
73   - _instance = None
74   - _lock = threading.Lock()
75   -
76   - def __new__(cls, model_dir='Xorbits/bge-m3'):
77   - with cls._lock:
78   - if cls._instance is None:
79   - cls._instance = super(BgeEncoder, cls).__new__(cls)
80   - logger.info("[BgeEncoder] Creating a new instance with model directory: %s", model_dir)
81   - cls._instance.model = SentenceTransformer(snapshot_download(model_dir))
82   - logger.info("[BgeEncoder] New instance has been created")
83   - return cls._instance
84   -
85   - def encode(self, sentences, normalize_embeddings=True, device='cuda'):
86   - # Move model to specified device
87   - if device == 'gpu':
88   - device = 'cuda'
89   - self.model = self.model.to(device)
90   - embeddings = self.model.encode(sentences, normalize_embeddings=normalize_embeddings, device=device, show_progress_bar=False)
91   - return embeddings
92   -图片向量化
93   -import sys
94   -import os
95   -import io
96   -import requests
97   -import torch
98   -import numpy as np
99   -from PIL import Image
100   -import logging
101   -import threading
102   -from typing import List, Optional, Union
103   -from config.logging_config import get_app_logger
104   -import cn_clip.clip as clip
105   -from cn_clip.clip import load_from_name
106   -
107   -# Get logger for this module
108   -logger = get_app_logger(__name__)
109   -
110   -# DEFAULT_MODEL_NAME = "ViT-L-14-336" # ["ViT-B-16", "ViT-L-14", "ViT-L-14-336", "ViT-H-14", "RN50"]
111   -DEFAULT_MODEL_NAME = "ViT-H-14"
112   -MODEL_DOWNLOAD_DIR = "/data/tw/uat/EsSearcher"
113   -
114   -class CLIPImageEncoder:
115   - """CLIP Image Encoder for generating image embeddings using cn_clip"""
116   -
117   - _instance = None
118   - _lock = threading.Lock()
119   -
120   - def __new__(cls, model_name=DEFAULT_MODEL_NAME, device=None):
121   - with cls._lock:
122   - if cls._instance is None:
123   - cls._instance = super(CLIPImageEncoder, cls).__new__(cls)
124   - logger.info(f"[CLIPImageEncoder] Creating new instance with model: {model_name}")
125   - cls._instance._initialize_model(model_name, device)
126   - return cls._instance
127   -
128   - def _initialize_model(self, model_name, device):
129   - """Initialize the CLIP model using cn_clip"""
130   - try:
131   - self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
132   - self.model, self.preprocess = load_from_name(model_name, device=self.device, download_root=MODEL_DOWNLOAD_DIR)
133   - self.model.eval()
134   - self.model_name = model_name
135   - logger.info(f"[CLIPImageEncoder] Model {model_name} initialized successfully on device {self.device}")
136   -
137   - except Exception as e:
138   - logger.error(f"[CLIPImageEncoder] Failed to initialize model: {str(e)}")
139   - raise
140   -
141   - def validate_image(self, image_data: bytes) -> Image.Image:
142   - """Validate image data and return PIL Image if valid"""
143   - try:
144   - image_stream = io.BytesIO(image_data)
145   - image = Image.open(image_stream)
146   - image.verify()
147   - image_stream.seek(0)
148   - image = Image.open(image_stream)
149   - if image.mode != 'RGB':
150   - image = image.convert('RGB')
151   - return image
152   - except Exception as e:
153   - raise ValueError(f"Invalid image data: {str(e)}")
154   -
155   - def download_image(self, url: str, timeout: int = 10) -> bytes:
156   - """Download image from URL"""
157   - try:
158   - if url.startswith(('http://', 'https://')):
159   - response = requests.get(url, timeout=timeout)
160   - if response.status_code != 200:
161   - raise ValueError(f"HTTP {response.status_code}")
162   - return response.content
163   - else:
164   - # Local file path
165   - with open(url, 'rb') as f:
166   - return f.read()
167   - except Exception as e:
168   - raise ValueError(f"Failed to download image from {url}: {str(e)}")
169   -
170   - def preprocess_image(self, image: Image.Image, max_size: int = 1024) -> Image.Image:
171   - """Preprocess image for CLIP model"""
172   - # Resize if too large
173   - if max(image.size) > max_size:
174   - ratio = max_size / max(image.size)
175   - new_size = tuple(int(dim * ratio) for dim in image.size)
176   - image = image.resize(new_size, Image.Resampling.LANCZOS)
177   - return image
178   -
179   - def encode_text(self, text):
180   - """Encode text to embedding vector using cn_clip"""
181   - text_data = clip.tokenize([text] if type(text) == str else text).to(self.device)
182   - with torch.no_grad():
183   - text_features = self.model.encode_text(text_data)
184   - text_features /= text_features.norm(dim=-1, keepdim=True)
185   - return text_features
186   -
187   - def encode_image(self, image: Image.Image) -> Optional[np.ndarray]:
188   - """Encode image to embedding vector using cn_clip"""
189   - if not isinstance(image, Image.Image):
190   - raise ValueError("CLIPImageEncoder.encode_image Input must be a PIL.Image")
191   -
192   - try:
193   - infer_data = self.preprocess(image).unsqueeze(0).to(self.device)
194   - with torch.no_grad():
195   - image_features = self.model.encode_image(infer_data)
196   - image_features /= image_features.norm(dim=-1, keepdim=True)
197   - return image_features.cpu().numpy().astype('float32')[0]
198   - except Exception as e:
199   - logger.error(f"Failed to process image. Reason: {str(e)}")
200   - return None
201   -
202   - def encode_image_from_url(self, url: str) -> Optional[np.ndarray]:
203   - """Complete pipeline: download, validate, preprocess and encode image from URL"""
204   - try:
205   - # Download image
206   - image_data = self.download_image(url)
207   -
208   - # Validate image
209   - image = self.validate_image(image_data)
210   -
211   - # Preprocess image
212   - image = self.preprocess_image(image)
213   -
214   - # Encode image
215   - embedding = self.encode_image(image)
216   -
217   - return embedding
218   -
219   - except Exception as e:
220   - logger.error(f"Error processing image from URL {url}: {str(e)}")
221   - return None
222 0 \ No newline at end of file
docs/reference/阿里opensearch电商行业.md deleted
... ... @@ -1,47 +0,0 @@
1   -https://help.aliyun.com/zh/open-search/industry-algorithm-edition/e-commerce?spm=a2c4g.11186623.help-menu-29102.d_3_2_1.5a903cfbxOsaHt&scm=20140722.H_99739._.OR_help-T_cn~zh-V_1
2   -
3   -
4   -## 定义应用结构
5   -示例如下:
6   -| 字段名称 | 主键 | 字段标签 | 类型 |
7   -|----------------|------|------------|--------------|
8   -| title | | 商品标题 | TEXT |
9   -| text_embedding | | 文本向量 | EMBEDDING |
10   -| image_embedding | | 图片向量 | EMBEDDING |
11   -| category_name | | 类目名称 | TEXT |
12   -| image_url | | | LITERAL_ARRAY|
13   -| description | | 商品描述 | TEXT |
14   -| brand_name | | 品牌名称 | TEXT |
15   -| thumbnail_url | | | LITERAL_ARRAY|
16   -| is_onsale | | | INT |
17   -| url | | | LITERAL |
18   -| brand_id | | | LITERAL |
19   -| series_id | | | LITERAL |
20   -| sold_num | | 商品销量 | INT |
21   -| category_id | | | INT |
22   -| onsale_time | | 上架时间 | INT |
23   -| price | | | DOUBLE |
24   -| series_name | | | TEXT |
25   -| discount_price | | DOUBLE |
26   -| pid | ● | INT |
27   -| sale_price | | DOUBLE |
28   -| act_price | | DOUBLE |
29   -
30   -
31   -## 定义索引结构
32   -
33   -| 索引名称 | 索引标签 | 包含字段 | 分析方式 | 使用示例 |
34   -| --- | --- | --- | --- | --- |
35   -| default | 默认索引 | category_name, description, brand_name, title, create_by, update_by | 行业 - 电商通用分析 | query=default:“云搜索” |
36   -| category_name | 类目名称索引 | category_name | 行业 - 电商通用分析 | query=category_name:“云搜索” |
37   -| category_id | | category_id | 关键字 | query=category_id:“云搜索” |
38   -| series_name | | series_name | 中文 - 通用分析 | query=series_name:“云搜索” |
39   -| brand_name | | brand_name | 中文 - 通用分析 | query=brand_name:“云搜索” |
40   -| id | | id | 关键字 | query=id:“云搜索” |
41   -| title | 标题索引 | title | 行业 - 电商通用分析 | query=title:“云搜索” |
42   -| seller_id | | seller_id | 关键字 | query=seller_id:“云搜索” |
43   -| brand_id | | brand_id | 关键字 | query=brand_id:“云搜索” |
44   -| series_id | | series_id | 关键字 | query=series_id:“云搜索” |
45   -
46   -上面的只是阿里云的opensearch的例子,我们也要有同样的一套配置,这里支持的“字分析方式” 为ES预先支持的 多种分析器,我们要支持的分析方式参考 @商品数据源入ES配置规范.md
47   -
docs/temporary/sku_image_src问题诊断报告.md deleted
... ... @@ -1,117 +0,0 @@
1   -# SKU image_src 字段为空问题诊断报告
2   -
3   -## 问题描述
4   -
5   -返回结果的每条结果中,多款式字段 `skus` 下面每个 SKU 的 `image_src` 为空。
6   -
7   -## 问题分析
8   -
9   -### 1. ES 数据检查
10   -
11   -通过查询 ES 数据,发现:
12   -- ES 中确实有 `skus` 数据(不是空数组)
13   -- 但是 `skus` 数组中的每个 SKU 对象**都没有 `image_src` 字段**
14   -
15   -示例 ES 文档:
16   -```json
17   -{
18   - "spu_id": "68238",
19   - "skus": [
20   - {
21   - "sku_id": "3568395",
22   - "price": 329.61,
23   - "compare_at_price": 485.65,
24   - "sku_code": "3468269",
25   - "stock": 57,
26   - "weight": 0.26,
27   - "weight_unit": "kg",
28   - "option1_value": "",
29   - "option2_value": "",
30   - "option3_value": ""
31   - // 注意:这里没有 image_src 字段
32   - }
33   - ]
34   -}
35   -```
36   -
37   -### 2. 代码逻辑检查
38   -
39   -在 `indexer/document_transformer.py` 的 `_transform_sku_row` 方法中(第558-560行),原有逻辑为:
40   -
41   -```python
42   -# Image src
43   -if pd.notna(sku_row.get('image_src')):
44   - sku_data['image_src'] = str(sku_row['image_src'])
45   -```
46   -
47   -**问题根源**:
48   -- 只有当 MySQL 中的 `image_src` 字段**非空**时,才会将其添加到 `sku_data` 字典中
49   -- 如果 MySQL 中的 `image_src` 是 `NULL` 或空字符串,这个字段就**不会出现在返回的字典中**
50   -- 导致 ES 文档中缺少 `image_src` 字段
51   -- API 返回时,`sku_entry.get('image_src')` 返回 `None`,前端看到的就是空值
52   -
53   -### 3. MySQL 数据情况
54   -
55   -根据代码逻辑推断:
56   -- MySQL 的 `shoplazza_product_sku` 表中,`image_src` 字段可能为 `NULL` 或空字符串
57   -- 这导致索引时该字段没有被写入 ES
58   -
59   -## 解决方案
60   -
61   -### 修复方案
62   -
63   -修改 `indexer/document_transformer.py` 中的 `_transform_sku_row` 方法,**始终包含 `image_src` 字段**,即使值为空也设置为 `None`:
64   -
65   -```python
66   -# Image src - always include this field, even if empty
67   -# This ensures the field is present in ES documents and API responses
68   -image_src = sku_row.get('image_src')
69   -if pd.notna(image_src) and str(image_src).strip():
70   - sku_data['image_src'] = str(image_src).strip()
71   -else:
72   - # Set to None (will be serialized as null in JSON) instead of omitting the field
73   - sku_data['image_src'] = None
74   -```
75   -
76   -### 修复效果
77   -
78   -修复后:
79   -1. **即使 MySQL 中 `image_src` 为 NULL 或空字符串**,ES 文档中也会包含该字段(值为 `null`)
80   -2. API 返回时,前端可以明确知道该字段存在但值为空
81   -3. 符合 API 模型定义:`image_src: Optional[str] = Field(None, ...)`
82   -
83   -## 问题分类
84   -
85   -**问题类型**:**本项目填充的问题**
86   -
87   -- ✅ **不是 MySQL 原始数据的问题**:MySQL 中 `image_src` 字段可能确实为 NULL,但这是正常的业务数据
88   -- ✅ **不是 ES 数据的问题**:ES mapping 中 `image_src` 字段定义正确
89   -- ❌ **是本项目填充的问题**:代码逻辑导致当 MySQL 中 `image_src` 为空时,该字段没有被写入 ES 文档
90   -
91   -## 后续操作
92   -
93   -1. **重新索引数据**:修复代码后,需要重新索引数据才能生效
94   - ```bash
95   - # 重新索引指定租户的数据
96   - ./scripts/ingest.sh <tenant_id> true
97   - ```
98   -
99   -2. **验证修复**:重新索引后,查询 ES 验证 `image_src` 字段是否已包含:
100   - ```bash
101   - curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' \
102   - -H 'Content-Type: application/json' \
103   - -d '{
104   - "size": 1,
105   - "query": {"nested": {"path": "skus", "query": {"exists": {"field": "skus"}}}},
106   - "_source": ["spu_id", "skus"]
107   - }'
108   - ```
109   -
110   -3. **可选优化**:如果业务需要,可以考虑当 SKU 的 `image_src` 为空时,使用 SPU 的主图(`image_url`)作为默认值
111   -
112   -## 相关文件
113   -
114   -- `indexer/document_transformer.py` - 已修复
115   -- `api/models.py` - `SkuResult.image_src: Optional[str]` - 模型定义正确
116   -- `api/result_formatter.py` - `image_src=sku_entry.get('image_src')` - 读取逻辑正确
117   -- `mappings/search_products.json` - `skus.image_src` mapping 定义正确
embeddings/README.md
... ... @@ -8,8 +8,10 @@
8 8  
9 9 - **HTTP 客户端**:`text_encoder.py` / `image_encoder.py`(供搜索/索引模块调用)
10 10 - **本地模型实现**:`bge_model.py` / `clip_model.py`
  11 +- **clip-as-service 客户端**:`clip_as_service_encoder.py`(图片向量,推荐)
11 12 - **向量化服务(FastAPI)**:`server.py`
12 13 - **统一配置**:`config.py`
  14 +- **接口契约**:`protocols.ImageEncoderProtocol`(图片编码统一为 `encode_image_urls(urls, batch_size)`,本地 CN-CLIP 与 clip-as-service 均实现该接口)
13 15  
14 16 ### 服务接口
15 17  
... ... @@ -21,6 +23,24 @@
21 23 - 入参:`["url或本地路径1", ...]`
22 24 - 出参:`[[...], null, ...]`(与输入按 index 对齐,失败为 `null`)
23 25  
  26 +### 图片向量:clip-as-service(推荐)
  27 +
  28 +默认使用 `third-party/clip-as-service` 的 Jina CLIP 服务生成图片向量。
  29 +
  30 +1. **安装 clip-client**(首次使用):
  31 + ```bash
  32 + pip install -e third-party/clip-as-service/client
  33 + ```
  34 +
  35 +2. **启动 CN-CLIP 服务**(独立 gRPC 服务,默认端口 51000,详见 `docs/CNCLIP_SERVICE说明文档.md`):
  36 + ```bash
  37 + ./scripts/start_cnclip_service.sh
  38 + ```
  39 +
  40 +3. **配置**(`embeddings/config.py` 或环境变量):
  41 + - `USE_CLIP_AS_SERVICE=true`(默认)
  42 + - `CLIP_AS_SERVICE_SERVER=grpc://127.0.0.1:51000`
  43 +
24 44 ### 启动服务
25 45  
26 46 使用仓库脚本启动(默认端口 6005):
... ... @@ -35,5 +55,6 @@
35 55  
36 56 - `PORT`: 服务端口(默认 6005)
37 57 - `TEXT_MODEL_DIR`, `TEXT_DEVICE`, `TEXT_BATCH_SIZE`
38   -- `IMAGE_MODEL_NAME`, `IMAGE_DEVICE`
  58 +- `USE_CLIP_AS_SERVICE`, `CLIP_AS_SERVICE_SERVER`:图片向量(clip-as-service)
  59 +- `IMAGE_MODEL_NAME`, `IMAGE_DEVICE`:本地 CN-CLIP(当 `USE_CLIP_AS_SERVICE=false` 时)
39 60  
... ...
embeddings/clip_as_service_encoder.py 0 → 100644
... ... @@ -0,0 +1,122 @@
  1 +"""
  2 +Image encoder using third-party clip-as-service (Jina CLIP server).
  3 +
  4 +Requires clip-as-service server to be running. The client is loaded from
  5 +third-party/clip-as-service/client so no separate pip install is needed
  6 +if that path is on sys.path or the package is installed in development mode.
  7 +"""
  8 +
  9 +import logging
  10 +import os
  11 +import sys
  12 +from typing import List, Optional
  13 +
  14 +import numpy as np
  15 +
  16 +logger = logging.getLogger(__name__)
  17 +
  18 +# Ensure third-party clip client is importable
  19 +def _ensure_clip_client_path():
  20 + repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
  21 + client_path = os.path.join(repo_root, "third-party", "clip-as-service", "client")
  22 + if os.path.isdir(client_path) and client_path not in sys.path:
  23 + sys.path.insert(0, client_path)
  24 +
  25 +
  26 +def _normalize_image_url(url: str) -> str:
  27 + """Normalize image URL for clip-as-service (e.g. //host/path -> https://host/path)."""
  28 + if not url or not isinstance(url, str):
  29 + return ""
  30 + url = url.strip()
  31 + if url.startswith("//"):
  32 + return "https:" + url
  33 + return url
  34 +
  35 +
  36 +class ClipAsServiceImageEncoder:
  37 + """
  38 + Image embedding encoder using clip-as-service Client.
  39 + Encodes image URLs in batch; returns 1024-dim vectors (server model must match).
  40 + """
  41 +
  42 + def __init__(
  43 + self,
  44 + server: str = "grpc://127.0.0.1:51000",
  45 + batch_size: int = 8,
  46 + show_progress: bool = False,
  47 + ):
  48 + """
  49 + Args:
  50 + server: clip-as-service server URI (e.g. grpc://127.0.0.1:51000 or http://127.0.0.1:51000).
  51 + batch_size: batch size for encode requests.
  52 + show_progress: whether to show progress bar when encoding.
  53 + """
  54 + _ensure_clip_client_path()
  55 + try:
  56 + from clip_client import Client
  57 + except ImportError as e:
  58 + raise ImportError(
  59 + "clip_client not found. Add third-party/clip-as-service/client to PYTHONPATH "
  60 + "or run: pip install -e third-party/clip-as-service/client"
  61 + ) from e
  62 +
  63 + self._server = server
  64 + self._batch_size = batch_size
  65 + self._show_progress = show_progress
  66 + self._client = Client(server)
  67 +
  68 + def encode_image_urls(
  69 + self,
  70 + urls: List[str],
  71 + batch_size: Optional[int] = None,
  72 + ) -> List[Optional[np.ndarray]]:
  73 + """
  74 + Encode a list of image URLs to vectors.
  75 +
  76 + Args:
  77 + urls: list of image URLs (http/https or //host/path).
  78 + batch_size: override instance batch_size for this call.
  79 +
  80 + Returns:
  81 + List of vectors (1024-dim float32) or None for failed items, same length as urls.
  82 + """
  83 + if not urls:
  84 + return []
  85 +
  86 + normalized = [_normalize_image_url(u) for u in urls]
  87 + valid_indices = [i for i, u in enumerate(normalized) if u]
  88 + if not valid_indices:
  89 + return [None] * len(urls)
  90 +
  91 + valid_urls = [normalized[i] for i in valid_indices]
  92 + bs = batch_size if batch_size is not None else self._batch_size
  93 + out: List[Optional[np.ndarray]] = [None] * len(urls)
  94 +
  95 + try:
  96 + # Client.encode(iterable of str) returns np.ndarray [N, D] for string input
  97 + arr = self._client.encode(
  98 + valid_urls,
  99 + batch_size=bs,
  100 + show_progress=self._show_progress,
  101 + )
  102 + if arr is not None and hasattr(arr, "shape") and len(arr) == len(valid_indices):
  103 + for j, idx in enumerate(valid_indices):
  104 + row = arr[j]
  105 + if row is not None and hasattr(row, "tolist"):
  106 + out[idx] = np.asarray(row, dtype=np.float32)
  107 + else:
  108 + out[idx] = np.array(row, dtype=np.float32)
  109 + else:
  110 + logger.warning(
  111 + "clip-as-service encode returned unexpected shape/length, "
  112 + "expected %d vectors", len(valid_indices)
  113 + )
  114 + except Exception as e:
  115 + logger.warning("clip-as-service encode failed: %s", e, exc_info=True)
  116 +
  117 + return out
  118 +
  119 + def encode_image_from_url(self, url: str) -> Optional[np.ndarray]:
  120 + """Encode a single image URL. Returns 1024-dim vector or None."""
  121 + results = self.encode_image_urls([url], batch_size=1)
  122 + return results[0] if results else None
... ...
embeddings/clip_model.py
... ... @@ -17,7 +17,7 @@ import cn_clip.clip as clip
17 17  
18 18  
19 19 DEFAULT_MODEL_NAME = "ViT-H-14"
20   -MODEL_DOWNLOAD_DIR = "/data/tw/uat/EsSearcher"
  20 +MODEL_DOWNLOAD_DIR = "/data/"
21 21  
22 22  
23 23 class ClipImageModel(object):
... ... @@ -91,6 +91,23 @@ class ClipImageModel(object):
91 91 image = self.preprocess_image(image)
92 92 return self.encode_image(image)
93 93  
  94 + def encode_image_urls(
  95 + self,
  96 + urls: List[str],
  97 + batch_size: Optional[int] = None,
  98 + ) -> List[Optional[np.ndarray]]:
  99 + """
  100 + Encode a list of image URLs to vectors. Same interface as ClipAsServiceImageEncoder.
  101 +
  102 + Args:
  103 + urls: list of image URLs or local paths.
  104 + batch_size: batch size for internal batching (default 8).
  105 +
  106 + Returns:
  107 + List of vectors (or None for failed items), same length as urls.
  108 + """
  109 + return self.encode_batch(urls, batch_size=batch_size or 8)
  110 +
94 111 def encode_batch(self, images: List[Union[str, Image.Image]], batch_size: int = 8) -> List[Optional[np.ndarray]]:
95 112 results: List[Optional[np.ndarray]] = []
96 113 for i in range(0, len(images), batch_size):
... ...
embeddings/cloud_text_encoder.py
... ... @@ -35,7 +35,7 @@ class CloudTextEncoder:
35 35 if not api_key:
36 36 raise ValueError("DASHSCOPE_API_KEY must be set in environment or passed as parameter")
37 37  
38   - # Use Beijing region by default
  38 + # 以下是北京地域base-url,如果使用新加坡地域的模型,需要将base_url替换为:https://dashscope-intl.aliyuncs.com/compatible-mode/v1
39 39 base_url = base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1"
40 40  
41 41 cls._instance.client = OpenAI(
... ...
embeddings/config.py
... ... @@ -21,7 +21,12 @@ class EmbeddingConfig(object):
21 21 TEXT_DEVICE = "cuda" # "cuda" or "cpu" (model may fall back to CPU if needed)
22 22 TEXT_BATCH_SIZE = 32
23 23  
24   - # Image embeddings (CN-CLIP)
  24 + # Image embeddings
  25 + # Option A: clip-as-service (Jina CLIP server, recommended)
  26 + USE_CLIP_AS_SERVICE = os.getenv("USE_CLIP_AS_SERVICE", "true").lower() in ("1", "true", "yes")
  27 + CLIP_AS_SERVICE_SERVER = os.getenv("CLIP_AS_SERVICE_SERVER", "grpc://127.0.0.1:51000")
  28 +
  29 + # Option B: local CN-CLIP (when USE_CLIP_AS_SERVICE=false)
25 30 IMAGE_MODEL_NAME = "ViT-H-14"
26 31 IMAGE_DEVICE = None # type: Optional[str] # "cuda" / "cpu" / None(auto)
27 32  
... ...
embeddings/protocols.py 0 → 100644
... ... @@ -0,0 +1,27 @@
  1 +"""
  2 +Protocols for embedding backends (structural typing, no inheritance required).
  3 +
  4 +Used by the embedding service so that any backend (ClipAsServiceImageEncoder,
  5 +ClipImageModel, etc.) can be used as long as it implements the same interface.
  6 +"""
  7 +
  8 +from typing import List, Optional, Protocol
  9 +
  10 +import numpy as np
  11 +
  12 +
  13 +class ImageEncoderProtocol(Protocol):
  14 + """Contract for image encoders used by the embedding service /embed/image endpoint."""
  15 +
  16 + def encode_image_urls(
  17 + self,
  18 + urls: List[str],
  19 + batch_size: Optional[int] = None,
  20 + ) -> List[Optional[np.ndarray]]:
  21 + """
  22 + Encode a list of image URLs to vectors.
  23 +
  24 + Returns:
  25 + List of vectors (or None for failed items), same length as urls.
  26 + """
  27 + ...
... ...
embeddings/server.py
... ... @@ -16,6 +16,8 @@ from fastapi import FastAPI
16 16 from embeddings.config import CONFIG
17 17 from embeddings.bge_model import BgeTextModel
18 18 from embeddings.clip_model import ClipImageModel
  19 +from embeddings.clip_as_service_encoder import ClipAsServiceImageEncoder
  20 +from embeddings.protocols import ImageEncoderProtocol
19 21  
20 22 logger = logging.getLogger(__name__)
21 23  
... ... @@ -23,9 +25,9 @@ app = FastAPI(title=&quot;saas-search Embedding Service&quot;, version=&quot;1.0.0&quot;)
23 25  
24 26 # Models are loaded at startup, not lazily
25 27 _text_model: Optional[BgeTextModel] = None
26   -_image_model: Optional[ClipImageModel] = None
  28 +_image_model: Optional[ImageEncoderProtocol] = None
27 29 open_text_model = True
28   -open_image_model = False
  30 +open_image_model = True # Enable image embedding when using clip-as-service
29 31  
30 32 _text_encode_lock = threading.Lock()
31 33 _image_encode_lock = threading.Lock()
... ... @@ -49,15 +51,23 @@ def load_models():
49 51 raise
50 52  
51 53  
52   - # Load image model
  54 + # Load image model: clip-as-service (recommended) or local CN-CLIP
53 55 if open_image_model:
54 56 try:
55   - logger.info(f"Loading image model: {CONFIG.IMAGE_MODEL_NAME} (device: {CONFIG.IMAGE_DEVICE})")
56   - _image_model = ClipImageModel(
57   - model_name=CONFIG.IMAGE_MODEL_NAME,
58   - device=CONFIG.IMAGE_DEVICE,
59   - )
60   - logger.info("Image model loaded successfully")
  57 + if CONFIG.USE_CLIP_AS_SERVICE:
  58 + logger.info(f"Loading image encoder via clip-as-service: {CONFIG.CLIP_AS_SERVICE_SERVER}")
  59 + _image_model = ClipAsServiceImageEncoder(
  60 + server=CONFIG.CLIP_AS_SERVICE_SERVER,
  61 + batch_size=CONFIG.IMAGE_BATCH_SIZE,
  62 + )
  63 + logger.info("Image model (clip-as-service) loaded successfully")
  64 + else:
  65 + logger.info(f"Loading local image model: {CONFIG.IMAGE_MODEL_NAME} (device: {CONFIG.IMAGE_DEVICE})")
  66 + _image_model = ClipImageModel(
  67 + model_name=CONFIG.IMAGE_MODEL_NAME,
  68 + device=CONFIG.IMAGE_DEVICE,
  69 + )
  70 + logger.info("Image model (local CN-CLIP) loaded successfully")
61 71 except Exception as e:
62 72 logger.error(f"Failed to load image model: {e}", exc_info=True)
63 73 raise
... ... @@ -125,20 +135,31 @@ def embed_image(images: List[str]) -&gt; List[Optional[List[float]]]:
125 135 raise RuntimeError("Image model not loaded")
126 136 out: List[Optional[List[float]]] = [None] * len(images)
127 137  
  138 + # Normalize inputs
  139 + urls = []
  140 + indices = []
  141 + for i, url_or_path in enumerate(images):
  142 + if url_or_path is None:
  143 + continue
  144 + if not isinstance(url_or_path, str):
  145 + url_or_path = str(url_or_path)
  146 + url_or_path = url_or_path.strip()
  147 + if url_or_path:
  148 + urls.append(url_or_path)
  149 + indices.append(i)
  150 +
  151 + if not urls:
  152 + return out
  153 +
128 154 with _image_encode_lock:
129   - for i, url_or_path in enumerate(images):
130   - try:
131   - if url_or_path is None:
132   - continue
133   - if not isinstance(url_or_path, str):
134   - url_or_path = str(url_or_path)
135   - url_or_path = url_or_path.strip()
136   - if not url_or_path:
137   - continue
138   - emb = _image_model.encode_image_from_url(url_or_path)
139   - out[i] = _as_list(emb)
140   - except Exception:
141   - out[i] = None
  155 + try:
  156 + # Both ClipAsServiceImageEncoder and ClipImageModel implement encode_image_urls(urls, batch_size)
  157 + vectors = _image_model.encode_image_urls(urls, batch_size=CONFIG.IMAGE_BATCH_SIZE)
  158 + for j, idx in enumerate(indices):
  159 + out[idx] = _as_list(vectors[j] if j < len(vectors) else None)
  160 + except Exception:
  161 + for idx in indices:
  162 + out[idx] = None
142 163 return out
143 164  
144 165  
... ...
requirements.txt
... ... @@ -40,3 +40,9 @@ click&gt;=8.1.0
40 40 pytest>=7.4.0
41 41 pytest-asyncio>=0.21.0
42 42 httpx>=0.24.0
  43 +
  44 +# clip-as-service client (for image embeddings via clip-as-service)
  45 +# Install with: pip install -e third-party/clip-as-service/client
  46 +# Or: pip install jina docarray
  47 +jina>=3.12.0
  48 +docarray[common]>=0.19.0,<0.30.0
... ...