Commit c10f90fea107e1ce226cd28caa66da1f07be6104
1 parent
42e3aea6
cnclip
Showing
11 changed files
with
245 additions
and
411 deletions
Show diff stats
docs/reference/商品数据源入ES配置规范.md deleted
| @@ -1,221 +0,0 @@ | @@ -1,221 +0,0 @@ | ||
| 1 | -根据您提供的内容,我将其整理为规范的Markdown格式: | ||
| 2 | - | ||
| 3 | -# ES索引配置文档 | ||
| 4 | - | ||
| 5 | -## 1. 全局配置 | ||
| 6 | - | ||
| 7 | -### 1.1 文本字段相关性设定 | ||
| 8 | -需要修改所有text字段相关性算法-BM25算法的默认参数: | ||
| 9 | -```json | ||
| 10 | -"similarity": { | ||
| 11 | - "default": { | ||
| 12 | - "type": "BM25", | ||
| 13 | - "b": "0.0", | ||
| 14 | - "k1": "0.0" | ||
| 15 | - } | ||
| 16 | -} | ||
| 17 | -``` | ||
| 18 | - | ||
| 19 | -### 1.2 索引分片设定 | ||
| 20 | -- `number_of_replicas`:0/1 | ||
| 21 | -- `number_of_shards`:设置建议 分片数 <= ES集群的总CPU核心个数/ (副本数 + 1) | ||
| 22 | - | ||
| 23 | -### 1.3 索引刷新时间设定 | ||
| 24 | -- `refresh_interval`:默认30S,根据客户需要进行调整 | ||
| 25 | -```json | ||
| 26 | -"refresh_interval": "30s" | ||
| 27 | -``` | ||
| 28 | - | ||
| 29 | -## 2. 单个字段配置 | ||
| 30 | - | ||
| 31 | -| 分析方式 | 字段预处理和ES输入格式要求 | 对应ES mapping配置 | 备注 | | ||
| 32 | -|---------|--------------------------|-------------------|------| | ||
| 33 | -| 电商通用分析-中文 | - | ```json { "type": "text", "analyzer": "index_ansj", "search_analyzer": "query_ansj" } ``` | - | | ||
| 34 | -| 文本-多语言向量化 | 调用"文本向量化"模块得到1024维向量 | ```json { "type": "dense_vector", "dims": 1024, "index": true, "similarity": "dot_product" } ``` | 1. 依赖"文本向量化"模块<br>2. 如果定期全量,需要对向量化结果做缓存 | | ||
| 35 | -| 图片-向量化 | 调用"图片向量化"模块得到1024维向量 | ```json { "type": "nested", "properties": { "vector": { "type": "dense_vector", "dims": 1024, "similarity": "dot_product" }, "url": { "type": "text" } } } ``` | 1. 依赖"图片向量化"模块<br>2. 如果定期全量,需要对向量化结果做缓存 | | ||
| 36 | -| 关键词 | ES输入格式:list或者单个值 | ```json {"type": "keyword"} ``` | - | | ||
| 37 | -| 电商通用分析-英文 | - | ```json {"type": "text", "analyzer": "english"} ``` | - | | ||
| 38 | -| 电商通用分析-阿拉伯文 | - | ```json {"type": "text", "analyzer": "arabic"} ``` | - | | ||
| 39 | -| 电商通用分析-西班牙文 | - | ```json {"type": "text", "analyzer": "spanish"} ``` | - | | ||
| 40 | -| 电商通用分析-俄文 | - | ```json {"type": "text", "analyzer": "russian"} ``` | - | | ||
| 41 | -| 电商通用分析-日文 | - | ```json {"type": "text", "analyzer": "japanese"} ``` | - | | ||
| 42 | -| 数值-整数 | - | ```json {"type": "long"} ``` | - | | ||
| 43 | -| 数值-浮点型 | - | ```json {"type": "float"} ``` | - | | ||
| 44 | -| 分值 | 输入是float,配置处理方式:log, pow, sigmoid等 | TODO:给代码, log | - | | ||
| 45 | -| 子串 | - | 暂时不支持 | - | | ||
| 46 | -| ngram匹配或前缀匹配或边缘前缀匹配 | - | 暂时不支持 | 以后根据需要再添加 | | ||
| 47 | - | ||
| 48 | -这样整理后,文档结构更加清晰,表格格式规范,便于阅读和理解。 | ||
| 49 | - | ||
| 50 | - | ||
| 51 | -参考 opensearch: | ||
| 52 | - | ||
| 53 | -数据接口 | ||
| 54 | -文本相关性字段 | ||
| 55 | -向量相关性字段 | ||
| 56 | -3. 模块提取 | ||
| 57 | -文本向量化 | ||
| 58 | -import sys | ||
| 59 | -import torch | ||
| 60 | -from sentence_transformers import SentenceTransformer | ||
| 61 | -import time | ||
| 62 | -import threading | ||
| 63 | -from modelscope import snapshot_download | ||
| 64 | -from transformers import AutoModel | ||
| 65 | -import os | ||
| 66 | -from openai import OpenAI | ||
| 67 | -from config.logging_config import get_app_logger | ||
| 68 | - | ||
| 69 | -# Get logger for this module | ||
| 70 | -logger = get_app_logger(__name__) | ||
| 71 | - | ||
| 72 | -class BgeEncoder: | ||
| 73 | - _instance = None | ||
| 74 | - _lock = threading.Lock() | ||
| 75 | - | ||
| 76 | - def __new__(cls, model_dir='Xorbits/bge-m3'): | ||
| 77 | - with cls._lock: | ||
| 78 | - if cls._instance is None: | ||
| 79 | - cls._instance = super(BgeEncoder, cls).__new__(cls) | ||
| 80 | - logger.info("[BgeEncoder] Creating a new instance with model directory: %s", model_dir) | ||
| 81 | - cls._instance.model = SentenceTransformer(snapshot_download(model_dir)) | ||
| 82 | - logger.info("[BgeEncoder] New instance has been created") | ||
| 83 | - return cls._instance | ||
| 84 | - | ||
| 85 | - def encode(self, sentences, normalize_embeddings=True, device='cuda'): | ||
| 86 | - # Move model to specified device | ||
| 87 | - if device == 'gpu': | ||
| 88 | - device = 'cuda' | ||
| 89 | - self.model = self.model.to(device) | ||
| 90 | - embeddings = self.model.encode(sentences, normalize_embeddings=normalize_embeddings, device=device, show_progress_bar=False) | ||
| 91 | - return embeddings | ||
| 92 | -图片向量化 | ||
| 93 | -import sys | ||
| 94 | -import os | ||
| 95 | -import io | ||
| 96 | -import requests | ||
| 97 | -import torch | ||
| 98 | -import numpy as np | ||
| 99 | -from PIL import Image | ||
| 100 | -import logging | ||
| 101 | -import threading | ||
| 102 | -from typing import List, Optional, Union | ||
| 103 | -from config.logging_config import get_app_logger | ||
| 104 | -import cn_clip.clip as clip | ||
| 105 | -from cn_clip.clip import load_from_name | ||
| 106 | - | ||
| 107 | -# Get logger for this module | ||
| 108 | -logger = get_app_logger(__name__) | ||
| 109 | - | ||
| 110 | -# DEFAULT_MODEL_NAME = "ViT-L-14-336" # ["ViT-B-16", "ViT-L-14", "ViT-L-14-336", "ViT-H-14", "RN50"] | ||
| 111 | -DEFAULT_MODEL_NAME = "ViT-H-14" | ||
| 112 | -MODEL_DOWNLOAD_DIR = "/data/tw/uat/EsSearcher" | ||
| 113 | - | ||
| 114 | -class CLIPImageEncoder: | ||
| 115 | - """CLIP Image Encoder for generating image embeddings using cn_clip""" | ||
| 116 | - | ||
| 117 | - _instance = None | ||
| 118 | - _lock = threading.Lock() | ||
| 119 | - | ||
| 120 | - def __new__(cls, model_name=DEFAULT_MODEL_NAME, device=None): | ||
| 121 | - with cls._lock: | ||
| 122 | - if cls._instance is None: | ||
| 123 | - cls._instance = super(CLIPImageEncoder, cls).__new__(cls) | ||
| 124 | - logger.info(f"[CLIPImageEncoder] Creating new instance with model: {model_name}") | ||
| 125 | - cls._instance._initialize_model(model_name, device) | ||
| 126 | - return cls._instance | ||
| 127 | - | ||
| 128 | - def _initialize_model(self, model_name, device): | ||
| 129 | - """Initialize the CLIP model using cn_clip""" | ||
| 130 | - try: | ||
| 131 | - self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu") | ||
| 132 | - self.model, self.preprocess = load_from_name(model_name, device=self.device, download_root=MODEL_DOWNLOAD_DIR) | ||
| 133 | - self.model.eval() | ||
| 134 | - self.model_name = model_name | ||
| 135 | - logger.info(f"[CLIPImageEncoder] Model {model_name} initialized successfully on device {self.device}") | ||
| 136 | - | ||
| 137 | - except Exception as e: | ||
| 138 | - logger.error(f"[CLIPImageEncoder] Failed to initialize model: {str(e)}") | ||
| 139 | - raise | ||
| 140 | - | ||
| 141 | - def validate_image(self, image_data: bytes) -> Image.Image: | ||
| 142 | - """Validate image data and return PIL Image if valid""" | ||
| 143 | - try: | ||
| 144 | - image_stream = io.BytesIO(image_data) | ||
| 145 | - image = Image.open(image_stream) | ||
| 146 | - image.verify() | ||
| 147 | - image_stream.seek(0) | ||
| 148 | - image = Image.open(image_stream) | ||
| 149 | - if image.mode != 'RGB': | ||
| 150 | - image = image.convert('RGB') | ||
| 151 | - return image | ||
| 152 | - except Exception as e: | ||
| 153 | - raise ValueError(f"Invalid image data: {str(e)}") | ||
| 154 | - | ||
| 155 | - def download_image(self, url: str, timeout: int = 10) -> bytes: | ||
| 156 | - """Download image from URL""" | ||
| 157 | - try: | ||
| 158 | - if url.startswith(('http://', 'https://')): | ||
| 159 | - response = requests.get(url, timeout=timeout) | ||
| 160 | - if response.status_code != 200: | ||
| 161 | - raise ValueError(f"HTTP {response.status_code}") | ||
| 162 | - return response.content | ||
| 163 | - else: | ||
| 164 | - # Local file path | ||
| 165 | - with open(url, 'rb') as f: | ||
| 166 | - return f.read() | ||
| 167 | - except Exception as e: | ||
| 168 | - raise ValueError(f"Failed to download image from {url}: {str(e)}") | ||
| 169 | - | ||
| 170 | - def preprocess_image(self, image: Image.Image, max_size: int = 1024) -> Image.Image: | ||
| 171 | - """Preprocess image for CLIP model""" | ||
| 172 | - # Resize if too large | ||
| 173 | - if max(image.size) > max_size: | ||
| 174 | - ratio = max_size / max(image.size) | ||
| 175 | - new_size = tuple(int(dim * ratio) for dim in image.size) | ||
| 176 | - image = image.resize(new_size, Image.Resampling.LANCZOS) | ||
| 177 | - return image | ||
| 178 | - | ||
| 179 | - def encode_text(self, text): | ||
| 180 | - """Encode text to embedding vector using cn_clip""" | ||
| 181 | - text_data = clip.tokenize([text] if type(text) == str else text).to(self.device) | ||
| 182 | - with torch.no_grad(): | ||
| 183 | - text_features = self.model.encode_text(text_data) | ||
| 184 | - text_features /= text_features.norm(dim=-1, keepdim=True) | ||
| 185 | - return text_features | ||
| 186 | - | ||
| 187 | - def encode_image(self, image: Image.Image) -> Optional[np.ndarray]: | ||
| 188 | - """Encode image to embedding vector using cn_clip""" | ||
| 189 | - if not isinstance(image, Image.Image): | ||
| 190 | - raise ValueError("CLIPImageEncoder.encode_image Input must be a PIL.Image") | ||
| 191 | - | ||
| 192 | - try: | ||
| 193 | - infer_data = self.preprocess(image).unsqueeze(0).to(self.device) | ||
| 194 | - with torch.no_grad(): | ||
| 195 | - image_features = self.model.encode_image(infer_data) | ||
| 196 | - image_features /= image_features.norm(dim=-1, keepdim=True) | ||
| 197 | - return image_features.cpu().numpy().astype('float32')[0] | ||
| 198 | - except Exception as e: | ||
| 199 | - logger.error(f"Failed to process image. Reason: {str(e)}") | ||
| 200 | - return None | ||
| 201 | - | ||
| 202 | - def encode_image_from_url(self, url: str) -> Optional[np.ndarray]: | ||
| 203 | - """Complete pipeline: download, validate, preprocess and encode image from URL""" | ||
| 204 | - try: | ||
| 205 | - # Download image | ||
| 206 | - image_data = self.download_image(url) | ||
| 207 | - | ||
| 208 | - # Validate image | ||
| 209 | - image = self.validate_image(image_data) | ||
| 210 | - | ||
| 211 | - # Preprocess image | ||
| 212 | - image = self.preprocess_image(image) | ||
| 213 | - | ||
| 214 | - # Encode image | ||
| 215 | - embedding = self.encode_image(image) | ||
| 216 | - | ||
| 217 | - return embedding | ||
| 218 | - | ||
| 219 | - except Exception as e: | ||
| 220 | - logger.error(f"Error processing image from URL {url}: {str(e)}") | ||
| 221 | - return None | ||
| 222 | \ No newline at end of file | 0 | \ No newline at end of file |
docs/reference/阿里opensearch电商行业.md deleted
| @@ -1,47 +0,0 @@ | @@ -1,47 +0,0 @@ | ||
| 1 | -https://help.aliyun.com/zh/open-search/industry-algorithm-edition/e-commerce?spm=a2c4g.11186623.help-menu-29102.d_3_2_1.5a903cfbxOsaHt&scm=20140722.H_99739._.OR_help-T_cn~zh-V_1 | ||
| 2 | - | ||
| 3 | - | ||
| 4 | -## 定义应用结构 | ||
| 5 | -示例如下: | ||
| 6 | -| 字段名称 | 主键 | 字段标签 | 类型 | | ||
| 7 | -|----------------|------|------------|--------------| | ||
| 8 | -| title | | 商品标题 | TEXT | | ||
| 9 | -| text_embedding | | 文本向量 | EMBEDDING | | ||
| 10 | -| image_embedding | | 图片向量 | EMBEDDING | | ||
| 11 | -| category_name | | 类目名称 | TEXT | | ||
| 12 | -| image_url | | | LITERAL_ARRAY| | ||
| 13 | -| description | | 商品描述 | TEXT | | ||
| 14 | -| brand_name | | 品牌名称 | TEXT | | ||
| 15 | -| thumbnail_url | | | LITERAL_ARRAY| | ||
| 16 | -| is_onsale | | | INT | | ||
| 17 | -| url | | | LITERAL | | ||
| 18 | -| brand_id | | | LITERAL | | ||
| 19 | -| series_id | | | LITERAL | | ||
| 20 | -| sold_num | | 商品销量 | INT | | ||
| 21 | -| category_id | | | INT | | ||
| 22 | -| onsale_time | | 上架时间 | INT | | ||
| 23 | -| price | | | DOUBLE | | ||
| 24 | -| series_name | | | TEXT | | ||
| 25 | -| discount_price | | DOUBLE | | ||
| 26 | -| pid | ● | INT | | ||
| 27 | -| sale_price | | DOUBLE | | ||
| 28 | -| act_price | | DOUBLE | | ||
| 29 | - | ||
| 30 | - | ||
| 31 | -## 定义索引结构 | ||
| 32 | - | ||
| 33 | -| 索引名称 | 索引标签 | 包含字段 | 分析方式 | 使用示例 | | ||
| 34 | -| --- | --- | --- | --- | --- | | ||
| 35 | -| default | 默认索引 | category_name, description, brand_name, title, create_by, update_by | 行业 - 电商通用分析 | query=default:“云搜索” | | ||
| 36 | -| category_name | 类目名称索引 | category_name | 行业 - 电商通用分析 | query=category_name:“云搜索” | | ||
| 37 | -| category_id | | category_id | 关键字 | query=category_id:“云搜索” | | ||
| 38 | -| series_name | | series_name | 中文 - 通用分析 | query=series_name:“云搜索” | | ||
| 39 | -| brand_name | | brand_name | 中文 - 通用分析 | query=brand_name:“云搜索” | | ||
| 40 | -| id | | id | 关键字 | query=id:“云搜索” | | ||
| 41 | -| title | 标题索引 | title | 行业 - 电商通用分析 | query=title:“云搜索” | | ||
| 42 | -| seller_id | | seller_id | 关键字 | query=seller_id:“云搜索” | | ||
| 43 | -| brand_id | | brand_id | 关键字 | query=brand_id:“云搜索” | | ||
| 44 | -| series_id | | series_id | 关键字 | query=series_id:“云搜索” | | ||
| 45 | - | ||
| 46 | -上面的只是阿里云的opensearch的例子,我们也要有同样的一套配置,这里支持的“字分析方式” 为ES预先支持的 多种分析器,我们要支持的分析方式参考 @商品数据源入ES配置规范.md | ||
| 47 | - |
docs/temporary/sku_image_src问题诊断报告.md deleted
| @@ -1,117 +0,0 @@ | @@ -1,117 +0,0 @@ | ||
| 1 | -# SKU image_src 字段为空问题诊断报告 | ||
| 2 | - | ||
| 3 | -## 问题描述 | ||
| 4 | - | ||
| 5 | -返回结果的每条结果中,多款式字段 `skus` 下面每个 SKU 的 `image_src` 为空。 | ||
| 6 | - | ||
| 7 | -## 问题分析 | ||
| 8 | - | ||
| 9 | -### 1. ES 数据检查 | ||
| 10 | - | ||
| 11 | -通过查询 ES 数据,发现: | ||
| 12 | -- ES 中确实有 `skus` 数据(不是空数组) | ||
| 13 | -- 但是 `skus` 数组中的每个 SKU 对象**都没有 `image_src` 字段** | ||
| 14 | - | ||
| 15 | -示例 ES 文档: | ||
| 16 | -```json | ||
| 17 | -{ | ||
| 18 | - "spu_id": "68238", | ||
| 19 | - "skus": [ | ||
| 20 | - { | ||
| 21 | - "sku_id": "3568395", | ||
| 22 | - "price": 329.61, | ||
| 23 | - "compare_at_price": 485.65, | ||
| 24 | - "sku_code": "3468269", | ||
| 25 | - "stock": 57, | ||
| 26 | - "weight": 0.26, | ||
| 27 | - "weight_unit": "kg", | ||
| 28 | - "option1_value": "", | ||
| 29 | - "option2_value": "", | ||
| 30 | - "option3_value": "" | ||
| 31 | - // 注意:这里没有 image_src 字段 | ||
| 32 | - } | ||
| 33 | - ] | ||
| 34 | -} | ||
| 35 | -``` | ||
| 36 | - | ||
| 37 | -### 2. 代码逻辑检查 | ||
| 38 | - | ||
| 39 | -在 `indexer/document_transformer.py` 的 `_transform_sku_row` 方法中(第558-560行),原有逻辑为: | ||
| 40 | - | ||
| 41 | -```python | ||
| 42 | -# Image src | ||
| 43 | -if pd.notna(sku_row.get('image_src')): | ||
| 44 | - sku_data['image_src'] = str(sku_row['image_src']) | ||
| 45 | -``` | ||
| 46 | - | ||
| 47 | -**问题根源**: | ||
| 48 | -- 只有当 MySQL 中的 `image_src` 字段**非空**时,才会将其添加到 `sku_data` 字典中 | ||
| 49 | -- 如果 MySQL 中的 `image_src` 是 `NULL` 或空字符串,这个字段就**不会出现在返回的字典中** | ||
| 50 | -- 导致 ES 文档中缺少 `image_src` 字段 | ||
| 51 | -- API 返回时,`sku_entry.get('image_src')` 返回 `None`,前端看到的就是空值 | ||
| 52 | - | ||
| 53 | -### 3. MySQL 数据情况 | ||
| 54 | - | ||
| 55 | -根据代码逻辑推断: | ||
| 56 | -- MySQL 的 `shoplazza_product_sku` 表中,`image_src` 字段可能为 `NULL` 或空字符串 | ||
| 57 | -- 这导致索引时该字段没有被写入 ES | ||
| 58 | - | ||
| 59 | -## 解决方案 | ||
| 60 | - | ||
| 61 | -### 修复方案 | ||
| 62 | - | ||
| 63 | -修改 `indexer/document_transformer.py` 中的 `_transform_sku_row` 方法,**始终包含 `image_src` 字段**,即使值为空也设置为 `None`: | ||
| 64 | - | ||
| 65 | -```python | ||
| 66 | -# Image src - always include this field, even if empty | ||
| 67 | -# This ensures the field is present in ES documents and API responses | ||
| 68 | -image_src = sku_row.get('image_src') | ||
| 69 | -if pd.notna(image_src) and str(image_src).strip(): | ||
| 70 | - sku_data['image_src'] = str(image_src).strip() | ||
| 71 | -else: | ||
| 72 | - # Set to None (will be serialized as null in JSON) instead of omitting the field | ||
| 73 | - sku_data['image_src'] = None | ||
| 74 | -``` | ||
| 75 | - | ||
| 76 | -### 修复效果 | ||
| 77 | - | ||
| 78 | -修复后: | ||
| 79 | -1. **即使 MySQL 中 `image_src` 为 NULL 或空字符串**,ES 文档中也会包含该字段(值为 `null`) | ||
| 80 | -2. API 返回时,前端可以明确知道该字段存在但值为空 | ||
| 81 | -3. 符合 API 模型定义:`image_src: Optional[str] = Field(None, ...)` | ||
| 82 | - | ||
| 83 | -## 问题分类 | ||
| 84 | - | ||
| 85 | -**问题类型**:**本项目填充的问题** | ||
| 86 | - | ||
| 87 | -- ✅ **不是 MySQL 原始数据的问题**:MySQL 中 `image_src` 字段可能确实为 NULL,但这是正常的业务数据 | ||
| 88 | -- ✅ **不是 ES 数据的问题**:ES mapping 中 `image_src` 字段定义正确 | ||
| 89 | -- ❌ **是本项目填充的问题**:代码逻辑导致当 MySQL 中 `image_src` 为空时,该字段没有被写入 ES 文档 | ||
| 90 | - | ||
| 91 | -## 后续操作 | ||
| 92 | - | ||
| 93 | -1. **重新索引数据**:修复代码后,需要重新索引数据才能生效 | ||
| 94 | - ```bash | ||
| 95 | - # 重新索引指定租户的数据 | ||
| 96 | - ./scripts/ingest.sh <tenant_id> true | ||
| 97 | - ``` | ||
| 98 | - | ||
| 99 | -2. **验证修复**:重新索引后,查询 ES 验证 `image_src` 字段是否已包含: | ||
| 100 | - ```bash | ||
| 101 | - curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' \ | ||
| 102 | - -H 'Content-Type: application/json' \ | ||
| 103 | - -d '{ | ||
| 104 | - "size": 1, | ||
| 105 | - "query": {"nested": {"path": "skus", "query": {"exists": {"field": "skus"}}}}, | ||
| 106 | - "_source": ["spu_id", "skus"] | ||
| 107 | - }' | ||
| 108 | - ``` | ||
| 109 | - | ||
| 110 | -3. **可选优化**:如果业务需要,可以考虑当 SKU 的 `image_src` 为空时,使用 SPU 的主图(`image_url`)作为默认值 | ||
| 111 | - | ||
| 112 | -## 相关文件 | ||
| 113 | - | ||
| 114 | -- `indexer/document_transformer.py` - 已修复 | ||
| 115 | -- `api/models.py` - `SkuResult.image_src: Optional[str]` - 模型定义正确 | ||
| 116 | -- `api/result_formatter.py` - `image_src=sku_entry.get('image_src')` - 读取逻辑正确 | ||
| 117 | -- `mappings/search_products.json` - `skus.image_src` mapping 定义正确 |
embeddings/README.md
| @@ -8,8 +8,10 @@ | @@ -8,8 +8,10 @@ | ||
| 8 | 8 | ||
| 9 | - **HTTP 客户端**:`text_encoder.py` / `image_encoder.py`(供搜索/索引模块调用) | 9 | - **HTTP 客户端**:`text_encoder.py` / `image_encoder.py`(供搜索/索引模块调用) |
| 10 | - **本地模型实现**:`bge_model.py` / `clip_model.py` | 10 | - **本地模型实现**:`bge_model.py` / `clip_model.py` |
| 11 | +- **clip-as-service 客户端**:`clip_as_service_encoder.py`(图片向量,推荐) | ||
| 11 | - **向量化服务(FastAPI)**:`server.py` | 12 | - **向量化服务(FastAPI)**:`server.py` |
| 12 | - **统一配置**:`config.py` | 13 | - **统一配置**:`config.py` |
| 14 | +- **接口契约**:`protocols.ImageEncoderProtocol`(图片编码统一为 `encode_image_urls(urls, batch_size)`,本地 CN-CLIP 与 clip-as-service 均实现该接口) | ||
| 13 | 15 | ||
| 14 | ### 服务接口 | 16 | ### 服务接口 |
| 15 | 17 | ||
| @@ -21,6 +23,24 @@ | @@ -21,6 +23,24 @@ | ||
| 21 | - 入参:`["url或本地路径1", ...]` | 23 | - 入参:`["url或本地路径1", ...]` |
| 22 | - 出参:`[[...], null, ...]`(与输入按 index 对齐,失败为 `null`) | 24 | - 出参:`[[...], null, ...]`(与输入按 index 对齐,失败为 `null`) |
| 23 | 25 | ||
| 26 | +### 图片向量:clip-as-service(推荐) | ||
| 27 | + | ||
| 28 | +默认使用 `third-party/clip-as-service` 的 Jina CLIP 服务生成图片向量。 | ||
| 29 | + | ||
| 30 | +1. **安装 clip-client**(首次使用): | ||
| 31 | + ```bash | ||
| 32 | + pip install -e third-party/clip-as-service/client | ||
| 33 | + ``` | ||
| 34 | + | ||
| 35 | +2. **启动 CN-CLIP 服务**(独立 gRPC 服务,默认端口 51000,详见 `docs/CNCLIP_SERVICE说明文档.md`): | ||
| 36 | + ```bash | ||
| 37 | + ./scripts/start_cnclip_service.sh | ||
| 38 | + ``` | ||
| 39 | + | ||
| 40 | +3. **配置**(`embeddings/config.py` 或环境变量): | ||
| 41 | + - `USE_CLIP_AS_SERVICE=true`(默认) | ||
| 42 | + - `CLIP_AS_SERVICE_SERVER=grpc://127.0.0.1:51000` | ||
| 43 | + | ||
| 24 | ### 启动服务 | 44 | ### 启动服务 |
| 25 | 45 | ||
| 26 | 使用仓库脚本启动(默认端口 6005): | 46 | 使用仓库脚本启动(默认端口 6005): |
| @@ -35,5 +55,6 @@ | @@ -35,5 +55,6 @@ | ||
| 35 | 55 | ||
| 36 | - `PORT`: 服务端口(默认 6005) | 56 | - `PORT`: 服务端口(默认 6005) |
| 37 | - `TEXT_MODEL_DIR`, `TEXT_DEVICE`, `TEXT_BATCH_SIZE` | 57 | - `TEXT_MODEL_DIR`, `TEXT_DEVICE`, `TEXT_BATCH_SIZE` |
| 38 | -- `IMAGE_MODEL_NAME`, `IMAGE_DEVICE` | 58 | +- `USE_CLIP_AS_SERVICE`, `CLIP_AS_SERVICE_SERVER`:图片向量(clip-as-service) |
| 59 | +- `IMAGE_MODEL_NAME`, `IMAGE_DEVICE`:本地 CN-CLIP(当 `USE_CLIP_AS_SERVICE=false` 时) | ||
| 39 | 60 |
| @@ -0,0 +1,122 @@ | @@ -0,0 +1,122 @@ | ||
| 1 | +""" | ||
| 2 | +Image encoder using third-party clip-as-service (Jina CLIP server). | ||
| 3 | + | ||
| 4 | +Requires clip-as-service server to be running. The client is loaded from | ||
| 5 | +third-party/clip-as-service/client so no separate pip install is needed | ||
| 6 | +if that path is on sys.path or the package is installed in development mode. | ||
| 7 | +""" | ||
| 8 | + | ||
| 9 | +import logging | ||
| 10 | +import os | ||
| 11 | +import sys | ||
| 12 | +from typing import List, Optional | ||
| 13 | + | ||
| 14 | +import numpy as np | ||
| 15 | + | ||
| 16 | +logger = logging.getLogger(__name__) | ||
| 17 | + | ||
| 18 | +# Ensure third-party clip client is importable | ||
| 19 | +def _ensure_clip_client_path(): | ||
| 20 | + repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | ||
| 21 | + client_path = os.path.join(repo_root, "third-party", "clip-as-service", "client") | ||
| 22 | + if os.path.isdir(client_path) and client_path not in sys.path: | ||
| 23 | + sys.path.insert(0, client_path) | ||
| 24 | + | ||
| 25 | + | ||
| 26 | +def _normalize_image_url(url: str) -> str: | ||
| 27 | + """Normalize image URL for clip-as-service (e.g. //host/path -> https://host/path).""" | ||
| 28 | + if not url or not isinstance(url, str): | ||
| 29 | + return "" | ||
| 30 | + url = url.strip() | ||
| 31 | + if url.startswith("//"): | ||
| 32 | + return "https:" + url | ||
| 33 | + return url | ||
| 34 | + | ||
| 35 | + | ||
| 36 | +class ClipAsServiceImageEncoder: | ||
| 37 | + """ | ||
| 38 | + Image embedding encoder using clip-as-service Client. | ||
| 39 | + Encodes image URLs in batch; returns 1024-dim vectors (server model must match). | ||
| 40 | + """ | ||
| 41 | + | ||
| 42 | + def __init__( | ||
| 43 | + self, | ||
| 44 | + server: str = "grpc://127.0.0.1:51000", | ||
| 45 | + batch_size: int = 8, | ||
| 46 | + show_progress: bool = False, | ||
| 47 | + ): | ||
| 48 | + """ | ||
| 49 | + Args: | ||
| 50 | + server: clip-as-service server URI (e.g. grpc://127.0.0.1:51000 or http://127.0.0.1:51000). | ||
| 51 | + batch_size: batch size for encode requests. | ||
| 52 | + show_progress: whether to show progress bar when encoding. | ||
| 53 | + """ | ||
| 54 | + _ensure_clip_client_path() | ||
| 55 | + try: | ||
| 56 | + from clip_client import Client | ||
| 57 | + except ImportError as e: | ||
| 58 | + raise ImportError( | ||
| 59 | + "clip_client not found. Add third-party/clip-as-service/client to PYTHONPATH " | ||
| 60 | + "or run: pip install -e third-party/clip-as-service/client" | ||
| 61 | + ) from e | ||
| 62 | + | ||
| 63 | + self._server = server | ||
| 64 | + self._batch_size = batch_size | ||
| 65 | + self._show_progress = show_progress | ||
| 66 | + self._client = Client(server) | ||
| 67 | + | ||
| 68 | + def encode_image_urls( | ||
| 69 | + self, | ||
| 70 | + urls: List[str], | ||
| 71 | + batch_size: Optional[int] = None, | ||
| 72 | + ) -> List[Optional[np.ndarray]]: | ||
| 73 | + """ | ||
| 74 | + Encode a list of image URLs to vectors. | ||
| 75 | + | ||
| 76 | + Args: | ||
| 77 | + urls: list of image URLs (http/https or //host/path). | ||
| 78 | + batch_size: override instance batch_size for this call. | ||
| 79 | + | ||
| 80 | + Returns: | ||
| 81 | + List of vectors (1024-dim float32) or None for failed items, same length as urls. | ||
| 82 | + """ | ||
| 83 | + if not urls: | ||
| 84 | + return [] | ||
| 85 | + | ||
| 86 | + normalized = [_normalize_image_url(u) for u in urls] | ||
| 87 | + valid_indices = [i for i, u in enumerate(normalized) if u] | ||
| 88 | + if not valid_indices: | ||
| 89 | + return [None] * len(urls) | ||
| 90 | + | ||
| 91 | + valid_urls = [normalized[i] for i in valid_indices] | ||
| 92 | + bs = batch_size if batch_size is not None else self._batch_size | ||
| 93 | + out: List[Optional[np.ndarray]] = [None] * len(urls) | ||
| 94 | + | ||
| 95 | + try: | ||
| 96 | + # Client.encode(iterable of str) returns np.ndarray [N, D] for string input | ||
| 97 | + arr = self._client.encode( | ||
| 98 | + valid_urls, | ||
| 99 | + batch_size=bs, | ||
| 100 | + show_progress=self._show_progress, | ||
| 101 | + ) | ||
| 102 | + if arr is not None and hasattr(arr, "shape") and len(arr) == len(valid_indices): | ||
| 103 | + for j, idx in enumerate(valid_indices): | ||
| 104 | + row = arr[j] | ||
| 105 | + if row is not None and hasattr(row, "tolist"): | ||
| 106 | + out[idx] = np.asarray(row, dtype=np.float32) | ||
| 107 | + else: | ||
| 108 | + out[idx] = np.array(row, dtype=np.float32) | ||
| 109 | + else: | ||
| 110 | + logger.warning( | ||
| 111 | + "clip-as-service encode returned unexpected shape/length, " | ||
| 112 | + "expected %d vectors", len(valid_indices) | ||
| 113 | + ) | ||
| 114 | + except Exception as e: | ||
| 115 | + logger.warning("clip-as-service encode failed: %s", e, exc_info=True) | ||
| 116 | + | ||
| 117 | + return out | ||
| 118 | + | ||
| 119 | + def encode_image_from_url(self, url: str) -> Optional[np.ndarray]: | ||
| 120 | + """Encode a single image URL. Returns 1024-dim vector or None.""" | ||
| 121 | + results = self.encode_image_urls([url], batch_size=1) | ||
| 122 | + return results[0] if results else None |
embeddings/clip_model.py
| @@ -17,7 +17,7 @@ import cn_clip.clip as clip | @@ -17,7 +17,7 @@ import cn_clip.clip as clip | ||
| 17 | 17 | ||
| 18 | 18 | ||
| 19 | DEFAULT_MODEL_NAME = "ViT-H-14" | 19 | DEFAULT_MODEL_NAME = "ViT-H-14" |
| 20 | -MODEL_DOWNLOAD_DIR = "/data/tw/uat/EsSearcher" | 20 | +MODEL_DOWNLOAD_DIR = "/data/" |
| 21 | 21 | ||
| 22 | 22 | ||
| 23 | class ClipImageModel(object): | 23 | class ClipImageModel(object): |
| @@ -91,6 +91,23 @@ class ClipImageModel(object): | @@ -91,6 +91,23 @@ class ClipImageModel(object): | ||
| 91 | image = self.preprocess_image(image) | 91 | image = self.preprocess_image(image) |
| 92 | return self.encode_image(image) | 92 | return self.encode_image(image) |
| 93 | 93 | ||
| 94 | + def encode_image_urls( | ||
| 95 | + self, | ||
| 96 | + urls: List[str], | ||
| 97 | + batch_size: Optional[int] = None, | ||
| 98 | + ) -> List[Optional[np.ndarray]]: | ||
| 99 | + """ | ||
| 100 | + Encode a list of image URLs to vectors. Same interface as ClipAsServiceImageEncoder. | ||
| 101 | + | ||
| 102 | + Args: | ||
| 103 | + urls: list of image URLs or local paths. | ||
| 104 | + batch_size: batch size for internal batching (default 8). | ||
| 105 | + | ||
| 106 | + Returns: | ||
| 107 | + List of vectors (or None for failed items), same length as urls. | ||
| 108 | + """ | ||
| 109 | + return self.encode_batch(urls, batch_size=batch_size or 8) | ||
| 110 | + | ||
| 94 | def encode_batch(self, images: List[Union[str, Image.Image]], batch_size: int = 8) -> List[Optional[np.ndarray]]: | 111 | def encode_batch(self, images: List[Union[str, Image.Image]], batch_size: int = 8) -> List[Optional[np.ndarray]]: |
| 95 | results: List[Optional[np.ndarray]] = [] | 112 | results: List[Optional[np.ndarray]] = [] |
| 96 | for i in range(0, len(images), batch_size): | 113 | for i in range(0, len(images), batch_size): |
embeddings/cloud_text_encoder.py
| @@ -35,7 +35,7 @@ class CloudTextEncoder: | @@ -35,7 +35,7 @@ class CloudTextEncoder: | ||
| 35 | if not api_key: | 35 | if not api_key: |
| 36 | raise ValueError("DASHSCOPE_API_KEY must be set in environment or passed as parameter") | 36 | raise ValueError("DASHSCOPE_API_KEY must be set in environment or passed as parameter") |
| 37 | 37 | ||
| 38 | - # Use Beijing region by default | 38 | + # 以下是北京地域base-url,如果使用新加坡地域的模型,需要将base_url替换为:https://dashscope-intl.aliyuncs.com/compatible-mode/v1 |
| 39 | base_url = base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1" | 39 | base_url = base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1" |
| 40 | 40 | ||
| 41 | cls._instance.client = OpenAI( | 41 | cls._instance.client = OpenAI( |
embeddings/config.py
| @@ -21,7 +21,12 @@ class EmbeddingConfig(object): | @@ -21,7 +21,12 @@ class EmbeddingConfig(object): | ||
| 21 | TEXT_DEVICE = "cuda" # "cuda" or "cpu" (model may fall back to CPU if needed) | 21 | TEXT_DEVICE = "cuda" # "cuda" or "cpu" (model may fall back to CPU if needed) |
| 22 | TEXT_BATCH_SIZE = 32 | 22 | TEXT_BATCH_SIZE = 32 |
| 23 | 23 | ||
| 24 | - # Image embeddings (CN-CLIP) | 24 | + # Image embeddings |
| 25 | + # Option A: clip-as-service (Jina CLIP server, recommended) | ||
| 26 | + USE_CLIP_AS_SERVICE = os.getenv("USE_CLIP_AS_SERVICE", "true").lower() in ("1", "true", "yes") | ||
| 27 | + CLIP_AS_SERVICE_SERVER = os.getenv("CLIP_AS_SERVICE_SERVER", "grpc://127.0.0.1:51000") | ||
| 28 | + | ||
| 29 | + # Option B: local CN-CLIP (when USE_CLIP_AS_SERVICE=false) | ||
| 25 | IMAGE_MODEL_NAME = "ViT-H-14" | 30 | IMAGE_MODEL_NAME = "ViT-H-14" |
| 26 | IMAGE_DEVICE = None # type: Optional[str] # "cuda" / "cpu" / None(auto) | 31 | IMAGE_DEVICE = None # type: Optional[str] # "cuda" / "cpu" / None(auto) |
| 27 | 32 |
| @@ -0,0 +1,27 @@ | @@ -0,0 +1,27 @@ | ||
| 1 | +""" | ||
| 2 | +Protocols for embedding backends (structural typing, no inheritance required). | ||
| 3 | + | ||
| 4 | +Used by the embedding service so that any backend (ClipAsServiceImageEncoder, | ||
| 5 | +ClipImageModel, etc.) can be used as long as it implements the same interface. | ||
| 6 | +""" | ||
| 7 | + | ||
| 8 | +from typing import List, Optional, Protocol | ||
| 9 | + | ||
| 10 | +import numpy as np | ||
| 11 | + | ||
| 12 | + | ||
| 13 | +class ImageEncoderProtocol(Protocol): | ||
| 14 | + """Contract for image encoders used by the embedding service /embed/image endpoint.""" | ||
| 15 | + | ||
| 16 | + def encode_image_urls( | ||
| 17 | + self, | ||
| 18 | + urls: List[str], | ||
| 19 | + batch_size: Optional[int] = None, | ||
| 20 | + ) -> List[Optional[np.ndarray]]: | ||
| 21 | + """ | ||
| 22 | + Encode a list of image URLs to vectors. | ||
| 23 | + | ||
| 24 | + Returns: | ||
| 25 | + List of vectors (or None for failed items), same length as urls. | ||
| 26 | + """ | ||
| 27 | + ... |
embeddings/server.py
| @@ -16,6 +16,8 @@ from fastapi import FastAPI | @@ -16,6 +16,8 @@ from fastapi import FastAPI | ||
| 16 | from embeddings.config import CONFIG | 16 | from embeddings.config import CONFIG |
| 17 | from embeddings.bge_model import BgeTextModel | 17 | from embeddings.bge_model import BgeTextModel |
| 18 | from embeddings.clip_model import ClipImageModel | 18 | from embeddings.clip_model import ClipImageModel |
| 19 | +from embeddings.clip_as_service_encoder import ClipAsServiceImageEncoder | ||
| 20 | +from embeddings.protocols import ImageEncoderProtocol | ||
| 19 | 21 | ||
| 20 | logger = logging.getLogger(__name__) | 22 | logger = logging.getLogger(__name__) |
| 21 | 23 | ||
| @@ -23,9 +25,9 @@ app = FastAPI(title="saas-search Embedding Service", version="1.0.0") | @@ -23,9 +25,9 @@ app = FastAPI(title="saas-search Embedding Service", version="1.0.0") | ||
| 23 | 25 | ||
| 24 | # Models are loaded at startup, not lazily | 26 | # Models are loaded at startup, not lazily |
| 25 | _text_model: Optional[BgeTextModel] = None | 27 | _text_model: Optional[BgeTextModel] = None |
| 26 | -_image_model: Optional[ClipImageModel] = None | 28 | +_image_model: Optional[ImageEncoderProtocol] = None |
| 27 | open_text_model = True | 29 | open_text_model = True |
| 28 | -open_image_model = False | 30 | +open_image_model = True # Enable image embedding when using clip-as-service |
| 29 | 31 | ||
| 30 | _text_encode_lock = threading.Lock() | 32 | _text_encode_lock = threading.Lock() |
| 31 | _image_encode_lock = threading.Lock() | 33 | _image_encode_lock = threading.Lock() |
| @@ -49,15 +51,23 @@ def load_models(): | @@ -49,15 +51,23 @@ def load_models(): | ||
| 49 | raise | 51 | raise |
| 50 | 52 | ||
| 51 | 53 | ||
| 52 | - # Load image model | 54 | + # Load image model: clip-as-service (recommended) or local CN-CLIP |
| 53 | if open_image_model: | 55 | if open_image_model: |
| 54 | try: | 56 | try: |
| 55 | - logger.info(f"Loading image model: {CONFIG.IMAGE_MODEL_NAME} (device: {CONFIG.IMAGE_DEVICE})") | ||
| 56 | - _image_model = ClipImageModel( | ||
| 57 | - model_name=CONFIG.IMAGE_MODEL_NAME, | ||
| 58 | - device=CONFIG.IMAGE_DEVICE, | ||
| 59 | - ) | ||
| 60 | - logger.info("Image model loaded successfully") | 57 | + if CONFIG.USE_CLIP_AS_SERVICE: |
| 58 | + logger.info(f"Loading image encoder via clip-as-service: {CONFIG.CLIP_AS_SERVICE_SERVER}") | ||
| 59 | + _image_model = ClipAsServiceImageEncoder( | ||
| 60 | + server=CONFIG.CLIP_AS_SERVICE_SERVER, | ||
| 61 | + batch_size=CONFIG.IMAGE_BATCH_SIZE, | ||
| 62 | + ) | ||
| 63 | + logger.info("Image model (clip-as-service) loaded successfully") | ||
| 64 | + else: | ||
| 65 | + logger.info(f"Loading local image model: {CONFIG.IMAGE_MODEL_NAME} (device: {CONFIG.IMAGE_DEVICE})") | ||
| 66 | + _image_model = ClipImageModel( | ||
| 67 | + model_name=CONFIG.IMAGE_MODEL_NAME, | ||
| 68 | + device=CONFIG.IMAGE_DEVICE, | ||
| 69 | + ) | ||
| 70 | + logger.info("Image model (local CN-CLIP) loaded successfully") | ||
| 61 | except Exception as e: | 71 | except Exception as e: |
| 62 | logger.error(f"Failed to load image model: {e}", exc_info=True) | 72 | logger.error(f"Failed to load image model: {e}", exc_info=True) |
| 63 | raise | 73 | raise |
| @@ -125,20 +135,31 @@ def embed_image(images: List[str]) -> List[Optional[List[float]]]: | @@ -125,20 +135,31 @@ def embed_image(images: List[str]) -> List[Optional[List[float]]]: | ||
| 125 | raise RuntimeError("Image model not loaded") | 135 | raise RuntimeError("Image model not loaded") |
| 126 | out: List[Optional[List[float]]] = [None] * len(images) | 136 | out: List[Optional[List[float]]] = [None] * len(images) |
| 127 | 137 | ||
| 138 | + # Normalize inputs | ||
| 139 | + urls = [] | ||
| 140 | + indices = [] | ||
| 141 | + for i, url_or_path in enumerate(images): | ||
| 142 | + if url_or_path is None: | ||
| 143 | + continue | ||
| 144 | + if not isinstance(url_or_path, str): | ||
| 145 | + url_or_path = str(url_or_path) | ||
| 146 | + url_or_path = url_or_path.strip() | ||
| 147 | + if url_or_path: | ||
| 148 | + urls.append(url_or_path) | ||
| 149 | + indices.append(i) | ||
| 150 | + | ||
| 151 | + if not urls: | ||
| 152 | + return out | ||
| 153 | + | ||
| 128 | with _image_encode_lock: | 154 | with _image_encode_lock: |
| 129 | - for i, url_or_path in enumerate(images): | ||
| 130 | - try: | ||
| 131 | - if url_or_path is None: | ||
| 132 | - continue | ||
| 133 | - if not isinstance(url_or_path, str): | ||
| 134 | - url_or_path = str(url_or_path) | ||
| 135 | - url_or_path = url_or_path.strip() | ||
| 136 | - if not url_or_path: | ||
| 137 | - continue | ||
| 138 | - emb = _image_model.encode_image_from_url(url_or_path) | ||
| 139 | - out[i] = _as_list(emb) | ||
| 140 | - except Exception: | ||
| 141 | - out[i] = None | 155 | + try: |
| 156 | + # Both ClipAsServiceImageEncoder and ClipImageModel implement encode_image_urls(urls, batch_size) | ||
| 157 | + vectors = _image_model.encode_image_urls(urls, batch_size=CONFIG.IMAGE_BATCH_SIZE) | ||
| 158 | + for j, idx in enumerate(indices): | ||
| 159 | + out[idx] = _as_list(vectors[j] if j < len(vectors) else None) | ||
| 160 | + except Exception: | ||
| 161 | + for idx in indices: | ||
| 162 | + out[idx] = None | ||
| 142 | return out | 163 | return out |
| 143 | 164 | ||
| 144 | 165 |
requirements.txt
| @@ -40,3 +40,9 @@ click>=8.1.0 | @@ -40,3 +40,9 @@ click>=8.1.0 | ||
| 40 | pytest>=7.4.0 | 40 | pytest>=7.4.0 |
| 41 | pytest-asyncio>=0.21.0 | 41 | pytest-asyncio>=0.21.0 |
| 42 | httpx>=0.24.0 | 42 | httpx>=0.24.0 |
| 43 | + | ||
| 44 | +# clip-as-service client (for image embeddings via clip-as-service) | ||
| 45 | +# Install with: pip install -e third-party/clip-as-service/client | ||
| 46 | +# Or: pip install jina docarray | ||
| 47 | +jina>=3.12.0 | ||
| 48 | +docarray[common]>=0.19.0,<0.30.0 |