Commit e7a2c0b73eb8429700d0fa9d30505605bcb100a4
1 parent
0e66a315
img encode
Showing
7 changed files
with
312 additions
and
6 deletions
Show diff stats
embeddings/image_encoder.py
| ... | ... | @@ -149,3 +149,20 @@ class CLIPImageEncoder: |
| 149 | 149 | results[batch_indices[j]] = None |
| 150 | 150 | |
| 151 | 151 | return results |
| 152 | + | |
| 153 | + def encode_image_urls( | |
| 154 | + self, | |
| 155 | + urls: List[str], | |
| 156 | + batch_size: Optional[int] = None, | |
| 157 | + ) -> List[Optional[np.ndarray]]: | |
| 158 | + """ | |
| 159 | + 与 ClipImageModel / ClipAsServiceImageEncoder 一致的接口,供索引器 document_transformer 调用。 | |
| 160 | + | |
| 161 | + Args: | |
| 162 | + urls: 图片 URL 列表 | |
| 163 | + batch_size: 批大小(默认 8) | |
| 164 | + | |
| 165 | + Returns: | |
| 166 | + 与 urls 等长的向量列表,失败为 None | |
| 167 | + """ | |
| 168 | + return self.encode_batch(urls, batch_size=batch_size or 8) | ... | ... |
indexer/document_transformer.py
| ... | ... | @@ -5,7 +5,7 @@ SPU文档转换器 - 公共转换逻辑。 |
| 5 | 5 | 输出文档结构与 mappings/search_products.json 及 索引字段说明v2 一致, |
| 6 | 6 | 供 search/searcher 与 search/es_query_builder 使用。 |
| 7 | 7 | - 多语言字段:title, brief, description, vendor, category_path, category_name_text |
| 8 | -- 嵌套:specifications, skus;向量:title_embedding(image_embedding 由 Java 索引或外部填充) | |
| 8 | +- 嵌套:specifications, skus;向量:title_embedding、image_embedding(可选,需提供 image_encoder) | |
| 9 | 9 | """ |
| 10 | 10 | |
| 11 | 11 | import pandas as pd |
| ... | ... | @@ -38,7 +38,9 @@ class SPUDocumentTransformer: |
| 38 | 38 | translator: Optional[Any] = None, |
| 39 | 39 | translation_prompts: Optional[Dict[str, str]] = None, |
| 40 | 40 | encoder: Optional[Any] = None, |
| 41 | - enable_title_embedding: bool = True | |
| 41 | + enable_title_embedding: bool = True, | |
| 42 | + image_encoder: Optional[Any] = None, | |
| 43 | + enable_image_embedding: bool = False, | |
| 42 | 44 | ): |
| 43 | 45 | """ |
| 44 | 46 | 初始化文档转换器。 |
| ... | ... | @@ -51,6 +53,8 @@ class SPUDocumentTransformer: |
| 51 | 53 | translation_prompts: 翻译提示词配置(可选) |
| 52 | 54 | encoder: 文本编码器实例(可选,用于生成title_embedding) |
| 53 | 55 | enable_title_embedding: 是否启用标题向量化(默认True) |
| 56 | + image_encoder: 图片编码器实例(可选,需实现 encode_image_urls(urls) -> List[Optional[np.ndarray]]) | |
| 57 | + enable_image_embedding: 是否启用图片向量化(默认False) | |
| 54 | 58 | """ |
| 55 | 59 | self.category_id_to_name = category_id_to_name |
| 56 | 60 | self.searchable_option_dimensions = searchable_option_dimensions |
| ... | ... | @@ -59,6 +63,8 @@ class SPUDocumentTransformer: |
| 59 | 63 | self.translation_prompts = translation_prompts or {} |
| 60 | 64 | self.encoder = encoder |
| 61 | 65 | self.enable_title_embedding = enable_title_embedding |
| 66 | + self.image_encoder = image_encoder | |
| 67 | + self.enable_image_embedding = bool(enable_image_embedding and image_encoder is not None) | |
| 62 | 68 | |
| 63 | 69 | def transform_spu_to_doc( |
| 64 | 70 | self, |
| ... | ... | @@ -116,6 +122,10 @@ class SPUDocumentTransformer: |
| 116 | 122 | # Image URL |
| 117 | 123 | self._fill_image_url(doc, spu_row) |
| 118 | 124 | |
| 125 | + # Image embedding(与 mappings/search_products.json 中 image_embedding 嵌套结构一致) | |
| 126 | + if self.enable_image_embedding: | |
| 127 | + self._fill_image_embedding(doc, spu_row, skus) | |
| 128 | + | |
| 119 | 129 | # Sales (fake_sales) |
| 120 | 130 | if pd.notna(spu_row.get('fake_sales')): |
| 121 | 131 | try: |
| ... | ... | @@ -364,6 +374,56 @@ class SPUDocumentTransformer: |
| 364 | 374 | image_src = f"//{image_src}" if not image_src.startswith('//') else image_src |
| 365 | 375 | doc['image_url'] = image_src |
| 366 | 376 | |
| 377 | + def _fill_image_embedding( | |
| 378 | + self, doc: Dict[str, Any], spu_row: pd.Series, skus: pd.DataFrame | |
| 379 | + ) -> None: | |
| 380 | + """ | |
| 381 | + 填充 image_embedding 嵌套字段,与 mappings/search_products.json 一致: | |
| 382 | + [{ "vector": [float, ...], "url": "..." }, ...] | |
| 383 | + 收集 SPU 主图 + SKU 图片 URL,去重后调用 image_encoder 生成向量。 | |
| 384 | + """ | |
| 385 | + urls: List[str] = [] | |
| 386 | + seen: set = set() | |
| 387 | + | |
| 388 | + def _add(url: str) -> None: | |
| 389 | + if not url or not str(url).strip(): | |
| 390 | + return | |
| 391 | + u = str(url).strip() | |
| 392 | + if u.startswith("//"): | |
| 393 | + u = "https:" + u | |
| 394 | + if u not in seen: | |
| 395 | + seen.add(u) | |
| 396 | + urls.append(u) | |
| 397 | + | |
| 398 | + if doc.get("image_url"): | |
| 399 | + _add(doc["image_url"]) | |
| 400 | + if pd.notna(spu_row.get("image_src")): | |
| 401 | + _add(str(spu_row["image_src"])) | |
| 402 | + if not skus.empty and "image_src" in skus.columns: | |
| 403 | + for _, row in skus.iterrows(): | |
| 404 | + if pd.notna(row.get("image_src")): | |
| 405 | + _add(str(row["image_src"])) | |
| 406 | + | |
| 407 | + if not urls: | |
| 408 | + return | |
| 409 | + try: | |
| 410 | + vectors = self.image_encoder.encode_image_urls(urls, batch_size=8) | |
| 411 | + if not vectors or len(vectors) != len(urls): | |
| 412 | + return | |
| 413 | + out = [] | |
| 414 | + for url, vec in zip(urls, vectors): | |
| 415 | + if vec is None: | |
| 416 | + continue | |
| 417 | + if isinstance(vec, np.ndarray): | |
| 418 | + vec = vec.astype(np.float32) | |
| 419 | + out.append({"vector": vec.tolist(), "url": url}) | |
| 420 | + elif hasattr(vec, "tolist"): | |
| 421 | + out.append({"vector": vec.tolist(), "url": url}) | |
| 422 | + if out: | |
| 423 | + doc["image_embedding"] = out | |
| 424 | + except Exception as e: | |
| 425 | + logger.warning("Failed to generate image_embedding for SPU %s: %s", doc.get("spu_id"), e) | |
| 426 | + | |
| 367 | 427 | def _process_skus( |
| 368 | 428 | self, |
| 369 | 429 | skus: pd.DataFrame, | ... | ... |
indexer/incremental_service.py
| ... | ... | @@ -74,11 +74,20 @@ class IncrementalIndexerService: |
| 74 | 74 | encoder = None |
| 75 | 75 | enable_embedding = False |
| 76 | 76 | |
| 77 | + image_encoder: Optional[Any] = None | |
| 78 | + try: | |
| 79 | + from embeddings.image_encoder import CLIPImageEncoder | |
| 80 | + image_encoder = CLIPImageEncoder() | |
| 81 | + except Exception as e: | |
| 82 | + logger.debug("Image encoder not available for indexer: %s", e) | |
| 83 | + | |
| 77 | 84 | transformer = create_document_transformer( |
| 78 | 85 | category_id_to_name=self.category_id_to_name, |
| 79 | 86 | tenant_id=tenant_id, |
| 80 | 87 | encoder=encoder, |
| 81 | 88 | enable_title_embedding=False, # batch fill later |
| 89 | + image_encoder=image_encoder, | |
| 90 | + enable_image_embedding=(image_encoder is not None), | |
| 82 | 91 | config=config, |
| 83 | 92 | ) |
| 84 | 93 | ... | ... |
indexer/indexing_utils.py
| ... | ... | @@ -59,11 +59,13 @@ def create_document_transformer( |
| 59 | 59 | translation_prompts: Optional[Dict[str, str]] = None, |
| 60 | 60 | encoder: Optional[Any] = None, |
| 61 | 61 | enable_title_embedding: bool = True, |
| 62 | + image_encoder: Optional[Any] = None, | |
| 63 | + enable_image_embedding: bool = False, | |
| 62 | 64 | config: Optional[Any] = None, |
| 63 | 65 | ) -> SPUDocumentTransformer: |
| 64 | 66 | """ |
| 65 | 67 | 创建文档转换器(统一初始化逻辑)。 |
| 66 | - | |
| 68 | + | |
| 67 | 69 | Args: |
| 68 | 70 | category_id_to_name: 分类ID到名称的映射 |
| 69 | 71 | tenant_id: 租户ID |
| ... | ... | @@ -72,7 +74,9 @@ def create_document_transformer( |
| 72 | 74 | translation_prompts: 翻译提示词配置(如果为None则从配置加载) |
| 73 | 75 | encoder: 文本编码器实例(如果为None且enable_title_embedding为True则根据配置初始化) |
| 74 | 76 | enable_title_embedding: 是否启用标题向量化(默认True) |
| 75 | - | |
| 77 | + image_encoder: 图片编码器(可选,需实现 encode_image_urls(urls)) | |
| 78 | + enable_image_embedding: 是否启用 image_embedding 填充(默认False) | |
| 79 | + | |
| 76 | 80 | Returns: |
| 77 | 81 | SPUDocumentTransformer实例 |
| 78 | 82 | """ |
| ... | ... | @@ -128,6 +132,8 @@ def create_document_transformer( |
| 128 | 132 | translator=translator, |
| 129 | 133 | translation_prompts=translation_prompts, |
| 130 | 134 | encoder=encoder, |
| 131 | - enable_title_embedding=enable_title_embedding | |
| 135 | + enable_title_embedding=enable_title_embedding, | |
| 136 | + image_encoder=image_encoder, | |
| 137 | + enable_image_embedding=enable_image_embedding, | |
| 132 | 138 | ) |
| 133 | 139 | ... | ... |
scripts/create_tenant_index.sh
| ... | ... | @@ -0,0 +1,155 @@ |
| 1 | +#!/usr/bin/env python3 | |
| 2 | +""" | |
| 3 | +测试 POST /indexer/build-docs 接口:构造请求数据、调用接口、打印完整响应。 | |
| 4 | + | |
| 5 | +用法: | |
| 6 | + 1. 先启动 Indexer 服务: ./scripts/start_indexer.sh (或 uvicorn api.indexer_app:app --port 6004) | |
| 7 | + 2. 执行: python scripts/test_build_docs_api.py | |
| 8 | + | |
| 9 | + 也可指定地址: INDEXER_URL=http://localhost:6004 python scripts/test_build_docs_api.py | |
| 10 | +""" | |
| 11 | + | |
| 12 | +import json | |
| 13 | +import os | |
| 14 | +import sys | |
| 15 | +from datetime import datetime, timezone | |
| 16 | + | |
| 17 | +# 项目根目录 | |
| 18 | +ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| 19 | +sys.path.insert(0, ROOT) | |
| 20 | + | |
| 21 | +# 默认使用 requests 调真实服务;若未安装则回退到 TestClient | |
| 22 | +try: | |
| 23 | + import requests | |
| 24 | + HAS_REQUESTS = True | |
| 25 | +except ImportError: | |
| 26 | + HAS_REQUESTS = False | |
| 27 | + | |
| 28 | + | |
| 29 | +def build_sample_request(): | |
| 30 | + """构造一条完整的 build-docs 请求体(对应 shoplazza_product_spu / sku / option 表结构)。""" | |
| 31 | + now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") | |
| 32 | + return { | |
| 33 | + "tenant_id": "162", | |
| 34 | + "items": [ | |
| 35 | + { | |
| 36 | + "spu": { | |
| 37 | + "id": 10001, | |
| 38 | + "tenant_id": "162", | |
| 39 | + "title": "测试T恤 纯棉短袖", | |
| 40 | + "brief": "舒适纯棉,多色可选", | |
| 41 | + "description": "这是一款适合日常穿着的纯棉T恤,透气吸汗。", | |
| 42 | + "vendor": "测试品牌", | |
| 43 | + "category": "服装/上衣/T恤", | |
| 44 | + "category_id": 100, | |
| 45 | + "category_level": 2, | |
| 46 | + "category_path": "服装/上衣/T恤", | |
| 47 | + "fake_sales": 1280, | |
| 48 | + "image_src": "https://example.com/img/tshirt.jpg", | |
| 49 | + "tags": "T恤,纯棉,短袖,夏季", | |
| 50 | + "create_time": now, | |
| 51 | + "update_time": now, | |
| 52 | + }, | |
| 53 | + "skus": [ | |
| 54 | + { | |
| 55 | + "id": 20001, | |
| 56 | + "spu_id": 10001, | |
| 57 | + "price": 99.0, | |
| 58 | + "compare_at_price": 129.0, | |
| 59 | + "sku": "SKU-TSHIRT-001", | |
| 60 | + "inventory_quantity": 50, | |
| 61 | + "option1": "黑色", | |
| 62 | + "option2": "M", | |
| 63 | + "option3": None, | |
| 64 | + }, | |
| 65 | + { | |
| 66 | + "id": 20002, | |
| 67 | + "spu_id": 10001, | |
| 68 | + "price": 99.0, | |
| 69 | + "compare_at_price": 129.0, | |
| 70 | + "sku": "SKU-TSHIRT-002", | |
| 71 | + "inventory_quantity": 30, | |
| 72 | + "option1": "白色", | |
| 73 | + "option2": "L", | |
| 74 | + "option3": None, | |
| 75 | + }, | |
| 76 | + ], | |
| 77 | + "options": [ | |
| 78 | + {"id": 1, "position": 1, "name": "颜色"}, | |
| 79 | + {"id": 2, "position": 2, "name": "尺码"}, | |
| 80 | + ], | |
| 81 | + } | |
| 82 | + ], | |
| 83 | + } | |
| 84 | + | |
| 85 | + | |
| 86 | +def call_via_http(base_url: str, body: dict): | |
| 87 | + """通过 HTTP 调用 build-docs。""" | |
| 88 | + url = f"{base_url.rstrip('/')}/indexer/build-docs" | |
| 89 | + r = requests.post(url, json=body, timeout=30) | |
| 90 | + return r.status_code, r.text, r.json() if r.headers.get("content-type", "").startswith("application/json") else None | |
| 91 | + | |
| 92 | + | |
| 93 | +def call_via_test_client(body: dict): | |
| 94 | + """通过 FastAPI TestClient 调用(不依赖已启动服务,但需 DB/ES 已配置)。""" | |
| 95 | + from fastapi.testclient import TestClient | |
| 96 | + import api.indexer_app as indexer_app | |
| 97 | + | |
| 98 | + with TestClient(indexer_app.app) as client: | |
| 99 | + r = client.post("/indexer/build-docs", json=body) | |
| 100 | + return r.status_code, r.text, r.json() if r.headers.get("content-type", "").startswith("application/json") else None | |
| 101 | + | |
| 102 | + | |
| 103 | +def main(): | |
| 104 | + body = build_sample_request() | |
| 105 | + | |
| 106 | + print("=" * 60) | |
| 107 | + print("【请求】POST /indexer/build-docs") | |
| 108 | + print("=" * 60) | |
| 109 | + print(json.dumps(body, ensure_ascii=False, indent=2)) | |
| 110 | + | |
| 111 | + base_url = os.getenv("INDEXER_URL", "http://localhost:6004") | |
| 112 | + use_http = HAS_REQUESTS and (os.getenv("USE_TEST_CLIENT", "").lower() not in ("1", "true", "yes")) | |
| 113 | + | |
| 114 | + if use_http: | |
| 115 | + try: | |
| 116 | + status, raw, data = call_via_http(base_url, body) | |
| 117 | + except requests.RequestException as e: | |
| 118 | + print("\n[错误] 无法连接 Indexer 服务:", e) | |
| 119 | + print("请先启动: ./scripts/start_indexer.sh 或 uvicorn api.indexer_app:app --port 6004") | |
| 120 | + if HAS_REQUESTS: | |
| 121 | + print("或使用进程内测试: USE_TEST_CLIENT=1 python scripts/test_build_docs_api.py") | |
| 122 | + sys.exit(1) | |
| 123 | + else: | |
| 124 | + if not use_http and not HAS_REQUESTS: | |
| 125 | + print("\n[提示] 未安装 requests,使用 TestClient 调用(需配置 DB/ES)。") | |
| 126 | + else: | |
| 127 | + print("\n[提示] 使用 TestClient 调用(USE_TEST_CLIENT=1)。") | |
| 128 | + try: | |
| 129 | + status, raw, data = call_via_test_client(body) | |
| 130 | + except Exception as e: | |
| 131 | + print("\n[错误] TestClient 调用失败:", e) | |
| 132 | + print("请确保已 source activate.sh 且 DB/ES 环境变量正确,或先启动 Indexer 再用 HTTP 调用。") | |
| 133 | + sys.exit(1) | |
| 134 | + | |
| 135 | + print("\n" + "=" * 60) | |
| 136 | + print("【响应】HTTP status =", status) | |
| 137 | + print("=" * 60) | |
| 138 | + if data is not None: | |
| 139 | + print(json.dumps(data, ensure_ascii=False, indent=2, default=str)) | |
| 140 | + if data.get("docs"): | |
| 141 | + doc = data["docs"][0] | |
| 142 | + print("\n" + "=" * 60) | |
| 143 | + print("【返回 doc 顶层字段】共 {} 个".format(len(doc))) | |
| 144 | + print("=" * 60) | |
| 145 | + for k in sorted(doc.keys()): | |
| 146 | + print(" ", k) | |
| 147 | + else: | |
| 148 | + print(raw) | |
| 149 | + | |
| 150 | + if status != 200: | |
| 151 | + sys.exit(1) | |
| 152 | + | |
| 153 | + | |
| 154 | +if __name__ == "__main__": | |
| 155 | + main() | ... | ... |
tests/ci/test_service_api_contracts.py
| 1 | 1 | from __future__ import annotations |
| 2 | 2 | |
| 3 | +import json | |
| 4 | +from pathlib import Path | |
| 3 | 5 | from types import SimpleNamespace |
| 4 | 6 | from typing import Any, Dict, List |
| 5 | 7 | |
| ... | ... | @@ -219,6 +221,64 @@ def test_indexer_build_docs_contract(indexer_client: TestClient): |
| 219 | 221 | assert data["docs"][0]["spu_id"] == "1" |
| 220 | 222 | |
| 221 | 223 | |
| 224 | +def test_indexer_build_docs_show_request_response(indexer_client: TestClient): | |
| 225 | + """ | |
| 226 | + 调用「输入 SPU 详情、输出 ES doc」的 build-docs 接口,并打印完整请求与响应, | |
| 227 | + 便于核对返回的 ES 文档字段是否齐全。 | |
| 228 | + 运行: pytest tests/ci/test_service_api_contracts.py::test_indexer_build_docs_show_request_response -v -s | |
| 229 | + """ | |
| 230 | + request_body = { | |
| 231 | + "tenant_id": "162", | |
| 232 | + "items": [ | |
| 233 | + { | |
| 234 | + "spu": {"id": 1, "title": "T-shirt", "brief": "A simple T-shirt"}, | |
| 235 | + "skus": [], | |
| 236 | + "options": [], | |
| 237 | + } | |
| 238 | + ], | |
| 239 | + } | |
| 240 | + print("\n" + "=" * 60) | |
| 241 | + print("【请求】POST /indexer/build-docs") | |
| 242 | + print("=" * 60) | |
| 243 | + print(json.dumps(request_body, ensure_ascii=False, indent=2)) | |
| 244 | + | |
| 245 | + response = indexer_client.post("/indexer/build-docs", json=request_body) | |
| 246 | + data = response.json() | |
| 247 | + | |
| 248 | + print("\n" + "=" * 60) | |
| 249 | + print("【响应】status_code =", response.status_code) | |
| 250 | + print("=" * 60) | |
| 251 | + print(json.dumps(data, ensure_ascii=False, indent=2, default=str)) | |
| 252 | + | |
| 253 | + if data.get("docs"): | |
| 254 | + doc = data["docs"][0] | |
| 255 | + doc_keys = sorted(doc.keys()) | |
| 256 | + print("\n" + "=" * 60) | |
| 257 | + print("【返回 doc 顶层字段】共 {} 个".format(len(doc_keys))) | |
| 258 | + print("=" * 60) | |
| 259 | + for k in doc_keys: | |
| 260 | + print(" ", k) | |
| 261 | + | |
| 262 | + # 与 ES mapping 顶层字段对比 | |
| 263 | + mapping_path = Path(__file__).resolve().parents[2] / "mappings" / "search_products.json" | |
| 264 | + if mapping_path.exists(): | |
| 265 | + with open(mapping_path) as f: | |
| 266 | + mapping = json.load(f) | |
| 267 | + expected_top_level = set(mapping.get("mappings", {}).get("properties", {}).keys()) | |
| 268 | + returned = set(doc_keys) | |
| 269 | + missing = expected_top_level - returned | |
| 270 | + extra = returned - expected_top_level | |
| 271 | + print("\n" + "=" * 60) | |
| 272 | + print("【与 mappings/search_products.json 对比】") | |
| 273 | + print("=" * 60) | |
| 274 | + print(" mapping 中应有、当前 doc 未返回:", sorted(missing) if missing else "(无)") | |
| 275 | + print(" 当前 doc 多出:", sorted(extra) if extra else "(无)") | |
| 276 | + | |
| 277 | + assert response.status_code == 200 | |
| 278 | + assert data["success_count"] == 1 | |
| 279 | + assert data["docs"][0]["spu_id"] == "1" | |
| 280 | + | |
| 281 | + | |
| 222 | 282 | def test_indexer_build_docs_from_db_contract(indexer_client: TestClient): |
| 223 | 283 | """POST /indexer/build-docs-from-db: tenant_id + spu_ids, returns same shape as build-docs.""" |
| 224 | 284 | response = indexer_client.post( | ... | ... |