Commit e7a2c0b73eb8429700d0fa9d30505605bcb100a4

Authored by tangwang
1 parent 0e66a315

img encode

embeddings/image_encoder.py
@@ -149,3 +149,20 @@ class CLIPImageEncoder: @@ -149,3 +149,20 @@ class CLIPImageEncoder:
149 results[batch_indices[j]] = None 149 results[batch_indices[j]] = None
150 150
151 return results 151 return results
  152 +
  153 + def encode_image_urls(
  154 + self,
  155 + urls: List[str],
  156 + batch_size: Optional[int] = None,
  157 + ) -> List[Optional[np.ndarray]]:
  158 + """
  159 + 与 ClipImageModel / ClipAsServiceImageEncoder 一致的接口,供索引器 document_transformer 调用。
  160 +
  161 + Args:
  162 + urls: 图片 URL 列表
  163 + batch_size: 批大小(默认 8)
  164 +
  165 + Returns:
  166 + 与 urls 等长的向量列表,失败为 None
  167 + """
  168 + return self.encode_batch(urls, batch_size=batch_size or 8)
indexer/document_transformer.py
@@ -5,7 +5,7 @@ SPU文档转换器 - 公共转换逻辑。 @@ -5,7 +5,7 @@ SPU文档转换器 - 公共转换逻辑。
5 输出文档结构与 mappings/search_products.json 及 索引字段说明v2 一致, 5 输出文档结构与 mappings/search_products.json 及 索引字段说明v2 一致,
6 供 search/searcher 与 search/es_query_builder 使用。 6 供 search/searcher 与 search/es_query_builder 使用。
7 - 多语言字段:title, brief, description, vendor, category_path, category_name_text 7 - 多语言字段:title, brief, description, vendor, category_path, category_name_text
8 -- 嵌套:specifications, skus;向量:title_embedding(image_embedding 由 Java 索引或外部填充 8 +- 嵌套:specifications, skus;向量:title_embedding、image_embedding(可选,需提供 image_encoder
9 """ 9 """
10 10
11 import pandas as pd 11 import pandas as pd
@@ -38,7 +38,9 @@ class SPUDocumentTransformer: @@ -38,7 +38,9 @@ class SPUDocumentTransformer:
38 translator: Optional[Any] = None, 38 translator: Optional[Any] = None,
39 translation_prompts: Optional[Dict[str, str]] = None, 39 translation_prompts: Optional[Dict[str, str]] = None,
40 encoder: Optional[Any] = None, 40 encoder: Optional[Any] = None,
41 - enable_title_embedding: bool = True 41 + enable_title_embedding: bool = True,
  42 + image_encoder: Optional[Any] = None,
  43 + enable_image_embedding: bool = False,
42 ): 44 ):
43 """ 45 """
44 初始化文档转换器。 46 初始化文档转换器。
@@ -51,6 +53,8 @@ class SPUDocumentTransformer: @@ -51,6 +53,8 @@ class SPUDocumentTransformer:
51 translation_prompts: 翻译提示词配置(可选) 53 translation_prompts: 翻译提示词配置(可选)
52 encoder: 文本编码器实例(可选,用于生成title_embedding) 54 encoder: 文本编码器实例(可选,用于生成title_embedding)
53 enable_title_embedding: 是否启用标题向量化(默认True) 55 enable_title_embedding: 是否启用标题向量化(默认True)
  56 + image_encoder: 图片编码器实例(可选,需实现 encode_image_urls(urls) -> List[Optional[np.ndarray]])
  57 + enable_image_embedding: 是否启用图片向量化(默认False)
54 """ 58 """
55 self.category_id_to_name = category_id_to_name 59 self.category_id_to_name = category_id_to_name
56 self.searchable_option_dimensions = searchable_option_dimensions 60 self.searchable_option_dimensions = searchable_option_dimensions
@@ -59,6 +63,8 @@ class SPUDocumentTransformer: @@ -59,6 +63,8 @@ class SPUDocumentTransformer:
59 self.translation_prompts = translation_prompts or {} 63 self.translation_prompts = translation_prompts or {}
60 self.encoder = encoder 64 self.encoder = encoder
61 self.enable_title_embedding = enable_title_embedding 65 self.enable_title_embedding = enable_title_embedding
  66 + self.image_encoder = image_encoder
  67 + self.enable_image_embedding = bool(enable_image_embedding and image_encoder is not None)
62 68
63 def transform_spu_to_doc( 69 def transform_spu_to_doc(
64 self, 70 self,
@@ -116,6 +122,10 @@ class SPUDocumentTransformer: @@ -116,6 +122,10 @@ class SPUDocumentTransformer:
116 # Image URL 122 # Image URL
117 self._fill_image_url(doc, spu_row) 123 self._fill_image_url(doc, spu_row)
118 124
  125 + # Image embedding(与 mappings/search_products.json 中 image_embedding 嵌套结构一致)
  126 + if self.enable_image_embedding:
  127 + self._fill_image_embedding(doc, spu_row, skus)
  128 +
119 # Sales (fake_sales) 129 # Sales (fake_sales)
120 if pd.notna(spu_row.get('fake_sales')): 130 if pd.notna(spu_row.get('fake_sales')):
121 try: 131 try:
@@ -364,6 +374,56 @@ class SPUDocumentTransformer: @@ -364,6 +374,56 @@ class SPUDocumentTransformer:
364 image_src = f"//{image_src}" if not image_src.startswith('//') else image_src 374 image_src = f"//{image_src}" if not image_src.startswith('//') else image_src
365 doc['image_url'] = image_src 375 doc['image_url'] = image_src
366 376
  377 + def _fill_image_embedding(
  378 + self, doc: Dict[str, Any], spu_row: pd.Series, skus: pd.DataFrame
  379 + ) -> None:
  380 + """
  381 + 填充 image_embedding 嵌套字段,与 mappings/search_products.json 一致:
  382 + [{ "vector": [float, ...], "url": "..." }, ...]
  383 + 收集 SPU 主图 + SKU 图片 URL,去重后调用 image_encoder 生成向量。
  384 + """
  385 + urls: List[str] = []
  386 + seen: set = set()
  387 +
  388 + def _add(url: str) -> None:
  389 + if not url or not str(url).strip():
  390 + return
  391 + u = str(url).strip()
  392 + if u.startswith("//"):
  393 + u = "https:" + u
  394 + if u not in seen:
  395 + seen.add(u)
  396 + urls.append(u)
  397 +
  398 + if doc.get("image_url"):
  399 + _add(doc["image_url"])
  400 + if pd.notna(spu_row.get("image_src")):
  401 + _add(str(spu_row["image_src"]))
  402 + if not skus.empty and "image_src" in skus.columns:
  403 + for _, row in skus.iterrows():
  404 + if pd.notna(row.get("image_src")):
  405 + _add(str(row["image_src"]))
  406 +
  407 + if not urls:
  408 + return
  409 + try:
  410 + vectors = self.image_encoder.encode_image_urls(urls, batch_size=8)
  411 + if not vectors or len(vectors) != len(urls):
  412 + return
  413 + out = []
  414 + for url, vec in zip(urls, vectors):
  415 + if vec is None:
  416 + continue
  417 + if isinstance(vec, np.ndarray):
  418 + vec = vec.astype(np.float32)
  419 + out.append({"vector": vec.tolist(), "url": url})
  420 + elif hasattr(vec, "tolist"):
  421 + out.append({"vector": vec.tolist(), "url": url})
  422 + if out:
  423 + doc["image_embedding"] = out
  424 + except Exception as e:
  425 + logger.warning("Failed to generate image_embedding for SPU %s: %s", doc.get("spu_id"), e)
  426 +
367 def _process_skus( 427 def _process_skus(
368 self, 428 self,
369 skus: pd.DataFrame, 429 skus: pd.DataFrame,
indexer/incremental_service.py
@@ -74,11 +74,20 @@ class IncrementalIndexerService: @@ -74,11 +74,20 @@ class IncrementalIndexerService:
74 encoder = None 74 encoder = None
75 enable_embedding = False 75 enable_embedding = False
76 76
  77 + image_encoder: Optional[Any] = None
  78 + try:
  79 + from embeddings.image_encoder import CLIPImageEncoder
  80 + image_encoder = CLIPImageEncoder()
  81 + except Exception as e:
  82 + logger.debug("Image encoder not available for indexer: %s", e)
  83 +
77 transformer = create_document_transformer( 84 transformer = create_document_transformer(
78 category_id_to_name=self.category_id_to_name, 85 category_id_to_name=self.category_id_to_name,
79 tenant_id=tenant_id, 86 tenant_id=tenant_id,
80 encoder=encoder, 87 encoder=encoder,
81 enable_title_embedding=False, # batch fill later 88 enable_title_embedding=False, # batch fill later
  89 + image_encoder=image_encoder,
  90 + enable_image_embedding=(image_encoder is not None),
82 config=config, 91 config=config,
83 ) 92 )
84 93
indexer/indexing_utils.py
@@ -59,11 +59,13 @@ def create_document_transformer( @@ -59,11 +59,13 @@ def create_document_transformer(
59 translation_prompts: Optional[Dict[str, str]] = None, 59 translation_prompts: Optional[Dict[str, str]] = None,
60 encoder: Optional[Any] = None, 60 encoder: Optional[Any] = None,
61 enable_title_embedding: bool = True, 61 enable_title_embedding: bool = True,
  62 + image_encoder: Optional[Any] = None,
  63 + enable_image_embedding: bool = False,
62 config: Optional[Any] = None, 64 config: Optional[Any] = None,
63 ) -> SPUDocumentTransformer: 65 ) -> SPUDocumentTransformer:
64 """ 66 """
65 创建文档转换器(统一初始化逻辑)。 67 创建文档转换器(统一初始化逻辑)。
66 - 68 +
67 Args: 69 Args:
68 category_id_to_name: 分类ID到名称的映射 70 category_id_to_name: 分类ID到名称的映射
69 tenant_id: 租户ID 71 tenant_id: 租户ID
@@ -72,7 +74,9 @@ def create_document_transformer( @@ -72,7 +74,9 @@ def create_document_transformer(
72 translation_prompts: 翻译提示词配置(如果为None则从配置加载) 74 translation_prompts: 翻译提示词配置(如果为None则从配置加载)
73 encoder: 文本编码器实例(如果为None且enable_title_embedding为True则根据配置初始化) 75 encoder: 文本编码器实例(如果为None且enable_title_embedding为True则根据配置初始化)
74 enable_title_embedding: 是否启用标题向量化(默认True) 76 enable_title_embedding: 是否启用标题向量化(默认True)
75 - 77 + image_encoder: 图片编码器(可选,需实现 encode_image_urls(urls))
  78 + enable_image_embedding: 是否启用 image_embedding 填充(默认False)
  79 +
76 Returns: 80 Returns:
77 SPUDocumentTransformer实例 81 SPUDocumentTransformer实例
78 """ 82 """
@@ -128,6 +132,8 @@ def create_document_transformer( @@ -128,6 +132,8 @@ def create_document_transformer(
128 translator=translator, 132 translator=translator,
129 translation_prompts=translation_prompts, 133 translation_prompts=translation_prompts,
130 encoder=encoder, 134 encoder=encoder,
131 - enable_title_embedding=enable_title_embedding 135 + enable_title_embedding=enable_title_embedding,
  136 + image_encoder=image_encoder,
  137 + enable_image_embedding=enable_image_embedding,
132 ) 138 )
133 139
scripts/create_tenant_index.sh
@@ -61,7 +61,6 @@ echo @@ -61,7 +61,6 @@ echo
61 echo "删除索引: $ES_INDEX" 61 echo "删除索引: $ES_INDEX"
62 echo 62 echo
63 curl -X DELETE "${ES_HOST}/${ES_INDEX}" $AUTH_PARAM -s -o /dev/null -w "HTTP状态码: %{http_code}\n" 63 curl -X DELETE "${ES_HOST}/${ES_INDEX}" $AUTH_PARAM -s -o /dev/null -w "HTTP状态码: %{http_code}\n"
64 -  
65 echo 64 echo
66 echo "创建索引: $ES_INDEX" 65 echo "创建索引: $ES_INDEX"
67 echo 66 echo
scripts/test_build_docs_api.py 0 → 100644
@@ -0,0 +1,155 @@ @@ -0,0 +1,155 @@
  1 +#!/usr/bin/env python3
  2 +"""
  3 +测试 POST /indexer/build-docs 接口:构造请求数据、调用接口、打印完整响应。
  4 +
  5 +用法:
  6 + 1. 先启动 Indexer 服务: ./scripts/start_indexer.sh (或 uvicorn api.indexer_app:app --port 6004)
  7 + 2. 执行: python scripts/test_build_docs_api.py
  8 +
  9 + 也可指定地址: INDEXER_URL=http://localhost:6004 python scripts/test_build_docs_api.py
  10 +"""
  11 +
  12 +import json
  13 +import os
  14 +import sys
  15 +from datetime import datetime, timezone
  16 +
  17 +# 项目根目录
  18 +ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
  19 +sys.path.insert(0, ROOT)
  20 +
  21 +# 默认使用 requests 调真实服务;若未安装则回退到 TestClient
  22 +try:
  23 + import requests
  24 + HAS_REQUESTS = True
  25 +except ImportError:
  26 + HAS_REQUESTS = False
  27 +
  28 +
  29 +def build_sample_request():
  30 + """构造一条完整的 build-docs 请求体(对应 shoplazza_product_spu / sku / option 表结构)。"""
  31 + now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
  32 + return {
  33 + "tenant_id": "162",
  34 + "items": [
  35 + {
  36 + "spu": {
  37 + "id": 10001,
  38 + "tenant_id": "162",
  39 + "title": "测试T恤 纯棉短袖",
  40 + "brief": "舒适纯棉,多色可选",
  41 + "description": "这是一款适合日常穿着的纯棉T恤,透气吸汗。",
  42 + "vendor": "测试品牌",
  43 + "category": "服装/上衣/T恤",
  44 + "category_id": 100,
  45 + "category_level": 2,
  46 + "category_path": "服装/上衣/T恤",
  47 + "fake_sales": 1280,
  48 + "image_src": "https://example.com/img/tshirt.jpg",
  49 + "tags": "T恤,纯棉,短袖,夏季",
  50 + "create_time": now,
  51 + "update_time": now,
  52 + },
  53 + "skus": [
  54 + {
  55 + "id": 20001,
  56 + "spu_id": 10001,
  57 + "price": 99.0,
  58 + "compare_at_price": 129.0,
  59 + "sku": "SKU-TSHIRT-001",
  60 + "inventory_quantity": 50,
  61 + "option1": "黑色",
  62 + "option2": "M",
  63 + "option3": None,
  64 + },
  65 + {
  66 + "id": 20002,
  67 + "spu_id": 10001,
  68 + "price": 99.0,
  69 + "compare_at_price": 129.0,
  70 + "sku": "SKU-TSHIRT-002",
  71 + "inventory_quantity": 30,
  72 + "option1": "白色",
  73 + "option2": "L",
  74 + "option3": None,
  75 + },
  76 + ],
  77 + "options": [
  78 + {"id": 1, "position": 1, "name": "颜色"},
  79 + {"id": 2, "position": 2, "name": "尺码"},
  80 + ],
  81 + }
  82 + ],
  83 + }
  84 +
  85 +
  86 +def call_via_http(base_url: str, body: dict):
  87 + """通过 HTTP 调用 build-docs。"""
  88 + url = f"{base_url.rstrip('/')}/indexer/build-docs"
  89 + r = requests.post(url, json=body, timeout=30)
  90 + return r.status_code, r.text, r.json() if r.headers.get("content-type", "").startswith("application/json") else None
  91 +
  92 +
  93 +def call_via_test_client(body: dict):
  94 + """通过 FastAPI TestClient 调用(不依赖已启动服务,但需 DB/ES 已配置)。"""
  95 + from fastapi.testclient import TestClient
  96 + import api.indexer_app as indexer_app
  97 +
  98 + with TestClient(indexer_app.app) as client:
  99 + r = client.post("/indexer/build-docs", json=body)
  100 + return r.status_code, r.text, r.json() if r.headers.get("content-type", "").startswith("application/json") else None
  101 +
  102 +
  103 +def main():
  104 + body = build_sample_request()
  105 +
  106 + print("=" * 60)
  107 + print("【请求】POST /indexer/build-docs")
  108 + print("=" * 60)
  109 + print(json.dumps(body, ensure_ascii=False, indent=2))
  110 +
  111 + base_url = os.getenv("INDEXER_URL", "http://localhost:6004")
  112 + use_http = HAS_REQUESTS and (os.getenv("USE_TEST_CLIENT", "").lower() not in ("1", "true", "yes"))
  113 +
  114 + if use_http:
  115 + try:
  116 + status, raw, data = call_via_http(base_url, body)
  117 + except requests.RequestException as e:
  118 + print("\n[错误] 无法连接 Indexer 服务:", e)
  119 + print("请先启动: ./scripts/start_indexer.sh 或 uvicorn api.indexer_app:app --port 6004")
  120 + if HAS_REQUESTS:
  121 + print("或使用进程内测试: USE_TEST_CLIENT=1 python scripts/test_build_docs_api.py")
  122 + sys.exit(1)
  123 + else:
  124 + if not use_http and not HAS_REQUESTS:
  125 + print("\n[提示] 未安装 requests,使用 TestClient 调用(需配置 DB/ES)。")
  126 + else:
  127 + print("\n[提示] 使用 TestClient 调用(USE_TEST_CLIENT=1)。")
  128 + try:
  129 + status, raw, data = call_via_test_client(body)
  130 + except Exception as e:
  131 + print("\n[错误] TestClient 调用失败:", e)
  132 + print("请确保已 source activate.sh 且 DB/ES 环境变量正确,或先启动 Indexer 再用 HTTP 调用。")
  133 + sys.exit(1)
  134 +
  135 + print("\n" + "=" * 60)
  136 + print("【响应】HTTP status =", status)
  137 + print("=" * 60)
  138 + if data is not None:
  139 + print(json.dumps(data, ensure_ascii=False, indent=2, default=str))
  140 + if data.get("docs"):
  141 + doc = data["docs"][0]
  142 + print("\n" + "=" * 60)
  143 + print("【返回 doc 顶层字段】共 {} 个".format(len(doc)))
  144 + print("=" * 60)
  145 + for k in sorted(doc.keys()):
  146 + print(" ", k)
  147 + else:
  148 + print(raw)
  149 +
  150 + if status != 200:
  151 + sys.exit(1)
  152 +
  153 +
  154 +if __name__ == "__main__":
  155 + main()
tests/ci/test_service_api_contracts.py
1 from __future__ import annotations 1 from __future__ import annotations
2 2
  3 +import json
  4 +from pathlib import Path
3 from types import SimpleNamespace 5 from types import SimpleNamespace
4 from typing import Any, Dict, List 6 from typing import Any, Dict, List
5 7
@@ -219,6 +221,64 @@ def test_indexer_build_docs_contract(indexer_client: TestClient): @@ -219,6 +221,64 @@ def test_indexer_build_docs_contract(indexer_client: TestClient):
219 assert data["docs"][0]["spu_id"] == "1" 221 assert data["docs"][0]["spu_id"] == "1"
220 222
221 223
  224 +def test_indexer_build_docs_show_request_response(indexer_client: TestClient):
  225 + """
  226 + 调用「输入 SPU 详情、输出 ES doc」的 build-docs 接口,并打印完整请求与响应,
  227 + 便于核对返回的 ES 文档字段是否齐全。
  228 + 运行: pytest tests/ci/test_service_api_contracts.py::test_indexer_build_docs_show_request_response -v -s
  229 + """
  230 + request_body = {
  231 + "tenant_id": "162",
  232 + "items": [
  233 + {
  234 + "spu": {"id": 1, "title": "T-shirt", "brief": "A simple T-shirt"},
  235 + "skus": [],
  236 + "options": [],
  237 + }
  238 + ],
  239 + }
  240 + print("\n" + "=" * 60)
  241 + print("【请求】POST /indexer/build-docs")
  242 + print("=" * 60)
  243 + print(json.dumps(request_body, ensure_ascii=False, indent=2))
  244 +
  245 + response = indexer_client.post("/indexer/build-docs", json=request_body)
  246 + data = response.json()
  247 +
  248 + print("\n" + "=" * 60)
  249 + print("【响应】status_code =", response.status_code)
  250 + print("=" * 60)
  251 + print(json.dumps(data, ensure_ascii=False, indent=2, default=str))
  252 +
  253 + if data.get("docs"):
  254 + doc = data["docs"][0]
  255 + doc_keys = sorted(doc.keys())
  256 + print("\n" + "=" * 60)
  257 + print("【返回 doc 顶层字段】共 {} 个".format(len(doc_keys)))
  258 + print("=" * 60)
  259 + for k in doc_keys:
  260 + print(" ", k)
  261 +
  262 + # 与 ES mapping 顶层字段对比
  263 + mapping_path = Path(__file__).resolve().parents[2] / "mappings" / "search_products.json"
  264 + if mapping_path.exists():
  265 + with open(mapping_path) as f:
  266 + mapping = json.load(f)
  267 + expected_top_level = set(mapping.get("mappings", {}).get("properties", {}).keys())
  268 + returned = set(doc_keys)
  269 + missing = expected_top_level - returned
  270 + extra = returned - expected_top_level
  271 + print("\n" + "=" * 60)
  272 + print("【与 mappings/search_products.json 对比】")
  273 + print("=" * 60)
  274 + print(" mapping 中应有、当前 doc 未返回:", sorted(missing) if missing else "(无)")
  275 + print(" 当前 doc 多出:", sorted(extra) if extra else "(无)")
  276 +
  277 + assert response.status_code == 200
  278 + assert data["success_count"] == 1
  279 + assert data["docs"][0]["spu_id"] == "1"
  280 +
  281 +
222 def test_indexer_build_docs_from_db_contract(indexer_client: TestClient): 282 def test_indexer_build_docs_from_db_contract(indexer_client: TestClient):
223 """POST /indexer/build-docs-from-db: tenant_id + spu_ids, returns same shape as build-docs.""" 283 """POST /indexer/build-docs-from-db: tenant_id + spu_ids, returns same shape as build-docs."""
224 response = indexer_client.post( 284 response = indexer_client.post(