Commit d350861ffff80413a1e1be71dfbe478d514ee925
1 parent
fca871fb
索引结构修改
Showing
11 changed files
with
408 additions
and
344 deletions
Show diff stats
api/routes/indexer.py
| @@ -80,7 +80,7 @@ class BuildDocsFromDbRequest(BaseModel): | @@ -80,7 +80,7 @@ class BuildDocsFromDbRequest(BaseModel): | ||
| 80 | class EnrichContentItem(BaseModel): | 80 | class EnrichContentItem(BaseModel): |
| 81 | """单条待生成内容理解字段的商品。""" | 81 | """单条待生成内容理解字段的商品。""" |
| 82 | spu_id: str = Field(..., description="SPU ID") | 82 | spu_id: str = Field(..., description="SPU ID") |
| 83 | - title: str = Field(..., description="商品标题,用于 LLM 分析生成 qanchors / tags 等") | 83 | + title: str = Field(..., description="商品标题,用于 LLM 分析生成 qanchors / enriched_tags 等") |
| 84 | image_url: Optional[str] = Field(None, description="商品主图 URL(预留给多模态/内容理解扩展)") | 84 | image_url: Optional[str] = Field(None, description="商品主图 URL(预留给多模态/内容理解扩展)") |
| 85 | brief: Optional[str] = Field(None, description="商品简介/短描述") | 85 | brief: Optional[str] = Field(None, description="商品简介/短描述") |
| 86 | description: Optional[str] = Field(None, description="商品详情/长描述") | 86 | description: Optional[str] = Field(None, description="商品详情/长描述") |
| @@ -93,10 +93,6 @@ class EnrichContentRequest(BaseModel): | @@ -93,10 +93,6 @@ class EnrichContentRequest(BaseModel): | ||
| 93 | """ | 93 | """ |
| 94 | tenant_id: str = Field(..., description="租户 ID,用于请求路由与结果归属,不参与缓存键") | 94 | tenant_id: str = Field(..., description="租户 ID,用于请求路由与结果归属,不参与缓存键") |
| 95 | items: List[EnrichContentItem] = Field(..., description="待分析的 SPU 列表(spu_id + title,可附带 brief/description/image_url)") | 95 | items: List[EnrichContentItem] = Field(..., description="待分析的 SPU 列表(spu_id + title,可附带 brief/description/image_url)") |
| 96 | - languages: List[str] = Field( | ||
| 97 | - default_factory=lambda: ["zh", "en"], | ||
| 98 | - description="目标语言列表,需在支持范围内(zh/en/de/ru/fr),默认 zh, en", | ||
| 99 | - ) | ||
| 100 | 96 | ||
| 101 | 97 | ||
| 102 | @router.post("/reindex") | 98 | @router.post("/reindex") |
| @@ -444,92 +440,25 @@ async def build_docs_from_db(request: BuildDocsFromDbRequest): | @@ -444,92 +440,25 @@ async def build_docs_from_db(request: BuildDocsFromDbRequest): | ||
| 444 | raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}") | 440 | raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}") |
| 445 | 441 | ||
| 446 | 442 | ||
| 447 | -def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: List[str]) -> List[Dict[str, Any]]: | 443 | +def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]]) -> List[Dict[str, Any]]: |
| 448 | """ | 444 | """ |
| 449 | - 同步执行内容理解:调用 product_enrich.analyze_products,按语言批量跑 LLM, | ||
| 450 | - 再聚合成每 SPU 的 qanchors、enriched_attributes、tags。供 run_in_executor 调用。 | 445 | + 同步执行内容理解,返回与 ES mapping 对齐的字段结构。 |
| 446 | + 语言策略由 product_enrich 内部统一决定,路由层不参与。 | ||
| 451 | """ | 447 | """ |
| 452 | - from indexer.product_enrich import analyze_products, split_multi_value_field | 448 | + from indexer.product_enrich import build_index_content_fields |
| 453 | 449 | ||
| 454 | - llm_langs = list(dict.fromkeys(languages)) or ["en"] | ||
| 455 | - | ||
| 456 | - products = [ | 450 | + results = build_index_content_fields(items=items, tenant_id=tenant_id) |
| 451 | + return [ | ||
| 457 | { | 452 | { |
| 458 | - "id": it["spu_id"], | ||
| 459 | - "title": (it.get("title") or "").strip(), | ||
| 460 | - "brief": (it.get("brief") or "").strip(), | ||
| 461 | - "description": (it.get("description") or "").strip(), | ||
| 462 | - "image_url": (it.get("image_url") or "").strip(), | 453 | + "spu_id": item["id"], |
| 454 | + "qanchors": item["qanchors"], | ||
| 455 | + "enriched_attributes": item["enriched_attributes"], | ||
| 456 | + "enriched_tags": item["enriched_tags"], | ||
| 457 | + **({"error": item["error"]} if item.get("error") else {}), | ||
| 463 | } | 458 | } |
| 464 | - for it in items | ||
| 465 | - ] | ||
| 466 | - dim_keys = [ | ||
| 467 | - "tags", | ||
| 468 | - "target_audience", | ||
| 469 | - "usage_scene", | ||
| 470 | - "season", | ||
| 471 | - "key_attributes", | ||
| 472 | - "material", | ||
| 473 | - "features", | 459 | + for item in results |
| 474 | ] | 460 | ] |
| 475 | 461 | ||
| 476 | - # 按 spu_id 聚合:qanchors[lang], enriched_attributes[], tags[] | ||
| 477 | - by_spu: Dict[str, Dict[str, Any]] = {} | ||
| 478 | - for it in items: | ||
| 479 | - sid = str(it["spu_id"]) | ||
| 480 | - by_spu[sid] = {"qanchors": {}, "enriched_attributes": [], "tags": []} | ||
| 481 | - | ||
| 482 | - for lang in llm_langs: | ||
| 483 | - try: | ||
| 484 | - rows = analyze_products( | ||
| 485 | - products=products, | ||
| 486 | - target_lang=lang, | ||
| 487 | - batch_size=20, | ||
| 488 | - tenant_id=tenant_id, | ||
| 489 | - ) | ||
| 490 | - except Exception as e: | ||
| 491 | - logger.warning("enrich-content analyze_products failed for lang=%s: %s", lang, e) | ||
| 492 | - for it in items: | ||
| 493 | - sid = str(it["spu_id"]) | ||
| 494 | - if "error" not in by_spu[sid]: | ||
| 495 | - by_spu[sid]["error"] = str(e) | ||
| 496 | - continue | ||
| 497 | - | ||
| 498 | - for row in rows: | ||
| 499 | - spu_id = str(row.get("id") or "") | ||
| 500 | - if spu_id not in by_spu: | ||
| 501 | - continue | ||
| 502 | - rec = by_spu[spu_id] | ||
| 503 | - if row.get("error"): | ||
| 504 | - rec["error"] = row["error"] | ||
| 505 | - continue | ||
| 506 | - anchor_text = str(row.get("anchor_text") or "").strip() | ||
| 507 | - if anchor_text: | ||
| 508 | - rec["qanchors"][lang] = anchor_text | ||
| 509 | - for name in dim_keys: | ||
| 510 | - raw = row.get(name) | ||
| 511 | - if not raw: | ||
| 512 | - continue | ||
| 513 | - for value in split_multi_value_field(str(raw)): | ||
| 514 | - rec["enriched_attributes"].append({"lang": lang, "name": name, "value": value}) | ||
| 515 | - if name == "tags": | ||
| 516 | - rec["tags"].append(value) | ||
| 517 | - | ||
| 518 | - # 去重 tags(保持顺序) | ||
| 519 | - out = [] | ||
| 520 | - for it in items: | ||
| 521 | - sid = str(it["spu_id"]) | ||
| 522 | - rec = by_spu[sid] | ||
| 523 | - tags = list(dict.fromkeys(rec["tags"])) | ||
| 524 | - out.append({ | ||
| 525 | - "spu_id": sid, | ||
| 526 | - "qanchors": rec["qanchors"], | ||
| 527 | - "enriched_attributes": rec["enriched_attributes"], | ||
| 528 | - "tags": tags, | ||
| 529 | - **({"error": rec["error"]} if rec.get("error") else {}), | ||
| 530 | - }) | ||
| 531 | - return out | ||
| 532 | - | ||
| 533 | 462 | ||
| 534 | @router.post("/enrich-content") | 463 | @router.post("/enrich-content") |
| 535 | async def enrich_content(request: EnrichContentRequest): | 464 | async def enrich_content(request: EnrichContentRequest): |
| @@ -540,7 +469,7 @@ async def enrich_content(request: EnrichContentRequest): | @@ -540,7 +469,7 @@ async def enrich_content(request: EnrichContentRequest): | ||
| 540 | - 外部 indexer 采用「微服务组合」方式自己组织 doc 时,可调用本接口获取 LLM 生成的 | 469 | - 外部 indexer 采用「微服务组合」方式自己组织 doc 时,可调用本接口获取 LLM 生成的 |
| 541 | 锚文本与语义属性,再与翻译、向量化结果合并写入 ES。 | 470 | 锚文本与语义属性,再与翻译、向量化结果合并写入 ES。 |
| 542 | - 与 /indexer/build-docs 解耦,避免 build-docs 因 LLM 耗时过长而阻塞;调用方可 | 471 | - 与 /indexer/build-docs 解耦,避免 build-docs 因 LLM 耗时过长而阻塞;调用方可 |
| 543 | - 先拿不含 qanchors/tags 的 doc,再异步或离线补齐本接口结果后更新 ES。 | 472 | + 先拿不含 qanchors/enriched_tags 的 doc,再异步或离线补齐本接口结果后更新 ES。 |
| 544 | 473 | ||
| 545 | 实现逻辑与 indexer.product_enrich.analyze_products 一致,支持多语言与 Redis 缓存。 | 474 | 实现逻辑与 indexer.product_enrich.analyze_products 一致,支持多语言与 Redis 缓存。 |
| 546 | """ | 475 | """ |
| @@ -568,8 +497,7 @@ async def enrich_content(request: EnrichContentRequest): | @@ -568,8 +497,7 @@ async def enrich_content(request: EnrichContentRequest): | ||
| 568 | None, | 497 | None, |
| 569 | lambda: _run_enrich_content( | 498 | lambda: _run_enrich_content( |
| 570 | tenant_id=request.tenant_id, | 499 | tenant_id=request.tenant_id, |
| 571 | - items=items_payload, | ||
| 572 | - languages=request.languages or ["zh", "en"], | 500 | + items=items_payload |
| 573 | ), | 501 | ), |
| 574 | ) | 502 | ) |
| 575 | return { | 503 | return { |
docs/suggestion索引构建.md
| @@ -169,7 +169,7 @@ | @@ -169,7 +169,7 @@ | ||
| 169 | 169 | ||
| 170 | ##### 4.1 从商品索引收集 title / qanchors / tags(Step 1) | 170 | ##### 4.1 从商品索引收集 title / qanchors / tags(Step 1) |
| 171 | 171 | ||
| 172 | - - 遍历店铺的所有商品:获取每个商品的 `"spu_id"`, `"title"`, `"qanchors"`, `"tags"`(按 `spu_id`、`id.keyword` 升序,便于 `search_after` 稳定分页) | 172 | + - 遍历店铺的所有商品:获取每个商品的 `"spu_id"`, `"title"`, `"qanchors"`, `"enriched_tags"`(按 `spu_id`、`id.keyword` 升序,便于 `search_after` 稳定分页) |
| 173 | 173 | ||
| 174 | - 对每个商品文档: | 174 | - 对每个商品文档: |
| 175 | 175 | ||
| @@ -207,7 +207,7 @@ | @@ -207,7 +207,7 @@ | ||
| 207 | - **qanchors 处理**: | 207 | - **qanchors 处理**: |
| 208 | - `qanchors` 字段同样为多语言对象: | 208 | - `qanchors` 字段同样为多语言对象: |
| 209 | ```json | 209 | ```json |
| 210 | - "qanchors": { "en": "...", "zh": "..." } | 210 | + "qanchors": { "en": ["slim fit", "sporty casual"], "zh": ["修身", "显瘦"] } |
| 211 | ``` | 211 | ``` |
| 212 | - 取 `q_raw = qanchors[lang]` | 212 | - 取 `q_raw = qanchors[lang]` |
| 213 | - 通过 `_split_qanchors(q_raw)` 拆分为若干字符串: | 213 | - 通过 `_split_qanchors(q_raw)` 拆分为若干字符串: |
| @@ -217,10 +217,14 @@ | @@ -217,10 +217,14 @@ | ||
| 217 | - `text_norm = _normalize_text(q_text)`,再用 `_looks_noise` 过滤 | 217 | - `text_norm = _normalize_text(q_text)`,再用 `_looks_noise` 过滤 |
| 218 | - 同样按 `(lang, text_norm)` 合并为 `SuggestionCandidate`,调用 `add_product("qanchor", spu_id=product_id)`。 | 218 | - 同样按 `(lang, text_norm)` 合并为 `SuggestionCandidate`,调用 `add_product("qanchor", spu_id=product_id)`。 |
| 219 | 219 | ||
| 220 | - 4. **tags 处理**(与 `index_languages` 循环并列,每个商品只做一次): | ||
| 221 | - - `tags` 可为字符串数组,或逗号等分隔的单个字符串;经 `_iter_product_tags` 展开为若干条。 | ||
| 222 | - - 每条 tag **无语言字段**:使用 `query.query_parser.detect_text_language_for_suggestions`(与 `QueryParser` 相同的 `LanguageDetector`)判定语言,并约束在租户的 `index_languages` 内。 | ||
| 223 | - - 通过 `_looks_noise` 后按 `(detected_lang, text_norm)` 合并,调用 `add_product("tag", spu_id=product_id)`。 | 220 | + 4. **enriched_tags 处理**(与 `index_languages` 循环并列): |
| 221 | + - `enriched_tags` 现为多语言对象,例如: | ||
| 222 | + ```json | ||
| 223 | + "enriched_tags": { "en": ["Classic", "ribbed neckline"], "zh": ["辣妹风"] } | ||
| 224 | + ``` | ||
| 225 | + - 优先读取 `enriched_tags[lang]`,每个值可为字符串数组,或逗号等分隔的单个字符串;经 `_iter_product_tags` 展开为若干条。 | ||
| 226 | + - 对历史旧数据,若 `enriched_tags` 仍是单层字符串 / 数组,则继续走语言检测兜底,并约束在租户的 `index_languages` 内。 | ||
| 227 | + - 通过 `_looks_noise` 后按 `(lang, text_norm)` 合并,调用 `add_product("tag", spu_id=product_id)`。 | ||
| 224 | 228 | ||
| 225 | ##### 4.2 从查询日志收集用户 query(Step 2) | 229 | ##### 4.2 从查询日志收集用户 query(Step 2) |
| 226 | 230 |
docs/搜索API对接指南-05-索引接口(Indexer).md
| @@ -13,7 +13,7 @@ | @@ -13,7 +13,7 @@ | ||
| 13 | | 查询文档 | POST | `/indexer/documents` | 按 SPU ID 列表查询 ES 文档,不写入 ES | | 13 | | 查询文档 | POST | `/indexer/documents` | 按 SPU ID 列表查询 ES 文档,不写入 ES | |
| 14 | | 构建 ES 文档(正式) | POST | `/indexer/build-docs` | 由上游提供 MySQL 行数据,返回 ES-ready 文档,不写 ES | | 14 | | 构建 ES 文档(正式) | POST | `/indexer/build-docs` | 由上游提供 MySQL 行数据,返回 ES-ready 文档,不写 ES | |
| 15 | | 构建 ES 文档(测试) | POST | `/indexer/build-docs-from-db` | 由本服务查库并构建文档,仅测试/调试用 | | 15 | | 构建 ES 文档(测试) | POST | `/indexer/build-docs-from-db` | 由本服务查库并构建文档,仅测试/调试用 | |
| 16 | -| 内容理解字段生成 | POST | `/indexer/enrich-content` | 根据商品标题批量生成 qanchors、semantic_attributes、tags(供微服务组合方式使用) | | 16 | +| 内容理解字段生成 | POST | `/indexer/enrich-content` | 根据商品标题批量生成 qanchors、enriched_attributes、tags(供微服务组合方式使用) | |
| 17 | | 索引健康检查 | GET | `/indexer/health` | 检查索引服务与数据库连接状态 | | 17 | | 索引健康检查 | GET | `/indexer/health` | 检查索引服务与数据库连接状态 | |
| 18 | 18 | ||
| 19 | #### 5.0 支撑外部 indexer 的三种方式 | 19 | #### 5.0 支撑外部 indexer 的三种方式 |
| @@ -510,7 +510,6 @@ curl -X POST "http://localhost:6004/indexer/build-docs" \ | @@ -510,7 +510,6 @@ curl -X POST "http://localhost:6004/indexer/build-docs" \ | ||
| 510 | { | 510 | { |
| 511 | "spu": { | 511 | "spu": { |
| 512 | "id": 10001, | 512 | "id": 10001, |
| 513 | - "tenant_id": "162", | ||
| 514 | "title": "测试T恤 纯棉短袖", | 513 | "title": "测试T恤 纯棉短袖", |
| 515 | "brief": "舒适纯棉,多色可选", | 514 | "brief": "舒适纯棉,多色可选", |
| 516 | "description": "这是一款适合日常穿着的纯棉T恤,透气吸汗。", | 515 | "description": "这是一款适合日常穿着的纯棉T恤,透气吸汗。", |
| @@ -521,7 +520,7 @@ curl -X POST "http://localhost:6004/indexer/build-docs" \ | @@ -521,7 +520,7 @@ curl -X POST "http://localhost:6004/indexer/build-docs" \ | ||
| 521 | "category_path": "服装/上衣/T恤", | 520 | "category_path": "服装/上衣/T恤", |
| 522 | "fake_sales": 1280, | 521 | "fake_sales": 1280, |
| 523 | "image_src": "https://oss.essa.cn/98532128-cf8e-456c-9e30-6f2a5ea0c19f.jpg", | 522 | "image_src": "https://oss.essa.cn/98532128-cf8e-456c-9e30-6f2a5ea0c19f.jpg", |
| 524 | - "tags": "T恤,纯棉,短袖,夏季", | 523 | + "enriched_tags": ["T恤", "纯棉"], |
| 525 | "create_time": "2024-01-01T00:00:00Z", | 524 | "create_time": "2024-01-01T00:00:00Z", |
| 526 | "update_time": "2024-01-01T00:00:00Z" | 525 | "update_time": "2024-01-01T00:00:00Z" |
| 527 | }, | 526 | }, |
| @@ -570,7 +569,7 @@ curl -X POST "http://localhost:6004/indexer/build-docs" \ | @@ -570,7 +569,7 @@ curl -X POST "http://localhost:6004/indexer/build-docs" \ | ||
| 570 | "tenant_id": "170", | 569 | "tenant_id": "170", |
| 571 | "spu_id": "223167", | 570 | "spu_id": "223167", |
| 572 | "title": { "en": "...", "zh": "..." }, | 571 | "title": { "en": "...", "zh": "..." }, |
| 573 | - "tags": ["Floerns", "Clothing", "Shoes & Jewelry"], | 572 | + "enriched_tags": ["Floerns", "Clothing", "Shoes & Jewelry"], |
| 574 | "skus": [ | 573 | "skus": [ |
| 575 | { | 574 | { |
| 576 | "sku_id": "3988393", | 575 | "sku_id": "3988393", |
| @@ -649,7 +648,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | @@ -649,7 +648,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | ||
| 649 | ### 5.8 内容理解字段生成接口 | 648 | ### 5.8 内容理解字段生成接口 |
| 650 | 649 | ||
| 651 | - **端点**: `POST /indexer/enrich-content` | 650 | - **端点**: `POST /indexer/enrich-content` |
| 652 | -- **描述**: 根据商品内容信息批量生成 **qanchors**(锚文本)、**enriched_attributes**(语义属性)、**tags**(细分标签),供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段(必填/可选见下表)。内部逻辑与 `indexer.product_enrich` 一致,支持多语言与 Redis 缓存;单次请求在线程池中执行,避免阻塞其他接口。 | 651 | +- **描述**: 根据商品内容信息批量生成 **qanchors**(锚文本)、**enriched_attributes**(语义属性)、**enriched_tags**(细分标签),供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段(必填/可选见下表)。接口只暴露商品内容输入,语言选择、分析维度与最终字段结构统一由 `indexer.product_enrich` 内部决定;当前返回结果与 `search_products` mapping 保持一致。单次请求在线程池中执行,避免阻塞其他接口。 |
| 653 | 652 | ||
| 654 | #### 请求参数 | 653 | #### 请求参数 |
| 655 | 654 | ||
| @@ -669,8 +668,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | @@ -669,8 +668,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | ||
| 669 | "title": "12PCS Dolls with Bottles", | 668 | "title": "12PCS Dolls with Bottles", |
| 670 | "image_url": "https://example.com/images/223168.jpg" | 669 | "image_url": "https://example.com/images/223168.jpg" |
| 671 | } | 670 | } |
| 672 | - ], | ||
| 673 | - "languages": ["zh", "en"] | 671 | + ] |
| 674 | } | 672 | } |
| 675 | ``` | 673 | ``` |
| 676 | 674 | ||
| @@ -678,7 +676,6 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | @@ -678,7 +676,6 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | ||
| 678 | |------|------|------|--------|------| | 676 | |------|------|------|--------|------| |
| 679 | | `tenant_id` | string | Y | - | 租户 ID。目前仅用于记录日志,不产生实际作用| | 677 | | `tenant_id` | string | Y | - | 租户 ID。目前仅用于记录日志,不产生实际作用| |
| 680 | | `items` | array | Y | - | 待分析列表;**单次最多 50 条** | | 678 | | `items` | array | Y | - | 待分析列表;**单次最多 50 条** | |
| 681 | -| `languages` | array[string] | N | `["zh", "en"]` | 目标语言,需在支持范围内:`zh`、`en`、`de`、`ru`、`fr` | | ||
| 682 | 679 | ||
| 683 | `items[]` 字段说明: | 680 | `items[]` 字段说明: |
| 684 | 681 | ||
| @@ -696,6 +693,12 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | @@ -696,6 +693,12 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | ||
| 696 | - `tenant_id`、`spu_id` 只用于请求归属与结果回填,不参与缓存键。 | 693 | - `tenant_id`、`spu_id` 只用于请求归属与结果回填,不参与缓存键。 |
| 697 | - 因此,输入内容不变时可跨请求直接命中缓存;任一输入字段变化时,会自然落到新的缓存 key。 | 694 | - 因此,输入内容不变时可跨请求直接命中缓存;任一输入字段变化时,会自然落到新的缓存 key。 |
| 698 | 695 | ||
| 696 | +语言说明: | ||
| 697 | + | ||
| 698 | +- 接口不接受语言控制参数。 | ||
| 699 | +- 返回哪些语言、返回哪些语义维度,统一由 `indexer.product_enrich` 内部逻辑决定。 | ||
| 700 | +- 当前为了与 `search_products` mapping 对齐,返回结果只包含核心索引语言 `zh`、`en`。 | ||
| 701 | + | ||
| 699 | 批量请求建议: | 702 | 批量请求建议: |
| 700 | - **全量**:强烈建议 尽可能 **20 个 SPU/doc** 攒成一个批次后再请求一次。 | 703 | - **全量**:强烈建议 尽可能 **20 个 SPU/doc** 攒成一个批次后再请求一次。 |
| 701 | - **增量**:可按时效要求设置时间窗口(例如 **5 分钟**),在窗口内尽可能攒到 **20 个**;达到 20 或窗口到期就发送一次请求。 | 704 | - **增量**:可按时效要求设置时间窗口(例如 **5 分钟**),在窗口内尽可能攒到 **20 个**;达到 20 或窗口到期就发送一次请求。 |
| @@ -711,21 +714,28 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | @@ -711,21 +714,28 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | ||
| 711 | { | 714 | { |
| 712 | "spu_id": "223167", | 715 | "spu_id": "223167", |
| 713 | "qanchors": { | 716 | "qanchors": { |
| 714 | - "zh": "短袖T恤,纯棉,男装,夏季", | ||
| 715 | - "en": "cotton t-shirt, short sleeve, men, summer" | 717 | + "zh": ["短袖T恤", "纯棉", "男装", "夏季"], |
| 718 | + "en": ["cotton t-shirt", "short sleeve", "men", "summer"] | ||
| 719 | + }, | ||
| 720 | + "enriched_tags": { | ||
| 721 | + "zh": ["纯棉", "短袖", "男装"], | ||
| 722 | + "en": ["cotton", "short sleeve", "men"] | ||
| 716 | }, | 723 | }, |
| 717 | "enriched_attributes": [ | 724 | "enriched_attributes": [ |
| 718 | - { "lang": "zh", "name": "tags", "value": "纯棉" }, | ||
| 719 | - { "lang": "zh", "name": "usage_scene", "value": "日常" }, | ||
| 720 | - { "lang": "en", "name": "tags", "value": "cotton" } | ||
| 721 | - ], | ||
| 722 | - "tags": ["纯棉", "短袖", "男装", "cotton", "short sleeve"] | 725 | + { "name": "enriched_tags", "value": { "zh": "纯棉" } }, |
| 726 | + { "name": "usage_scene", "value": { "zh": "日常" } }, | ||
| 727 | + { "name": "enriched_tags", "value": { "en": "cotton" } } | ||
| 728 | + ] | ||
| 723 | }, | 729 | }, |
| 724 | { | 730 | { |
| 725 | "spu_id": "223168", | 731 | "spu_id": "223168", |
| 726 | - "qanchors": { "en": "dolls, toys, 12pcs" }, | ||
| 727 | - "enriched_attributes": [], | ||
| 728 | - "tags": ["dolls", "toys"] | 732 | + "qanchors": { |
| 733 | + "en": ["dolls", "toys", "12pcs"] | ||
| 734 | + }, | ||
| 735 | + "enriched_tags": { | ||
| 736 | + "en": ["dolls", "toys"] | ||
| 737 | + }, | ||
| 738 | + "enriched_attributes": [] | ||
| 729 | } | 739 | } |
| 730 | ] | 740 | ] |
| 731 | } | 741 | } |
| @@ -733,10 +743,10 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | @@ -733,10 +743,10 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | ||
| 733 | 743 | ||
| 734 | | 字段 | 类型 | 说明 | | 744 | | 字段 | 类型 | 说明 | |
| 735 | |------|------|------| | 745 | |------|------|------| |
| 736 | -| `results` | array | 与请求 `items` 一一对应,每项含 `spu_id`、`qanchors`、`enriched_attributes`、`tags` | | ||
| 737 | -| `results[].qanchors` | object | 按语言键的锚文本(逗号分隔短语),可写入 ES 文档的 `qanchors.{lang}` | | ||
| 738 | -| `results[].enriched_attributes` | array | 语义属性列表,每项为 `{ "lang", "name", "value" }`,可写入 ES 的 `enriched_attributes` nested 字段 | | ||
| 739 | -| `results[].tags` | array | 从语义属性中抽取的 `name=tags` 的 value 集合,可与业务原有 `tags` 合并后写入 ES 的 `tags` 字段 | | 746 | +| `results` | array | 与请求 `items` 一一对应,每项含 `spu_id`、`qanchors`、`enriched_attributes`、`enriched_tags` | |
| 747 | +| `results[].qanchors` | object | 与 ES `qanchors` 字段同结构,按语言键返回短语数组 | | ||
| 748 | +| `results[].enriched_tags` | object | 与 ES `enriched_tags` 字段同结构,按语言键返回标签数组 | | ||
| 749 | +| `results[].enriched_attributes` | array | 与 ES `enriched_attributes` nested 字段同结构,每项为 `{ "name", "value": { "zh"?: "...", "en"?: "..." } }` | | ||
| 740 | | `results[].error` | string | 若该条处理失败(如 LLM 异常),会在此字段返回错误信息 | | 750 | | `results[].error` | string | 若该条处理失败(如 LLM 异常),会在此字段返回错误信息 | |
| 741 | 751 | ||
| 742 | **错误响应**: | 752 | **错误响应**: |
| @@ -758,8 +768,7 @@ curl -X POST "http://localhost:6004/indexer/enrich-content" \ | @@ -758,8 +768,7 @@ curl -X POST "http://localhost:6004/indexer/enrich-content" \ | ||
| 758 | "description": "100%棉,圆领版型,适合日常通勤与休闲穿搭。", | 768 | "description": "100%棉,圆领版型,适合日常通勤与休闲穿搭。", |
| 759 | "image_url": "https://example.com/images/223167.jpg" | 769 | "image_url": "https://example.com/images/223167.jpg" |
| 760 | } | 770 | } |
| 761 | - ], | ||
| 762 | - "languages": ["zh", "en"] | 771 | + ] |
| 763 | }' | 772 | }' |
| 764 | ``` | 773 | ``` |
| 765 | 774 |
indexer/document_transformer.py
| @@ -11,9 +11,8 @@ SPU文档转换器 - 公共转换逻辑。 | @@ -11,9 +11,8 @@ SPU文档转换器 - 公共转换逻辑。 | ||
| 11 | import pandas as pd | 11 | import pandas as pd |
| 12 | import numpy as np | 12 | import numpy as np |
| 13 | import logging | 13 | import logging |
| 14 | -import re | ||
| 15 | from typing import Dict, Any, Optional, List | 14 | from typing import Dict, Any, Optional, List |
| 16 | -from indexer.product_enrich import analyze_products, split_multi_value_field | 15 | +from indexer.product_enrich import build_index_content_fields |
| 17 | 16 | ||
| 18 | logger = logging.getLogger(__name__) | 17 | logger = logging.getLogger(__name__) |
| 19 | 18 | ||
| @@ -75,6 +74,39 @@ class SPUDocumentTransformer: | @@ -75,6 +74,39 @@ class SPUDocumentTransformer: | ||
| 75 | ) | 74 | ) |
| 76 | return translations | 75 | return translations |
| 77 | 76 | ||
| 77 | + def _build_core_language_text_object( | ||
| 78 | + self, | ||
| 79 | + text: Optional[str], | ||
| 80 | + source_lang: str, | ||
| 81 | + scene: str = "general", | ||
| 82 | + ) -> Dict[str, str]: | ||
| 83 | + """ | ||
| 84 | + 构建与 mapping 中 core_language_text(_with_keyword) 对齐的对象。 | ||
| 85 | + 当前核心语言固定为 zh/en。 | ||
| 86 | + """ | ||
| 87 | + if not text or not str(text).strip(): | ||
| 88 | + return {} | ||
| 89 | + | ||
| 90 | + source_text = str(text).strip() | ||
| 91 | + obj: Dict[str, str] = {} | ||
| 92 | + | ||
| 93 | + if source_lang in CORE_INDEX_LANGUAGES: | ||
| 94 | + obj[source_lang] = source_text | ||
| 95 | + | ||
| 96 | + if self.translator: | ||
| 97 | + translations = self._translate_index_languages( | ||
| 98 | + text=source_text, | ||
| 99 | + source_lang=source_lang, | ||
| 100 | + index_languages=CORE_INDEX_LANGUAGES, | ||
| 101 | + scene=scene, | ||
| 102 | + ) | ||
| 103 | + for lang in CORE_INDEX_LANGUAGES: | ||
| 104 | + val = translations.get(lang) | ||
| 105 | + if val and str(val).strip(): | ||
| 106 | + obj[lang] = str(val).strip() | ||
| 107 | + | ||
| 108 | + return obj | ||
| 109 | + | ||
| 78 | def transform_spu_to_doc( | 110 | def transform_spu_to_doc( |
| 79 | self, | 111 | self, |
| 80 | tenant_id: str, | 112 | tenant_id: str, |
| @@ -118,10 +150,16 @@ class SPUDocumentTransformer: | @@ -118,10 +150,16 @@ class SPUDocumentTransformer: | ||
| 118 | if self.enable_title_embedding and self.encoder: | 150 | if self.enable_title_embedding and self.encoder: |
| 119 | self._fill_title_embedding(doc) | 151 | self._fill_title_embedding(doc) |
| 120 | 152 | ||
| 121 | - # Tags | 153 | + # Tags:统一转成与 mapping 一致的 core-language object |
| 122 | if pd.notna(spu_row.get('tags')): | 154 | if pd.notna(spu_row.get('tags')): |
| 123 | tags_str = str(spu_row['tags']) | 155 | tags_str = str(spu_row['tags']) |
| 124 | - doc['tags'] = split_multi_value_field(tags_str) | 156 | + tags_obj = self._build_core_language_text_object( |
| 157 | + tags_str, | ||
| 158 | + source_lang=primary_lang, | ||
| 159 | + scene="general", | ||
| 160 | + ) | ||
| 161 | + if tags_obj: | ||
| 162 | + doc['tags'] = tags_obj | ||
| 125 | 163 | ||
| 126 | # Category相关字段 | 164 | # Category相关字段 |
| 127 | self._fill_category_fields(doc, spu_row) | 165 | self._fill_category_fields(doc, spu_row) |
| @@ -202,7 +240,8 @@ class SPUDocumentTransformer: | @@ -202,7 +240,8 @@ class SPUDocumentTransformer: | ||
| 202 | """ | 240 | """ |
| 203 | 批量调用 LLM,为一批 doc 填充: | 241 | 批量调用 LLM,为一批 doc 填充: |
| 204 | - qanchors.{lang} | 242 | - qanchors.{lang} |
| 205 | - - enriched_attributes (lang/name/value) | 243 | + - tags.{lang} |
| 244 | + - enriched_attributes[].value.{lang} | ||
| 206 | 245 | ||
| 207 | 设计目标: | 246 | 设计目标: |
| 208 | - 尽可能攒批调用 LLM; | 247 | - 尽可能攒批调用 LLM; |
| @@ -211,16 +250,8 @@ class SPUDocumentTransformer: | @@ -211,16 +250,8 @@ class SPUDocumentTransformer: | ||
| 211 | if not docs or not spu_rows or len(docs) != len(spu_rows): | 250 | if not docs or not spu_rows or len(docs) != len(spu_rows): |
| 212 | return | 251 | return |
| 213 | 252 | ||
| 214 | - try: | ||
| 215 | - index_langs = self.tenant_config.get("index_languages") or ["en", "zh"] | ||
| 216 | - except Exception: | ||
| 217 | - index_langs = ["en", "zh"] | ||
| 218 | - # 不再限制为固定 SUPPORTED_LANGS,直接按照租户配置的 index_languages 调用 | ||
| 219 | - llm_langs = list(dict.fromkeys(index_langs)) # 去重并保持顺序 | ||
| 220 | - | ||
| 221 | - # 只对有 title 的 SPU 参与 LLM;其余跳过 | ||
| 222 | id_to_idx: Dict[str, int] = {} | 253 | id_to_idx: Dict[str, int] = {} |
| 223 | - products: List[Dict[str, str]] = [] | 254 | + items: List[Dict[str, str]] = [] |
| 224 | for i, row in enumerate(spu_rows): | 255 | for i, row in enumerate(spu_rows): |
| 225 | raw_id = row.get("id") | 256 | raw_id = row.get("id") |
| 226 | spu_id = "" if raw_id is None else str(raw_id).strip() | 257 | spu_id = "" if raw_id is None else str(raw_id).strip() |
| @@ -228,69 +259,45 @@ class SPUDocumentTransformer: | @@ -228,69 +259,45 @@ class SPUDocumentTransformer: | ||
| 228 | if not spu_id or not title: | 259 | if not spu_id or not title: |
| 229 | continue | 260 | continue |
| 230 | id_to_idx[spu_id] = i | 261 | id_to_idx[spu_id] = i |
| 231 | - products.append({"id": spu_id, "title": title}) | ||
| 232 | - if not products: | 262 | + items.append( |
| 263 | + { | ||
| 264 | + "id": spu_id, | ||
| 265 | + "title": title, | ||
| 266 | + "brief": str(row.get("brief") or "").strip(), | ||
| 267 | + "description": str(row.get("description") or "").strip(), | ||
| 268 | + "image_url": str(row.get("image_src") or "").strip(), | ||
| 269 | + } | ||
| 270 | + ) | ||
| 271 | + if not items: | ||
| 233 | return | 272 | return |
| 234 | 273 | ||
| 235 | tenant_id = str(docs[0].get("tenant_id") or "").strip() or None | 274 | tenant_id = str(docs[0].get("tenant_id") or "").strip() or None |
| 275 | + try: | ||
| 276 | + results = build_index_content_fields(items=items, tenant_id=tenant_id) | ||
| 277 | + except Exception as e: | ||
| 278 | + logger.warning("LLM batch attribute fill failed: %s", e) | ||
| 279 | + return | ||
| 236 | 280 | ||
| 237 | - dim_keys = [ | ||
| 238 | - "tags", | ||
| 239 | - "target_audience", | ||
| 240 | - "usage_scene", | ||
| 241 | - "season", | ||
| 242 | - "key_attributes", | ||
| 243 | - "material", | ||
| 244 | - "features", | ||
| 245 | - ] | ||
| 246 | - | ||
| 247 | - for lang in llm_langs: | ||
| 248 | - try: | ||
| 249 | - rows = analyze_products( | ||
| 250 | - products=products, | ||
| 251 | - target_lang=lang, | ||
| 252 | - batch_size=20, | ||
| 253 | - tenant_id=tenant_id, | ||
| 254 | - ) | ||
| 255 | - except Exception as e: | ||
| 256 | - logger.warning("LLM batch attribute fill failed (lang=%s): %s", lang, e) | 281 | + for result in results: |
| 282 | + spu_id = str(result.get("id") or "").strip() | ||
| 283 | + if not spu_id: | ||
| 257 | continue | 284 | continue |
| 285 | + idx = id_to_idx.get(spu_id) | ||
| 286 | + if idx is None: | ||
| 287 | + continue | ||
| 288 | + self._apply_content_enrichment(docs[idx], result) | ||
| 258 | 289 | ||
| 259 | - for row in rows or []: | ||
| 260 | - spu_id = str(row.get("id") or "").strip() | ||
| 261 | - if not spu_id: | ||
| 262 | - continue | ||
| 263 | - idx = id_to_idx.get(spu_id) | ||
| 264 | - if idx is None: | ||
| 265 | - continue | ||
| 266 | - self._apply_llm_row(docs[idx], row=row, lang=lang, dim_keys=dim_keys) | ||
| 267 | - | ||
| 268 | - def _apply_llm_row(self, doc: Dict[str, Any], row: Dict[str, Any], lang: str, dim_keys: List[str]) -> None: | ||
| 269 | - """将单条 LLM 输出 row 按既定结构写入 doc(不抛异常)。""" | 290 | + def _apply_content_enrichment(self, doc: Dict[str, Any], enrichment: Dict[str, Any]) -> None: |
| 291 | + """将 product_enrich 产出的 ES-ready 内容字段写入 doc。""" | ||
| 270 | try: | 292 | try: |
| 271 | - if row.get("error"): | ||
| 272 | - return | ||
| 273 | - | ||
| 274 | - semantic_list = doc.get("enriched_attributes") or [] | ||
| 275 | - qanchors_obj = doc.get("qanchors") or {} | ||
| 276 | - | ||
| 277 | - anchor_text = str(row.get("anchor_text") or "").strip() | ||
| 278 | - if anchor_text: | ||
| 279 | - qanchors_obj[lang] = anchor_text | ||
| 280 | - | ||
| 281 | - for name in dim_keys: | ||
| 282 | - raw = row.get(name) | ||
| 283 | - if not raw: | ||
| 284 | - continue | ||
| 285 | - for value in split_multi_value_field(str(raw)): | ||
| 286 | - semantic_list.append({"lang": lang, "name": name, "value": value}) | ||
| 287 | - | ||
| 288 | - if qanchors_obj: | ||
| 289 | - doc["qanchors"] = qanchors_obj | ||
| 290 | - if semantic_list: | ||
| 291 | - doc["enriched_attributes"] = semantic_list | 293 | + if enrichment.get("qanchors"): |
| 294 | + doc["qanchors"] = enrichment["qanchors"] | ||
| 295 | + if enrichment.get("tags"): | ||
| 296 | + doc["tags"] = enrichment["tags"] | ||
| 297 | + if enrichment.get("enriched_attributes"): | ||
| 298 | + doc["enriched_attributes"] = enrichment["enriched_attributes"] | ||
| 292 | except Exception as e: | 299 | except Exception as e: |
| 293 | - logger.warning("Failed to apply LLM row to doc (spu_id=%s, lang=%s): %s", doc.get("spu_id"), lang, e) | 300 | + logger.warning("Failed to apply enrichment to doc (spu_id=%s): %s", doc.get("spu_id"), e) |
| 294 | 301 | ||
| 295 | def _fill_text_fields( | 302 | def _fill_text_fields( |
| 296 | self, | 303 | self, |
| @@ -544,6 +551,23 @@ class SPUDocumentTransformer: | @@ -544,6 +551,23 @@ class SPUDocumentTransformer: | ||
| 544 | if pd.notna(position) and pd.notna(name): | 551 | if pd.notna(position) and pd.notna(name): |
| 545 | option_name_map[int(position)] = str(name) | 552 | option_name_map[int(position)] = str(name) |
| 546 | 553 | ||
| 554 | + primary_lang = self.tenant_config.get('primary_language', 'en') | ||
| 555 | + | ||
| 556 | + def _build_specification(name: str, raw_value: Any, sku_id: str) -> Optional[Dict[str, Any]]: | ||
| 557 | + value = "" if raw_value is None else str(raw_value).strip() | ||
| 558 | + if not value: | ||
| 559 | + return None | ||
| 560 | + return { | ||
| 561 | + 'sku_id': sku_id, | ||
| 562 | + 'name': name, | ||
| 563 | + 'value_keyword': value, | ||
| 564 | + 'value_text': self._build_core_language_text_object( | ||
| 565 | + value, | ||
| 566 | + source_lang=primary_lang, | ||
| 567 | + scene="general", | ||
| 568 | + ) or normalize_core_text_field_value(value, primary_lang), | ||
| 569 | + } | ||
| 570 | + | ||
| 547 | for _, sku_row in skus.iterrows(): | 571 | for _, sku_row in skus.iterrows(): |
| 548 | sku_data = self._transform_sku_row(sku_row, option_name_map) | 572 | sku_data = self._transform_sku_row(sku_row, option_name_map) |
| 549 | if sku_data: | 573 | if sku_data: |
| @@ -584,23 +608,17 @@ class SPUDocumentTransformer: | @@ -584,23 +608,17 @@ class SPUDocumentTransformer: | ||
| 584 | # 构建specifications(从SKU的option值和option表的name) | 608 | # 构建specifications(从SKU的option值和option表的name) |
| 585 | sku_id = str(sku_row['id']) | 609 | sku_id = str(sku_row['id']) |
| 586 | if pd.notna(sku_row.get('option1')) and 1 in option_name_map: | 610 | if pd.notna(sku_row.get('option1')) and 1 in option_name_map: |
| 587 | - specifications.append({ | ||
| 588 | - 'sku_id': sku_id, | ||
| 589 | - 'name': option_name_map[1], | ||
| 590 | - 'value': str(sku_row['option1']) | ||
| 591 | - }) | 611 | + spec = _build_specification(option_name_map[1], sku_row['option1'], sku_id) |
| 612 | + if spec: | ||
| 613 | + specifications.append(spec) | ||
| 592 | if pd.notna(sku_row.get('option2')) and 2 in option_name_map: | 614 | if pd.notna(sku_row.get('option2')) and 2 in option_name_map: |
| 593 | - specifications.append({ | ||
| 594 | - 'sku_id': sku_id, | ||
| 595 | - 'name': option_name_map[2], | ||
| 596 | - 'value': str(sku_row['option2']) | ||
| 597 | - }) | 615 | + spec = _build_specification(option_name_map[2], sku_row['option2'], sku_id) |
| 616 | + if spec: | ||
| 617 | + specifications.append(spec) | ||
| 598 | if pd.notna(sku_row.get('option3')) and 3 in option_name_map: | 618 | if pd.notna(sku_row.get('option3')) and 3 in option_name_map: |
| 599 | - specifications.append({ | ||
| 600 | - 'sku_id': sku_id, | ||
| 601 | - 'name': option_name_map[3], | ||
| 602 | - 'value': str(sku_row['option3']) | ||
| 603 | - }) | 619 | + spec = _build_specification(option_name_map[3], sku_row['option3'], sku_id) |
| 620 | + if spec: | ||
| 621 | + specifications.append(spec) | ||
| 604 | 622 | ||
| 605 | return skus_list, prices, compare_prices, sku_prices, sku_weights, sku_weight_units, total_inventory, specifications | 623 | return skus_list, prices, compare_prices, sku_prices, sku_weights, sku_weight_units, total_inventory, specifications |
| 606 | 624 | ||
| @@ -636,82 +654,36 @@ class SPUDocumentTransformer: | @@ -636,82 +654,36 @@ class SPUDocumentTransformer: | ||
| 636 | 654 | ||
| 637 | def _fill_llm_attributes(self, doc: Dict[str, Any], spu_row: pd.Series) -> None: | 655 | def _fill_llm_attributes(self, doc: Dict[str, Any], spu_row: pd.Series) -> None: |
| 638 | """ | 656 | """ |
| 639 | - 调用 indexer.product_enrich.analyze_products,为当前 SPU 填充: | 657 | + 调用 indexer.product_enrich 的高层内容理解入口,为当前 SPU 填充: |
| 640 | - qanchors.{lang} | 658 | - qanchors.{lang} |
| 641 | - - enriched_attributes (lang/name/value) | 659 | + - tags.{lang} |
| 660 | + - enriched_attributes[].value.{lang} | ||
| 642 | """ | 661 | """ |
| 643 | - try: | ||
| 644 | - index_langs = self.tenant_config.get("index_languages") or ["en", "zh"] | ||
| 645 | - except Exception: | ||
| 646 | - index_langs = ["en", "zh"] | ||
| 647 | - | ||
| 648 | - # 不再限制为固定 SUPPORTED_LANGS,直接按照租户配置的 index_languages 调用 | ||
| 649 | - llm_langs = list(dict.fromkeys(index_langs)) # 去重并保持顺序 | ||
| 650 | - | ||
| 651 | spu_id = str(spu_row.get("id") or "").strip() | 662 | spu_id = str(spu_row.get("id") or "").strip() |
| 652 | title = str(spu_row.get("title") or "").strip() | 663 | title = str(spu_row.get("title") or "").strip() |
| 653 | if not spu_id or not title: | 664 | if not spu_id or not title: |
| 654 | return | 665 | return |
| 655 | 666 | ||
| 656 | - semantic_list = doc.get("enriched_attributes") or [] | ||
| 657 | - qanchors_obj = doc.get("qanchors") or {} | ||
| 658 | - | ||
| 659 | - dim_keys = [ | ||
| 660 | - "tags", | ||
| 661 | - "target_audience", | ||
| 662 | - "usage_scene", | ||
| 663 | - "season", | ||
| 664 | - "key_attributes", | ||
| 665 | - "material", | ||
| 666 | - "features", | ||
| 667 | - ] | ||
| 668 | - | ||
| 669 | tenant_id = doc.get("tenant_id") | 667 | tenant_id = doc.get("tenant_id") |
| 668 | + try: | ||
| 669 | + results = build_index_content_fields( | ||
| 670 | + items=[ | ||
| 671 | + { | ||
| 672 | + "id": spu_id, | ||
| 673 | + "title": title, | ||
| 674 | + "brief": str(spu_row.get("brief") or "").strip(), | ||
| 675 | + "description": str(spu_row.get("description") or "").strip(), | ||
| 676 | + "image_url": str(spu_row.get("image_src") or "").strip(), | ||
| 677 | + } | ||
| 678 | + ], | ||
| 679 | + tenant_id=str(tenant_id), | ||
| 680 | + ) | ||
| 681 | + except Exception as e: | ||
| 682 | + logger.warning("LLM attribute fill failed for SPU %s: %s", spu_id, e) | ||
| 683 | + return | ||
| 670 | 684 | ||
| 671 | - for lang in llm_langs: | ||
| 672 | - try: | ||
| 673 | - rows = analyze_products( | ||
| 674 | - products=[{"id": spu_id, "title": title}], | ||
| 675 | - target_lang=lang, | ||
| 676 | - batch_size=1, | ||
| 677 | - tenant_id=str(tenant_id), | ||
| 678 | - ) | ||
| 679 | - except Exception as e: | ||
| 680 | - logger.warning( | ||
| 681 | - "LLM attribute fill failed for SPU %s, lang=%s: %s", | ||
| 682 | - spu_id, | ||
| 683 | - lang, | ||
| 684 | - e, | ||
| 685 | - ) | ||
| 686 | - continue | ||
| 687 | - | ||
| 688 | - if not rows: | ||
| 689 | - continue | ||
| 690 | - row = rows[0] or {} | ||
| 691 | - | ||
| 692 | - # qanchors.{lang} | ||
| 693 | - anchor_text = str(row.get("anchor_text") or "").strip() | ||
| 694 | - if anchor_text: | ||
| 695 | - qanchors_obj[lang] = anchor_text | ||
| 696 | - | ||
| 697 | - # 语义属性:按各维度拆分为短语 | ||
| 698 | - for name in dim_keys: | ||
| 699 | - raw = row.get(name) | ||
| 700 | - if not raw: | ||
| 701 | - continue | ||
| 702 | - for value in split_multi_value_field(str(raw)): | ||
| 703 | - semantic_list.append( | ||
| 704 | - { | ||
| 705 | - "lang": lang, | ||
| 706 | - "name": name, | ||
| 707 | - "value": value, | ||
| 708 | - } | ||
| 709 | - ) | ||
| 710 | - | ||
| 711 | - if qanchors_obj: | ||
| 712 | - doc["qanchors"] = qanchors_obj | ||
| 713 | - if semantic_list: | ||
| 714 | - doc["enriched_attributes"] = semantic_list | 685 | + if results: |
| 686 | + self._apply_content_enrichment(doc, results[0]) | ||
| 715 | 687 | ||
| 716 | def _transform_sku_row(self, sku_row: pd.Series, option_name_map: Dict[int, str] = None) -> Optional[Dict[str, Any]]: | 688 | def _transform_sku_row(self, sku_row: pd.Series, option_name_map: Dict[int, str] = None) -> Optional[Dict[str, Any]]: |
| 717 | """ | 689 | """ |
indexer/product_enrich.py
| @@ -146,6 +146,16 @@ if _missing_prompt_langs: | @@ -146,6 +146,16 @@ if _missing_prompt_langs: | ||
| 146 | 146 | ||
| 147 | # 多值字段分隔:英文逗号、中文逗号、顿号,及历史约定的 ; | / 与空白 | 147 | # 多值字段分隔:英文逗号、中文逗号、顿号,及历史约定的 ; | / 与空白 |
| 148 | _MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[,、,;|/\n\t]+") | 148 | _MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[,、,;|/\n\t]+") |
| 149 | +_CORE_INDEX_LANGUAGES = ("zh", "en") | ||
| 150 | +_ENRICHED_ATTRIBUTE_DIMENSIONS = ( | ||
| 151 | + "enriched_tags", | ||
| 152 | + "target_audience", | ||
| 153 | + "usage_scene", | ||
| 154 | + "season", | ||
| 155 | + "key_attributes", | ||
| 156 | + "material", | ||
| 157 | + "features", | ||
| 158 | +) | ||
| 149 | 159 | ||
| 150 | 160 | ||
| 151 | def split_multi_value_field(text: Optional[str]) -> List[str]: | 161 | def split_multi_value_field(text: Optional[str]) -> List[str]: |
| @@ -158,6 +168,124 @@ def split_multi_value_field(text: Optional[str]) -> List[str]: | @@ -158,6 +168,124 @@ def split_multi_value_field(text: Optional[str]) -> List[str]: | ||
| 158 | return [p.strip() for p in _MULTI_VALUE_FIELD_SPLIT_RE.split(s) if p.strip()] | 168 | return [p.strip() for p in _MULTI_VALUE_FIELD_SPLIT_RE.split(s) if p.strip()] |
| 159 | 169 | ||
| 160 | 170 | ||
| 171 | +def _append_lang_phrase_map(target: Dict[str, List[str]], lang: str, raw_value: Any) -> None: | ||
| 172 | + parts = split_multi_value_field(raw_value) | ||
| 173 | + if not parts: | ||
| 174 | + return | ||
| 175 | + existing = target.get(lang) or [] | ||
| 176 | + merged = list(dict.fromkeys([str(x).strip() for x in existing if str(x).strip()] + parts)) | ||
| 177 | + if merged: | ||
| 178 | + target[lang] = merged | ||
| 179 | + | ||
| 180 | + | ||
| 181 | +def _append_enriched_attribute( | ||
| 182 | + target: List[Dict[str, Any]], | ||
| 183 | + name: str, | ||
| 184 | + lang: str, | ||
| 185 | + raw_value: Any, | ||
| 186 | +) -> None: | ||
| 187 | + for value in split_multi_value_field(raw_value): | ||
| 188 | + if any( | ||
| 189 | + item.get("name") == name | ||
| 190 | + and isinstance(item.get("value"), dict) | ||
| 191 | + and item["value"].get(lang) == value | ||
| 192 | + for item in target | ||
| 193 | + ): | ||
| 194 | + continue | ||
| 195 | + target.append({"name": name, "value": {lang: value}}) | ||
| 196 | + | ||
| 197 | + | ||
| 198 | +def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang: str) -> None: | ||
| 199 | + if not row or row.get("error"): | ||
| 200 | + return | ||
| 201 | + | ||
| 202 | + anchor_text = str(row.get("anchor_text") or "").strip() | ||
| 203 | + if anchor_text: | ||
| 204 | + _append_lang_phrase_map(result["qanchors"], lang=lang, raw_value=anchor_text) | ||
| 205 | + | ||
| 206 | + for name in _ENRICHED_ATTRIBUTE_DIMENSIONS: | ||
| 207 | + raw = row.get(name) | ||
| 208 | + if not raw: | ||
| 209 | + continue | ||
| 210 | + _append_enriched_attribute(result["enriched_attributes"], name=name, lang=lang, raw_value=raw) | ||
| 211 | + if name == "enriched_tags": | ||
| 212 | + _append_lang_phrase_map(result["enriched_tags"], lang=lang, raw_value=raw) | ||
| 213 | + | ||
| 214 | + | ||
| 215 | +def _normalize_index_content_item(item: Dict[str, Any]) -> Dict[str, str]: | ||
| 216 | + item_id = str(item.get("id") or item.get("spu_id") or "").strip() | ||
| 217 | + return { | ||
| 218 | + "id": item_id, | ||
| 219 | + "title": str(item.get("title") or "").strip(), | ||
| 220 | + "brief": str(item.get("brief") or "").strip(), | ||
| 221 | + "description": str(item.get("description") or "").strip(), | ||
| 222 | + "image_url": str(item.get("image_url") or "").strip(), | ||
| 223 | + } | ||
| 224 | + | ||
| 225 | + | ||
| 226 | +def build_index_content_fields( | ||
| 227 | + items: List[Dict[str, Any]], | ||
| 228 | + tenant_id: Optional[str] = None, | ||
| 229 | +) -> List[Dict[str, Any]]: | ||
| 230 | + """ | ||
| 231 | + 高层入口:生成与 ES mapping 对齐的内容理解字段。 | ||
| 232 | + | ||
| 233 | + 输入项需包含: | ||
| 234 | + - `id` 或 `spu_id` | ||
| 235 | + - `title` | ||
| 236 | + - 可选 `brief` / `description` / `image_url` | ||
| 237 | + | ||
| 238 | + 返回项结构: | ||
| 239 | + - `id` | ||
| 240 | + - `qanchors` | ||
| 241 | + - `enriched_tags` | ||
| 242 | + - `enriched_attributes` | ||
| 243 | + - 可选 `error` | ||
| 244 | + | ||
| 245 | + 其中: | ||
| 246 | + - `qanchors.{lang}` 为短语数组 | ||
| 247 | + - `enriched_tags.{lang}` 为标签数组 | ||
| 248 | + """ | ||
| 249 | + normalized_items = [_normalize_index_content_item(item) for item in items] | ||
| 250 | + if not normalized_items: | ||
| 251 | + return [] | ||
| 252 | + | ||
| 253 | + results_by_id: Dict[str, Dict[str, Any]] = { | ||
| 254 | + item["id"]: { | ||
| 255 | + "id": item["id"], | ||
| 256 | + "qanchors": {}, | ||
| 257 | + "enriched_tags": {}, | ||
| 258 | + "enriched_attributes": [], | ||
| 259 | + } | ||
| 260 | + for item in normalized_items | ||
| 261 | + } | ||
| 262 | + | ||
| 263 | + for lang in _CORE_INDEX_LANGUAGES: | ||
| 264 | + try: | ||
| 265 | + rows = analyze_products( | ||
| 266 | + products=normalized_items, | ||
| 267 | + target_lang=lang, | ||
| 268 | + batch_size=BATCH_SIZE, | ||
| 269 | + tenant_id=tenant_id, | ||
| 270 | + ) | ||
| 271 | + except Exception as e: | ||
| 272 | + logger.warning("build_index_content_fields failed for lang=%s: %s", lang, e) | ||
| 273 | + for item in normalized_items: | ||
| 274 | + results_by_id[item["id"]].setdefault("error", str(e)) | ||
| 275 | + continue | ||
| 276 | + | ||
| 277 | + for row in rows or []: | ||
| 278 | + item_id = str(row.get("id") or "").strip() | ||
| 279 | + if not item_id or item_id not in results_by_id: | ||
| 280 | + continue | ||
| 281 | + if row.get("error"): | ||
| 282 | + results_by_id[item_id].setdefault("error", row["error"]) | ||
| 283 | + continue | ||
| 284 | + _apply_index_content_row(results_by_id[item_id], row=row, lang=lang) | ||
| 285 | + | ||
| 286 | + return [results_by_id[item["id"]] for item in normalized_items] | ||
| 287 | + | ||
| 288 | + | ||
| 161 | def _normalize_space(text: str) -> str: | 289 | def _normalize_space(text: str) -> str: |
| 162 | return re.sub(r"\s+", " ", (text or "").strip()) | 290 | return re.sub(r"\s+", " ", (text or "").strip()) |
| 163 | 291 | ||
| @@ -526,7 +654,7 @@ def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]: | @@ -526,7 +654,7 @@ def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]: | ||
| 526 | "seq_no": parts[0], | 654 | "seq_no": parts[0], |
| 527 | "title": parts[1], # 商品标题(按目标语言) | 655 | "title": parts[1], # 商品标题(按目标语言) |
| 528 | "category_path": parts[2] if len(parts) > 2 else "", # 品类路径 | 656 | "category_path": parts[2] if len(parts) > 2 else "", # 品类路径 |
| 529 | - "tags": parts[3] if len(parts) > 3 else "", # 细分标签 | 657 | + "enriched_tags": parts[3] if len(parts) > 3 else "", # 细分标签 |
| 530 | "target_audience": parts[4] if len(parts) > 4 else "", # 适用人群 | 658 | "target_audience": parts[4] if len(parts) > 4 else "", # 适用人群 |
| 531 | "usage_scene": parts[5] if len(parts) > 5 else "", # 使用场景 | 659 | "usage_scene": parts[5] if len(parts) > 5 else "", # 使用场景 |
| 532 | "season": parts[6] if len(parts) > 6 else "", # 适用季节 | 660 | "season": parts[6] if len(parts) > 6 else "", # 适用季节 |
| @@ -603,7 +731,7 @@ def process_batch( | @@ -603,7 +731,7 @@ def process_batch( | ||
| 603 | "title_input": item.get("title", ""), | 731 | "title_input": item.get("title", ""), |
| 604 | "title": "", | 732 | "title": "", |
| 605 | "category_path": "", | 733 | "category_path": "", |
| 606 | - "tags": "", | 734 | + "enriched_tags": "", |
| 607 | "target_audience": "", | 735 | "target_audience": "", |
| 608 | "usage_scene": "", | 736 | "usage_scene": "", |
| 609 | "season": "", | 737 | "season": "", |
| @@ -643,7 +771,7 @@ def process_batch( | @@ -643,7 +771,7 @@ def process_batch( | ||
| 643 | "title_input": batch_data[i]["title"], # 原始输入标题 | 771 | "title_input": batch_data[i]["title"], # 原始输入标题 |
| 644 | "title": parsed_item.get("title", ""), # 模型生成的标题 | 772 | "title": parsed_item.get("title", ""), # 模型生成的标题 |
| 645 | "category_path": parsed_item.get("category_path", ""), # 品类路径 | 773 | "category_path": parsed_item.get("category_path", ""), # 品类路径 |
| 646 | - "tags": parsed_item.get("tags", ""), # 细分标签 | 774 | + "enriched_tags": parsed_item.get("enriched_tags", ""), # 细分标签 |
| 647 | "target_audience": parsed_item.get("target_audience", ""), # 适用人群 | 775 | "target_audience": parsed_item.get("target_audience", ""), # 适用人群 |
| 648 | "usage_scene": parsed_item.get("usage_scene", ""), # 使用场景 | 776 | "usage_scene": parsed_item.get("usage_scene", ""), # 使用场景 |
| 649 | "season": parsed_item.get("season", ""), # 适用季节 | 777 | "season": parsed_item.get("season", ""), # 适用季节 |
| @@ -686,7 +814,7 @@ def process_batch( | @@ -686,7 +814,7 @@ def process_batch( | ||
| 686 | "title_input": item["title"], | 814 | "title_input": item["title"], |
| 687 | "title": "", | 815 | "title": "", |
| 688 | "category_path": "", | 816 | "category_path": "", |
| 689 | - "tags": "", | 817 | + "enriched_tags": "", |
| 690 | "target_audience": "", | 818 | "target_audience": "", |
| 691 | "usage_scene": "", | 819 | "usage_scene": "", |
| 692 | "season": "", | 820 | "season": "", |
mappings/README.md
| @@ -34,8 +34,8 @@ | @@ -34,8 +34,8 @@ | ||
| 34 | 34 | ||
| 35 | 当前字段大致分为几类: | 35 | 当前字段大致分为几类: |
| 36 | 36 | ||
| 37 | -- 全语言字段:`title`、`keywords`、`brief`、`description`、`vendor`、`category_path`、`category_name_text`、`specifications.value` | ||
| 38 | -- 核心索引语言字段:`qanchors`、`tags`、`option1_values`、`option2_values`、`option3_values`、`enriched_attributes.value` | 37 | +- 全语言字段:`title`、`keywords`、`brief`、`description`、`vendor`、`category_path`、`category_name_text` |
| 38 | +- 核心索引语言字段:`qanchors`、`enriched_tags`、`option1_values`、`option2_values`、`option3_values`、`enriched_attributes.value` | ||
| 39 | - 复合嵌套字段:`image_embedding`、`specifications`、`enriched_attributes`、`skus` | 39 | - 复合嵌套字段:`image_embedding`、`specifications`、`enriched_attributes`、`skus` |
| 40 | - 其他标量字段:`tenant_id`、`spu_id`、价格、库存、类目等 | 40 | - 其他标量字段:`tenant_id`、`spu_id`、价格、库存、类目等 |
| 41 | 41 | ||
| @@ -63,11 +63,12 @@ | @@ -63,11 +63,12 @@ | ||
| 63 | 典型字段: | 63 | 典型字段: |
| 64 | 64 | ||
| 65 | - `qanchors` | 65 | - `qanchors` |
| 66 | -- `tags` | 66 | +- `enriched_tags` |
| 67 | - `option1_values` | 67 | - `option1_values` |
| 68 | - `option2_values` | 68 | - `option2_values` |
| 69 | - `option3_values` | 69 | - `option3_values` |
| 70 | - `enriched_attributes.value` | 70 | - `enriched_attributes.value` |
| 71 | +- `specifications.value_text` | ||
| 71 | 72 | ||
| 72 | 以 `category_path` 和 `option*_values` 为例,核心语言灌入结果应至少包含: | 73 | 以 `category_path` 和 `option*_values` 为例,核心语言灌入结果应至少包含: |
| 73 | 74 | ||
| @@ -118,7 +119,6 @@ | @@ -118,7 +119,6 @@ | ||
| 118 | - `vendor` | 119 | - `vendor` |
| 119 | - `category_path` | 120 | - `category_path` |
| 120 | - `category_name_text` | 121 | - `category_name_text` |
| 121 | -- `specifications.value` | ||
| 122 | 122 | ||
| 123 | 灌入规则: | 123 | 灌入规则: |
| 124 | 124 | ||
| @@ -151,7 +151,7 @@ | @@ -151,7 +151,7 @@ | ||
| 151 | } | 151 | } |
| 152 | ``` | 152 | ``` |
| 153 | 153 | ||
| 154 | -示例:规格值 `specifications.value` | 154 | +示例:规格值 `specifications.value_text` / `specifications.value_keyword` |
| 155 | 155 | ||
| 156 | ```json | 156 | ```json |
| 157 | { | 157 | { |
| @@ -159,16 +159,21 @@ | @@ -159,16 +159,21 @@ | ||
| 159 | { | 159 | { |
| 160 | "sku_id": "sku-red-s", | 160 | "sku_id": "sku-red-s", |
| 161 | "name": "color", | 161 | "name": "color", |
| 162 | - "value": { | 162 | + "value_keyword": "красный", |
| 163 | + "value_text": { | ||
| 163 | "zh": "红色", | 164 | "zh": "红色", |
| 164 | - "en": "red", | ||
| 165 | - "ru": "красный" | 165 | + "en": "red" |
| 166 | } | 166 | } |
| 167 | } | 167 | } |
| 168 | ] | 168 | ] |
| 169 | } | 169 | } |
| 170 | ``` | 170 | ``` |
| 171 | 171 | ||
| 172 | +其中: | ||
| 173 | + | ||
| 174 | +- `specifications.value_keyword` 保存原始规格值,用于精确过滤 / 分面 | ||
| 175 | +- `specifications.value_text` 保存 `zh/en` 两个核心索引语言版本,用于检索召回 | ||
| 176 | + | ||
| 172 | ### 原始语言为中文或英文时 | 177 | ### 原始语言为中文或英文时 |
| 173 | 178 | ||
| 174 | 如果原始语言就是核心索引语言之一,不需要额外再写第三份语言字段。 | 179 | 如果原始语言就是核心索引语言之一,不需要额外再写第三份语言字段。 |
| @@ -210,7 +215,7 @@ | @@ -210,7 +215,7 @@ | ||
| 210 | - 标量字段:直接写固定值,例如 `tenant_id`、`spu_id`、`min_price` | 215 | - 标量字段:直接写固定值,例如 `tenant_id`、`spu_id`、`min_price` |
| 211 | - 核心索引语言字段:只生成 `zh/en` | 216 | - 核心索引语言字段:只生成 `zh/en` |
| 212 | - 全语言字段:生成 `zh/en`,再按原始语言补一个对应语种字段 | 217 | - 全语言字段:生成 `zh/en`,再按原始语言补一个对应语种字段 |
| 213 | -- 嵌套字段:对每个元素内部重复应用同样规则,例如 `specifications[].value` | 218 | +- 嵌套字段:对每个元素内部重复应用同样规则,例如 `specifications[].value_text`、`enriched_attributes[].value` |
| 214 | 219 | ||
| 215 | ### 推荐灌入流程 | 220 | ### 推荐灌入流程 |
| 216 | 221 |
mappings/generate_search_products_mapping.py
| @@ -194,8 +194,7 @@ FIELD_SPECS = [ | @@ -194,8 +194,7 @@ FIELD_SPECS = [ | ||
| 194 | ), | 194 | ), |
| 195 | text_field("category_path", "all_language_text_with_keyword"), | 195 | text_field("category_path", "all_language_text_with_keyword"), |
| 196 | text_field("category_name_text", "all_language_text_with_keyword"), | 196 | text_field("category_name_text", "all_language_text_with_keyword"), |
| 197 | - text_field("qanchors", "core_language_text"), | ||
| 198 | - text_field("tags", "core_language_text_with_keyword"), | 197 | + text_field("tags", "all_language_text_with_keyword"), |
| 199 | scalar_field("category_id", "keyword"), | 198 | scalar_field("category_id", "keyword"), |
| 200 | scalar_field("category_name", "keyword"), | 199 | scalar_field("category_name", "keyword"), |
| 201 | scalar_field("category_level", "integer"), | 200 | scalar_field("category_level", "integer"), |
| @@ -209,6 +208,8 @@ FIELD_SPECS = [ | @@ -209,6 +208,8 @@ FIELD_SPECS = [ | ||
| 209 | scalar_field("value_keyword", "keyword"), | 208 | scalar_field("value_keyword", "keyword"), |
| 210 | text_field("value_text", "core_language_text_with_keyword"), | 209 | text_field("value_text", "core_language_text_with_keyword"), |
| 211 | ), | 210 | ), |
| 211 | + text_field("qanchors", "core_language_text"), | ||
| 212 | + text_field("enriched_tags", "core_language_text_with_keyword"), | ||
| 212 | nested_field( | 213 | nested_field( |
| 213 | "enriched_attributes", | 214 | "enriched_attributes", |
| 214 | scalar_field("name", "keyword"), | 215 | scalar_field("name", "keyword"), |
suggestion/builder.py
| @@ -166,6 +166,29 @@ class SuggestionIndexBuilder: | @@ -166,6 +166,29 @@ class SuggestionIndexBuilder: | ||
| 166 | out = [p.strip() for p in parts if p and p.strip()] | 166 | out = [p.strip() for p in parts if p and p.strip()] |
| 167 | return out if out else [s] | 167 | return out if out else [s] |
| 168 | 168 | ||
| 169 | + def _iter_multilang_product_tags( | ||
| 170 | + self, | ||
| 171 | + raw: Any, | ||
| 172 | + index_languages: List[str], | ||
| 173 | + primary_language: str, | ||
| 174 | + ) -> List[Tuple[str, str]]: | ||
| 175 | + if isinstance(raw, dict): | ||
| 176 | + pairs: List[Tuple[str, str]] = [] | ||
| 177 | + for lang in index_languages: | ||
| 178 | + for tag in self._iter_product_tags(raw.get(lang)): | ||
| 179 | + pairs.append((lang, tag)) | ||
| 180 | + return pairs | ||
| 181 | + | ||
| 182 | + pairs = [] | ||
| 183 | + for tag in self._iter_product_tags(raw): | ||
| 184 | + tag_lang, _, _ = detect_text_language_for_suggestions( | ||
| 185 | + tag, | ||
| 186 | + index_languages=index_languages, | ||
| 187 | + primary_language=primary_language, | ||
| 188 | + ) | ||
| 189 | + pairs.append((tag_lang, tag)) | ||
| 190 | + return pairs | ||
| 191 | + | ||
| 169 | @staticmethod | 192 | @staticmethod |
| 170 | def _looks_noise(text_value: str) -> bool: | 193 | def _looks_noise(text_value: str) -> bool: |
| 171 | if not text_value: | 194 | if not text_value: |
| @@ -487,12 +510,11 @@ class SuggestionIndexBuilder: | @@ -487,12 +510,11 @@ class SuggestionIndexBuilder: | ||
| 487 | key_to_candidate[key] = c | 510 | key_to_candidate[key] = c |
| 488 | c.add_product("qanchor", spu_id=product_id) | 511 | c.add_product("qanchor", spu_id=product_id) |
| 489 | 512 | ||
| 490 | - for tag in self._iter_product_tags(src.get("tags")): | ||
| 491 | - tag_lang, _, _ = detect_text_language_for_suggestions( | ||
| 492 | - tag, | ||
| 493 | - index_languages=index_languages, | ||
| 494 | - primary_language=primary_language, | ||
| 495 | - ) | 513 | + for tag_lang, tag in self._iter_multilang_product_tags( |
| 514 | + src.get("tags"), | ||
| 515 | + index_languages=index_languages, | ||
| 516 | + primary_language=primary_language, | ||
| 517 | + ): | ||
| 496 | text_norm = self._normalize_text(tag) | 518 | text_norm = self._normalize_text(tag) |
| 497 | if self._looks_noise(text_norm): | 519 | if self._looks_noise(text_norm): |
| 498 | continue | 520 | continue |
tests/ci/test_service_api_contracts.py
| @@ -345,33 +345,25 @@ def test_indexer_build_docs_from_db_contract(indexer_client: TestClient): | @@ -345,33 +345,25 @@ def test_indexer_build_docs_from_db_contract(indexer_client: TestClient): | ||
| 345 | def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch): | 345 | def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch): |
| 346 | import indexer.product_enrich as process_products | 346 | import indexer.product_enrich as process_products |
| 347 | 347 | ||
| 348 | - def _fake_analyze_products( | ||
| 349 | - products: List[Dict[str, str]], | ||
| 350 | - target_lang: str = "zh", | ||
| 351 | - batch_size: int | None = None, | ||
| 352 | - tenant_id: str | None = None, | ||
| 353 | - ): | ||
| 354 | - assert batch_size == 20 | 348 | + def _fake_build_index_content_fields(items: List[Dict[str, str]], tenant_id: str | None = None): |
| 349 | + assert tenant_id == "162" | ||
| 355 | return [ | 350 | return [ |
| 356 | { | 351 | { |
| 357 | - "id": p["id"], | ||
| 358 | - "lang": target_lang, | ||
| 359 | - "title_input": p["title"], | ||
| 360 | - "title": p["title"], | ||
| 361 | - "category_path": "", | ||
| 362 | - "tags": "tag1,tag2", | ||
| 363 | - "target_audience": "", | ||
| 364 | - "usage_scene": "", | ||
| 365 | - "season": "", | ||
| 366 | - "key_attributes": "", | ||
| 367 | - "material": "", | ||
| 368 | - "features": "", | ||
| 369 | - "anchor_text": f"{target_lang}-anchor-{p['id']}", | 352 | + "id": p["spu_id"], |
| 353 | + "qanchors": { | ||
| 354 | + "zh": [f"zh-anchor-{p['spu_id']}"], | ||
| 355 | + "en": [f"en-anchor-{p['spu_id']}"], | ||
| 356 | + }, | ||
| 357 | + "enriched_tags": {"zh": ["tag1", "tag2"], "en": ["tag1", "tag2"]}, | ||
| 358 | + "enriched_attributes": [ | ||
| 359 | + {"name": "enriched_tags", "value": {"zh": "tag1"}}, | ||
| 360 | + {"name": "enriched_tags", "value": {"en": "tag1"}}, | ||
| 361 | + ], | ||
| 370 | } | 362 | } |
| 371 | - for p in products | 363 | + for p in items |
| 372 | ] | 364 | ] |
| 373 | 365 | ||
| 374 | - monkeypatch.setattr(process_products, "analyze_products", _fake_analyze_products) | 366 | + monkeypatch.setattr(process_products, "build_index_content_fields", _fake_build_index_content_fields) |
| 375 | 367 | ||
| 376 | response = indexer_client.post( | 368 | response = indexer_client.post( |
| 377 | "/indexer/enrich-content", | 369 | "/indexer/enrich-content", |
| @@ -381,7 +373,6 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch | @@ -381,7 +373,6 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch | ||
| 381 | {"spu_id": "1001", "title": "T-shirt"}, | 373 | {"spu_id": "1001", "title": "T-shirt"}, |
| 382 | {"spu_id": "1002", "title": "Toy"}, | 374 | {"spu_id": "1002", "title": "Toy"}, |
| 383 | ], | 375 | ], |
| 384 | - "languages": ["zh", "en"], | ||
| 385 | }, | 376 | }, |
| 386 | ) | 377 | ) |
| 387 | assert response.status_code == 200 | 378 | assert response.status_code == 200 |
| @@ -390,9 +381,14 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch | @@ -390,9 +381,14 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch | ||
| 390 | assert data["total"] == 2 | 381 | assert data["total"] == 2 |
| 391 | assert len(data["results"]) == 2 | 382 | assert len(data["results"]) == 2 |
| 392 | assert data["results"][0]["spu_id"] == "1001" | 383 | assert data["results"][0]["spu_id"] == "1001" |
| 393 | - assert data["results"][0]["qanchors"]["zh"] == "zh-anchor-1001" | ||
| 394 | - assert data["results"][0]["qanchors"]["en"] == "en-anchor-1001" | ||
| 395 | - assert "tag1" in data["results"][0]["tags"] | 384 | + assert data["results"][0]["qanchors"]["zh"] == ["zh-anchor-1001"] |
| 385 | + assert data["results"][0]["qanchors"]["en"] == ["en-anchor-1001"] | ||
| 386 | + assert data["results"][0]["enriched_tags"]["zh"] == ["tag1", "tag2"] | ||
| 387 | + assert data["results"][0]["enriched_tags"]["en"] == ["tag1", "tag2"] | ||
| 388 | + assert data["results"][0]["enriched_attributes"][0] == { | ||
| 389 | + "name": "enriched_tags", | ||
| 390 | + "value": {"zh": "tag1"}, | ||
| 391 | + } | ||
| 396 | 392 | ||
| 397 | 393 | ||
| 398 | def test_indexer_documents_contract(indexer_client: TestClient): | 394 | def test_indexer_documents_contract(indexer_client: TestClient): |
| @@ -515,7 +511,6 @@ def test_indexer_enrich_content_validation_max_items(indexer_client: TestClient) | @@ -515,7 +511,6 @@ def test_indexer_enrich_content_validation_max_items(indexer_client: TestClient) | ||
| 515 | json={ | 511 | json={ |
| 516 | "tenant_id": "162", | 512 | "tenant_id": "162", |
| 517 | "items": [{"spu_id": str(i), "title": "x"} for i in range(51)], | 513 | "items": [{"spu_id": str(i), "title": "x"} for i in range(51)], |
| 518 | - "languages": ["zh"], | ||
| 519 | }, | 514 | }, |
| 520 | ) | 515 | ) |
| 521 | assert response.status_code == 400 | 516 | assert response.status_code == 400 |
tests/test_llm_enrichment_batch_fill.py
| @@ -7,33 +7,30 @@ import pandas as pd | @@ -7,33 +7,30 @@ import pandas as pd | ||
| 7 | from indexer.document_transformer import SPUDocumentTransformer | 7 | from indexer.document_transformer import SPUDocumentTransformer |
| 8 | 8 | ||
| 9 | 9 | ||
| 10 | -def test_fill_llm_attributes_batch_calls_analyze_in_batches(monkeypatch): | 10 | +def test_fill_llm_attributes_batch_uses_product_enrich_helper(monkeypatch): |
| 11 | seen_calls: List[Dict[str, Any]] = [] | 11 | seen_calls: List[Dict[str, Any]] = [] |
| 12 | 12 | ||
| 13 | - def _fake_analyze_products(products, target_lang="zh", batch_size=None, tenant_id=None): | ||
| 14 | - # should always request batch_size=20 and pass full list; internal splitter handles >20 | ||
| 15 | - seen_calls.append( | ||
| 16 | - { | ||
| 17 | - "n": len(products), | ||
| 18 | - "target_lang": target_lang, | ||
| 19 | - "batch_size": batch_size, | ||
| 20 | - "tenant_id": tenant_id, | ||
| 21 | - } | ||
| 22 | - ) | 13 | + def _fake_build_index_content_fields(items, tenant_id=None): |
| 14 | + seen_calls.append({"n": len(items), "tenant_id": tenant_id}) | ||
| 23 | return [ | 15 | return [ |
| 24 | { | 16 | { |
| 25 | - "id": p["id"], | ||
| 26 | - "lang": target_lang, | ||
| 27 | - "title_input": p["title"], | ||
| 28 | - "tags": "t1,t2", | ||
| 29 | - "anchor_text": f"{target_lang}-anchor-{p['id']}", | 17 | + "id": item["id"], |
| 18 | + "qanchors": { | ||
| 19 | + "zh": [f"zh-anchor-{item['id']}"], | ||
| 20 | + "en": [f"en-anchor-{item['id']}"], | ||
| 21 | + }, | ||
| 22 | + "tags": {"zh": ["t1", "t2"], "en": ["t1", "t2"]}, | ||
| 23 | + "enriched_attributes": [ | ||
| 24 | + {"name": "tags", "value": {"zh": "t1"}}, | ||
| 25 | + {"name": "tags", "value": {"en": "t1"}}, | ||
| 26 | + ], | ||
| 30 | } | 27 | } |
| 31 | - for p in products | 28 | + for item in items |
| 32 | ] | 29 | ] |
| 33 | 30 | ||
| 34 | import indexer.document_transformer as doc_tr | 31 | import indexer.document_transformer as doc_tr |
| 35 | 32 | ||
| 36 | - monkeypatch.setattr(doc_tr, "analyze_products", _fake_analyze_products) | 33 | + monkeypatch.setattr(doc_tr, "build_index_content_fields", _fake_build_index_content_fields) |
| 37 | 34 | ||
| 38 | transformer = SPUDocumentTransformer( | 35 | transformer = SPUDocumentTransformer( |
| 39 | category_id_to_name={}, | 36 | category_id_to_name={}, |
| @@ -54,11 +51,11 @@ def test_fill_llm_attributes_batch_calls_analyze_in_batches(monkeypatch): | @@ -54,11 +51,11 @@ def test_fill_llm_attributes_batch_calls_analyze_in_batches(monkeypatch): | ||
| 54 | 51 | ||
| 55 | transformer.fill_llm_attributes_batch(docs, rows) | 52 | transformer.fill_llm_attributes_batch(docs, rows) |
| 56 | 53 | ||
| 57 | - # called once per language, with full list; analyze_products handles splitting | ||
| 58 | - assert seen_calls == [ | ||
| 59 | - {"n": 45, "target_lang": "zh", "batch_size": 20, "tenant_id": "162"}, | ||
| 60 | - {"n": 45, "target_lang": "en", "batch_size": 20, "tenant_id": "162"}, | ||
| 61 | - ] | 54 | + assert seen_calls == [{"n": 45, "tenant_id": "162"}] |
| 62 | 55 | ||
| 63 | - assert docs[0]["qanchors"]["zh"] == "zh-anchor-0" | ||
| 64 | - assert docs[0]["qanchors"]["en"] == "en-anchor-0" | 56 | + assert docs[0]["qanchors"]["zh"] == ["zh-anchor-0"] |
| 57 | + assert docs[0]["qanchors"]["en"] == ["en-anchor-0"] | ||
| 58 | + assert docs[0]["tags"]["zh"] == ["t1", "t2"] | ||
| 59 | + assert docs[0]["tags"]["en"] == ["t1", "t2"] | ||
| 60 | + assert {"name": "tags", "value": {"zh": "t1"}} in docs[0]["enriched_attributes"] | ||
| 61 | + assert {"name": "tags", "value": {"en": "t1"}} in docs[0]["enriched_attributes"] |
tests/test_suggestions.py
| @@ -403,10 +403,13 @@ def test_build_full_candidates_tags_and_qanchor_phrases(monkeypatch): | @@ -403,10 +403,13 @@ def test_build_full_candidates_tags_and_qanchor_phrases(monkeypatch): | ||
| 403 | "spu_id": "900", | 403 | "spu_id": "900", |
| 404 | "title": {"en": "Tee", "zh": "T恤"}, | 404 | "title": {"en": "Tee", "zh": "T恤"}, |
| 405 | "qanchors": { | 405 | "qanchors": { |
| 406 | - "en": "slim fit, sporty casual", | ||
| 407 | - "zh": "修身, 显瘦", | 406 | + "en": ["slim fit", "sporty casual"], |
| 407 | + "zh": ["修身", "显瘦"], | ||
| 408 | + }, | ||
| 409 | + "tags": { | ||
| 410 | + "en": ["Classic", "ribbed neckline"], | ||
| 411 | + "zh": ["辣妹风"], | ||
| 408 | }, | 412 | }, |
| 409 | - "tags": ["Classic", "辣妹风", "ribbed neckline"], | ||
| 410 | }, | 413 | }, |
| 411 | } | 414 | } |
| 412 | ] | 415 | ] |