Commit d350861ffff80413a1e1be71dfbe478d514ee925

Authored by tangwang
1 parent fca871fb

索引结构修改

api/routes/indexer.py
@@ -80,7 +80,7 @@ class BuildDocsFromDbRequest(BaseModel): @@ -80,7 +80,7 @@ class BuildDocsFromDbRequest(BaseModel):
80 class EnrichContentItem(BaseModel): 80 class EnrichContentItem(BaseModel):
81 """单条待生成内容理解字段的商品。""" 81 """单条待生成内容理解字段的商品。"""
82 spu_id: str = Field(..., description="SPU ID") 82 spu_id: str = Field(..., description="SPU ID")
83 - title: str = Field(..., description="商品标题,用于 LLM 分析生成 qanchors / tags 等") 83 + title: str = Field(..., description="商品标题,用于 LLM 分析生成 qanchors / enriched_tags 等")
84 image_url: Optional[str] = Field(None, description="商品主图 URL(预留给多模态/内容理解扩展)") 84 image_url: Optional[str] = Field(None, description="商品主图 URL(预留给多模态/内容理解扩展)")
85 brief: Optional[str] = Field(None, description="商品简介/短描述") 85 brief: Optional[str] = Field(None, description="商品简介/短描述")
86 description: Optional[str] = Field(None, description="商品详情/长描述") 86 description: Optional[str] = Field(None, description="商品详情/长描述")
@@ -93,10 +93,6 @@ class EnrichContentRequest(BaseModel): @@ -93,10 +93,6 @@ class EnrichContentRequest(BaseModel):
93 """ 93 """
94 tenant_id: str = Field(..., description="租户 ID,用于请求路由与结果归属,不参与缓存键") 94 tenant_id: str = Field(..., description="租户 ID,用于请求路由与结果归属,不参与缓存键")
95 items: List[EnrichContentItem] = Field(..., description="待分析的 SPU 列表(spu_id + title,可附带 brief/description/image_url)") 95 items: List[EnrichContentItem] = Field(..., description="待分析的 SPU 列表(spu_id + title,可附带 brief/description/image_url)")
96 - languages: List[str] = Field(  
97 - default_factory=lambda: ["zh", "en"],  
98 - description="目标语言列表,需在支持范围内(zh/en/de/ru/fr),默认 zh, en",  
99 - )  
100 96
101 97
102 @router.post("/reindex") 98 @router.post("/reindex")
@@ -444,92 +440,25 @@ async def build_docs_from_db(request: BuildDocsFromDbRequest): @@ -444,92 +440,25 @@ async def build_docs_from_db(request: BuildDocsFromDbRequest):
444 raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}") 440 raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
445 441
446 442
447 -def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: List[str]) -> List[Dict[str, Any]]: 443 +def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]]) -> List[Dict[str, Any]]:
448 """ 444 """
449 - 同步执行内容理解:调用 product_enrich.analyze_products,按语言批量跑 LLM,  
450 - 再聚合成每 SPU 的 qanchors、enriched_attributes、tags。供 run_in_executor 调用。 445 + 同步执行内容理解,返回与 ES mapping 对齐的字段结构。
  446 + 语言策略由 product_enrich 内部统一决定,路由层不参与。
451 """ 447 """
452 - from indexer.product_enrich import analyze_products, split_multi_value_field 448 + from indexer.product_enrich import build_index_content_fields
453 449
454 - llm_langs = list(dict.fromkeys(languages)) or ["en"]  
455 -  
456 - products = [ 450 + results = build_index_content_fields(items=items, tenant_id=tenant_id)
  451 + return [
457 { 452 {
458 - "id": it["spu_id"],  
459 - "title": (it.get("title") or "").strip(),  
460 - "brief": (it.get("brief") or "").strip(),  
461 - "description": (it.get("description") or "").strip(),  
462 - "image_url": (it.get("image_url") or "").strip(), 453 + "spu_id": item["id"],
  454 + "qanchors": item["qanchors"],
  455 + "enriched_attributes": item["enriched_attributes"],
  456 + "enriched_tags": item["enriched_tags"],
  457 + **({"error": item["error"]} if item.get("error") else {}),
463 } 458 }
464 - for it in items  
465 - ]  
466 - dim_keys = [  
467 - "tags",  
468 - "target_audience",  
469 - "usage_scene",  
470 - "season",  
471 - "key_attributes",  
472 - "material",  
473 - "features", 459 + for item in results
474 ] 460 ]
475 461
476 - # 按 spu_id 聚合:qanchors[lang], enriched_attributes[], tags[]  
477 - by_spu: Dict[str, Dict[str, Any]] = {}  
478 - for it in items:  
479 - sid = str(it["spu_id"])  
480 - by_spu[sid] = {"qanchors": {}, "enriched_attributes": [], "tags": []}  
481 -  
482 - for lang in llm_langs:  
483 - try:  
484 - rows = analyze_products(  
485 - products=products,  
486 - target_lang=lang,  
487 - batch_size=20,  
488 - tenant_id=tenant_id,  
489 - )  
490 - except Exception as e:  
491 - logger.warning("enrich-content analyze_products failed for lang=%s: %s", lang, e)  
492 - for it in items:  
493 - sid = str(it["spu_id"])  
494 - if "error" not in by_spu[sid]:  
495 - by_spu[sid]["error"] = str(e)  
496 - continue  
497 -  
498 - for row in rows:  
499 - spu_id = str(row.get("id") or "")  
500 - if spu_id not in by_spu:  
501 - continue  
502 - rec = by_spu[spu_id]  
503 - if row.get("error"):  
504 - rec["error"] = row["error"]  
505 - continue  
506 - anchor_text = str(row.get("anchor_text") or "").strip()  
507 - if anchor_text:  
508 - rec["qanchors"][lang] = anchor_text  
509 - for name in dim_keys:  
510 - raw = row.get(name)  
511 - if not raw:  
512 - continue  
513 - for value in split_multi_value_field(str(raw)):  
514 - rec["enriched_attributes"].append({"lang": lang, "name": name, "value": value})  
515 - if name == "tags":  
516 - rec["tags"].append(value)  
517 -  
518 - # 去重 tags(保持顺序)  
519 - out = []  
520 - for it in items:  
521 - sid = str(it["spu_id"])  
522 - rec = by_spu[sid]  
523 - tags = list(dict.fromkeys(rec["tags"]))  
524 - out.append({  
525 - "spu_id": sid,  
526 - "qanchors": rec["qanchors"],  
527 - "enriched_attributes": rec["enriched_attributes"],  
528 - "tags": tags,  
529 - **({"error": rec["error"]} if rec.get("error") else {}),  
530 - })  
531 - return out  
532 -  
533 462
534 @router.post("/enrich-content") 463 @router.post("/enrich-content")
535 async def enrich_content(request: EnrichContentRequest): 464 async def enrich_content(request: EnrichContentRequest):
@@ -540,7 +469,7 @@ async def enrich_content(request: EnrichContentRequest): @@ -540,7 +469,7 @@ async def enrich_content(request: EnrichContentRequest):
540 - 外部 indexer 采用「微服务组合」方式自己组织 doc 时,可调用本接口获取 LLM 生成的 469 - 外部 indexer 采用「微服务组合」方式自己组织 doc 时,可调用本接口获取 LLM 生成的
541 锚文本与语义属性,再与翻译、向量化结果合并写入 ES。 470 锚文本与语义属性,再与翻译、向量化结果合并写入 ES。
542 - 与 /indexer/build-docs 解耦,避免 build-docs 因 LLM 耗时过长而阻塞;调用方可 471 - 与 /indexer/build-docs 解耦,避免 build-docs 因 LLM 耗时过长而阻塞;调用方可
543 - 先拿不含 qanchors/tags 的 doc,再异步或离线补齐本接口结果后更新 ES。 472 + 先拿不含 qanchors/enriched_tags 的 doc,再异步或离线补齐本接口结果后更新 ES。
544 473
545 实现逻辑与 indexer.product_enrich.analyze_products 一致,支持多语言与 Redis 缓存。 474 实现逻辑与 indexer.product_enrich.analyze_products 一致,支持多语言与 Redis 缓存。
546 """ 475 """
@@ -568,8 +497,7 @@ async def enrich_content(request: EnrichContentRequest): @@ -568,8 +497,7 @@ async def enrich_content(request: EnrichContentRequest):
568 None, 497 None,
569 lambda: _run_enrich_content( 498 lambda: _run_enrich_content(
570 tenant_id=request.tenant_id, 499 tenant_id=request.tenant_id,
571 - items=items_payload,  
572 - languages=request.languages or ["zh", "en"], 500 + items=items_payload
573 ), 501 ),
574 ) 502 )
575 return { 503 return {
docs/suggestion索引构建.md
@@ -169,7 +169,7 @@ @@ -169,7 +169,7 @@
169 169
170 ##### 4.1 从商品索引收集 title / qanchors / tags(Step 1) 170 ##### 4.1 从商品索引收集 title / qanchors / tags(Step 1)
171 171
172 - - 遍历店铺的所有商品:获取每个商品的 `"spu_id"`, `"title"`, `"qanchors"`, `"tags"`(按 `spu_id`、`id.keyword` 升序,便于 `search_after` 稳定分页) 172 + - 遍历店铺的所有商品:获取每个商品的 `"spu_id"`, `"title"`, `"qanchors"`, `"enriched_tags"`(按 `spu_id`、`id.keyword` 升序,便于 `search_after` 稳定分页)
173 173
174 - 对每个商品文档: 174 - 对每个商品文档:
175 175
@@ -207,7 +207,7 @@ @@ -207,7 +207,7 @@
207 - **qanchors 处理**: 207 - **qanchors 处理**:
208 - `qanchors` 字段同样为多语言对象: 208 - `qanchors` 字段同样为多语言对象:
209 ```json 209 ```json
210 - "qanchors": { "en": "...", "zh": "..." } 210 + "qanchors": { "en": ["slim fit", "sporty casual"], "zh": ["修身", "显瘦"] }
211 ``` 211 ```
212 - 取 `q_raw = qanchors[lang]` 212 - 取 `q_raw = qanchors[lang]`
213 - 通过 `_split_qanchors(q_raw)` 拆分为若干字符串: 213 - 通过 `_split_qanchors(q_raw)` 拆分为若干字符串:
@@ -217,10 +217,14 @@ @@ -217,10 +217,14 @@
217 - `text_norm = _normalize_text(q_text)`,再用 `_looks_noise` 过滤 217 - `text_norm = _normalize_text(q_text)`,再用 `_looks_noise` 过滤
218 - 同样按 `(lang, text_norm)` 合并为 `SuggestionCandidate`,调用 `add_product("qanchor", spu_id=product_id)`。 218 - 同样按 `(lang, text_norm)` 合并为 `SuggestionCandidate`,调用 `add_product("qanchor", spu_id=product_id)`。
219 219
220 - 4. **tags 处理**(与 `index_languages` 循环并列,每个商品只做一次):  
221 - - `tags` 可为字符串数组,或逗号等分隔的单个字符串;经 `_iter_product_tags` 展开为若干条。  
222 - - 每条 tag **无语言字段**:使用 `query.query_parser.detect_text_language_for_suggestions`(与 `QueryParser` 相同的 `LanguageDetector`)判定语言,并约束在租户的 `index_languages` 内。  
223 - - 通过 `_looks_noise` 后按 `(detected_lang, text_norm)` 合并,调用 `add_product("tag", spu_id=product_id)`。 220 + 4. **enriched_tags 处理**(与 `index_languages` 循环并列):
  221 + - `enriched_tags` 现为多语言对象,例如:
  222 + ```json
  223 + "enriched_tags": { "en": ["Classic", "ribbed neckline"], "zh": ["辣妹风"] }
  224 + ```
  225 + - 优先读取 `enriched_tags[lang]`,每个值可为字符串数组,或逗号等分隔的单个字符串;经 `_iter_product_tags` 展开为若干条。
  226 + - 对历史旧数据,若 `enriched_tags` 仍是单层字符串 / 数组,则继续走语言检测兜底,并约束在租户的 `index_languages` 内。
  227 + - 通过 `_looks_noise` 后按 `(lang, text_norm)` 合并,调用 `add_product("tag", spu_id=product_id)`。
224 228
225 ##### 4.2 从查询日志收集用户 query(Step 2) 229 ##### 4.2 从查询日志收集用户 query(Step 2)
226 230
docs/搜索API对接指南-05-索引接口(Indexer).md
@@ -13,7 +13,7 @@ @@ -13,7 +13,7 @@
13 | 查询文档 | POST | `/indexer/documents` | 按 SPU ID 列表查询 ES 文档,不写入 ES | 13 | 查询文档 | POST | `/indexer/documents` | 按 SPU ID 列表查询 ES 文档,不写入 ES |
14 | 构建 ES 文档(正式) | POST | `/indexer/build-docs` | 由上游提供 MySQL 行数据,返回 ES-ready 文档,不写 ES | 14 | 构建 ES 文档(正式) | POST | `/indexer/build-docs` | 由上游提供 MySQL 行数据,返回 ES-ready 文档,不写 ES |
15 | 构建 ES 文档(测试) | POST | `/indexer/build-docs-from-db` | 由本服务查库并构建文档,仅测试/调试用 | 15 | 构建 ES 文档(测试) | POST | `/indexer/build-docs-from-db` | 由本服务查库并构建文档,仅测试/调试用 |
16 -| 内容理解字段生成 | POST | `/indexer/enrich-content` | 根据商品标题批量生成 qanchors、semantic_attributes、tags(供微服务组合方式使用) | 16 +| 内容理解字段生成 | POST | `/indexer/enrich-content` | 根据商品标题批量生成 qanchors、enriched_attributes、tags(供微服务组合方式使用) |
17 | 索引健康检查 | GET | `/indexer/health` | 检查索引服务与数据库连接状态 | 17 | 索引健康检查 | GET | `/indexer/health` | 检查索引服务与数据库连接状态 |
18 18
19 #### 5.0 支撑外部 indexer 的三种方式 19 #### 5.0 支撑外部 indexer 的三种方式
@@ -510,7 +510,6 @@ curl -X POST "http://localhost:6004/indexer/build-docs" \ @@ -510,7 +510,6 @@ curl -X POST "http://localhost:6004/indexer/build-docs" \
510 { 510 {
511 "spu": { 511 "spu": {
512 "id": 10001, 512 "id": 10001,
513 - "tenant_id": "162",  
514 "title": "测试T恤 纯棉短袖", 513 "title": "测试T恤 纯棉短袖",
515 "brief": "舒适纯棉,多色可选", 514 "brief": "舒适纯棉,多色可选",
516 "description": "这是一款适合日常穿着的纯棉T恤,透气吸汗。", 515 "description": "这是一款适合日常穿着的纯棉T恤,透气吸汗。",
@@ -521,7 +520,7 @@ curl -X POST "http://localhost:6004/indexer/build-docs" \ @@ -521,7 +520,7 @@ curl -X POST "http://localhost:6004/indexer/build-docs" \
521 "category_path": "服装/上衣/T恤", 520 "category_path": "服装/上衣/T恤",
522 "fake_sales": 1280, 521 "fake_sales": 1280,
523 "image_src": "https://oss.essa.cn/98532128-cf8e-456c-9e30-6f2a5ea0c19f.jpg", 522 "image_src": "https://oss.essa.cn/98532128-cf8e-456c-9e30-6f2a5ea0c19f.jpg",
524 - "tags": "T恤,纯棉,短袖,夏季", 523 + "enriched_tags": ["T恤", "纯棉"],
525 "create_time": "2024-01-01T00:00:00Z", 524 "create_time": "2024-01-01T00:00:00Z",
526 "update_time": "2024-01-01T00:00:00Z" 525 "update_time": "2024-01-01T00:00:00Z"
527 }, 526 },
@@ -570,7 +569,7 @@ curl -X POST "http://localhost:6004/indexer/build-docs" \ @@ -570,7 +569,7 @@ curl -X POST "http://localhost:6004/indexer/build-docs" \
570 "tenant_id": "170", 569 "tenant_id": "170",
571 "spu_id": "223167", 570 "spu_id": "223167",
572 "title": { "en": "...", "zh": "..." }, 571 "title": { "en": "...", "zh": "..." },
573 - "tags": ["Floerns", "Clothing", "Shoes & Jewelry"], 572 + "enriched_tags": ["Floerns", "Clothing", "Shoes & Jewelry"],
574 "skus": [ 573 "skus": [
575 { 574 {
576 "sku_id": "3988393", 575 "sku_id": "3988393",
@@ -649,7 +648,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ @@ -649,7 +648,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \
649 ### 5.8 内容理解字段生成接口 648 ### 5.8 内容理解字段生成接口
650 649
651 - **端点**: `POST /indexer/enrich-content` 650 - **端点**: `POST /indexer/enrich-content`
652 -- **描述**: 根据商品内容信息批量生成 **qanchors**(锚文本)、**enriched_attributes**(语义属性)、**tags**(细分标签),供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段(必填/可选见下表)。内部逻辑与 `indexer.product_enrich` 一致,支持多语言与 Redis 缓存;单次请求在线程池中执行,避免阻塞其他接口。 651 +- **描述**: 根据商品内容信息批量生成 **qanchors**(锚文本)、**enriched_attributes**(语义属性)、**enriched_tags**(细分标签),供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段(必填/可选见下表)。接口只暴露商品内容输入,语言选择、分析维度与最终字段结构统一由 `indexer.product_enrich` 内部决定;当前返回结果与 `search_products` mapping 保持一致。单次请求在线程池中执行,避免阻塞其他接口。
653 652
654 #### 请求参数 653 #### 请求参数
655 654
@@ -669,8 +668,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ @@ -669,8 +668,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \
669 "title": "12PCS Dolls with Bottles", 668 "title": "12PCS Dolls with Bottles",
670 "image_url": "https://example.com/images/223168.jpg" 669 "image_url": "https://example.com/images/223168.jpg"
671 } 670 }
672 - ],  
673 - "languages": ["zh", "en"] 671 + ]
674 } 672 }
675 ``` 673 ```
676 674
@@ -678,7 +676,6 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ @@ -678,7 +676,6 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \
678 |------|------|------|--------|------| 676 |------|------|------|--------|------|
679 | `tenant_id` | string | Y | - | 租户 ID。目前仅用于记录日志,不产生实际作用| 677 | `tenant_id` | string | Y | - | 租户 ID。目前仅用于记录日志,不产生实际作用|
680 | `items` | array | Y | - | 待分析列表;**单次最多 50 条** | 678 | `items` | array | Y | - | 待分析列表;**单次最多 50 条** |
681 -| `languages` | array[string] | N | `["zh", "en"]` | 目标语言,需在支持范围内:`zh`、`en`、`de`、`ru`、`fr` |  
682 679
683 `items[]` 字段说明: 680 `items[]` 字段说明:
684 681
@@ -696,6 +693,12 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ @@ -696,6 +693,12 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \
696 - `tenant_id`、`spu_id` 只用于请求归属与结果回填,不参与缓存键。 693 - `tenant_id`、`spu_id` 只用于请求归属与结果回填,不参与缓存键。
697 - 因此,输入内容不变时可跨请求直接命中缓存;任一输入字段变化时,会自然落到新的缓存 key。 694 - 因此,输入内容不变时可跨请求直接命中缓存;任一输入字段变化时,会自然落到新的缓存 key。
698 695
  696 +语言说明:
  697 +
  698 +- 接口不接受语言控制参数。
  699 +- 返回哪些语言、返回哪些语义维度,统一由 `indexer.product_enrich` 内部逻辑决定。
  700 +- 当前为了与 `search_products` mapping 对齐,返回结果只包含核心索引语言 `zh`、`en`。
  701 +
699 批量请求建议: 702 批量请求建议:
700 - **全量**:强烈建议 尽可能 **20 个 SPU/doc** 攒成一个批次后再请求一次。 703 - **全量**:强烈建议 尽可能 **20 个 SPU/doc** 攒成一个批次后再请求一次。
701 - **增量**:可按时效要求设置时间窗口(例如 **5 分钟**),在窗口内尽可能攒到 **20 个**;达到 20 或窗口到期就发送一次请求。 704 - **增量**:可按时效要求设置时间窗口(例如 **5 分钟**),在窗口内尽可能攒到 **20 个**;达到 20 或窗口到期就发送一次请求。
@@ -711,21 +714,28 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ @@ -711,21 +714,28 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \
711 { 714 {
712 "spu_id": "223167", 715 "spu_id": "223167",
713 "qanchors": { 716 "qanchors": {
714 - "zh": "短袖T恤,纯棉,男装,夏季",  
715 - "en": "cotton t-shirt, short sleeve, men, summer" 717 + "zh": ["短袖T恤", "纯棉", "男装", "夏季"],
  718 + "en": ["cotton t-shirt", "short sleeve", "men", "summer"]
  719 + },
  720 + "enriched_tags": {
  721 + "zh": ["纯棉", "短袖", "男装"],
  722 + "en": ["cotton", "short sleeve", "men"]
716 }, 723 },
717 "enriched_attributes": [ 724 "enriched_attributes": [
718 - { "lang": "zh", "name": "tags", "value": "纯棉" },  
719 - { "lang": "zh", "name": "usage_scene", "value": "日常" },  
720 - { "lang": "en", "name": "tags", "value": "cotton" }  
721 - ],  
722 - "tags": ["纯棉", "短袖", "男装", "cotton", "short sleeve"] 725 + { "name": "enriched_tags", "value": { "zh": "纯棉" } },
  726 + { "name": "usage_scene", "value": { "zh": "日常" } },
  727 + { "name": "enriched_tags", "value": { "en": "cotton" } }
  728 + ]
723 }, 729 },
724 { 730 {
725 "spu_id": "223168", 731 "spu_id": "223168",
726 - "qanchors": { "en": "dolls, toys, 12pcs" },  
727 - "enriched_attributes": [],  
728 - "tags": ["dolls", "toys"] 732 + "qanchors": {
  733 + "en": ["dolls", "toys", "12pcs"]
  734 + },
  735 + "enriched_tags": {
  736 + "en": ["dolls", "toys"]
  737 + },
  738 + "enriched_attributes": []
729 } 739 }
730 ] 740 ]
731 } 741 }
@@ -733,10 +743,10 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ @@ -733,10 +743,10 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \
733 743
734 | 字段 | 类型 | 说明 | 744 | 字段 | 类型 | 说明 |
735 |------|------|------| 745 |------|------|------|
736 -| `results` | array | 与请求 `items` 一一对应,每项含 `spu_id`、`qanchors`、`enriched_attributes`、`tags` |  
737 -| `results[].qanchors` | object | 按语言键的锚文本(逗号分隔短语),可写入 ES 文档的 `qanchors.{lang}` |  
738 -| `results[].enriched_attributes` | array | 语义属性列表,每项为 `{ "lang", "name", "value" }`,可写入 ES 的 `enriched_attributes` nested 字段 |  
739 -| `results[].tags` | array | 从语义属性中抽取的 `name=tags` 的 value 集合,可与业务原有 `tags` 合并后写入 ES 的 `tags` 字段 | 746 +| `results` | array | 与请求 `items` 一一对应,每项含 `spu_id`、`qanchors`、`enriched_attributes`、`enriched_tags` |
  747 +| `results[].qanchors` | object | 与 ES `qanchors` 字段同结构,按语言键返回短语数组 |
  748 +| `results[].enriched_tags` | object | 与 ES `enriched_tags` 字段同结构,按语言键返回标签数组 |
  749 +| `results[].enriched_attributes` | array | 与 ES `enriched_attributes` nested 字段同结构,每项为 `{ "name", "value": { "zh"?: "...", "en"?: "..." } }` |
740 | `results[].error` | string | 若该条处理失败(如 LLM 异常),会在此字段返回错误信息 | 750 | `results[].error` | string | 若该条处理失败(如 LLM 异常),会在此字段返回错误信息 |
741 751
742 **错误响应**: 752 **错误响应**:
@@ -758,8 +768,7 @@ curl -X POST "http://localhost:6004/indexer/enrich-content" \ @@ -758,8 +768,7 @@ curl -X POST "http://localhost:6004/indexer/enrich-content" \
758 "description": "100%棉,圆领版型,适合日常通勤与休闲穿搭。", 768 "description": "100%棉,圆领版型,适合日常通勤与休闲穿搭。",
759 "image_url": "https://example.com/images/223167.jpg" 769 "image_url": "https://example.com/images/223167.jpg"
760 } 770 }
761 - ],  
762 - "languages": ["zh", "en"] 771 + ]
763 }' 772 }'
764 ``` 773 ```
765 774
indexer/document_transformer.py
@@ -11,9 +11,8 @@ SPU文档转换器 - 公共转换逻辑。 @@ -11,9 +11,8 @@ SPU文档转换器 - 公共转换逻辑。
11 import pandas as pd 11 import pandas as pd
12 import numpy as np 12 import numpy as np
13 import logging 13 import logging
14 -import re  
15 from typing import Dict, Any, Optional, List 14 from typing import Dict, Any, Optional, List
16 -from indexer.product_enrich import analyze_products, split_multi_value_field 15 +from indexer.product_enrich import build_index_content_fields
17 16
18 logger = logging.getLogger(__name__) 17 logger = logging.getLogger(__name__)
19 18
@@ -75,6 +74,39 @@ class SPUDocumentTransformer: @@ -75,6 +74,39 @@ class SPUDocumentTransformer:
75 ) 74 )
76 return translations 75 return translations
77 76
  77 + def _build_core_language_text_object(
  78 + self,
  79 + text: Optional[str],
  80 + source_lang: str,
  81 + scene: str = "general",
  82 + ) -> Dict[str, str]:
  83 + """
  84 + 构建与 mapping 中 core_language_text(_with_keyword) 对齐的对象。
  85 + 当前核心语言固定为 zh/en。
  86 + """
  87 + if not text or not str(text).strip():
  88 + return {}
  89 +
  90 + source_text = str(text).strip()
  91 + obj: Dict[str, str] = {}
  92 +
  93 + if source_lang in CORE_INDEX_LANGUAGES:
  94 + obj[source_lang] = source_text
  95 +
  96 + if self.translator:
  97 + translations = self._translate_index_languages(
  98 + text=source_text,
  99 + source_lang=source_lang,
  100 + index_languages=CORE_INDEX_LANGUAGES,
  101 + scene=scene,
  102 + )
  103 + for lang in CORE_INDEX_LANGUAGES:
  104 + val = translations.get(lang)
  105 + if val and str(val).strip():
  106 + obj[lang] = str(val).strip()
  107 +
  108 + return obj
  109 +
78 def transform_spu_to_doc( 110 def transform_spu_to_doc(
79 self, 111 self,
80 tenant_id: str, 112 tenant_id: str,
@@ -118,10 +150,16 @@ class SPUDocumentTransformer: @@ -118,10 +150,16 @@ class SPUDocumentTransformer:
118 if self.enable_title_embedding and self.encoder: 150 if self.enable_title_embedding and self.encoder:
119 self._fill_title_embedding(doc) 151 self._fill_title_embedding(doc)
120 152
121 - # Tags 153 + # Tags:统一转成与 mapping 一致的 core-language object
122 if pd.notna(spu_row.get('tags')): 154 if pd.notna(spu_row.get('tags')):
123 tags_str = str(spu_row['tags']) 155 tags_str = str(spu_row['tags'])
124 - doc['tags'] = split_multi_value_field(tags_str) 156 + tags_obj = self._build_core_language_text_object(
  157 + tags_str,
  158 + source_lang=primary_lang,
  159 + scene="general",
  160 + )
  161 + if tags_obj:
  162 + doc['tags'] = tags_obj
125 163
126 # Category相关字段 164 # Category相关字段
127 self._fill_category_fields(doc, spu_row) 165 self._fill_category_fields(doc, spu_row)
@@ -202,7 +240,8 @@ class SPUDocumentTransformer: @@ -202,7 +240,8 @@ class SPUDocumentTransformer:
202 """ 240 """
203 批量调用 LLM,为一批 doc 填充: 241 批量调用 LLM,为一批 doc 填充:
204 - qanchors.{lang} 242 - qanchors.{lang}
205 - - enriched_attributes (lang/name/value) 243 + - tags.{lang}
  244 + - enriched_attributes[].value.{lang}
206 245
207 设计目标: 246 设计目标:
208 - 尽可能攒批调用 LLM; 247 - 尽可能攒批调用 LLM;
@@ -211,16 +250,8 @@ class SPUDocumentTransformer: @@ -211,16 +250,8 @@ class SPUDocumentTransformer:
211 if not docs or not spu_rows or len(docs) != len(spu_rows): 250 if not docs or not spu_rows or len(docs) != len(spu_rows):
212 return 251 return
213 252
214 - try:  
215 - index_langs = self.tenant_config.get("index_languages") or ["en", "zh"]  
216 - except Exception:  
217 - index_langs = ["en", "zh"]  
218 - # 不再限制为固定 SUPPORTED_LANGS,直接按照租户配置的 index_languages 调用  
219 - llm_langs = list(dict.fromkeys(index_langs)) # 去重并保持顺序  
220 -  
221 - # 只对有 title 的 SPU 参与 LLM;其余跳过  
222 id_to_idx: Dict[str, int] = {} 253 id_to_idx: Dict[str, int] = {}
223 - products: List[Dict[str, str]] = [] 254 + items: List[Dict[str, str]] = []
224 for i, row in enumerate(spu_rows): 255 for i, row in enumerate(spu_rows):
225 raw_id = row.get("id") 256 raw_id = row.get("id")
226 spu_id = "" if raw_id is None else str(raw_id).strip() 257 spu_id = "" if raw_id is None else str(raw_id).strip()
@@ -228,69 +259,45 @@ class SPUDocumentTransformer: @@ -228,69 +259,45 @@ class SPUDocumentTransformer:
228 if not spu_id or not title: 259 if not spu_id or not title:
229 continue 260 continue
230 id_to_idx[spu_id] = i 261 id_to_idx[spu_id] = i
231 - products.append({"id": spu_id, "title": title})  
232 - if not products: 262 + items.append(
  263 + {
  264 + "id": spu_id,
  265 + "title": title,
  266 + "brief": str(row.get("brief") or "").strip(),
  267 + "description": str(row.get("description") or "").strip(),
  268 + "image_url": str(row.get("image_src") or "").strip(),
  269 + }
  270 + )
  271 + if not items:
233 return 272 return
234 273
235 tenant_id = str(docs[0].get("tenant_id") or "").strip() or None 274 tenant_id = str(docs[0].get("tenant_id") or "").strip() or None
  275 + try:
  276 + results = build_index_content_fields(items=items, tenant_id=tenant_id)
  277 + except Exception as e:
  278 + logger.warning("LLM batch attribute fill failed: %s", e)
  279 + return
236 280
237 - dim_keys = [  
238 - "tags",  
239 - "target_audience",  
240 - "usage_scene",  
241 - "season",  
242 - "key_attributes",  
243 - "material",  
244 - "features",  
245 - ]  
246 -  
247 - for lang in llm_langs:  
248 - try:  
249 - rows = analyze_products(  
250 - products=products,  
251 - target_lang=lang,  
252 - batch_size=20,  
253 - tenant_id=tenant_id,  
254 - )  
255 - except Exception as e:  
256 - logger.warning("LLM batch attribute fill failed (lang=%s): %s", lang, e) 281 + for result in results:
  282 + spu_id = str(result.get("id") or "").strip()
  283 + if not spu_id:
257 continue 284 continue
  285 + idx = id_to_idx.get(spu_id)
  286 + if idx is None:
  287 + continue
  288 + self._apply_content_enrichment(docs[idx], result)
258 289
259 - for row in rows or []:  
260 - spu_id = str(row.get("id") or "").strip()  
261 - if not spu_id:  
262 - continue  
263 - idx = id_to_idx.get(spu_id)  
264 - if idx is None:  
265 - continue  
266 - self._apply_llm_row(docs[idx], row=row, lang=lang, dim_keys=dim_keys)  
267 -  
268 - def _apply_llm_row(self, doc: Dict[str, Any], row: Dict[str, Any], lang: str, dim_keys: List[str]) -> None:  
269 - """将单条 LLM 输出 row 按既定结构写入 doc(不抛异常)。""" 290 + def _apply_content_enrichment(self, doc: Dict[str, Any], enrichment: Dict[str, Any]) -> None:
  291 + """将 product_enrich 产出的 ES-ready 内容字段写入 doc。"""
270 try: 292 try:
271 - if row.get("error"):  
272 - return  
273 -  
274 - semantic_list = doc.get("enriched_attributes") or []  
275 - qanchors_obj = doc.get("qanchors") or {}  
276 -  
277 - anchor_text = str(row.get("anchor_text") or "").strip()  
278 - if anchor_text:  
279 - qanchors_obj[lang] = anchor_text  
280 -  
281 - for name in dim_keys:  
282 - raw = row.get(name)  
283 - if not raw:  
284 - continue  
285 - for value in split_multi_value_field(str(raw)):  
286 - semantic_list.append({"lang": lang, "name": name, "value": value})  
287 -  
288 - if qanchors_obj:  
289 - doc["qanchors"] = qanchors_obj  
290 - if semantic_list:  
291 - doc["enriched_attributes"] = semantic_list 293 + if enrichment.get("qanchors"):
  294 + doc["qanchors"] = enrichment["qanchors"]
  295 + if enrichment.get("tags"):
  296 + doc["tags"] = enrichment["tags"]
  297 + if enrichment.get("enriched_attributes"):
  298 + doc["enriched_attributes"] = enrichment["enriched_attributes"]
292 except Exception as e: 299 except Exception as e:
293 - logger.warning("Failed to apply LLM row to doc (spu_id=%s, lang=%s): %s", doc.get("spu_id"), lang, e) 300 + logger.warning("Failed to apply enrichment to doc (spu_id=%s): %s", doc.get("spu_id"), e)
294 301
295 def _fill_text_fields( 302 def _fill_text_fields(
296 self, 303 self,
@@ -544,6 +551,23 @@ class SPUDocumentTransformer: @@ -544,6 +551,23 @@ class SPUDocumentTransformer:
544 if pd.notna(position) and pd.notna(name): 551 if pd.notna(position) and pd.notna(name):
545 option_name_map[int(position)] = str(name) 552 option_name_map[int(position)] = str(name)
546 553
  554 + primary_lang = self.tenant_config.get('primary_language', 'en')
  555 +
  556 + def _build_specification(name: str, raw_value: Any, sku_id: str) -> Optional[Dict[str, Any]]:
  557 + value = "" if raw_value is None else str(raw_value).strip()
  558 + if not value:
  559 + return None
  560 + return {
  561 + 'sku_id': sku_id,
  562 + 'name': name,
  563 + 'value_keyword': value,
  564 + 'value_text': self._build_core_language_text_object(
  565 + value,
  566 + source_lang=primary_lang,
  567 + scene="general",
  568 + ) or normalize_core_text_field_value(value, primary_lang),
  569 + }
  570 +
547 for _, sku_row in skus.iterrows(): 571 for _, sku_row in skus.iterrows():
548 sku_data = self._transform_sku_row(sku_row, option_name_map) 572 sku_data = self._transform_sku_row(sku_row, option_name_map)
549 if sku_data: 573 if sku_data:
@@ -584,23 +608,17 @@ class SPUDocumentTransformer: @@ -584,23 +608,17 @@ class SPUDocumentTransformer:
584 # 构建specifications(从SKU的option值和option表的name) 608 # 构建specifications(从SKU的option值和option表的name)
585 sku_id = str(sku_row['id']) 609 sku_id = str(sku_row['id'])
586 if pd.notna(sku_row.get('option1')) and 1 in option_name_map: 610 if pd.notna(sku_row.get('option1')) and 1 in option_name_map:
587 - specifications.append({  
588 - 'sku_id': sku_id,  
589 - 'name': option_name_map[1],  
590 - 'value': str(sku_row['option1'])  
591 - }) 611 + spec = _build_specification(option_name_map[1], sku_row['option1'], sku_id)
  612 + if spec:
  613 + specifications.append(spec)
592 if pd.notna(sku_row.get('option2')) and 2 in option_name_map: 614 if pd.notna(sku_row.get('option2')) and 2 in option_name_map:
593 - specifications.append({  
594 - 'sku_id': sku_id,  
595 - 'name': option_name_map[2],  
596 - 'value': str(sku_row['option2'])  
597 - }) 615 + spec = _build_specification(option_name_map[2], sku_row['option2'], sku_id)
  616 + if spec:
  617 + specifications.append(spec)
598 if pd.notna(sku_row.get('option3')) and 3 in option_name_map: 618 if pd.notna(sku_row.get('option3')) and 3 in option_name_map:
599 - specifications.append({  
600 - 'sku_id': sku_id,  
601 - 'name': option_name_map[3],  
602 - 'value': str(sku_row['option3'])  
603 - }) 619 + spec = _build_specification(option_name_map[3], sku_row['option3'], sku_id)
  620 + if spec:
  621 + specifications.append(spec)
604 622
605 return skus_list, prices, compare_prices, sku_prices, sku_weights, sku_weight_units, total_inventory, specifications 623 return skus_list, prices, compare_prices, sku_prices, sku_weights, sku_weight_units, total_inventory, specifications
606 624
@@ -636,82 +654,36 @@ class SPUDocumentTransformer: @@ -636,82 +654,36 @@ class SPUDocumentTransformer:
636 654
637 def _fill_llm_attributes(self, doc: Dict[str, Any], spu_row: pd.Series) -> None: 655 def _fill_llm_attributes(self, doc: Dict[str, Any], spu_row: pd.Series) -> None:
638 """ 656 """
639 - 调用 indexer.product_enrich.analyze_products,为当前 SPU 填充: 657 + 调用 indexer.product_enrich 的高层内容理解入口,为当前 SPU 填充:
640 - qanchors.{lang} 658 - qanchors.{lang}
641 - - enriched_attributes (lang/name/value) 659 + - tags.{lang}
  660 + - enriched_attributes[].value.{lang}
642 """ 661 """
643 - try:  
644 - index_langs = self.tenant_config.get("index_languages") or ["en", "zh"]  
645 - except Exception:  
646 - index_langs = ["en", "zh"]  
647 -  
648 - # 不再限制为固定 SUPPORTED_LANGS,直接按照租户配置的 index_languages 调用  
649 - llm_langs = list(dict.fromkeys(index_langs)) # 去重并保持顺序  
650 -  
651 spu_id = str(spu_row.get("id") or "").strip() 662 spu_id = str(spu_row.get("id") or "").strip()
652 title = str(spu_row.get("title") or "").strip() 663 title = str(spu_row.get("title") or "").strip()
653 if not spu_id or not title: 664 if not spu_id or not title:
654 return 665 return
655 666
656 - semantic_list = doc.get("enriched_attributes") or []  
657 - qanchors_obj = doc.get("qanchors") or {}  
658 -  
659 - dim_keys = [  
660 - "tags",  
661 - "target_audience",  
662 - "usage_scene",  
663 - "season",  
664 - "key_attributes",  
665 - "material",  
666 - "features",  
667 - ]  
668 -  
669 tenant_id = doc.get("tenant_id") 667 tenant_id = doc.get("tenant_id")
  668 + try:
  669 + results = build_index_content_fields(
  670 + items=[
  671 + {
  672 + "id": spu_id,
  673 + "title": title,
  674 + "brief": str(spu_row.get("brief") or "").strip(),
  675 + "description": str(spu_row.get("description") or "").strip(),
  676 + "image_url": str(spu_row.get("image_src") or "").strip(),
  677 + }
  678 + ],
  679 + tenant_id=str(tenant_id),
  680 + )
  681 + except Exception as e:
  682 + logger.warning("LLM attribute fill failed for SPU %s: %s", spu_id, e)
  683 + return
670 684
671 - for lang in llm_langs:  
672 - try:  
673 - rows = analyze_products(  
674 - products=[{"id": spu_id, "title": title}],  
675 - target_lang=lang,  
676 - batch_size=1,  
677 - tenant_id=str(tenant_id),  
678 - )  
679 - except Exception as e:  
680 - logger.warning(  
681 - "LLM attribute fill failed for SPU %s, lang=%s: %s",  
682 - spu_id,  
683 - lang,  
684 - e,  
685 - )  
686 - continue  
687 -  
688 - if not rows:  
689 - continue  
690 - row = rows[0] or {}  
691 -  
692 - # qanchors.{lang}  
693 - anchor_text = str(row.get("anchor_text") or "").strip()  
694 - if anchor_text:  
695 - qanchors_obj[lang] = anchor_text  
696 -  
697 - # 语义属性:按各维度拆分为短语  
698 - for name in dim_keys:  
699 - raw = row.get(name)  
700 - if not raw:  
701 - continue  
702 - for value in split_multi_value_field(str(raw)):  
703 - semantic_list.append(  
704 - {  
705 - "lang": lang,  
706 - "name": name,  
707 - "value": value,  
708 - }  
709 - )  
710 -  
711 - if qanchors_obj:  
712 - doc["qanchors"] = qanchors_obj  
713 - if semantic_list:  
714 - doc["enriched_attributes"] = semantic_list 685 + if results:
  686 + self._apply_content_enrichment(doc, results[0])
715 687
716 def _transform_sku_row(self, sku_row: pd.Series, option_name_map: Dict[int, str] = None) -> Optional[Dict[str, Any]]: 688 def _transform_sku_row(self, sku_row: pd.Series, option_name_map: Dict[int, str] = None) -> Optional[Dict[str, Any]]:
717 """ 689 """
indexer/product_enrich.py
@@ -146,6 +146,16 @@ if _missing_prompt_langs: @@ -146,6 +146,16 @@ if _missing_prompt_langs:
146 146
147 # 多值字段分隔:英文逗号、中文逗号、顿号,及历史约定的 ; | / 与空白 147 # 多值字段分隔:英文逗号、中文逗号、顿号,及历史约定的 ; | / 与空白
148 _MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[,、,;|/\n\t]+") 148 _MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[,、,;|/\n\t]+")
  149 +_CORE_INDEX_LANGUAGES = ("zh", "en")
  150 +_ENRICHED_ATTRIBUTE_DIMENSIONS = (
  151 + "enriched_tags",
  152 + "target_audience",
  153 + "usage_scene",
  154 + "season",
  155 + "key_attributes",
  156 + "material",
  157 + "features",
  158 +)
149 159
150 160
151 def split_multi_value_field(text: Optional[str]) -> List[str]: 161 def split_multi_value_field(text: Optional[str]) -> List[str]:
@@ -158,6 +168,124 @@ def split_multi_value_field(text: Optional[str]) -> List[str]: @@ -158,6 +168,124 @@ def split_multi_value_field(text: Optional[str]) -> List[str]:
158 return [p.strip() for p in _MULTI_VALUE_FIELD_SPLIT_RE.split(s) if p.strip()] 168 return [p.strip() for p in _MULTI_VALUE_FIELD_SPLIT_RE.split(s) if p.strip()]
159 169
160 170
  171 +def _append_lang_phrase_map(target: Dict[str, List[str]], lang: str, raw_value: Any) -> None:
  172 + parts = split_multi_value_field(raw_value)
  173 + if not parts:
  174 + return
  175 + existing = target.get(lang) or []
  176 + merged = list(dict.fromkeys([str(x).strip() for x in existing if str(x).strip()] + parts))
  177 + if merged:
  178 + target[lang] = merged
  179 +
  180 +
  181 +def _append_enriched_attribute(
  182 + target: List[Dict[str, Any]],
  183 + name: str,
  184 + lang: str,
  185 + raw_value: Any,
  186 +) -> None:
  187 + for value in split_multi_value_field(raw_value):
  188 + if any(
  189 + item.get("name") == name
  190 + and isinstance(item.get("value"), dict)
  191 + and item["value"].get(lang) == value
  192 + for item in target
  193 + ):
  194 + continue
  195 + target.append({"name": name, "value": {lang: value}})
  196 +
  197 +
  198 +def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang: str) -> None:
  199 + if not row or row.get("error"):
  200 + return
  201 +
  202 + anchor_text = str(row.get("anchor_text") or "").strip()
  203 + if anchor_text:
  204 + _append_lang_phrase_map(result["qanchors"], lang=lang, raw_value=anchor_text)
  205 +
  206 + for name in _ENRICHED_ATTRIBUTE_DIMENSIONS:
  207 + raw = row.get(name)
  208 + if not raw:
  209 + continue
  210 + _append_enriched_attribute(result["enriched_attributes"], name=name, lang=lang, raw_value=raw)
  211 + if name == "enriched_tags":
  212 + _append_lang_phrase_map(result["enriched_tags"], lang=lang, raw_value=raw)
  213 +
  214 +
  215 +def _normalize_index_content_item(item: Dict[str, Any]) -> Dict[str, str]:
  216 + item_id = str(item.get("id") or item.get("spu_id") or "").strip()
  217 + return {
  218 + "id": item_id,
  219 + "title": str(item.get("title") or "").strip(),
  220 + "brief": str(item.get("brief") or "").strip(),
  221 + "description": str(item.get("description") or "").strip(),
  222 + "image_url": str(item.get("image_url") or "").strip(),
  223 + }
  224 +
  225 +
  226 +def build_index_content_fields(
  227 + items: List[Dict[str, Any]],
  228 + tenant_id: Optional[str] = None,
  229 +) -> List[Dict[str, Any]]:
  230 + """
  231 + 高层入口:生成与 ES mapping 对齐的内容理解字段。
  232 +
  233 + 输入项需包含:
  234 + - `id` 或 `spu_id`
  235 + - `title`
  236 + - 可选 `brief` / `description` / `image_url`
  237 +
  238 + 返回项结构:
  239 + - `id`
  240 + - `qanchors`
  241 + - `enriched_tags`
  242 + - `enriched_attributes`
  243 + - 可选 `error`
  244 +
  245 + 其中:
  246 + - `qanchors.{lang}` 为短语数组
  247 + - `enriched_tags.{lang}` 为标签数组
  248 + """
  249 + normalized_items = [_normalize_index_content_item(item) for item in items]
  250 + if not normalized_items:
  251 + return []
  252 +
  253 + results_by_id: Dict[str, Dict[str, Any]] = {
  254 + item["id"]: {
  255 + "id": item["id"],
  256 + "qanchors": {},
  257 + "enriched_tags": {},
  258 + "enriched_attributes": [],
  259 + }
  260 + for item in normalized_items
  261 + }
  262 +
  263 + for lang in _CORE_INDEX_LANGUAGES:
  264 + try:
  265 + rows = analyze_products(
  266 + products=normalized_items,
  267 + target_lang=lang,
  268 + batch_size=BATCH_SIZE,
  269 + tenant_id=tenant_id,
  270 + )
  271 + except Exception as e:
  272 + logger.warning("build_index_content_fields failed for lang=%s: %s", lang, e)
  273 + for item in normalized_items:
  274 + results_by_id[item["id"]].setdefault("error", str(e))
  275 + continue
  276 +
  277 + for row in rows or []:
  278 + item_id = str(row.get("id") or "").strip()
  279 + if not item_id or item_id not in results_by_id:
  280 + continue
  281 + if row.get("error"):
  282 + results_by_id[item_id].setdefault("error", row["error"])
  283 + continue
  284 + _apply_index_content_row(results_by_id[item_id], row=row, lang=lang)
  285 +
  286 + return [results_by_id[item["id"]] for item in normalized_items]
  287 +
  288 +
161 def _normalize_space(text: str) -> str: 289 def _normalize_space(text: str) -> str:
162 return re.sub(r"\s+", " ", (text or "").strip()) 290 return re.sub(r"\s+", " ", (text or "").strip())
163 291
@@ -526,7 +654,7 @@ def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]: @@ -526,7 +654,7 @@ def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]:
526 "seq_no": parts[0], 654 "seq_no": parts[0],
527 "title": parts[1], # 商品标题(按目标语言) 655 "title": parts[1], # 商品标题(按目标语言)
528 "category_path": parts[2] if len(parts) > 2 else "", # 品类路径 656 "category_path": parts[2] if len(parts) > 2 else "", # 品类路径
529 - "tags": parts[3] if len(parts) > 3 else "", # 细分标签 657 + "enriched_tags": parts[3] if len(parts) > 3 else "", # 细分标签
530 "target_audience": parts[4] if len(parts) > 4 else "", # 适用人群 658 "target_audience": parts[4] if len(parts) > 4 else "", # 适用人群
531 "usage_scene": parts[5] if len(parts) > 5 else "", # 使用场景 659 "usage_scene": parts[5] if len(parts) > 5 else "", # 使用场景
532 "season": parts[6] if len(parts) > 6 else "", # 适用季节 660 "season": parts[6] if len(parts) > 6 else "", # 适用季节
@@ -603,7 +731,7 @@ def process_batch( @@ -603,7 +731,7 @@ def process_batch(
603 "title_input": item.get("title", ""), 731 "title_input": item.get("title", ""),
604 "title": "", 732 "title": "",
605 "category_path": "", 733 "category_path": "",
606 - "tags": "", 734 + "enriched_tags": "",
607 "target_audience": "", 735 "target_audience": "",
608 "usage_scene": "", 736 "usage_scene": "",
609 "season": "", 737 "season": "",
@@ -643,7 +771,7 @@ def process_batch( @@ -643,7 +771,7 @@ def process_batch(
643 "title_input": batch_data[i]["title"], # 原始输入标题 771 "title_input": batch_data[i]["title"], # 原始输入标题
644 "title": parsed_item.get("title", ""), # 模型生成的标题 772 "title": parsed_item.get("title", ""), # 模型生成的标题
645 "category_path": parsed_item.get("category_path", ""), # 品类路径 773 "category_path": parsed_item.get("category_path", ""), # 品类路径
646 - "tags": parsed_item.get("tags", ""), # 细分标签 774 + "enriched_tags": parsed_item.get("enriched_tags", ""), # 细分标签
647 "target_audience": parsed_item.get("target_audience", ""), # 适用人群 775 "target_audience": parsed_item.get("target_audience", ""), # 适用人群
648 "usage_scene": parsed_item.get("usage_scene", ""), # 使用场景 776 "usage_scene": parsed_item.get("usage_scene", ""), # 使用场景
649 "season": parsed_item.get("season", ""), # 适用季节 777 "season": parsed_item.get("season", ""), # 适用季节
@@ -686,7 +814,7 @@ def process_batch( @@ -686,7 +814,7 @@ def process_batch(
686 "title_input": item["title"], 814 "title_input": item["title"],
687 "title": "", 815 "title": "",
688 "category_path": "", 816 "category_path": "",
689 - "tags": "", 817 + "enriched_tags": "",
690 "target_audience": "", 818 "target_audience": "",
691 "usage_scene": "", 819 "usage_scene": "",
692 "season": "", 820 "season": "",
mappings/README.md
@@ -34,8 +34,8 @@ @@ -34,8 +34,8 @@
34 34
35 当前字段大致分为几类: 35 当前字段大致分为几类:
36 36
37 -- 全语言字段:`title`、`keywords`、`brief`、`description`、`vendor`、`category_path`、`category_name_text`、`specifications.value`  
38 -- 核心索引语言字段:`qanchors`、`tags`、`option1_values`、`option2_values`、`option3_values`、`enriched_attributes.value` 37 +- 全语言字段:`title`、`keywords`、`brief`、`description`、`vendor`、`category_path`、`category_name_text`
  38 +- 核心索引语言字段:`qanchors`、`enriched_tags`、`option1_values`、`option2_values`、`option3_values`、`enriched_attributes.value`
39 - 复合嵌套字段:`image_embedding`、`specifications`、`enriched_attributes`、`skus` 39 - 复合嵌套字段:`image_embedding`、`specifications`、`enriched_attributes`、`skus`
40 - 其他标量字段:`tenant_id`、`spu_id`、价格、库存、类目等 40 - 其他标量字段:`tenant_id`、`spu_id`、价格、库存、类目等
41 41
@@ -63,11 +63,12 @@ @@ -63,11 +63,12 @@
63 典型字段: 63 典型字段:
64 64
65 - `qanchors` 65 - `qanchors`
66 -- `tags` 66 +- `enriched_tags`
67 - `option1_values` 67 - `option1_values`
68 - `option2_values` 68 - `option2_values`
69 - `option3_values` 69 - `option3_values`
70 - `enriched_attributes.value` 70 - `enriched_attributes.value`
  71 +- `specifications.value_text`
71 72
72 以 `category_path` 和 `option*_values` 为例,核心语言灌入结果应至少包含: 73 以 `category_path` 和 `option*_values` 为例,核心语言灌入结果应至少包含:
73 74
@@ -118,7 +119,6 @@ @@ -118,7 +119,6 @@
118 - `vendor` 119 - `vendor`
119 - `category_path` 120 - `category_path`
120 - `category_name_text` 121 - `category_name_text`
121 -- `specifications.value`  
122 122
123 灌入规则: 123 灌入规则:
124 124
@@ -151,7 +151,7 @@ @@ -151,7 +151,7 @@
151 } 151 }
152 ``` 152 ```
153 153
154 -示例:规格值 `specifications.value` 154 +示例:规格值 `specifications.value_text` / `specifications.value_keyword`
155 155
156 ```json 156 ```json
157 { 157 {
@@ -159,16 +159,21 @@ @@ -159,16 +159,21 @@
159 { 159 {
160 "sku_id": "sku-red-s", 160 "sku_id": "sku-red-s",
161 "name": "color", 161 "name": "color",
162 - "value": { 162 + "value_keyword": "красный",
  163 + "value_text": {
163 "zh": "红色", 164 "zh": "红色",
164 - "en": "red",  
165 - "ru": "красный" 165 + "en": "red"
166 } 166 }
167 } 167 }
168 ] 168 ]
169 } 169 }
170 ``` 170 ```
171 171
  172 +其中:
  173 +
  174 +- `specifications.value_keyword` 保存原始规格值,用于精确过滤 / 分面
  175 +- `specifications.value_text` 保存 `zh/en` 两个核心索引语言版本,用于检索召回
  176 +
172 ### 原始语言为中文或英文时 177 ### 原始语言为中文或英文时
173 178
174 如果原始语言就是核心索引语言之一,不需要额外再写第三份语言字段。 179 如果原始语言就是核心索引语言之一,不需要额外再写第三份语言字段。
@@ -210,7 +215,7 @@ @@ -210,7 +215,7 @@
210 - 标量字段:直接写固定值,例如 `tenant_id`、`spu_id`、`min_price` 215 - 标量字段:直接写固定值,例如 `tenant_id`、`spu_id`、`min_price`
211 - 核心索引语言字段:只生成 `zh/en` 216 - 核心索引语言字段:只生成 `zh/en`
212 - 全语言字段:生成 `zh/en`,再按原始语言补一个对应语种字段 217 - 全语言字段:生成 `zh/en`,再按原始语言补一个对应语种字段
213 -- 嵌套字段:对每个元素内部重复应用同样规则,例如 `specifications[].value` 218 +- 嵌套字段:对每个元素内部重复应用同样规则,例如 `specifications[].value_text`、`enriched_attributes[].value`
214 219
215 ### 推荐灌入流程 220 ### 推荐灌入流程
216 221
mappings/generate_search_products_mapping.py
@@ -194,8 +194,7 @@ FIELD_SPECS = [ @@ -194,8 +194,7 @@ FIELD_SPECS = [
194 ), 194 ),
195 text_field("category_path", "all_language_text_with_keyword"), 195 text_field("category_path", "all_language_text_with_keyword"),
196 text_field("category_name_text", "all_language_text_with_keyword"), 196 text_field("category_name_text", "all_language_text_with_keyword"),
197 - text_field("qanchors", "core_language_text"),  
198 - text_field("tags", "core_language_text_with_keyword"), 197 + text_field("tags", "all_language_text_with_keyword"),
199 scalar_field("category_id", "keyword"), 198 scalar_field("category_id", "keyword"),
200 scalar_field("category_name", "keyword"), 199 scalar_field("category_name", "keyword"),
201 scalar_field("category_level", "integer"), 200 scalar_field("category_level", "integer"),
@@ -209,6 +208,8 @@ FIELD_SPECS = [ @@ -209,6 +208,8 @@ FIELD_SPECS = [
209 scalar_field("value_keyword", "keyword"), 208 scalar_field("value_keyword", "keyword"),
210 text_field("value_text", "core_language_text_with_keyword"), 209 text_field("value_text", "core_language_text_with_keyword"),
211 ), 210 ),
  211 + text_field("qanchors", "core_language_text"),
  212 + text_field("enriched_tags", "core_language_text_with_keyword"),
212 nested_field( 213 nested_field(
213 "enriched_attributes", 214 "enriched_attributes",
214 scalar_field("name", "keyword"), 215 scalar_field("name", "keyword"),
suggestion/builder.py
@@ -166,6 +166,29 @@ class SuggestionIndexBuilder: @@ -166,6 +166,29 @@ class SuggestionIndexBuilder:
166 out = [p.strip() for p in parts if p and p.strip()] 166 out = [p.strip() for p in parts if p and p.strip()]
167 return out if out else [s] 167 return out if out else [s]
168 168
  169 + def _iter_multilang_product_tags(
  170 + self,
  171 + raw: Any,
  172 + index_languages: List[str],
  173 + primary_language: str,
  174 + ) -> List[Tuple[str, str]]:
  175 + if isinstance(raw, dict):
  176 + pairs: List[Tuple[str, str]] = []
  177 + for lang in index_languages:
  178 + for tag in self._iter_product_tags(raw.get(lang)):
  179 + pairs.append((lang, tag))
  180 + return pairs
  181 +
  182 + pairs = []
  183 + for tag in self._iter_product_tags(raw):
  184 + tag_lang, _, _ = detect_text_language_for_suggestions(
  185 + tag,
  186 + index_languages=index_languages,
  187 + primary_language=primary_language,
  188 + )
  189 + pairs.append((tag_lang, tag))
  190 + return pairs
  191 +
169 @staticmethod 192 @staticmethod
170 def _looks_noise(text_value: str) -> bool: 193 def _looks_noise(text_value: str) -> bool:
171 if not text_value: 194 if not text_value:
@@ -487,12 +510,11 @@ class SuggestionIndexBuilder: @@ -487,12 +510,11 @@ class SuggestionIndexBuilder:
487 key_to_candidate[key] = c 510 key_to_candidate[key] = c
488 c.add_product("qanchor", spu_id=product_id) 511 c.add_product("qanchor", spu_id=product_id)
489 512
490 - for tag in self._iter_product_tags(src.get("tags")):  
491 - tag_lang, _, _ = detect_text_language_for_suggestions(  
492 - tag,  
493 - index_languages=index_languages,  
494 - primary_language=primary_language,  
495 - ) 513 + for tag_lang, tag in self._iter_multilang_product_tags(
  514 + src.get("tags"),
  515 + index_languages=index_languages,
  516 + primary_language=primary_language,
  517 + ):
496 text_norm = self._normalize_text(tag) 518 text_norm = self._normalize_text(tag)
497 if self._looks_noise(text_norm): 519 if self._looks_noise(text_norm):
498 continue 520 continue
tests/ci/test_service_api_contracts.py
@@ -345,33 +345,25 @@ def test_indexer_build_docs_from_db_contract(indexer_client: TestClient): @@ -345,33 +345,25 @@ def test_indexer_build_docs_from_db_contract(indexer_client: TestClient):
345 def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch): 345 def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch):
346 import indexer.product_enrich as process_products 346 import indexer.product_enrich as process_products
347 347
348 - def _fake_analyze_products(  
349 - products: List[Dict[str, str]],  
350 - target_lang: str = "zh",  
351 - batch_size: int | None = None,  
352 - tenant_id: str | None = None,  
353 - ):  
354 - assert batch_size == 20 348 + def _fake_build_index_content_fields(items: List[Dict[str, str]], tenant_id: str | None = None):
  349 + assert tenant_id == "162"
355 return [ 350 return [
356 { 351 {
357 - "id": p["id"],  
358 - "lang": target_lang,  
359 - "title_input": p["title"],  
360 - "title": p["title"],  
361 - "category_path": "",  
362 - "tags": "tag1,tag2",  
363 - "target_audience": "",  
364 - "usage_scene": "",  
365 - "season": "",  
366 - "key_attributes": "",  
367 - "material": "",  
368 - "features": "",  
369 - "anchor_text": f"{target_lang}-anchor-{p['id']}", 352 + "id": p["spu_id"],
  353 + "qanchors": {
  354 + "zh": [f"zh-anchor-{p['spu_id']}"],
  355 + "en": [f"en-anchor-{p['spu_id']}"],
  356 + },
  357 + "enriched_tags": {"zh": ["tag1", "tag2"], "en": ["tag1", "tag2"]},
  358 + "enriched_attributes": [
  359 + {"name": "enriched_tags", "value": {"zh": "tag1"}},
  360 + {"name": "enriched_tags", "value": {"en": "tag1"}},
  361 + ],
370 } 362 }
371 - for p in products 363 + for p in items
372 ] 364 ]
373 365
374 - monkeypatch.setattr(process_products, "analyze_products", _fake_analyze_products) 366 + monkeypatch.setattr(process_products, "build_index_content_fields", _fake_build_index_content_fields)
375 367
376 response = indexer_client.post( 368 response = indexer_client.post(
377 "/indexer/enrich-content", 369 "/indexer/enrich-content",
@@ -381,7 +373,6 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch @@ -381,7 +373,6 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch
381 {"spu_id": "1001", "title": "T-shirt"}, 373 {"spu_id": "1001", "title": "T-shirt"},
382 {"spu_id": "1002", "title": "Toy"}, 374 {"spu_id": "1002", "title": "Toy"},
383 ], 375 ],
384 - "languages": ["zh", "en"],  
385 }, 376 },
386 ) 377 )
387 assert response.status_code == 200 378 assert response.status_code == 200
@@ -390,9 +381,14 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch @@ -390,9 +381,14 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch
390 assert data["total"] == 2 381 assert data["total"] == 2
391 assert len(data["results"]) == 2 382 assert len(data["results"]) == 2
392 assert data["results"][0]["spu_id"] == "1001" 383 assert data["results"][0]["spu_id"] == "1001"
393 - assert data["results"][0]["qanchors"]["zh"] == "zh-anchor-1001"  
394 - assert data["results"][0]["qanchors"]["en"] == "en-anchor-1001"  
395 - assert "tag1" in data["results"][0]["tags"] 384 + assert data["results"][0]["qanchors"]["zh"] == ["zh-anchor-1001"]
  385 + assert data["results"][0]["qanchors"]["en"] == ["en-anchor-1001"]
  386 + assert data["results"][0]["enriched_tags"]["zh"] == ["tag1", "tag2"]
  387 + assert data["results"][0]["enriched_tags"]["en"] == ["tag1", "tag2"]
  388 + assert data["results"][0]["enriched_attributes"][0] == {
  389 + "name": "enriched_tags",
  390 + "value": {"zh": "tag1"},
  391 + }
396 392
397 393
398 def test_indexer_documents_contract(indexer_client: TestClient): 394 def test_indexer_documents_contract(indexer_client: TestClient):
@@ -515,7 +511,6 @@ def test_indexer_enrich_content_validation_max_items(indexer_client: TestClient) @@ -515,7 +511,6 @@ def test_indexer_enrich_content_validation_max_items(indexer_client: TestClient)
515 json={ 511 json={
516 "tenant_id": "162", 512 "tenant_id": "162",
517 "items": [{"spu_id": str(i), "title": "x"} for i in range(51)], 513 "items": [{"spu_id": str(i), "title": "x"} for i in range(51)],
518 - "languages": ["zh"],  
519 }, 514 },
520 ) 515 )
521 assert response.status_code == 400 516 assert response.status_code == 400
tests/test_llm_enrichment_batch_fill.py
@@ -7,33 +7,30 @@ import pandas as pd @@ -7,33 +7,30 @@ import pandas as pd
7 from indexer.document_transformer import SPUDocumentTransformer 7 from indexer.document_transformer import SPUDocumentTransformer
8 8
9 9
10 -def test_fill_llm_attributes_batch_calls_analyze_in_batches(monkeypatch): 10 +def test_fill_llm_attributes_batch_uses_product_enrich_helper(monkeypatch):
11 seen_calls: List[Dict[str, Any]] = [] 11 seen_calls: List[Dict[str, Any]] = []
12 12
13 - def _fake_analyze_products(products, target_lang="zh", batch_size=None, tenant_id=None):  
14 - # should always request batch_size=20 and pass full list; internal splitter handles >20  
15 - seen_calls.append(  
16 - {  
17 - "n": len(products),  
18 - "target_lang": target_lang,  
19 - "batch_size": batch_size,  
20 - "tenant_id": tenant_id,  
21 - }  
22 - ) 13 + def _fake_build_index_content_fields(items, tenant_id=None):
  14 + seen_calls.append({"n": len(items), "tenant_id": tenant_id})
23 return [ 15 return [
24 { 16 {
25 - "id": p["id"],  
26 - "lang": target_lang,  
27 - "title_input": p["title"],  
28 - "tags": "t1,t2",  
29 - "anchor_text": f"{target_lang}-anchor-{p['id']}", 17 + "id": item["id"],
  18 + "qanchors": {
  19 + "zh": [f"zh-anchor-{item['id']}"],
  20 + "en": [f"en-anchor-{item['id']}"],
  21 + },
  22 + "tags": {"zh": ["t1", "t2"], "en": ["t1", "t2"]},
  23 + "enriched_attributes": [
  24 + {"name": "tags", "value": {"zh": "t1"}},
  25 + {"name": "tags", "value": {"en": "t1"}},
  26 + ],
30 } 27 }
31 - for p in products 28 + for item in items
32 ] 29 ]
33 30
34 import indexer.document_transformer as doc_tr 31 import indexer.document_transformer as doc_tr
35 32
36 - monkeypatch.setattr(doc_tr, "analyze_products", _fake_analyze_products) 33 + monkeypatch.setattr(doc_tr, "build_index_content_fields", _fake_build_index_content_fields)
37 34
38 transformer = SPUDocumentTransformer( 35 transformer = SPUDocumentTransformer(
39 category_id_to_name={}, 36 category_id_to_name={},
@@ -54,11 +51,11 @@ def test_fill_llm_attributes_batch_calls_analyze_in_batches(monkeypatch): @@ -54,11 +51,11 @@ def test_fill_llm_attributes_batch_calls_analyze_in_batches(monkeypatch):
54 51
55 transformer.fill_llm_attributes_batch(docs, rows) 52 transformer.fill_llm_attributes_batch(docs, rows)
56 53
57 - # called once per language, with full list; analyze_products handles splitting  
58 - assert seen_calls == [  
59 - {"n": 45, "target_lang": "zh", "batch_size": 20, "tenant_id": "162"},  
60 - {"n": 45, "target_lang": "en", "batch_size": 20, "tenant_id": "162"},  
61 - ] 54 + assert seen_calls == [{"n": 45, "tenant_id": "162"}]
62 55
63 - assert docs[0]["qanchors"]["zh"] == "zh-anchor-0"  
64 - assert docs[0]["qanchors"]["en"] == "en-anchor-0" 56 + assert docs[0]["qanchors"]["zh"] == ["zh-anchor-0"]
  57 + assert docs[0]["qanchors"]["en"] == ["en-anchor-0"]
  58 + assert docs[0]["tags"]["zh"] == ["t1", "t2"]
  59 + assert docs[0]["tags"]["en"] == ["t1", "t2"]
  60 + assert {"name": "tags", "value": {"zh": "t1"}} in docs[0]["enriched_attributes"]
  61 + assert {"name": "tags", "value": {"en": "t1"}} in docs[0]["enriched_attributes"]
tests/test_suggestions.py
@@ -403,10 +403,13 @@ def test_build_full_candidates_tags_and_qanchor_phrases(monkeypatch): @@ -403,10 +403,13 @@ def test_build_full_candidates_tags_and_qanchor_phrases(monkeypatch):
403 "spu_id": "900", 403 "spu_id": "900",
404 "title": {"en": "Tee", "zh": "T恤"}, 404 "title": {"en": "Tee", "zh": "T恤"},
405 "qanchors": { 405 "qanchors": {
406 - "en": "slim fit, sporty casual",  
407 - "zh": "修身, 显瘦", 406 + "en": ["slim fit", "sporty casual"],
  407 + "zh": ["修身", "显瘦"],
  408 + },
  409 + "tags": {
  410 + "en": ["Classic", "ribbed neckline"],
  411 + "zh": ["辣妹风"],
408 }, 412 },
409 - "tags": ["Classic", "辣妹风", "ribbed neckline"],  
410 }, 413 },
411 } 414 }
412 ] 415 ]