Commit d350861ffff80413a1e1be71dfbe478d514ee925

Authored by tangwang
1 parent fca871fb

索引结构修改

api/routes/indexer.py
... ... @@ -80,7 +80,7 @@ class BuildDocsFromDbRequest(BaseModel):
80 80 class EnrichContentItem(BaseModel):
81 81 """单条待生成内容理解字段的商品。"""
82 82 spu_id: str = Field(..., description="SPU ID")
83   - title: str = Field(..., description="商品标题,用于 LLM 分析生成 qanchors / tags 等")
  83 + title: str = Field(..., description="商品标题,用于 LLM 分析生成 qanchors / enriched_tags 等")
84 84 image_url: Optional[str] = Field(None, description="商品主图 URL(预留给多模态/内容理解扩展)")
85 85 brief: Optional[str] = Field(None, description="商品简介/短描述")
86 86 description: Optional[str] = Field(None, description="商品详情/长描述")
... ... @@ -93,10 +93,6 @@ class EnrichContentRequest(BaseModel):
93 93 """
94 94 tenant_id: str = Field(..., description="租户 ID,用于请求路由与结果归属,不参与缓存键")
95 95 items: List[EnrichContentItem] = Field(..., description="待分析的 SPU 列表(spu_id + title,可附带 brief/description/image_url)")
96   - languages: List[str] = Field(
97   - default_factory=lambda: ["zh", "en"],
98   - description="目标语言列表,需在支持范围内(zh/en/de/ru/fr),默认 zh, en",
99   - )
100 96  
101 97  
102 98 @router.post("/reindex")
... ... @@ -444,92 +440,25 @@ async def build_docs_from_db(request: BuildDocsFromDbRequest):
444 440 raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
445 441  
446 442  
447   -def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: List[str]) -> List[Dict[str, Any]]:
  443 +def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]]) -> List[Dict[str, Any]]:
448 444 """
449   - 同步执行内容理解:调用 product_enrich.analyze_products,按语言批量跑 LLM,
450   - 再聚合成每 SPU 的 qanchors、enriched_attributes、tags。供 run_in_executor 调用。
  445 + 同步执行内容理解,返回与 ES mapping 对齐的字段结构。
  446 + 语言策略由 product_enrich 内部统一决定,路由层不参与。
451 447 """
452   - from indexer.product_enrich import analyze_products, split_multi_value_field
  448 + from indexer.product_enrich import build_index_content_fields
453 449  
454   - llm_langs = list(dict.fromkeys(languages)) or ["en"]
455   -
456   - products = [
  450 + results = build_index_content_fields(items=items, tenant_id=tenant_id)
  451 + return [
457 452 {
458   - "id": it["spu_id"],
459   - "title": (it.get("title") or "").strip(),
460   - "brief": (it.get("brief") or "").strip(),
461   - "description": (it.get("description") or "").strip(),
462   - "image_url": (it.get("image_url") or "").strip(),
  453 + "spu_id": item["id"],
  454 + "qanchors": item["qanchors"],
  455 + "enriched_attributes": item["enriched_attributes"],
  456 + "enriched_tags": item["enriched_tags"],
  457 + **({"error": item["error"]} if item.get("error") else {}),
463 458 }
464   - for it in items
465   - ]
466   - dim_keys = [
467   - "tags",
468   - "target_audience",
469   - "usage_scene",
470   - "season",
471   - "key_attributes",
472   - "material",
473   - "features",
  459 + for item in results
474 460 ]
475 461  
476   - # 按 spu_id 聚合:qanchors[lang], enriched_attributes[], tags[]
477   - by_spu: Dict[str, Dict[str, Any]] = {}
478   - for it in items:
479   - sid = str(it["spu_id"])
480   - by_spu[sid] = {"qanchors": {}, "enriched_attributes": [], "tags": []}
481   -
482   - for lang in llm_langs:
483   - try:
484   - rows = analyze_products(
485   - products=products,
486   - target_lang=lang,
487   - batch_size=20,
488   - tenant_id=tenant_id,
489   - )
490   - except Exception as e:
491   - logger.warning("enrich-content analyze_products failed for lang=%s: %s", lang, e)
492   - for it in items:
493   - sid = str(it["spu_id"])
494   - if "error" not in by_spu[sid]:
495   - by_spu[sid]["error"] = str(e)
496   - continue
497   -
498   - for row in rows:
499   - spu_id = str(row.get("id") or "")
500   - if spu_id not in by_spu:
501   - continue
502   - rec = by_spu[spu_id]
503   - if row.get("error"):
504   - rec["error"] = row["error"]
505   - continue
506   - anchor_text = str(row.get("anchor_text") or "").strip()
507   - if anchor_text:
508   - rec["qanchors"][lang] = anchor_text
509   - for name in dim_keys:
510   - raw = row.get(name)
511   - if not raw:
512   - continue
513   - for value in split_multi_value_field(str(raw)):
514   - rec["enriched_attributes"].append({"lang": lang, "name": name, "value": value})
515   - if name == "tags":
516   - rec["tags"].append(value)
517   -
518   - # 去重 tags(保持顺序)
519   - out = []
520   - for it in items:
521   - sid = str(it["spu_id"])
522   - rec = by_spu[sid]
523   - tags = list(dict.fromkeys(rec["tags"]))
524   - out.append({
525   - "spu_id": sid,
526   - "qanchors": rec["qanchors"],
527   - "enriched_attributes": rec["enriched_attributes"],
528   - "tags": tags,
529   - **({"error": rec["error"]} if rec.get("error") else {}),
530   - })
531   - return out
532   -
533 462  
534 463 @router.post("/enrich-content")
535 464 async def enrich_content(request: EnrichContentRequest):
... ... @@ -540,7 +469,7 @@ async def enrich_content(request: EnrichContentRequest):
540 469 - 外部 indexer 采用「微服务组合」方式自己组织 doc 时,可调用本接口获取 LLM 生成的
541 470 锚文本与语义属性,再与翻译、向量化结果合并写入 ES。
542 471 - 与 /indexer/build-docs 解耦,避免 build-docs 因 LLM 耗时过长而阻塞;调用方可
543   - 先拿不含 qanchors/tags 的 doc,再异步或离线补齐本接口结果后更新 ES。
  472 + 先拿不含 qanchors/enriched_tags 的 doc,再异步或离线补齐本接口结果后更新 ES。
544 473  
545 474 实现逻辑与 indexer.product_enrich.analyze_products 一致,支持多语言与 Redis 缓存。
546 475 """
... ... @@ -568,8 +497,7 @@ async def enrich_content(request: EnrichContentRequest):
568 497 None,
569 498 lambda: _run_enrich_content(
570 499 tenant_id=request.tenant_id,
571   - items=items_payload,
572   - languages=request.languages or ["zh", "en"],
  500 + items=items_payload
573 501 ),
574 502 )
575 503 return {
... ...
docs/suggestion索引构建.md
... ... @@ -169,7 +169,7 @@
169 169  
170 170 ##### 4.1 从商品索引收集 title / qanchors / tags(Step 1)
171 171  
172   - - 遍历店铺的所有商品:获取每个商品的 `"spu_id"`, `"title"`, `"qanchors"`, `"tags"`(按 `spu_id`、`id.keyword` 升序,便于 `search_after` 稳定分页)
  172 + - 遍历店铺的所有商品:获取每个商品的 `"spu_id"`, `"title"`, `"qanchors"`, `"enriched_tags"`(按 `spu_id`、`id.keyword` 升序,便于 `search_after` 稳定分页)
173 173  
174 174 - 对每个商品文档:
175 175  
... ... @@ -207,7 +207,7 @@
207 207 - **qanchors 处理**:
208 208 - `qanchors` 字段同样为多语言对象:
209 209 ```json
210   - "qanchors": { "en": "...", "zh": "..." }
  210 + "qanchors": { "en": ["slim fit", "sporty casual"], "zh": ["修身", "显瘦"] }
211 211 ```
212 212 - 取 `q_raw = qanchors[lang]`
213 213 - 通过 `_split_qanchors(q_raw)` 拆分为若干字符串:
... ... @@ -217,10 +217,14 @@
217 217 - `text_norm = _normalize_text(q_text)`,再用 `_looks_noise` 过滤
218 218 - 同样按 `(lang, text_norm)` 合并为 `SuggestionCandidate`,调用 `add_product("qanchor", spu_id=product_id)`。
219 219  
220   - 4. **tags 处理**(与 `index_languages` 循环并列,每个商品只做一次):
221   - - `tags` 可为字符串数组,或逗号等分隔的单个字符串;经 `_iter_product_tags` 展开为若干条。
222   - - 每条 tag **无语言字段**:使用 `query.query_parser.detect_text_language_for_suggestions`(与 `QueryParser` 相同的 `LanguageDetector`)判定语言,并约束在租户的 `index_languages` 内。
223   - - 通过 `_looks_noise` 后按 `(detected_lang, text_norm)` 合并,调用 `add_product("tag", spu_id=product_id)`。
  220 + 4. **enriched_tags 处理**(与 `index_languages` 循环并列):
  221 + - `enriched_tags` 现为多语言对象,例如:
  222 + ```json
  223 + "enriched_tags": { "en": ["Classic", "ribbed neckline"], "zh": ["辣妹风"] }
  224 + ```
  225 + - 优先读取 `enriched_tags[lang]`,每个值可为字符串数组,或逗号等分隔的单个字符串;经 `_iter_product_tags` 展开为若干条。
  226 + - 对历史旧数据,若 `enriched_tags` 仍是单层字符串 / 数组,则继续走语言检测兜底,并约束在租户的 `index_languages` 内。
  227 + - 通过 `_looks_noise` 后按 `(lang, text_norm)` 合并,调用 `add_product("tag", spu_id=product_id)`。
224 228  
225 229 ##### 4.2 从查询日志收集用户 query(Step 2)
226 230  
... ...
docs/搜索API对接指南-05-索引接口(Indexer).md
... ... @@ -13,7 +13,7 @@
13 13 | 查询文档 | POST | `/indexer/documents` | 按 SPU ID 列表查询 ES 文档,不写入 ES |
14 14 | 构建 ES 文档(正式) | POST | `/indexer/build-docs` | 由上游提供 MySQL 行数据,返回 ES-ready 文档,不写 ES |
15 15 | 构建 ES 文档(测试) | POST | `/indexer/build-docs-from-db` | 由本服务查库并构建文档,仅测试/调试用 |
16   -| 内容理解字段生成 | POST | `/indexer/enrich-content` | 根据商品标题批量生成 qanchors、semantic_attributes、tags(供微服务组合方式使用) |
  16 +| 内容理解字段生成 | POST | `/indexer/enrich-content` | 根据商品标题批量生成 qanchors、enriched_attributes、tags(供微服务组合方式使用) |
17 17 | 索引健康检查 | GET | `/indexer/health` | 检查索引服务与数据库连接状态 |
18 18  
19 19 #### 5.0 支撑外部 indexer 的三种方式
... ... @@ -510,7 +510,6 @@ curl -X POST "http://localhost:6004/indexer/build-docs" \
510 510 {
511 511 "spu": {
512 512 "id": 10001,
513   - "tenant_id": "162",
514 513 "title": "测试T恤 纯棉短袖",
515 514 "brief": "舒适纯棉,多色可选",
516 515 "description": "这是一款适合日常穿着的纯棉T恤,透气吸汗。",
... ... @@ -521,7 +520,7 @@ curl -X POST "http://localhost:6004/indexer/build-docs" \
521 520 "category_path": "服装/上衣/T恤",
522 521 "fake_sales": 1280,
523 522 "image_src": "https://oss.essa.cn/98532128-cf8e-456c-9e30-6f2a5ea0c19f.jpg",
524   - "tags": "T恤,纯棉,短袖,夏季",
  523 + "enriched_tags": ["T恤", "纯棉"],
525 524 "create_time": "2024-01-01T00:00:00Z",
526 525 "update_time": "2024-01-01T00:00:00Z"
527 526 },
... ... @@ -570,7 +569,7 @@ curl -X POST "http://localhost:6004/indexer/build-docs" \
570 569 "tenant_id": "170",
571 570 "spu_id": "223167",
572 571 "title": { "en": "...", "zh": "..." },
573   - "tags": ["Floerns", "Clothing", "Shoes & Jewelry"],
  572 + "enriched_tags": ["Floerns", "Clothing", "Shoes & Jewelry"],
574 573 "skus": [
575 574 {
576 575 "sku_id": "3988393",
... ... @@ -649,7 +648,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \
649 648 ### 5.8 内容理解字段生成接口
650 649  
651 650 - **端点**: `POST /indexer/enrich-content`
652   -- **描述**: 根据商品内容信息批量生成 **qanchors**(锚文本)、**enriched_attributes**(语义属性)、**tags**(细分标签),供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段(必填/可选见下表)。内部逻辑与 `indexer.product_enrich` 一致,支持多语言与 Redis 缓存;单次请求在线程池中执行,避免阻塞其他接口。
  651 +- **描述**: 根据商品内容信息批量生成 **qanchors**(锚文本)、**enriched_attributes**(语义属性)、**enriched_tags**(细分标签),供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段(必填/可选见下表)。接口只暴露商品内容输入,语言选择、分析维度与最终字段结构统一由 `indexer.product_enrich` 内部决定;当前返回结果与 `search_products` mapping 保持一致。单次请求在线程池中执行,避免阻塞其他接口。
653 652  
654 653 #### 请求参数
655 654  
... ... @@ -669,8 +668,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \
669 668 "title": "12PCS Dolls with Bottles",
670 669 "image_url": "https://example.com/images/223168.jpg"
671 670 }
672   - ],
673   - "languages": ["zh", "en"]
  671 + ]
674 672 }
675 673 ```
676 674  
... ... @@ -678,7 +676,6 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \
678 676 |------|------|------|--------|------|
679 677 | `tenant_id` | string | Y | - | 租户 ID。目前仅用于记录日志,不产生实际作用|
680 678 | `items` | array | Y | - | 待分析列表;**单次最多 50 条** |
681   -| `languages` | array[string] | N | `["zh", "en"]` | 目标语言,需在支持范围内:`zh`、`en`、`de`、`ru`、`fr` |
682 679  
683 680 `items[]` 字段说明:
684 681  
... ... @@ -696,6 +693,12 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \
696 693 - `tenant_id`、`spu_id` 只用于请求归属与结果回填,不参与缓存键。
697 694 - 因此,输入内容不变时可跨请求直接命中缓存;任一输入字段变化时,会自然落到新的缓存 key。
698 695  
  696 +语言说明:
  697 +
  698 +- 接口不接受语言控制参数。
  699 +- 返回哪些语言、返回哪些语义维度,统一由 `indexer.product_enrich` 内部逻辑决定。
  700 +- 当前为了与 `search_products` mapping 对齐,返回结果只包含核心索引语言 `zh`、`en`。
  701 +
699 702 批量请求建议:
700 703 - **全量**:强烈建议 尽可能 **20 个 SPU/doc** 攒成一个批次后再请求一次。
701 704 - **增量**:可按时效要求设置时间窗口(例如 **5 分钟**),在窗口内尽可能攒到 **20 个**;达到 20 或窗口到期就发送一次请求。
... ... @@ -711,21 +714,28 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \
711 714 {
712 715 "spu_id": "223167",
713 716 "qanchors": {
714   - "zh": "短袖T恤,纯棉,男装,夏季",
715   - "en": "cotton t-shirt, short sleeve, men, summer"
  717 + "zh": ["短袖T恤", "纯棉", "男装", "夏季"],
  718 + "en": ["cotton t-shirt", "short sleeve", "men", "summer"]
  719 + },
  720 + "enriched_tags": {
  721 + "zh": ["纯棉", "短袖", "男装"],
  722 + "en": ["cotton", "short sleeve", "men"]
716 723 },
717 724 "enriched_attributes": [
718   - { "lang": "zh", "name": "tags", "value": "纯棉" },
719   - { "lang": "zh", "name": "usage_scene", "value": "日常" },
720   - { "lang": "en", "name": "tags", "value": "cotton" }
721   - ],
722   - "tags": ["纯棉", "短袖", "男装", "cotton", "short sleeve"]
  725 + { "name": "enriched_tags", "value": { "zh": "纯棉" } },
  726 + { "name": "usage_scene", "value": { "zh": "日常" } },
  727 + { "name": "enriched_tags", "value": { "en": "cotton" } }
  728 + ]
723 729 },
724 730 {
725 731 "spu_id": "223168",
726   - "qanchors": { "en": "dolls, toys, 12pcs" },
727   - "enriched_attributes": [],
728   - "tags": ["dolls", "toys"]
  732 + "qanchors": {
  733 + "en": ["dolls", "toys", "12pcs"]
  734 + },
  735 + "enriched_tags": {
  736 + "en": ["dolls", "toys"]
  737 + },
  738 + "enriched_attributes": []
729 739 }
730 740 ]
731 741 }
... ... @@ -733,10 +743,10 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \
733 743  
734 744 | 字段 | 类型 | 说明 |
735 745 |------|------|------|
736   -| `results` | array | 与请求 `items` 一一对应,每项含 `spu_id`、`qanchors`、`enriched_attributes`、`tags` |
737   -| `results[].qanchors` | object | 按语言键的锚文本(逗号分隔短语),可写入 ES 文档的 `qanchors.{lang}` |
738   -| `results[].enriched_attributes` | array | 语义属性列表,每项为 `{ "lang", "name", "value" }`,可写入 ES 的 `enriched_attributes` nested 字段 |
739   -| `results[].tags` | array | 从语义属性中抽取的 `name=tags` 的 value 集合,可与业务原有 `tags` 合并后写入 ES 的 `tags` 字段 |
  746 +| `results` | array | 与请求 `items` 一一对应,每项含 `spu_id`、`qanchors`、`enriched_attributes`、`enriched_tags` |
  747 +| `results[].qanchors` | object | 与 ES `qanchors` 字段同结构,按语言键返回短语数组 |
  748 +| `results[].enriched_tags` | object | 与 ES `enriched_tags` 字段同结构,按语言键返回标签数组 |
  749 +| `results[].enriched_attributes` | array | 与 ES `enriched_attributes` nested 字段同结构,每项为 `{ "name", "value": { "zh"?: "...", "en"?: "..." } }` |
740 750 | `results[].error` | string | 若该条处理失败(如 LLM 异常),会在此字段返回错误信息 |
741 751  
742 752 **错误响应**:
... ... @@ -758,8 +768,7 @@ curl -X POST "http://localhost:6004/indexer/enrich-content" \
758 768 "description": "100%棉,圆领版型,适合日常通勤与休闲穿搭。",
759 769 "image_url": "https://example.com/images/223167.jpg"
760 770 }
761   - ],
762   - "languages": ["zh", "en"]
  771 + ]
763 772 }'
764 773 ```
765 774  
... ...
indexer/document_transformer.py
... ... @@ -11,9 +11,8 @@ SPU文档转换器 - 公共转换逻辑。
11 11 import pandas as pd
12 12 import numpy as np
13 13 import logging
14   -import re
15 14 from typing import Dict, Any, Optional, List
16   -from indexer.product_enrich import analyze_products, split_multi_value_field
  15 +from indexer.product_enrich import build_index_content_fields
17 16  
18 17 logger = logging.getLogger(__name__)
19 18  
... ... @@ -75,6 +74,39 @@ class SPUDocumentTransformer:
75 74 )
76 75 return translations
77 76  
  77 + def _build_core_language_text_object(
  78 + self,
  79 + text: Optional[str],
  80 + source_lang: str,
  81 + scene: str = "general",
  82 + ) -> Dict[str, str]:
  83 + """
  84 + 构建与 mapping 中 core_language_text(_with_keyword) 对齐的对象。
  85 + 当前核心语言固定为 zh/en。
  86 + """
  87 + if not text or not str(text).strip():
  88 + return {}
  89 +
  90 + source_text = str(text).strip()
  91 + obj: Dict[str, str] = {}
  92 +
  93 + if source_lang in CORE_INDEX_LANGUAGES:
  94 + obj[source_lang] = source_text
  95 +
  96 + if self.translator:
  97 + translations = self._translate_index_languages(
  98 + text=source_text,
  99 + source_lang=source_lang,
  100 + index_languages=CORE_INDEX_LANGUAGES,
  101 + scene=scene,
  102 + )
  103 + for lang in CORE_INDEX_LANGUAGES:
  104 + val = translations.get(lang)
  105 + if val and str(val).strip():
  106 + obj[lang] = str(val).strip()
  107 +
  108 + return obj
  109 +
78 110 def transform_spu_to_doc(
79 111 self,
80 112 tenant_id: str,
... ... @@ -118,10 +150,16 @@ class SPUDocumentTransformer:
118 150 if self.enable_title_embedding and self.encoder:
119 151 self._fill_title_embedding(doc)
120 152  
121   - # Tags
  153 + # Tags:统一转成与 mapping 一致的 core-language object
122 154 if pd.notna(spu_row.get('tags')):
123 155 tags_str = str(spu_row['tags'])
124   - doc['tags'] = split_multi_value_field(tags_str)
  156 + tags_obj = self._build_core_language_text_object(
  157 + tags_str,
  158 + source_lang=primary_lang,
  159 + scene="general",
  160 + )
  161 + if tags_obj:
  162 + doc['tags'] = tags_obj
125 163  
126 164 # Category相关字段
127 165 self._fill_category_fields(doc, spu_row)
... ... @@ -202,7 +240,8 @@ class SPUDocumentTransformer:
202 240 """
203 241 批量调用 LLM,为一批 doc 填充:
204 242 - qanchors.{lang}
205   - - enriched_attributes (lang/name/value)
  243 + - tags.{lang}
  244 + - enriched_attributes[].value.{lang}
206 245  
207 246 设计目标:
208 247 - 尽可能攒批调用 LLM;
... ... @@ -211,16 +250,8 @@ class SPUDocumentTransformer:
211 250 if not docs or not spu_rows or len(docs) != len(spu_rows):
212 251 return
213 252  
214   - try:
215   - index_langs = self.tenant_config.get("index_languages") or ["en", "zh"]
216   - except Exception:
217   - index_langs = ["en", "zh"]
218   - # 不再限制为固定 SUPPORTED_LANGS,直接按照租户配置的 index_languages 调用
219   - llm_langs = list(dict.fromkeys(index_langs)) # 去重并保持顺序
220   -
221   - # 只对有 title 的 SPU 参与 LLM;其余跳过
222 253 id_to_idx: Dict[str, int] = {}
223   - products: List[Dict[str, str]] = []
  254 + items: List[Dict[str, str]] = []
224 255 for i, row in enumerate(spu_rows):
225 256 raw_id = row.get("id")
226 257 spu_id = "" if raw_id is None else str(raw_id).strip()
... ... @@ -228,69 +259,45 @@ class SPUDocumentTransformer:
228 259 if not spu_id or not title:
229 260 continue
230 261 id_to_idx[spu_id] = i
231   - products.append({"id": spu_id, "title": title})
232   - if not products:
  262 + items.append(
  263 + {
  264 + "id": spu_id,
  265 + "title": title,
  266 + "brief": str(row.get("brief") or "").strip(),
  267 + "description": str(row.get("description") or "").strip(),
  268 + "image_url": str(row.get("image_src") or "").strip(),
  269 + }
  270 + )
  271 + if not items:
233 272 return
234 273  
235 274 tenant_id = str(docs[0].get("tenant_id") or "").strip() or None
  275 + try:
  276 + results = build_index_content_fields(items=items, tenant_id=tenant_id)
  277 + except Exception as e:
  278 + logger.warning("LLM batch attribute fill failed: %s", e)
  279 + return
236 280  
237   - dim_keys = [
238   - "tags",
239   - "target_audience",
240   - "usage_scene",
241   - "season",
242   - "key_attributes",
243   - "material",
244   - "features",
245   - ]
246   -
247   - for lang in llm_langs:
248   - try:
249   - rows = analyze_products(
250   - products=products,
251   - target_lang=lang,
252   - batch_size=20,
253   - tenant_id=tenant_id,
254   - )
255   - except Exception as e:
256   - logger.warning("LLM batch attribute fill failed (lang=%s): %s", lang, e)
  281 + for result in results:
  282 + spu_id = str(result.get("id") or "").strip()
  283 + if not spu_id:
257 284 continue
  285 + idx = id_to_idx.get(spu_id)
  286 + if idx is None:
  287 + continue
  288 + self._apply_content_enrichment(docs[idx], result)
258 289  
259   - for row in rows or []:
260   - spu_id = str(row.get("id") or "").strip()
261   - if not spu_id:
262   - continue
263   - idx = id_to_idx.get(spu_id)
264   - if idx is None:
265   - continue
266   - self._apply_llm_row(docs[idx], row=row, lang=lang, dim_keys=dim_keys)
267   -
268   - def _apply_llm_row(self, doc: Dict[str, Any], row: Dict[str, Any], lang: str, dim_keys: List[str]) -> None:
269   - """将单条 LLM 输出 row 按既定结构写入 doc(不抛异常)。"""
  290 + def _apply_content_enrichment(self, doc: Dict[str, Any], enrichment: Dict[str, Any]) -> None:
  291 + """将 product_enrich 产出的 ES-ready 内容字段写入 doc。"""
270 292 try:
271   - if row.get("error"):
272   - return
273   -
274   - semantic_list = doc.get("enriched_attributes") or []
275   - qanchors_obj = doc.get("qanchors") or {}
276   -
277   - anchor_text = str(row.get("anchor_text") or "").strip()
278   - if anchor_text:
279   - qanchors_obj[lang] = anchor_text
280   -
281   - for name in dim_keys:
282   - raw = row.get(name)
283   - if not raw:
284   - continue
285   - for value in split_multi_value_field(str(raw)):
286   - semantic_list.append({"lang": lang, "name": name, "value": value})
287   -
288   - if qanchors_obj:
289   - doc["qanchors"] = qanchors_obj
290   - if semantic_list:
291   - doc["enriched_attributes"] = semantic_list
  293 + if enrichment.get("qanchors"):
  294 + doc["qanchors"] = enrichment["qanchors"]
  295 + if enrichment.get("tags"):
  296 + doc["tags"] = enrichment["tags"]
  297 + if enrichment.get("enriched_attributes"):
  298 + doc["enriched_attributes"] = enrichment["enriched_attributes"]
292 299 except Exception as e:
293   - logger.warning("Failed to apply LLM row to doc (spu_id=%s, lang=%s): %s", doc.get("spu_id"), lang, e)
  300 + logger.warning("Failed to apply enrichment to doc (spu_id=%s): %s", doc.get("spu_id"), e)
294 301  
295 302 def _fill_text_fields(
296 303 self,
... ... @@ -544,6 +551,23 @@ class SPUDocumentTransformer:
544 551 if pd.notna(position) and pd.notna(name):
545 552 option_name_map[int(position)] = str(name)
546 553  
  554 + primary_lang = self.tenant_config.get('primary_language', 'en')
  555 +
  556 + def _build_specification(name: str, raw_value: Any, sku_id: str) -> Optional[Dict[str, Any]]:
  557 + value = "" if raw_value is None else str(raw_value).strip()
  558 + if not value:
  559 + return None
  560 + return {
  561 + 'sku_id': sku_id,
  562 + 'name': name,
  563 + 'value_keyword': value,
  564 + 'value_text': self._build_core_language_text_object(
  565 + value,
  566 + source_lang=primary_lang,
  567 + scene="general",
  568 + ) or normalize_core_text_field_value(value, primary_lang),
  569 + }
  570 +
547 571 for _, sku_row in skus.iterrows():
548 572 sku_data = self._transform_sku_row(sku_row, option_name_map)
549 573 if sku_data:
... ... @@ -584,23 +608,17 @@ class SPUDocumentTransformer:
584 608 # 构建specifications(从SKU的option值和option表的name)
585 609 sku_id = str(sku_row['id'])
586 610 if pd.notna(sku_row.get('option1')) and 1 in option_name_map:
587   - specifications.append({
588   - 'sku_id': sku_id,
589   - 'name': option_name_map[1],
590   - 'value': str(sku_row['option1'])
591   - })
  611 + spec = _build_specification(option_name_map[1], sku_row['option1'], sku_id)
  612 + if spec:
  613 + specifications.append(spec)
592 614 if pd.notna(sku_row.get('option2')) and 2 in option_name_map:
593   - specifications.append({
594   - 'sku_id': sku_id,
595   - 'name': option_name_map[2],
596   - 'value': str(sku_row['option2'])
597   - })
  615 + spec = _build_specification(option_name_map[2], sku_row['option2'], sku_id)
  616 + if spec:
  617 + specifications.append(spec)
598 618 if pd.notna(sku_row.get('option3')) and 3 in option_name_map:
599   - specifications.append({
600   - 'sku_id': sku_id,
601   - 'name': option_name_map[3],
602   - 'value': str(sku_row['option3'])
603   - })
  619 + spec = _build_specification(option_name_map[3], sku_row['option3'], sku_id)
  620 + if spec:
  621 + specifications.append(spec)
604 622  
605 623 return skus_list, prices, compare_prices, sku_prices, sku_weights, sku_weight_units, total_inventory, specifications
606 624  
... ... @@ -636,82 +654,36 @@ class SPUDocumentTransformer:
636 654  
637 655 def _fill_llm_attributes(self, doc: Dict[str, Any], spu_row: pd.Series) -> None:
638 656 """
639   - 调用 indexer.product_enrich.analyze_products,为当前 SPU 填充:
  657 + 调用 indexer.product_enrich 的高层内容理解入口,为当前 SPU 填充:
640 658 - qanchors.{lang}
641   - - enriched_attributes (lang/name/value)
  659 + - tags.{lang}
  660 + - enriched_attributes[].value.{lang}
642 661 """
643   - try:
644   - index_langs = self.tenant_config.get("index_languages") or ["en", "zh"]
645   - except Exception:
646   - index_langs = ["en", "zh"]
647   -
648   - # 不再限制为固定 SUPPORTED_LANGS,直接按照租户配置的 index_languages 调用
649   - llm_langs = list(dict.fromkeys(index_langs)) # 去重并保持顺序
650   -
651 662 spu_id = str(spu_row.get("id") or "").strip()
652 663 title = str(spu_row.get("title") or "").strip()
653 664 if not spu_id or not title:
654 665 return
655 666  
656   - semantic_list = doc.get("enriched_attributes") or []
657   - qanchors_obj = doc.get("qanchors") or {}
658   -
659   - dim_keys = [
660   - "tags",
661   - "target_audience",
662   - "usage_scene",
663   - "season",
664   - "key_attributes",
665   - "material",
666   - "features",
667   - ]
668   -
669 667 tenant_id = doc.get("tenant_id")
  668 + try:
  669 + results = build_index_content_fields(
  670 + items=[
  671 + {
  672 + "id": spu_id,
  673 + "title": title,
  674 + "brief": str(spu_row.get("brief") or "").strip(),
  675 + "description": str(spu_row.get("description") or "").strip(),
  676 + "image_url": str(spu_row.get("image_src") or "").strip(),
  677 + }
  678 + ],
  679 + tenant_id=str(tenant_id),
  680 + )
  681 + except Exception as e:
  682 + logger.warning("LLM attribute fill failed for SPU %s: %s", spu_id, e)
  683 + return
670 684  
671   - for lang in llm_langs:
672   - try:
673   - rows = analyze_products(
674   - products=[{"id": spu_id, "title": title}],
675   - target_lang=lang,
676   - batch_size=1,
677   - tenant_id=str(tenant_id),
678   - )
679   - except Exception as e:
680   - logger.warning(
681   - "LLM attribute fill failed for SPU %s, lang=%s: %s",
682   - spu_id,
683   - lang,
684   - e,
685   - )
686   - continue
687   -
688   - if not rows:
689   - continue
690   - row = rows[0] or {}
691   -
692   - # qanchors.{lang}
693   - anchor_text = str(row.get("anchor_text") or "").strip()
694   - if anchor_text:
695   - qanchors_obj[lang] = anchor_text
696   -
697   - # 语义属性:按各维度拆分为短语
698   - for name in dim_keys:
699   - raw = row.get(name)
700   - if not raw:
701   - continue
702   - for value in split_multi_value_field(str(raw)):
703   - semantic_list.append(
704   - {
705   - "lang": lang,
706   - "name": name,
707   - "value": value,
708   - }
709   - )
710   -
711   - if qanchors_obj:
712   - doc["qanchors"] = qanchors_obj
713   - if semantic_list:
714   - doc["enriched_attributes"] = semantic_list
  685 + if results:
  686 + self._apply_content_enrichment(doc, results[0])
715 687  
716 688 def _transform_sku_row(self, sku_row: pd.Series, option_name_map: Dict[int, str] = None) -> Optional[Dict[str, Any]]:
717 689 """
... ...
indexer/product_enrich.py
... ... @@ -146,6 +146,16 @@ if _missing_prompt_langs:
146 146  
147 147 # 多值字段分隔:英文逗号、中文逗号、顿号,及历史约定的 ; | / 与空白
148 148 _MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[,、,;|/\n\t]+")
  149 +_CORE_INDEX_LANGUAGES = ("zh", "en")
  150 +_ENRICHED_ATTRIBUTE_DIMENSIONS = (
  151 + "enriched_tags",
  152 + "target_audience",
  153 + "usage_scene",
  154 + "season",
  155 + "key_attributes",
  156 + "material",
  157 + "features",
  158 +)
149 159  
150 160  
151 161 def split_multi_value_field(text: Optional[str]) -> List[str]:
... ... @@ -158,6 +168,124 @@ def split_multi_value_field(text: Optional[str]) -> List[str]:
158 168 return [p.strip() for p in _MULTI_VALUE_FIELD_SPLIT_RE.split(s) if p.strip()]
159 169  
160 170  
  171 +def _append_lang_phrase_map(target: Dict[str, List[str]], lang: str, raw_value: Any) -> None:
  172 + parts = split_multi_value_field(raw_value)
  173 + if not parts:
  174 + return
  175 + existing = target.get(lang) or []
  176 + merged = list(dict.fromkeys([str(x).strip() for x in existing if str(x).strip()] + parts))
  177 + if merged:
  178 + target[lang] = merged
  179 +
  180 +
  181 +def _append_enriched_attribute(
  182 + target: List[Dict[str, Any]],
  183 + name: str,
  184 + lang: str,
  185 + raw_value: Any,
  186 +) -> None:
  187 + for value in split_multi_value_field(raw_value):
  188 + if any(
  189 + item.get("name") == name
  190 + and isinstance(item.get("value"), dict)
  191 + and item["value"].get(lang) == value
  192 + for item in target
  193 + ):
  194 + continue
  195 + target.append({"name": name, "value": {lang: value}})
  196 +
  197 +
  198 +def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang: str) -> None:
  199 + if not row or row.get("error"):
  200 + return
  201 +
  202 + anchor_text = str(row.get("anchor_text") or "").strip()
  203 + if anchor_text:
  204 + _append_lang_phrase_map(result["qanchors"], lang=lang, raw_value=anchor_text)
  205 +
  206 + for name in _ENRICHED_ATTRIBUTE_DIMENSIONS:
  207 + raw = row.get(name)
  208 + if not raw:
  209 + continue
  210 + _append_enriched_attribute(result["enriched_attributes"], name=name, lang=lang, raw_value=raw)
  211 + if name == "enriched_tags":
  212 + _append_lang_phrase_map(result["enriched_tags"], lang=lang, raw_value=raw)
  213 +
  214 +
  215 +def _normalize_index_content_item(item: Dict[str, Any]) -> Dict[str, str]:
  216 + item_id = str(item.get("id") or item.get("spu_id") or "").strip()
  217 + return {
  218 + "id": item_id,
  219 + "title": str(item.get("title") or "").strip(),
  220 + "brief": str(item.get("brief") or "").strip(),
  221 + "description": str(item.get("description") or "").strip(),
  222 + "image_url": str(item.get("image_url") or "").strip(),
  223 + }
  224 +
  225 +
  226 +def build_index_content_fields(
  227 + items: List[Dict[str, Any]],
  228 + tenant_id: Optional[str] = None,
  229 +) -> List[Dict[str, Any]]:
  230 + """
  231 + 高层入口:生成与 ES mapping 对齐的内容理解字段。
  232 +
  233 + 输入项需包含:
  234 + - `id` 或 `spu_id`
  235 + - `title`
  236 + - 可选 `brief` / `description` / `image_url`
  237 +
  238 + 返回项结构:
  239 + - `id`
  240 + - `qanchors`
  241 + - `enriched_tags`
  242 + - `enriched_attributes`
  243 + - 可选 `error`
  244 +
  245 + 其中:
  246 + - `qanchors.{lang}` 为短语数组
  247 + - `enriched_tags.{lang}` 为标签数组
  248 + """
  249 + normalized_items = [_normalize_index_content_item(item) for item in items]
  250 + if not normalized_items:
  251 + return []
  252 +
  253 + results_by_id: Dict[str, Dict[str, Any]] = {
  254 + item["id"]: {
  255 + "id": item["id"],
  256 + "qanchors": {},
  257 + "enriched_tags": {},
  258 + "enriched_attributes": [],
  259 + }
  260 + for item in normalized_items
  261 + }
  262 +
  263 + for lang in _CORE_INDEX_LANGUAGES:
  264 + try:
  265 + rows = analyze_products(
  266 + products=normalized_items,
  267 + target_lang=lang,
  268 + batch_size=BATCH_SIZE,
  269 + tenant_id=tenant_id,
  270 + )
  271 + except Exception as e:
  272 + logger.warning("build_index_content_fields failed for lang=%s: %s", lang, e)
  273 + for item in normalized_items:
  274 + results_by_id[item["id"]].setdefault("error", str(e))
  275 + continue
  276 +
  277 + for row in rows or []:
  278 + item_id = str(row.get("id") or "").strip()
  279 + if not item_id or item_id not in results_by_id:
  280 + continue
  281 + if row.get("error"):
  282 + results_by_id[item_id].setdefault("error", row["error"])
  283 + continue
  284 + _apply_index_content_row(results_by_id[item_id], row=row, lang=lang)
  285 +
  286 + return [results_by_id[item["id"]] for item in normalized_items]
  287 +
  288 +
161 289 def _normalize_space(text: str) -> str:
162 290 return re.sub(r"\s+", " ", (text or "").strip())
163 291  
... ... @@ -526,7 +654,7 @@ def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]:
526 654 "seq_no": parts[0],
527 655 "title": parts[1], # 商品标题(按目标语言)
528 656 "category_path": parts[2] if len(parts) > 2 else "", # 品类路径
529   - "tags": parts[3] if len(parts) > 3 else "", # 细分标签
  657 + "enriched_tags": parts[3] if len(parts) > 3 else "", # 细分标签
530 658 "target_audience": parts[4] if len(parts) > 4 else "", # 适用人群
531 659 "usage_scene": parts[5] if len(parts) > 5 else "", # 使用场景
532 660 "season": parts[6] if len(parts) > 6 else "", # 适用季节
... ... @@ -603,7 +731,7 @@ def process_batch(
603 731 "title_input": item.get("title", ""),
604 732 "title": "",
605 733 "category_path": "",
606   - "tags": "",
  734 + "enriched_tags": "",
607 735 "target_audience": "",
608 736 "usage_scene": "",
609 737 "season": "",
... ... @@ -643,7 +771,7 @@ def process_batch(
643 771 "title_input": batch_data[i]["title"], # 原始输入标题
644 772 "title": parsed_item.get("title", ""), # 模型生成的标题
645 773 "category_path": parsed_item.get("category_path", ""), # 品类路径
646   - "tags": parsed_item.get("tags", ""), # 细分标签
  774 + "enriched_tags": parsed_item.get("enriched_tags", ""), # 细分标签
647 775 "target_audience": parsed_item.get("target_audience", ""), # 适用人群
648 776 "usage_scene": parsed_item.get("usage_scene", ""), # 使用场景
649 777 "season": parsed_item.get("season", ""), # 适用季节
... ... @@ -686,7 +814,7 @@ def process_batch(
686 814 "title_input": item["title"],
687 815 "title": "",
688 816 "category_path": "",
689   - "tags": "",
  817 + "enriched_tags": "",
690 818 "target_audience": "",
691 819 "usage_scene": "",
692 820 "season": "",
... ...
mappings/README.md
... ... @@ -34,8 +34,8 @@
34 34  
35 35 当前字段大致分为几类:
36 36  
37   -- 全语言字段:`title`、`keywords`、`brief`、`description`、`vendor`、`category_path`、`category_name_text`、`specifications.value`
38   -- 核心索引语言字段:`qanchors`、`tags`、`option1_values`、`option2_values`、`option3_values`、`enriched_attributes.value`
  37 +- 全语言字段:`title`、`keywords`、`brief`、`description`、`vendor`、`category_path`、`category_name_text`
  38 +- 核心索引语言字段:`qanchors`、`enriched_tags`、`option1_values`、`option2_values`、`option3_values`、`enriched_attributes.value`
39 39 - 复合嵌套字段:`image_embedding`、`specifications`、`enriched_attributes`、`skus`
40 40 - 其他标量字段:`tenant_id`、`spu_id`、价格、库存、类目等
41 41  
... ... @@ -63,11 +63,12 @@
63 63 典型字段:
64 64  
65 65 - `qanchors`
66   -- `tags`
  66 +- `enriched_tags`
67 67 - `option1_values`
68 68 - `option2_values`
69 69 - `option3_values`
70 70 - `enriched_attributes.value`
  71 +- `specifications.value_text`
71 72  
72 73 以 `category_path` 和 `option*_values` 为例,核心语言灌入结果应至少包含:
73 74  
... ... @@ -118,7 +119,6 @@
118 119 - `vendor`
119 120 - `category_path`
120 121 - `category_name_text`
121   -- `specifications.value`
122 122  
123 123 灌入规则:
124 124  
... ... @@ -151,7 +151,7 @@
151 151 }
152 152 ```
153 153  
154   -示例:规格值 `specifications.value`
  154 +示例:规格值 `specifications.value_text` / `specifications.value_keyword`
155 155  
156 156 ```json
157 157 {
... ... @@ -159,16 +159,21 @@
159 159 {
160 160 "sku_id": "sku-red-s",
161 161 "name": "color",
162   - "value": {
  162 + "value_keyword": "красный",
  163 + "value_text": {
163 164 "zh": "红色",
164   - "en": "red",
165   - "ru": "красный"
  165 + "en": "red"
166 166 }
167 167 }
168 168 ]
169 169 }
170 170 ```
171 171  
  172 +其中:
  173 +
  174 +- `specifications.value_keyword` 保存原始规格值,用于精确过滤 / 分面
  175 +- `specifications.value_text` 保存 `zh/en` 两个核心索引语言版本,用于检索召回
  176 +
172 177 ### 原始语言为中文或英文时
173 178  
174 179 如果原始语言就是核心索引语言之一,不需要额外再写第三份语言字段。
... ... @@ -210,7 +215,7 @@
210 215 - 标量字段:直接写固定值,例如 `tenant_id`、`spu_id`、`min_price`
211 216 - 核心索引语言字段:只生成 `zh/en`
212 217 - 全语言字段:生成 `zh/en`,再按原始语言补一个对应语种字段
213   -- 嵌套字段:对每个元素内部重复应用同样规则,例如 `specifications[].value`
  218 +- 嵌套字段:对每个元素内部重复应用同样规则,例如 `specifications[].value_text`、`enriched_attributes[].value`
214 219  
215 220 ### 推荐灌入流程
216 221  
... ...
mappings/generate_search_products_mapping.py
... ... @@ -194,8 +194,7 @@ FIELD_SPECS = [
194 194 ),
195 195 text_field("category_path", "all_language_text_with_keyword"),
196 196 text_field("category_name_text", "all_language_text_with_keyword"),
197   - text_field("qanchors", "core_language_text"),
198   - text_field("tags", "core_language_text_with_keyword"),
  197 + text_field("tags", "all_language_text_with_keyword"),
199 198 scalar_field("category_id", "keyword"),
200 199 scalar_field("category_name", "keyword"),
201 200 scalar_field("category_level", "integer"),
... ... @@ -209,6 +208,8 @@ FIELD_SPECS = [
209 208 scalar_field("value_keyword", "keyword"),
210 209 text_field("value_text", "core_language_text_with_keyword"),
211 210 ),
  211 + text_field("qanchors", "core_language_text"),
  212 + text_field("enriched_tags", "core_language_text_with_keyword"),
212 213 nested_field(
213 214 "enriched_attributes",
214 215 scalar_field("name", "keyword"),
... ...
suggestion/builder.py
... ... @@ -166,6 +166,29 @@ class SuggestionIndexBuilder:
166 166 out = [p.strip() for p in parts if p and p.strip()]
167 167 return out if out else [s]
168 168  
  169 + def _iter_multilang_product_tags(
  170 + self,
  171 + raw: Any,
  172 + index_languages: List[str],
  173 + primary_language: str,
  174 + ) -> List[Tuple[str, str]]:
  175 + if isinstance(raw, dict):
  176 + pairs: List[Tuple[str, str]] = []
  177 + for lang in index_languages:
  178 + for tag in self._iter_product_tags(raw.get(lang)):
  179 + pairs.append((lang, tag))
  180 + return pairs
  181 +
  182 + pairs = []
  183 + for tag in self._iter_product_tags(raw):
  184 + tag_lang, _, _ = detect_text_language_for_suggestions(
  185 + tag,
  186 + index_languages=index_languages,
  187 + primary_language=primary_language,
  188 + )
  189 + pairs.append((tag_lang, tag))
  190 + return pairs
  191 +
169 192 @staticmethod
170 193 def _looks_noise(text_value: str) -> bool:
171 194 if not text_value:
... ... @@ -487,12 +510,11 @@ class SuggestionIndexBuilder:
487 510 key_to_candidate[key] = c
488 511 c.add_product("qanchor", spu_id=product_id)
489 512  
490   - for tag in self._iter_product_tags(src.get("tags")):
491   - tag_lang, _, _ = detect_text_language_for_suggestions(
492   - tag,
493   - index_languages=index_languages,
494   - primary_language=primary_language,
495   - )
  513 + for tag_lang, tag in self._iter_multilang_product_tags(
  514 + src.get("tags"),
  515 + index_languages=index_languages,
  516 + primary_language=primary_language,
  517 + ):
496 518 text_norm = self._normalize_text(tag)
497 519 if self._looks_noise(text_norm):
498 520 continue
... ...
tests/ci/test_service_api_contracts.py
... ... @@ -345,33 +345,25 @@ def test_indexer_build_docs_from_db_contract(indexer_client: TestClient):
345 345 def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch):
346 346 import indexer.product_enrich as process_products
347 347  
348   - def _fake_analyze_products(
349   - products: List[Dict[str, str]],
350   - target_lang: str = "zh",
351   - batch_size: int | None = None,
352   - tenant_id: str | None = None,
353   - ):
354   - assert batch_size == 20
  348 + def _fake_build_index_content_fields(items: List[Dict[str, str]], tenant_id: str | None = None):
  349 + assert tenant_id == "162"
355 350 return [
356 351 {
357   - "id": p["id"],
358   - "lang": target_lang,
359   - "title_input": p["title"],
360   - "title": p["title"],
361   - "category_path": "",
362   - "tags": "tag1,tag2",
363   - "target_audience": "",
364   - "usage_scene": "",
365   - "season": "",
366   - "key_attributes": "",
367   - "material": "",
368   - "features": "",
369   - "anchor_text": f"{target_lang}-anchor-{p['id']}",
  352 + "id": p["spu_id"],
  353 + "qanchors": {
  354 + "zh": [f"zh-anchor-{p['spu_id']}"],
  355 + "en": [f"en-anchor-{p['spu_id']}"],
  356 + },
  357 + "enriched_tags": {"zh": ["tag1", "tag2"], "en": ["tag1", "tag2"]},
  358 + "enriched_attributes": [
  359 + {"name": "enriched_tags", "value": {"zh": "tag1"}},
  360 + {"name": "enriched_tags", "value": {"en": "tag1"}},
  361 + ],
370 362 }
371   - for p in products
  363 + for p in items
372 364 ]
373 365  
374   - monkeypatch.setattr(process_products, "analyze_products", _fake_analyze_products)
  366 + monkeypatch.setattr(process_products, "build_index_content_fields", _fake_build_index_content_fields)
375 367  
376 368 response = indexer_client.post(
377 369 "/indexer/enrich-content",
... ... @@ -381,7 +373,6 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch
381 373 {"spu_id": "1001", "title": "T-shirt"},
382 374 {"spu_id": "1002", "title": "Toy"},
383 375 ],
384   - "languages": ["zh", "en"],
385 376 },
386 377 )
387 378 assert response.status_code == 200
... ... @@ -390,9 +381,14 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch
390 381 assert data["total"] == 2
391 382 assert len(data["results"]) == 2
392 383 assert data["results"][0]["spu_id"] == "1001"
393   - assert data["results"][0]["qanchors"]["zh"] == "zh-anchor-1001"
394   - assert data["results"][0]["qanchors"]["en"] == "en-anchor-1001"
395   - assert "tag1" in data["results"][0]["tags"]
  384 + assert data["results"][0]["qanchors"]["zh"] == ["zh-anchor-1001"]
  385 + assert data["results"][0]["qanchors"]["en"] == ["en-anchor-1001"]
  386 + assert data["results"][0]["enriched_tags"]["zh"] == ["tag1", "tag2"]
  387 + assert data["results"][0]["enriched_tags"]["en"] == ["tag1", "tag2"]
  388 + assert data["results"][0]["enriched_attributes"][0] == {
  389 + "name": "enriched_tags",
  390 + "value": {"zh": "tag1"},
  391 + }
396 392  
397 393  
398 394 def test_indexer_documents_contract(indexer_client: TestClient):
... ... @@ -515,7 +511,6 @@ def test_indexer_enrich_content_validation_max_items(indexer_client: TestClient)
515 511 json={
516 512 "tenant_id": "162",
517 513 "items": [{"spu_id": str(i), "title": "x"} for i in range(51)],
518   - "languages": ["zh"],
519 514 },
520 515 )
521 516 assert response.status_code == 400
... ...
tests/test_llm_enrichment_batch_fill.py
... ... @@ -7,33 +7,30 @@ import pandas as pd
7 7 from indexer.document_transformer import SPUDocumentTransformer
8 8  
9 9  
10   -def test_fill_llm_attributes_batch_calls_analyze_in_batches(monkeypatch):
  10 +def test_fill_llm_attributes_batch_uses_product_enrich_helper(monkeypatch):
11 11 seen_calls: List[Dict[str, Any]] = []
12 12  
13   - def _fake_analyze_products(products, target_lang="zh", batch_size=None, tenant_id=None):
14   - # should always request batch_size=20 and pass full list; internal splitter handles >20
15   - seen_calls.append(
16   - {
17   - "n": len(products),
18   - "target_lang": target_lang,
19   - "batch_size": batch_size,
20   - "tenant_id": tenant_id,
21   - }
22   - )
  13 + def _fake_build_index_content_fields(items, tenant_id=None):
  14 + seen_calls.append({"n": len(items), "tenant_id": tenant_id})
23 15 return [
24 16 {
25   - "id": p["id"],
26   - "lang": target_lang,
27   - "title_input": p["title"],
28   - "tags": "t1,t2",
29   - "anchor_text": f"{target_lang}-anchor-{p['id']}",
  17 + "id": item["id"],
  18 + "qanchors": {
  19 + "zh": [f"zh-anchor-{item['id']}"],
  20 + "en": [f"en-anchor-{item['id']}"],
  21 + },
  22 + "tags": {"zh": ["t1", "t2"], "en": ["t1", "t2"]},
  23 + "enriched_attributes": [
  24 + {"name": "tags", "value": {"zh": "t1"}},
  25 + {"name": "tags", "value": {"en": "t1"}},
  26 + ],
30 27 }
31   - for p in products
  28 + for item in items
32 29 ]
33 30  
34 31 import indexer.document_transformer as doc_tr
35 32  
36   - monkeypatch.setattr(doc_tr, "analyze_products", _fake_analyze_products)
  33 + monkeypatch.setattr(doc_tr, "build_index_content_fields", _fake_build_index_content_fields)
37 34  
38 35 transformer = SPUDocumentTransformer(
39 36 category_id_to_name={},
... ... @@ -54,11 +51,11 @@ def test_fill_llm_attributes_batch_calls_analyze_in_batches(monkeypatch):
54 51  
55 52 transformer.fill_llm_attributes_batch(docs, rows)
56 53  
57   - # called once per language, with full list; analyze_products handles splitting
58   - assert seen_calls == [
59   - {"n": 45, "target_lang": "zh", "batch_size": 20, "tenant_id": "162"},
60   - {"n": 45, "target_lang": "en", "batch_size": 20, "tenant_id": "162"},
61   - ]
  54 + assert seen_calls == [{"n": 45, "tenant_id": "162"}]
62 55  
63   - assert docs[0]["qanchors"]["zh"] == "zh-anchor-0"
64   - assert docs[0]["qanchors"]["en"] == "en-anchor-0"
  56 + assert docs[0]["qanchors"]["zh"] == ["zh-anchor-0"]
  57 + assert docs[0]["qanchors"]["en"] == ["en-anchor-0"]
  58 + assert docs[0]["tags"]["zh"] == ["t1", "t2"]
  59 + assert docs[0]["tags"]["en"] == ["t1", "t2"]
  60 + assert {"name": "tags", "value": {"zh": "t1"}} in docs[0]["enriched_attributes"]
  61 + assert {"name": "tags", "value": {"en": "t1"}} in docs[0]["enriched_attributes"]
... ...
tests/test_suggestions.py
... ... @@ -403,10 +403,13 @@ def test_build_full_candidates_tags_and_qanchor_phrases(monkeypatch):
403 403 "spu_id": "900",
404 404 "title": {"en": "Tee", "zh": "T恤"},
405 405 "qanchors": {
406   - "en": "slim fit, sporty casual",
407   - "zh": "修身, 显瘦",
  406 + "en": ["slim fit", "sporty casual"],
  407 + "zh": ["修身", "显瘦"],
  408 + },
  409 + "tags": {
  410 + "en": ["Classic", "ribbed neckline"],
  411 + "zh": ["辣妹风"],
408 412 },
409   - "tags": ["Classic", "辣妹风", "ribbed neckline"],
410 413 },
411 414 }
412 415 ]
... ...