Commit d350861ffff80413a1e1be71dfbe478d514ee925
1 parent
fca871fb
索引结构修改
Showing
11 changed files
with
408 additions
and
344 deletions
Show diff stats
api/routes/indexer.py
| ... | ... | @@ -80,7 +80,7 @@ class BuildDocsFromDbRequest(BaseModel): |
| 80 | 80 | class EnrichContentItem(BaseModel): |
| 81 | 81 | """单条待生成内容理解字段的商品。""" |
| 82 | 82 | spu_id: str = Field(..., description="SPU ID") |
| 83 | - title: str = Field(..., description="商品标题,用于 LLM 分析生成 qanchors / tags 等") | |
| 83 | + title: str = Field(..., description="商品标题,用于 LLM 分析生成 qanchors / enriched_tags 等") | |
| 84 | 84 | image_url: Optional[str] = Field(None, description="商品主图 URL(预留给多模态/内容理解扩展)") |
| 85 | 85 | brief: Optional[str] = Field(None, description="商品简介/短描述") |
| 86 | 86 | description: Optional[str] = Field(None, description="商品详情/长描述") |
| ... | ... | @@ -93,10 +93,6 @@ class EnrichContentRequest(BaseModel): |
| 93 | 93 | """ |
| 94 | 94 | tenant_id: str = Field(..., description="租户 ID,用于请求路由与结果归属,不参与缓存键") |
| 95 | 95 | items: List[EnrichContentItem] = Field(..., description="待分析的 SPU 列表(spu_id + title,可附带 brief/description/image_url)") |
| 96 | - languages: List[str] = Field( | |
| 97 | - default_factory=lambda: ["zh", "en"], | |
| 98 | - description="目标语言列表,需在支持范围内(zh/en/de/ru/fr),默认 zh, en", | |
| 99 | - ) | |
| 100 | 96 | |
| 101 | 97 | |
| 102 | 98 | @router.post("/reindex") |
| ... | ... | @@ -444,92 +440,25 @@ async def build_docs_from_db(request: BuildDocsFromDbRequest): |
| 444 | 440 | raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}") |
| 445 | 441 | |
| 446 | 442 | |
| 447 | -def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: List[str]) -> List[Dict[str, Any]]: | |
| 443 | +def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]]) -> List[Dict[str, Any]]: | |
| 448 | 444 | """ |
| 449 | - 同步执行内容理解:调用 product_enrich.analyze_products,按语言批量跑 LLM, | |
| 450 | - 再聚合成每 SPU 的 qanchors、enriched_attributes、tags。供 run_in_executor 调用。 | |
| 445 | + 同步执行内容理解,返回与 ES mapping 对齐的字段结构。 | |
| 446 | + 语言策略由 product_enrich 内部统一决定,路由层不参与。 | |
| 451 | 447 | """ |
| 452 | - from indexer.product_enrich import analyze_products, split_multi_value_field | |
| 448 | + from indexer.product_enrich import build_index_content_fields | |
| 453 | 449 | |
| 454 | - llm_langs = list(dict.fromkeys(languages)) or ["en"] | |
| 455 | - | |
| 456 | - products = [ | |
| 450 | + results = build_index_content_fields(items=items, tenant_id=tenant_id) | |
| 451 | + return [ | |
| 457 | 452 | { |
| 458 | - "id": it["spu_id"], | |
| 459 | - "title": (it.get("title") or "").strip(), | |
| 460 | - "brief": (it.get("brief") or "").strip(), | |
| 461 | - "description": (it.get("description") or "").strip(), | |
| 462 | - "image_url": (it.get("image_url") or "").strip(), | |
| 453 | + "spu_id": item["id"], | |
| 454 | + "qanchors": item["qanchors"], | |
| 455 | + "enriched_attributes": item["enriched_attributes"], | |
| 456 | + "enriched_tags": item["enriched_tags"], | |
| 457 | + **({"error": item["error"]} if item.get("error") else {}), | |
| 463 | 458 | } |
| 464 | - for it in items | |
| 465 | - ] | |
| 466 | - dim_keys = [ | |
| 467 | - "tags", | |
| 468 | - "target_audience", | |
| 469 | - "usage_scene", | |
| 470 | - "season", | |
| 471 | - "key_attributes", | |
| 472 | - "material", | |
| 473 | - "features", | |
| 459 | + for item in results | |
| 474 | 460 | ] |
| 475 | 461 | |
| 476 | - # 按 spu_id 聚合:qanchors[lang], enriched_attributes[], tags[] | |
| 477 | - by_spu: Dict[str, Dict[str, Any]] = {} | |
| 478 | - for it in items: | |
| 479 | - sid = str(it["spu_id"]) | |
| 480 | - by_spu[sid] = {"qanchors": {}, "enriched_attributes": [], "tags": []} | |
| 481 | - | |
| 482 | - for lang in llm_langs: | |
| 483 | - try: | |
| 484 | - rows = analyze_products( | |
| 485 | - products=products, | |
| 486 | - target_lang=lang, | |
| 487 | - batch_size=20, | |
| 488 | - tenant_id=tenant_id, | |
| 489 | - ) | |
| 490 | - except Exception as e: | |
| 491 | - logger.warning("enrich-content analyze_products failed for lang=%s: %s", lang, e) | |
| 492 | - for it in items: | |
| 493 | - sid = str(it["spu_id"]) | |
| 494 | - if "error" not in by_spu[sid]: | |
| 495 | - by_spu[sid]["error"] = str(e) | |
| 496 | - continue | |
| 497 | - | |
| 498 | - for row in rows: | |
| 499 | - spu_id = str(row.get("id") or "") | |
| 500 | - if spu_id not in by_spu: | |
| 501 | - continue | |
| 502 | - rec = by_spu[spu_id] | |
| 503 | - if row.get("error"): | |
| 504 | - rec["error"] = row["error"] | |
| 505 | - continue | |
| 506 | - anchor_text = str(row.get("anchor_text") or "").strip() | |
| 507 | - if anchor_text: | |
| 508 | - rec["qanchors"][lang] = anchor_text | |
| 509 | - for name in dim_keys: | |
| 510 | - raw = row.get(name) | |
| 511 | - if not raw: | |
| 512 | - continue | |
| 513 | - for value in split_multi_value_field(str(raw)): | |
| 514 | - rec["enriched_attributes"].append({"lang": lang, "name": name, "value": value}) | |
| 515 | - if name == "tags": | |
| 516 | - rec["tags"].append(value) | |
| 517 | - | |
| 518 | - # 去重 tags(保持顺序) | |
| 519 | - out = [] | |
| 520 | - for it in items: | |
| 521 | - sid = str(it["spu_id"]) | |
| 522 | - rec = by_spu[sid] | |
| 523 | - tags = list(dict.fromkeys(rec["tags"])) | |
| 524 | - out.append({ | |
| 525 | - "spu_id": sid, | |
| 526 | - "qanchors": rec["qanchors"], | |
| 527 | - "enriched_attributes": rec["enriched_attributes"], | |
| 528 | - "tags": tags, | |
| 529 | - **({"error": rec["error"]} if rec.get("error") else {}), | |
| 530 | - }) | |
| 531 | - return out | |
| 532 | - | |
| 533 | 462 | |
| 534 | 463 | @router.post("/enrich-content") |
| 535 | 464 | async def enrich_content(request: EnrichContentRequest): |
| ... | ... | @@ -540,7 +469,7 @@ async def enrich_content(request: EnrichContentRequest): |
| 540 | 469 | - 外部 indexer 采用「微服务组合」方式自己组织 doc 时,可调用本接口获取 LLM 生成的 |
| 541 | 470 | 锚文本与语义属性,再与翻译、向量化结果合并写入 ES。 |
| 542 | 471 | - 与 /indexer/build-docs 解耦,避免 build-docs 因 LLM 耗时过长而阻塞;调用方可 |
| 543 | - 先拿不含 qanchors/tags 的 doc,再异步或离线补齐本接口结果后更新 ES。 | |
| 472 | + 先拿不含 qanchors/enriched_tags 的 doc,再异步或离线补齐本接口结果后更新 ES。 | |
| 544 | 473 | |
| 545 | 474 | 实现逻辑与 indexer.product_enrich.analyze_products 一致,支持多语言与 Redis 缓存。 |
| 546 | 475 | """ |
| ... | ... | @@ -568,8 +497,7 @@ async def enrich_content(request: EnrichContentRequest): |
| 568 | 497 | None, |
| 569 | 498 | lambda: _run_enrich_content( |
| 570 | 499 | tenant_id=request.tenant_id, |
| 571 | - items=items_payload, | |
| 572 | - languages=request.languages or ["zh", "en"], | |
| 500 | + items=items_payload | |
| 573 | 501 | ), |
| 574 | 502 | ) |
| 575 | 503 | return { | ... | ... |
docs/suggestion索引构建.md
| ... | ... | @@ -169,7 +169,7 @@ |
| 169 | 169 | |
| 170 | 170 | ##### 4.1 从商品索引收集 title / qanchors / tags(Step 1) |
| 171 | 171 | |
| 172 | - - 遍历店铺的所有商品:获取每个商品的 `"spu_id"`, `"title"`, `"qanchors"`, `"tags"`(按 `spu_id`、`id.keyword` 升序,便于 `search_after` 稳定分页) | |
| 172 | + - 遍历店铺的所有商品:获取每个商品的 `"spu_id"`, `"title"`, `"qanchors"`, `"enriched_tags"`(按 `spu_id`、`id.keyword` 升序,便于 `search_after` 稳定分页) | |
| 173 | 173 | |
| 174 | 174 | - 对每个商品文档: |
| 175 | 175 | |
| ... | ... | @@ -207,7 +207,7 @@ |
| 207 | 207 | - **qanchors 处理**: |
| 208 | 208 | - `qanchors` 字段同样为多语言对象: |
| 209 | 209 | ```json |
| 210 | - "qanchors": { "en": "...", "zh": "..." } | |
| 210 | + "qanchors": { "en": ["slim fit", "sporty casual"], "zh": ["修身", "显瘦"] } | |
| 211 | 211 | ``` |
| 212 | 212 | - 取 `q_raw = qanchors[lang]` |
| 213 | 213 | - 通过 `_split_qanchors(q_raw)` 拆分为若干字符串: |
| ... | ... | @@ -217,10 +217,14 @@ |
| 217 | 217 | - `text_norm = _normalize_text(q_text)`,再用 `_looks_noise` 过滤 |
| 218 | 218 | - 同样按 `(lang, text_norm)` 合并为 `SuggestionCandidate`,调用 `add_product("qanchor", spu_id=product_id)`。 |
| 219 | 219 | |
| 220 | - 4. **tags 处理**(与 `index_languages` 循环并列,每个商品只做一次): | |
| 221 | - - `tags` 可为字符串数组,或逗号等分隔的单个字符串;经 `_iter_product_tags` 展开为若干条。 | |
| 222 | - - 每条 tag **无语言字段**:使用 `query.query_parser.detect_text_language_for_suggestions`(与 `QueryParser` 相同的 `LanguageDetector`)判定语言,并约束在租户的 `index_languages` 内。 | |
| 223 | - - 通过 `_looks_noise` 后按 `(detected_lang, text_norm)` 合并,调用 `add_product("tag", spu_id=product_id)`。 | |
| 220 | + 4. **enriched_tags 处理**(与 `index_languages` 循环并列): | |
| 221 | + - `enriched_tags` 现为多语言对象,例如: | |
| 222 | + ```json | |
| 223 | + "enriched_tags": { "en": ["Classic", "ribbed neckline"], "zh": ["辣妹风"] } | |
| 224 | + ``` | |
| 225 | + - 优先读取 `enriched_tags[lang]`,每个值可为字符串数组,或逗号等分隔的单个字符串;经 `_iter_product_tags` 展开为若干条。 | |
| 226 | + - 对历史旧数据,若 `enriched_tags` 仍是单层字符串 / 数组,则继续走语言检测兜底,并约束在租户的 `index_languages` 内。 | |
| 227 | + - 通过 `_looks_noise` 后按 `(lang, text_norm)` 合并,调用 `add_product("tag", spu_id=product_id)`。 | |
| 224 | 228 | |
| 225 | 229 | ##### 4.2 从查询日志收集用户 query(Step 2) |
| 226 | 230 | ... | ... |
docs/搜索API对接指南-05-索引接口(Indexer).md
| ... | ... | @@ -13,7 +13,7 @@ |
| 13 | 13 | | 查询文档 | POST | `/indexer/documents` | 按 SPU ID 列表查询 ES 文档,不写入 ES | |
| 14 | 14 | | 构建 ES 文档(正式) | POST | `/indexer/build-docs` | 由上游提供 MySQL 行数据,返回 ES-ready 文档,不写 ES | |
| 15 | 15 | | 构建 ES 文档(测试) | POST | `/indexer/build-docs-from-db` | 由本服务查库并构建文档,仅测试/调试用 | |
| 16 | -| 内容理解字段生成 | POST | `/indexer/enrich-content` | 根据商品标题批量生成 qanchors、semantic_attributes、tags(供微服务组合方式使用) | | |
| 16 | +| 内容理解字段生成 | POST | `/indexer/enrich-content` | 根据商品标题批量生成 qanchors、enriched_attributes、tags(供微服务组合方式使用) | | |
| 17 | 17 | | 索引健康检查 | GET | `/indexer/health` | 检查索引服务与数据库连接状态 | |
| 18 | 18 | |
| 19 | 19 | #### 5.0 支撑外部 indexer 的三种方式 |
| ... | ... | @@ -510,7 +510,6 @@ curl -X POST "http://localhost:6004/indexer/build-docs" \ |
| 510 | 510 | { |
| 511 | 511 | "spu": { |
| 512 | 512 | "id": 10001, |
| 513 | - "tenant_id": "162", | |
| 514 | 513 | "title": "测试T恤 纯棉短袖", |
| 515 | 514 | "brief": "舒适纯棉,多色可选", |
| 516 | 515 | "description": "这是一款适合日常穿着的纯棉T恤,透气吸汗。", |
| ... | ... | @@ -521,7 +520,7 @@ curl -X POST "http://localhost:6004/indexer/build-docs" \ |
| 521 | 520 | "category_path": "服装/上衣/T恤", |
| 522 | 521 | "fake_sales": 1280, |
| 523 | 522 | "image_src": "https://oss.essa.cn/98532128-cf8e-456c-9e30-6f2a5ea0c19f.jpg", |
| 524 | - "tags": "T恤,纯棉,短袖,夏季", | |
| 523 | + "enriched_tags": ["T恤", "纯棉"], | |
| 525 | 524 | "create_time": "2024-01-01T00:00:00Z", |
| 526 | 525 | "update_time": "2024-01-01T00:00:00Z" |
| 527 | 526 | }, |
| ... | ... | @@ -570,7 +569,7 @@ curl -X POST "http://localhost:6004/indexer/build-docs" \ |
| 570 | 569 | "tenant_id": "170", |
| 571 | 570 | "spu_id": "223167", |
| 572 | 571 | "title": { "en": "...", "zh": "..." }, |
| 573 | - "tags": ["Floerns", "Clothing", "Shoes & Jewelry"], | |
| 572 | + "enriched_tags": ["Floerns", "Clothing", "Shoes & Jewelry"], | |
| 574 | 573 | "skus": [ |
| 575 | 574 | { |
| 576 | 575 | "sku_id": "3988393", |
| ... | ... | @@ -649,7 +648,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ |
| 649 | 648 | ### 5.8 内容理解字段生成接口 |
| 650 | 649 | |
| 651 | 650 | - **端点**: `POST /indexer/enrich-content` |
| 652 | -- **描述**: 根据商品内容信息批量生成 **qanchors**(锚文本)、**enriched_attributes**(语义属性)、**tags**(细分标签),供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段(必填/可选见下表)。内部逻辑与 `indexer.product_enrich` 一致,支持多语言与 Redis 缓存;单次请求在线程池中执行,避免阻塞其他接口。 | |
| 651 | +- **描述**: 根据商品内容信息批量生成 **qanchors**(锚文本)、**enriched_attributes**(语义属性)、**enriched_tags**(细分标签),供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段(必填/可选见下表)。接口只暴露商品内容输入,语言选择、分析维度与最终字段结构统一由 `indexer.product_enrich` 内部决定;当前返回结果与 `search_products` mapping 保持一致。单次请求在线程池中执行,避免阻塞其他接口。 | |
| 653 | 652 | |
| 654 | 653 | #### 请求参数 |
| 655 | 654 | |
| ... | ... | @@ -669,8 +668,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ |
| 669 | 668 | "title": "12PCS Dolls with Bottles", |
| 670 | 669 | "image_url": "https://example.com/images/223168.jpg" |
| 671 | 670 | } |
| 672 | - ], | |
| 673 | - "languages": ["zh", "en"] | |
| 671 | + ] | |
| 674 | 672 | } |
| 675 | 673 | ``` |
| 676 | 674 | |
| ... | ... | @@ -678,7 +676,6 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ |
| 678 | 676 | |------|------|------|--------|------| |
| 679 | 677 | | `tenant_id` | string | Y | - | 租户 ID。目前仅用于记录日志,不产生实际作用| |
| 680 | 678 | | `items` | array | Y | - | 待分析列表;**单次最多 50 条** | |
| 681 | -| `languages` | array[string] | N | `["zh", "en"]` | 目标语言,需在支持范围内:`zh`、`en`、`de`、`ru`、`fr` | | |
| 682 | 679 | |
| 683 | 680 | `items[]` 字段说明: |
| 684 | 681 | |
| ... | ... | @@ -696,6 +693,12 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ |
| 696 | 693 | - `tenant_id`、`spu_id` 只用于请求归属与结果回填,不参与缓存键。 |
| 697 | 694 | - 因此,输入内容不变时可跨请求直接命中缓存;任一输入字段变化时,会自然落到新的缓存 key。 |
| 698 | 695 | |
| 696 | +语言说明: | |
| 697 | + | |
| 698 | +- 接口不接受语言控制参数。 | |
| 699 | +- 返回哪些语言、返回哪些语义维度,统一由 `indexer.product_enrich` 内部逻辑决定。 | |
| 700 | +- 当前为了与 `search_products` mapping 对齐,返回结果只包含核心索引语言 `zh`、`en`。 | |
| 701 | + | |
| 699 | 702 | 批量请求建议: |
| 700 | 703 | - **全量**:强烈建议 尽可能 **20 个 SPU/doc** 攒成一个批次后再请求一次。 |
| 701 | 704 | - **增量**:可按时效要求设置时间窗口(例如 **5 分钟**),在窗口内尽可能攒到 **20 个**;达到 20 或窗口到期就发送一次请求。 |
| ... | ... | @@ -711,21 +714,28 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ |
| 711 | 714 | { |
| 712 | 715 | "spu_id": "223167", |
| 713 | 716 | "qanchors": { |
| 714 | - "zh": "短袖T恤,纯棉,男装,夏季", | |
| 715 | - "en": "cotton t-shirt, short sleeve, men, summer" | |
| 717 | + "zh": ["短袖T恤", "纯棉", "男装", "夏季"], | |
| 718 | + "en": ["cotton t-shirt", "short sleeve", "men", "summer"] | |
| 719 | + }, | |
| 720 | + "enriched_tags": { | |
| 721 | + "zh": ["纯棉", "短袖", "男装"], | |
| 722 | + "en": ["cotton", "short sleeve", "men"] | |
| 716 | 723 | }, |
| 717 | 724 | "enriched_attributes": [ |
| 718 | - { "lang": "zh", "name": "tags", "value": "纯棉" }, | |
| 719 | - { "lang": "zh", "name": "usage_scene", "value": "日常" }, | |
| 720 | - { "lang": "en", "name": "tags", "value": "cotton" } | |
| 721 | - ], | |
| 722 | - "tags": ["纯棉", "短袖", "男装", "cotton", "short sleeve"] | |
| 725 | + { "name": "enriched_tags", "value": { "zh": "纯棉" } }, | |
| 726 | + { "name": "usage_scene", "value": { "zh": "日常" } }, | |
| 727 | + { "name": "enriched_tags", "value": { "en": "cotton" } } | |
| 728 | + ] | |
| 723 | 729 | }, |
| 724 | 730 | { |
| 725 | 731 | "spu_id": "223168", |
| 726 | - "qanchors": { "en": "dolls, toys, 12pcs" }, | |
| 727 | - "enriched_attributes": [], | |
| 728 | - "tags": ["dolls", "toys"] | |
| 732 | + "qanchors": { | |
| 733 | + "en": ["dolls", "toys", "12pcs"] | |
| 734 | + }, | |
| 735 | + "enriched_tags": { | |
| 736 | + "en": ["dolls", "toys"] | |
| 737 | + }, | |
| 738 | + "enriched_attributes": [] | |
| 729 | 739 | } |
| 730 | 740 | ] |
| 731 | 741 | } |
| ... | ... | @@ -733,10 +743,10 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ |
| 733 | 743 | |
| 734 | 744 | | 字段 | 类型 | 说明 | |
| 735 | 745 | |------|------|------| |
| 736 | -| `results` | array | 与请求 `items` 一一对应,每项含 `spu_id`、`qanchors`、`enriched_attributes`、`tags` | | |
| 737 | -| `results[].qanchors` | object | 按语言键的锚文本(逗号分隔短语),可写入 ES 文档的 `qanchors.{lang}` | | |
| 738 | -| `results[].enriched_attributes` | array | 语义属性列表,每项为 `{ "lang", "name", "value" }`,可写入 ES 的 `enriched_attributes` nested 字段 | | |
| 739 | -| `results[].tags` | array | 从语义属性中抽取的 `name=tags` 的 value 集合,可与业务原有 `tags` 合并后写入 ES 的 `tags` 字段 | | |
| 746 | +| `results` | array | 与请求 `items` 一一对应,每项含 `spu_id`、`qanchors`、`enriched_attributes`、`enriched_tags` | | |
| 747 | +| `results[].qanchors` | object | 与 ES `qanchors` 字段同结构,按语言键返回短语数组 | | |
| 748 | +| `results[].enriched_tags` | object | 与 ES `enriched_tags` 字段同结构,按语言键返回标签数组 | | |
| 749 | +| `results[].enriched_attributes` | array | 与 ES `enriched_attributes` nested 字段同结构,每项为 `{ "name", "value": { "zh"?: "...", "en"?: "..." } }` | | |
| 740 | 750 | | `results[].error` | string | 若该条处理失败(如 LLM 异常),会在此字段返回错误信息 | |
| 741 | 751 | |
| 742 | 752 | **错误响应**: |
| ... | ... | @@ -758,8 +768,7 @@ curl -X POST "http://localhost:6004/indexer/enrich-content" \ |
| 758 | 768 | "description": "100%棉,圆领版型,适合日常通勤与休闲穿搭。", |
| 759 | 769 | "image_url": "https://example.com/images/223167.jpg" |
| 760 | 770 | } |
| 761 | - ], | |
| 762 | - "languages": ["zh", "en"] | |
| 771 | + ] | |
| 763 | 772 | }' |
| 764 | 773 | ``` |
| 765 | 774 | ... | ... |
indexer/document_transformer.py
| ... | ... | @@ -11,9 +11,8 @@ SPU文档转换器 - 公共转换逻辑。 |
| 11 | 11 | import pandas as pd |
| 12 | 12 | import numpy as np |
| 13 | 13 | import logging |
| 14 | -import re | |
| 15 | 14 | from typing import Dict, Any, Optional, List |
| 16 | -from indexer.product_enrich import analyze_products, split_multi_value_field | |
| 15 | +from indexer.product_enrich import build_index_content_fields | |
| 17 | 16 | |
| 18 | 17 | logger = logging.getLogger(__name__) |
| 19 | 18 | |
| ... | ... | @@ -75,6 +74,39 @@ class SPUDocumentTransformer: |
| 75 | 74 | ) |
| 76 | 75 | return translations |
| 77 | 76 | |
| 77 | + def _build_core_language_text_object( | |
| 78 | + self, | |
| 79 | + text: Optional[str], | |
| 80 | + source_lang: str, | |
| 81 | + scene: str = "general", | |
| 82 | + ) -> Dict[str, str]: | |
| 83 | + """ | |
| 84 | + 构建与 mapping 中 core_language_text(_with_keyword) 对齐的对象。 | |
| 85 | + 当前核心语言固定为 zh/en。 | |
| 86 | + """ | |
| 87 | + if not text or not str(text).strip(): | |
| 88 | + return {} | |
| 89 | + | |
| 90 | + source_text = str(text).strip() | |
| 91 | + obj: Dict[str, str] = {} | |
| 92 | + | |
| 93 | + if source_lang in CORE_INDEX_LANGUAGES: | |
| 94 | + obj[source_lang] = source_text | |
| 95 | + | |
| 96 | + if self.translator: | |
| 97 | + translations = self._translate_index_languages( | |
| 98 | + text=source_text, | |
| 99 | + source_lang=source_lang, | |
| 100 | + index_languages=CORE_INDEX_LANGUAGES, | |
| 101 | + scene=scene, | |
| 102 | + ) | |
| 103 | + for lang in CORE_INDEX_LANGUAGES: | |
| 104 | + val = translations.get(lang) | |
| 105 | + if val and str(val).strip(): | |
| 106 | + obj[lang] = str(val).strip() | |
| 107 | + | |
| 108 | + return obj | |
| 109 | + | |
| 78 | 110 | def transform_spu_to_doc( |
| 79 | 111 | self, |
| 80 | 112 | tenant_id: str, |
| ... | ... | @@ -118,10 +150,16 @@ class SPUDocumentTransformer: |
| 118 | 150 | if self.enable_title_embedding and self.encoder: |
| 119 | 151 | self._fill_title_embedding(doc) |
| 120 | 152 | |
| 121 | - # Tags | |
| 153 | + # Tags:统一转成与 mapping 一致的 core-language object | |
| 122 | 154 | if pd.notna(spu_row.get('tags')): |
| 123 | 155 | tags_str = str(spu_row['tags']) |
| 124 | - doc['tags'] = split_multi_value_field(tags_str) | |
| 156 | + tags_obj = self._build_core_language_text_object( | |
| 157 | + tags_str, | |
| 158 | + source_lang=primary_lang, | |
| 159 | + scene="general", | |
| 160 | + ) | |
| 161 | + if tags_obj: | |
| 162 | + doc['tags'] = tags_obj | |
| 125 | 163 | |
| 126 | 164 | # Category相关字段 |
| 127 | 165 | self._fill_category_fields(doc, spu_row) |
| ... | ... | @@ -202,7 +240,8 @@ class SPUDocumentTransformer: |
| 202 | 240 | """ |
| 203 | 241 | 批量调用 LLM,为一批 doc 填充: |
| 204 | 242 | - qanchors.{lang} |
| 205 | - - enriched_attributes (lang/name/value) | |
| 243 | + - tags.{lang} | |
| 244 | + - enriched_attributes[].value.{lang} | |
| 206 | 245 | |
| 207 | 246 | 设计目标: |
| 208 | 247 | - 尽可能攒批调用 LLM; |
| ... | ... | @@ -211,16 +250,8 @@ class SPUDocumentTransformer: |
| 211 | 250 | if not docs or not spu_rows or len(docs) != len(spu_rows): |
| 212 | 251 | return |
| 213 | 252 | |
| 214 | - try: | |
| 215 | - index_langs = self.tenant_config.get("index_languages") or ["en", "zh"] | |
| 216 | - except Exception: | |
| 217 | - index_langs = ["en", "zh"] | |
| 218 | - # 不再限制为固定 SUPPORTED_LANGS,直接按照租户配置的 index_languages 调用 | |
| 219 | - llm_langs = list(dict.fromkeys(index_langs)) # 去重并保持顺序 | |
| 220 | - | |
| 221 | - # 只对有 title 的 SPU 参与 LLM;其余跳过 | |
| 222 | 253 | id_to_idx: Dict[str, int] = {} |
| 223 | - products: List[Dict[str, str]] = [] | |
| 254 | + items: List[Dict[str, str]] = [] | |
| 224 | 255 | for i, row in enumerate(spu_rows): |
| 225 | 256 | raw_id = row.get("id") |
| 226 | 257 | spu_id = "" if raw_id is None else str(raw_id).strip() |
| ... | ... | @@ -228,69 +259,45 @@ class SPUDocumentTransformer: |
| 228 | 259 | if not spu_id or not title: |
| 229 | 260 | continue |
| 230 | 261 | id_to_idx[spu_id] = i |
| 231 | - products.append({"id": spu_id, "title": title}) | |
| 232 | - if not products: | |
| 262 | + items.append( | |
| 263 | + { | |
| 264 | + "id": spu_id, | |
| 265 | + "title": title, | |
| 266 | + "brief": str(row.get("brief") or "").strip(), | |
| 267 | + "description": str(row.get("description") or "").strip(), | |
| 268 | + "image_url": str(row.get("image_src") or "").strip(), | |
| 269 | + } | |
| 270 | + ) | |
| 271 | + if not items: | |
| 233 | 272 | return |
| 234 | 273 | |
| 235 | 274 | tenant_id = str(docs[0].get("tenant_id") or "").strip() or None |
| 275 | + try: | |
| 276 | + results = build_index_content_fields(items=items, tenant_id=tenant_id) | |
| 277 | + except Exception as e: | |
| 278 | + logger.warning("LLM batch attribute fill failed: %s", e) | |
| 279 | + return | |
| 236 | 280 | |
| 237 | - dim_keys = [ | |
| 238 | - "tags", | |
| 239 | - "target_audience", | |
| 240 | - "usage_scene", | |
| 241 | - "season", | |
| 242 | - "key_attributes", | |
| 243 | - "material", | |
| 244 | - "features", | |
| 245 | - ] | |
| 246 | - | |
| 247 | - for lang in llm_langs: | |
| 248 | - try: | |
| 249 | - rows = analyze_products( | |
| 250 | - products=products, | |
| 251 | - target_lang=lang, | |
| 252 | - batch_size=20, | |
| 253 | - tenant_id=tenant_id, | |
| 254 | - ) | |
| 255 | - except Exception as e: | |
| 256 | - logger.warning("LLM batch attribute fill failed (lang=%s): %s", lang, e) | |
| 281 | + for result in results: | |
| 282 | + spu_id = str(result.get("id") or "").strip() | |
| 283 | + if not spu_id: | |
| 257 | 284 | continue |
| 285 | + idx = id_to_idx.get(spu_id) | |
| 286 | + if idx is None: | |
| 287 | + continue | |
| 288 | + self._apply_content_enrichment(docs[idx], result) | |
| 258 | 289 | |
| 259 | - for row in rows or []: | |
| 260 | - spu_id = str(row.get("id") or "").strip() | |
| 261 | - if not spu_id: | |
| 262 | - continue | |
| 263 | - idx = id_to_idx.get(spu_id) | |
| 264 | - if idx is None: | |
| 265 | - continue | |
| 266 | - self._apply_llm_row(docs[idx], row=row, lang=lang, dim_keys=dim_keys) | |
| 267 | - | |
| 268 | - def _apply_llm_row(self, doc: Dict[str, Any], row: Dict[str, Any], lang: str, dim_keys: List[str]) -> None: | |
| 269 | - """将单条 LLM 输出 row 按既定结构写入 doc(不抛异常)。""" | |
| 290 | + def _apply_content_enrichment(self, doc: Dict[str, Any], enrichment: Dict[str, Any]) -> None: | |
| 291 | + """将 product_enrich 产出的 ES-ready 内容字段写入 doc。""" | |
| 270 | 292 | try: |
| 271 | - if row.get("error"): | |
| 272 | - return | |
| 273 | - | |
| 274 | - semantic_list = doc.get("enriched_attributes") or [] | |
| 275 | - qanchors_obj = doc.get("qanchors") or {} | |
| 276 | - | |
| 277 | - anchor_text = str(row.get("anchor_text") or "").strip() | |
| 278 | - if anchor_text: | |
| 279 | - qanchors_obj[lang] = anchor_text | |
| 280 | - | |
| 281 | - for name in dim_keys: | |
| 282 | - raw = row.get(name) | |
| 283 | - if not raw: | |
| 284 | - continue | |
| 285 | - for value in split_multi_value_field(str(raw)): | |
| 286 | - semantic_list.append({"lang": lang, "name": name, "value": value}) | |
| 287 | - | |
| 288 | - if qanchors_obj: | |
| 289 | - doc["qanchors"] = qanchors_obj | |
| 290 | - if semantic_list: | |
| 291 | - doc["enriched_attributes"] = semantic_list | |
| 293 | + if enrichment.get("qanchors"): | |
| 294 | + doc["qanchors"] = enrichment["qanchors"] | |
| 295 | + if enrichment.get("tags"): | |
| 296 | + doc["tags"] = enrichment["tags"] | |
| 297 | + if enrichment.get("enriched_attributes"): | |
| 298 | + doc["enriched_attributes"] = enrichment["enriched_attributes"] | |
| 292 | 299 | except Exception as e: |
| 293 | - logger.warning("Failed to apply LLM row to doc (spu_id=%s, lang=%s): %s", doc.get("spu_id"), lang, e) | |
| 300 | + logger.warning("Failed to apply enrichment to doc (spu_id=%s): %s", doc.get("spu_id"), e) | |
| 294 | 301 | |
| 295 | 302 | def _fill_text_fields( |
| 296 | 303 | self, |
| ... | ... | @@ -544,6 +551,23 @@ class SPUDocumentTransformer: |
| 544 | 551 | if pd.notna(position) and pd.notna(name): |
| 545 | 552 | option_name_map[int(position)] = str(name) |
| 546 | 553 | |
| 554 | + primary_lang = self.tenant_config.get('primary_language', 'en') | |
| 555 | + | |
| 556 | + def _build_specification(name: str, raw_value: Any, sku_id: str) -> Optional[Dict[str, Any]]: | |
| 557 | + value = "" if raw_value is None else str(raw_value).strip() | |
| 558 | + if not value: | |
| 559 | + return None | |
| 560 | + return { | |
| 561 | + 'sku_id': sku_id, | |
| 562 | + 'name': name, | |
| 563 | + 'value_keyword': value, | |
| 564 | + 'value_text': self._build_core_language_text_object( | |
| 565 | + value, | |
| 566 | + source_lang=primary_lang, | |
| 567 | + scene="general", | |
| 568 | + ) or normalize_core_text_field_value(value, primary_lang), | |
| 569 | + } | |
| 570 | + | |
| 547 | 571 | for _, sku_row in skus.iterrows(): |
| 548 | 572 | sku_data = self._transform_sku_row(sku_row, option_name_map) |
| 549 | 573 | if sku_data: |
| ... | ... | @@ -584,23 +608,17 @@ class SPUDocumentTransformer: |
| 584 | 608 | # 构建specifications(从SKU的option值和option表的name) |
| 585 | 609 | sku_id = str(sku_row['id']) |
| 586 | 610 | if pd.notna(sku_row.get('option1')) and 1 in option_name_map: |
| 587 | - specifications.append({ | |
| 588 | - 'sku_id': sku_id, | |
| 589 | - 'name': option_name_map[1], | |
| 590 | - 'value': str(sku_row['option1']) | |
| 591 | - }) | |
| 611 | + spec = _build_specification(option_name_map[1], sku_row['option1'], sku_id) | |
| 612 | + if spec: | |
| 613 | + specifications.append(spec) | |
| 592 | 614 | if pd.notna(sku_row.get('option2')) and 2 in option_name_map: |
| 593 | - specifications.append({ | |
| 594 | - 'sku_id': sku_id, | |
| 595 | - 'name': option_name_map[2], | |
| 596 | - 'value': str(sku_row['option2']) | |
| 597 | - }) | |
| 615 | + spec = _build_specification(option_name_map[2], sku_row['option2'], sku_id) | |
| 616 | + if spec: | |
| 617 | + specifications.append(spec) | |
| 598 | 618 | if pd.notna(sku_row.get('option3')) and 3 in option_name_map: |
| 599 | - specifications.append({ | |
| 600 | - 'sku_id': sku_id, | |
| 601 | - 'name': option_name_map[3], | |
| 602 | - 'value': str(sku_row['option3']) | |
| 603 | - }) | |
| 619 | + spec = _build_specification(option_name_map[3], sku_row['option3'], sku_id) | |
| 620 | + if spec: | |
| 621 | + specifications.append(spec) | |
| 604 | 622 | |
| 605 | 623 | return skus_list, prices, compare_prices, sku_prices, sku_weights, sku_weight_units, total_inventory, specifications |
| 606 | 624 | |
| ... | ... | @@ -636,82 +654,36 @@ class SPUDocumentTransformer: |
| 636 | 654 | |
| 637 | 655 | def _fill_llm_attributes(self, doc: Dict[str, Any], spu_row: pd.Series) -> None: |
| 638 | 656 | """ |
| 639 | - 调用 indexer.product_enrich.analyze_products,为当前 SPU 填充: | |
| 657 | + 调用 indexer.product_enrich 的高层内容理解入口,为当前 SPU 填充: | |
| 640 | 658 | - qanchors.{lang} |
| 641 | - - enriched_attributes (lang/name/value) | |
| 659 | + - tags.{lang} | |
| 660 | + - enriched_attributes[].value.{lang} | |
| 642 | 661 | """ |
| 643 | - try: | |
| 644 | - index_langs = self.tenant_config.get("index_languages") or ["en", "zh"] | |
| 645 | - except Exception: | |
| 646 | - index_langs = ["en", "zh"] | |
| 647 | - | |
| 648 | - # 不再限制为固定 SUPPORTED_LANGS,直接按照租户配置的 index_languages 调用 | |
| 649 | - llm_langs = list(dict.fromkeys(index_langs)) # 去重并保持顺序 | |
| 650 | - | |
| 651 | 662 | spu_id = str(spu_row.get("id") or "").strip() |
| 652 | 663 | title = str(spu_row.get("title") or "").strip() |
| 653 | 664 | if not spu_id or not title: |
| 654 | 665 | return |
| 655 | 666 | |
| 656 | - semantic_list = doc.get("enriched_attributes") or [] | |
| 657 | - qanchors_obj = doc.get("qanchors") or {} | |
| 658 | - | |
| 659 | - dim_keys = [ | |
| 660 | - "tags", | |
| 661 | - "target_audience", | |
| 662 | - "usage_scene", | |
| 663 | - "season", | |
| 664 | - "key_attributes", | |
| 665 | - "material", | |
| 666 | - "features", | |
| 667 | - ] | |
| 668 | - | |
| 669 | 667 | tenant_id = doc.get("tenant_id") |
| 668 | + try: | |
| 669 | + results = build_index_content_fields( | |
| 670 | + items=[ | |
| 671 | + { | |
| 672 | + "id": spu_id, | |
| 673 | + "title": title, | |
| 674 | + "brief": str(spu_row.get("brief") or "").strip(), | |
| 675 | + "description": str(spu_row.get("description") or "").strip(), | |
| 676 | + "image_url": str(spu_row.get("image_src") or "").strip(), | |
| 677 | + } | |
| 678 | + ], | |
| 679 | + tenant_id=str(tenant_id), | |
| 680 | + ) | |
| 681 | + except Exception as e: | |
| 682 | + logger.warning("LLM attribute fill failed for SPU %s: %s", spu_id, e) | |
| 683 | + return | |
| 670 | 684 | |
| 671 | - for lang in llm_langs: | |
| 672 | - try: | |
| 673 | - rows = analyze_products( | |
| 674 | - products=[{"id": spu_id, "title": title}], | |
| 675 | - target_lang=lang, | |
| 676 | - batch_size=1, | |
| 677 | - tenant_id=str(tenant_id), | |
| 678 | - ) | |
| 679 | - except Exception as e: | |
| 680 | - logger.warning( | |
| 681 | - "LLM attribute fill failed for SPU %s, lang=%s: %s", | |
| 682 | - spu_id, | |
| 683 | - lang, | |
| 684 | - e, | |
| 685 | - ) | |
| 686 | - continue | |
| 687 | - | |
| 688 | - if not rows: | |
| 689 | - continue | |
| 690 | - row = rows[0] or {} | |
| 691 | - | |
| 692 | - # qanchors.{lang} | |
| 693 | - anchor_text = str(row.get("anchor_text") or "").strip() | |
| 694 | - if anchor_text: | |
| 695 | - qanchors_obj[lang] = anchor_text | |
| 696 | - | |
| 697 | - # 语义属性:按各维度拆分为短语 | |
| 698 | - for name in dim_keys: | |
| 699 | - raw = row.get(name) | |
| 700 | - if not raw: | |
| 701 | - continue | |
| 702 | - for value in split_multi_value_field(str(raw)): | |
| 703 | - semantic_list.append( | |
| 704 | - { | |
| 705 | - "lang": lang, | |
| 706 | - "name": name, | |
| 707 | - "value": value, | |
| 708 | - } | |
| 709 | - ) | |
| 710 | - | |
| 711 | - if qanchors_obj: | |
| 712 | - doc["qanchors"] = qanchors_obj | |
| 713 | - if semantic_list: | |
| 714 | - doc["enriched_attributes"] = semantic_list | |
| 685 | + if results: | |
| 686 | + self._apply_content_enrichment(doc, results[0]) | |
| 715 | 687 | |
| 716 | 688 | def _transform_sku_row(self, sku_row: pd.Series, option_name_map: Dict[int, str] = None) -> Optional[Dict[str, Any]]: |
| 717 | 689 | """ | ... | ... |
indexer/product_enrich.py
| ... | ... | @@ -146,6 +146,16 @@ if _missing_prompt_langs: |
| 146 | 146 | |
| 147 | 147 | # 多值字段分隔:英文逗号、中文逗号、顿号,及历史约定的 ; | / 与空白 |
| 148 | 148 | _MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[,、,;|/\n\t]+") |
| 149 | +_CORE_INDEX_LANGUAGES = ("zh", "en") | |
| 150 | +_ENRICHED_ATTRIBUTE_DIMENSIONS = ( | |
| 151 | + "enriched_tags", | |
| 152 | + "target_audience", | |
| 153 | + "usage_scene", | |
| 154 | + "season", | |
| 155 | + "key_attributes", | |
| 156 | + "material", | |
| 157 | + "features", | |
| 158 | +) | |
| 149 | 159 | |
| 150 | 160 | |
| 151 | 161 | def split_multi_value_field(text: Optional[str]) -> List[str]: |
| ... | ... | @@ -158,6 +168,124 @@ def split_multi_value_field(text: Optional[str]) -> List[str]: |
| 158 | 168 | return [p.strip() for p in _MULTI_VALUE_FIELD_SPLIT_RE.split(s) if p.strip()] |
| 159 | 169 | |
| 160 | 170 | |
| 171 | +def _append_lang_phrase_map(target: Dict[str, List[str]], lang: str, raw_value: Any) -> None: | |
| 172 | + parts = split_multi_value_field(raw_value) | |
| 173 | + if not parts: | |
| 174 | + return | |
| 175 | + existing = target.get(lang) or [] | |
| 176 | + merged = list(dict.fromkeys([str(x).strip() for x in existing if str(x).strip()] + parts)) | |
| 177 | + if merged: | |
| 178 | + target[lang] = merged | |
| 179 | + | |
| 180 | + | |
| 181 | +def _append_enriched_attribute( | |
| 182 | + target: List[Dict[str, Any]], | |
| 183 | + name: str, | |
| 184 | + lang: str, | |
| 185 | + raw_value: Any, | |
| 186 | +) -> None: | |
| 187 | + for value in split_multi_value_field(raw_value): | |
| 188 | + if any( | |
| 189 | + item.get("name") == name | |
| 190 | + and isinstance(item.get("value"), dict) | |
| 191 | + and item["value"].get(lang) == value | |
| 192 | + for item in target | |
| 193 | + ): | |
| 194 | + continue | |
| 195 | + target.append({"name": name, "value": {lang: value}}) | |
| 196 | + | |
| 197 | + | |
| 198 | +def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang: str) -> None: | |
| 199 | + if not row or row.get("error"): | |
| 200 | + return | |
| 201 | + | |
| 202 | + anchor_text = str(row.get("anchor_text") or "").strip() | |
| 203 | + if anchor_text: | |
| 204 | + _append_lang_phrase_map(result["qanchors"], lang=lang, raw_value=anchor_text) | |
| 205 | + | |
| 206 | + for name in _ENRICHED_ATTRIBUTE_DIMENSIONS: | |
| 207 | + raw = row.get(name) | |
| 208 | + if not raw: | |
| 209 | + continue | |
| 210 | + _append_enriched_attribute(result["enriched_attributes"], name=name, lang=lang, raw_value=raw) | |
| 211 | + if name == "enriched_tags": | |
| 212 | + _append_lang_phrase_map(result["enriched_tags"], lang=lang, raw_value=raw) | |
| 213 | + | |
| 214 | + | |
| 215 | +def _normalize_index_content_item(item: Dict[str, Any]) -> Dict[str, str]: | |
| 216 | + item_id = str(item.get("id") or item.get("spu_id") or "").strip() | |
| 217 | + return { | |
| 218 | + "id": item_id, | |
| 219 | + "title": str(item.get("title") or "").strip(), | |
| 220 | + "brief": str(item.get("brief") or "").strip(), | |
| 221 | + "description": str(item.get("description") or "").strip(), | |
| 222 | + "image_url": str(item.get("image_url") or "").strip(), | |
| 223 | + } | |
| 224 | + | |
| 225 | + | |
| 226 | +def build_index_content_fields( | |
| 227 | + items: List[Dict[str, Any]], | |
| 228 | + tenant_id: Optional[str] = None, | |
| 229 | +) -> List[Dict[str, Any]]: | |
| 230 | + """ | |
| 231 | + 高层入口:生成与 ES mapping 对齐的内容理解字段。 | |
| 232 | + | |
| 233 | + 输入项需包含: | |
| 234 | + - `id` 或 `spu_id` | |
| 235 | + - `title` | |
| 236 | + - 可选 `brief` / `description` / `image_url` | |
| 237 | + | |
| 238 | + 返回项结构: | |
| 239 | + - `id` | |
| 240 | + - `qanchors` | |
| 241 | + - `enriched_tags` | |
| 242 | + - `enriched_attributes` | |
| 243 | + - 可选 `error` | |
| 244 | + | |
| 245 | + 其中: | |
| 246 | + - `qanchors.{lang}` 为短语数组 | |
| 247 | + - `enriched_tags.{lang}` 为标签数组 | |
| 248 | + """ | |
| 249 | + normalized_items = [_normalize_index_content_item(item) for item in items] | |
| 250 | + if not normalized_items: | |
| 251 | + return [] | |
| 252 | + | |
| 253 | + results_by_id: Dict[str, Dict[str, Any]] = { | |
| 254 | + item["id"]: { | |
| 255 | + "id": item["id"], | |
| 256 | + "qanchors": {}, | |
| 257 | + "enriched_tags": {}, | |
| 258 | + "enriched_attributes": [], | |
| 259 | + } | |
| 260 | + for item in normalized_items | |
| 261 | + } | |
| 262 | + | |
| 263 | + for lang in _CORE_INDEX_LANGUAGES: | |
| 264 | + try: | |
| 265 | + rows = analyze_products( | |
| 266 | + products=normalized_items, | |
| 267 | + target_lang=lang, | |
| 268 | + batch_size=BATCH_SIZE, | |
| 269 | + tenant_id=tenant_id, | |
| 270 | + ) | |
| 271 | + except Exception as e: | |
| 272 | + logger.warning("build_index_content_fields failed for lang=%s: %s", lang, e) | |
| 273 | + for item in normalized_items: | |
| 274 | + results_by_id[item["id"]].setdefault("error", str(e)) | |
| 275 | + continue | |
| 276 | + | |
| 277 | + for row in rows or []: | |
| 278 | + item_id = str(row.get("id") or "").strip() | |
| 279 | + if not item_id or item_id not in results_by_id: | |
| 280 | + continue | |
| 281 | + if row.get("error"): | |
| 282 | + results_by_id[item_id].setdefault("error", row["error"]) | |
| 283 | + continue | |
| 284 | + _apply_index_content_row(results_by_id[item_id], row=row, lang=lang) | |
| 285 | + | |
| 286 | + return [results_by_id[item["id"]] for item in normalized_items] | |
| 287 | + | |
| 288 | + | |
| 161 | 289 | def _normalize_space(text: str) -> str: |
| 162 | 290 | return re.sub(r"\s+", " ", (text or "").strip()) |
| 163 | 291 | |
| ... | ... | @@ -526,7 +654,7 @@ def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]: |
| 526 | 654 | "seq_no": parts[0], |
| 527 | 655 | "title": parts[1], # 商品标题(按目标语言) |
| 528 | 656 | "category_path": parts[2] if len(parts) > 2 else "", # 品类路径 |
| 529 | - "tags": parts[3] if len(parts) > 3 else "", # 细分标签 | |
| 657 | + "enriched_tags": parts[3] if len(parts) > 3 else "", # 细分标签 | |
| 530 | 658 | "target_audience": parts[4] if len(parts) > 4 else "", # 适用人群 |
| 531 | 659 | "usage_scene": parts[5] if len(parts) > 5 else "", # 使用场景 |
| 532 | 660 | "season": parts[6] if len(parts) > 6 else "", # 适用季节 |
| ... | ... | @@ -603,7 +731,7 @@ def process_batch( |
| 603 | 731 | "title_input": item.get("title", ""), |
| 604 | 732 | "title": "", |
| 605 | 733 | "category_path": "", |
| 606 | - "tags": "", | |
| 734 | + "enriched_tags": "", | |
| 607 | 735 | "target_audience": "", |
| 608 | 736 | "usage_scene": "", |
| 609 | 737 | "season": "", |
| ... | ... | @@ -643,7 +771,7 @@ def process_batch( |
| 643 | 771 | "title_input": batch_data[i]["title"], # 原始输入标题 |
| 644 | 772 | "title": parsed_item.get("title", ""), # 模型生成的标题 |
| 645 | 773 | "category_path": parsed_item.get("category_path", ""), # 品类路径 |
| 646 | - "tags": parsed_item.get("tags", ""), # 细分标签 | |
| 774 | + "enriched_tags": parsed_item.get("enriched_tags", ""), # 细分标签 | |
| 647 | 775 | "target_audience": parsed_item.get("target_audience", ""), # 适用人群 |
| 648 | 776 | "usage_scene": parsed_item.get("usage_scene", ""), # 使用场景 |
| 649 | 777 | "season": parsed_item.get("season", ""), # 适用季节 |
| ... | ... | @@ -686,7 +814,7 @@ def process_batch( |
| 686 | 814 | "title_input": item["title"], |
| 687 | 815 | "title": "", |
| 688 | 816 | "category_path": "", |
| 689 | - "tags": "", | |
| 817 | + "enriched_tags": "", | |
| 690 | 818 | "target_audience": "", |
| 691 | 819 | "usage_scene": "", |
| 692 | 820 | "season": "", | ... | ... |
mappings/README.md
| ... | ... | @@ -34,8 +34,8 @@ |
| 34 | 34 | |
| 35 | 35 | 当前字段大致分为几类: |
| 36 | 36 | |
| 37 | -- 全语言字段:`title`、`keywords`、`brief`、`description`、`vendor`、`category_path`、`category_name_text`、`specifications.value` | |
| 38 | -- 核心索引语言字段:`qanchors`、`tags`、`option1_values`、`option2_values`、`option3_values`、`enriched_attributes.value` | |
| 37 | +- 全语言字段:`title`、`keywords`、`brief`、`description`、`vendor`、`category_path`、`category_name_text` | |
| 38 | +- 核心索引语言字段:`qanchors`、`enriched_tags`、`option1_values`、`option2_values`、`option3_values`、`enriched_attributes.value` | |
| 39 | 39 | - 复合嵌套字段:`image_embedding`、`specifications`、`enriched_attributes`、`skus` |
| 40 | 40 | - 其他标量字段:`tenant_id`、`spu_id`、价格、库存、类目等 |
| 41 | 41 | |
| ... | ... | @@ -63,11 +63,12 @@ |
| 63 | 63 | 典型字段: |
| 64 | 64 | |
| 65 | 65 | - `qanchors` |
| 66 | -- `tags` | |
| 66 | +- `enriched_tags` | |
| 67 | 67 | - `option1_values` |
| 68 | 68 | - `option2_values` |
| 69 | 69 | - `option3_values` |
| 70 | 70 | - `enriched_attributes.value` |
| 71 | +- `specifications.value_text` | |
| 71 | 72 | |
| 72 | 73 | 以 `category_path` 和 `option*_values` 为例,核心语言灌入结果应至少包含: |
| 73 | 74 | |
| ... | ... | @@ -118,7 +119,6 @@ |
| 118 | 119 | - `vendor` |
| 119 | 120 | - `category_path` |
| 120 | 121 | - `category_name_text` |
| 121 | -- `specifications.value` | |
| 122 | 122 | |
| 123 | 123 | 灌入规则: |
| 124 | 124 | |
| ... | ... | @@ -151,7 +151,7 @@ |
| 151 | 151 | } |
| 152 | 152 | ``` |
| 153 | 153 | |
| 154 | -示例:规格值 `specifications.value` | |
| 154 | +示例:规格值 `specifications.value_text` / `specifications.value_keyword` | |
| 155 | 155 | |
| 156 | 156 | ```json |
| 157 | 157 | { |
| ... | ... | @@ -159,16 +159,21 @@ |
| 159 | 159 | { |
| 160 | 160 | "sku_id": "sku-red-s", |
| 161 | 161 | "name": "color", |
| 162 | - "value": { | |
| 162 | + "value_keyword": "красный", | |
| 163 | + "value_text": { | |
| 163 | 164 | "zh": "红色", |
| 164 | - "en": "red", | |
| 165 | - "ru": "красный" | |
| 165 | + "en": "red" | |
| 166 | 166 | } |
| 167 | 167 | } |
| 168 | 168 | ] |
| 169 | 169 | } |
| 170 | 170 | ``` |
| 171 | 171 | |
| 172 | +其中: | |
| 173 | + | |
| 174 | +- `specifications.value_keyword` 保存原始规格值,用于精确过滤 / 分面 | |
| 175 | +- `specifications.value_text` 保存 `zh/en` 两个核心索引语言版本,用于检索召回 | |
| 176 | + | |
| 172 | 177 | ### 原始语言为中文或英文时 |
| 173 | 178 | |
| 174 | 179 | 如果原始语言就是核心索引语言之一,不需要额外再写第三份语言字段。 |
| ... | ... | @@ -210,7 +215,7 @@ |
| 210 | 215 | - 标量字段:直接写固定值,例如 `tenant_id`、`spu_id`、`min_price` |
| 211 | 216 | - 核心索引语言字段:只生成 `zh/en` |
| 212 | 217 | - 全语言字段:生成 `zh/en`,再按原始语言补一个对应语种字段 |
| 213 | -- 嵌套字段:对每个元素内部重复应用同样规则,例如 `specifications[].value` | |
| 218 | +- 嵌套字段:对每个元素内部重复应用同样规则,例如 `specifications[].value_text`、`enriched_attributes[].value` | |
| 214 | 219 | |
| 215 | 220 | ### 推荐灌入流程 |
| 216 | 221 | ... | ... |
mappings/generate_search_products_mapping.py
| ... | ... | @@ -194,8 +194,7 @@ FIELD_SPECS = [ |
| 194 | 194 | ), |
| 195 | 195 | text_field("category_path", "all_language_text_with_keyword"), |
| 196 | 196 | text_field("category_name_text", "all_language_text_with_keyword"), |
| 197 | - text_field("qanchors", "core_language_text"), | |
| 198 | - text_field("tags", "core_language_text_with_keyword"), | |
| 197 | + text_field("tags", "all_language_text_with_keyword"), | |
| 199 | 198 | scalar_field("category_id", "keyword"), |
| 200 | 199 | scalar_field("category_name", "keyword"), |
| 201 | 200 | scalar_field("category_level", "integer"), |
| ... | ... | @@ -209,6 +208,8 @@ FIELD_SPECS = [ |
| 209 | 208 | scalar_field("value_keyword", "keyword"), |
| 210 | 209 | text_field("value_text", "core_language_text_with_keyword"), |
| 211 | 210 | ), |
| 211 | + text_field("qanchors", "core_language_text"), | |
| 212 | + text_field("enriched_tags", "core_language_text_with_keyword"), | |
| 212 | 213 | nested_field( |
| 213 | 214 | "enriched_attributes", |
| 214 | 215 | scalar_field("name", "keyword"), | ... | ... |
suggestion/builder.py
| ... | ... | @@ -166,6 +166,29 @@ class SuggestionIndexBuilder: |
| 166 | 166 | out = [p.strip() for p in parts if p and p.strip()] |
| 167 | 167 | return out if out else [s] |
| 168 | 168 | |
| 169 | + def _iter_multilang_product_tags( | |
| 170 | + self, | |
| 171 | + raw: Any, | |
| 172 | + index_languages: List[str], | |
| 173 | + primary_language: str, | |
| 174 | + ) -> List[Tuple[str, str]]: | |
| 175 | + if isinstance(raw, dict): | |
| 176 | + pairs: List[Tuple[str, str]] = [] | |
| 177 | + for lang in index_languages: | |
| 178 | + for tag in self._iter_product_tags(raw.get(lang)): | |
| 179 | + pairs.append((lang, tag)) | |
| 180 | + return pairs | |
| 181 | + | |
| 182 | + pairs = [] | |
| 183 | + for tag in self._iter_product_tags(raw): | |
| 184 | + tag_lang, _, _ = detect_text_language_for_suggestions( | |
| 185 | + tag, | |
| 186 | + index_languages=index_languages, | |
| 187 | + primary_language=primary_language, | |
| 188 | + ) | |
| 189 | + pairs.append((tag_lang, tag)) | |
| 190 | + return pairs | |
| 191 | + | |
| 169 | 192 | @staticmethod |
| 170 | 193 | def _looks_noise(text_value: str) -> bool: |
| 171 | 194 | if not text_value: |
| ... | ... | @@ -487,12 +510,11 @@ class SuggestionIndexBuilder: |
| 487 | 510 | key_to_candidate[key] = c |
| 488 | 511 | c.add_product("qanchor", spu_id=product_id) |
| 489 | 512 | |
| 490 | - for tag in self._iter_product_tags(src.get("tags")): | |
| 491 | - tag_lang, _, _ = detect_text_language_for_suggestions( | |
| 492 | - tag, | |
| 493 | - index_languages=index_languages, | |
| 494 | - primary_language=primary_language, | |
| 495 | - ) | |
| 513 | + for tag_lang, tag in self._iter_multilang_product_tags( | |
| 514 | + src.get("tags"), | |
| 515 | + index_languages=index_languages, | |
| 516 | + primary_language=primary_language, | |
| 517 | + ): | |
| 496 | 518 | text_norm = self._normalize_text(tag) |
| 497 | 519 | if self._looks_noise(text_norm): |
| 498 | 520 | continue | ... | ... |
tests/ci/test_service_api_contracts.py
| ... | ... | @@ -345,33 +345,25 @@ def test_indexer_build_docs_from_db_contract(indexer_client: TestClient): |
| 345 | 345 | def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch): |
| 346 | 346 | import indexer.product_enrich as process_products |
| 347 | 347 | |
| 348 | - def _fake_analyze_products( | |
| 349 | - products: List[Dict[str, str]], | |
| 350 | - target_lang: str = "zh", | |
| 351 | - batch_size: int | None = None, | |
| 352 | - tenant_id: str | None = None, | |
| 353 | - ): | |
| 354 | - assert batch_size == 20 | |
| 348 | + def _fake_build_index_content_fields(items: List[Dict[str, str]], tenant_id: str | None = None): | |
| 349 | + assert tenant_id == "162" | |
| 355 | 350 | return [ |
| 356 | 351 | { |
| 357 | - "id": p["id"], | |
| 358 | - "lang": target_lang, | |
| 359 | - "title_input": p["title"], | |
| 360 | - "title": p["title"], | |
| 361 | - "category_path": "", | |
| 362 | - "tags": "tag1,tag2", | |
| 363 | - "target_audience": "", | |
| 364 | - "usage_scene": "", | |
| 365 | - "season": "", | |
| 366 | - "key_attributes": "", | |
| 367 | - "material": "", | |
| 368 | - "features": "", | |
| 369 | - "anchor_text": f"{target_lang}-anchor-{p['id']}", | |
| 352 | + "id": p["spu_id"], | |
| 353 | + "qanchors": { | |
| 354 | + "zh": [f"zh-anchor-{p['spu_id']}"], | |
| 355 | + "en": [f"en-anchor-{p['spu_id']}"], | |
| 356 | + }, | |
| 357 | + "enriched_tags": {"zh": ["tag1", "tag2"], "en": ["tag1", "tag2"]}, | |
| 358 | + "enriched_attributes": [ | |
| 359 | + {"name": "enriched_tags", "value": {"zh": "tag1"}}, | |
| 360 | + {"name": "enriched_tags", "value": {"en": "tag1"}}, | |
| 361 | + ], | |
| 370 | 362 | } |
| 371 | - for p in products | |
| 363 | + for p in items | |
| 372 | 364 | ] |
| 373 | 365 | |
| 374 | - monkeypatch.setattr(process_products, "analyze_products", _fake_analyze_products) | |
| 366 | + monkeypatch.setattr(process_products, "build_index_content_fields", _fake_build_index_content_fields) | |
| 375 | 367 | |
| 376 | 368 | response = indexer_client.post( |
| 377 | 369 | "/indexer/enrich-content", |
| ... | ... | @@ -381,7 +373,6 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch |
| 381 | 373 | {"spu_id": "1001", "title": "T-shirt"}, |
| 382 | 374 | {"spu_id": "1002", "title": "Toy"}, |
| 383 | 375 | ], |
| 384 | - "languages": ["zh", "en"], | |
| 385 | 376 | }, |
| 386 | 377 | ) |
| 387 | 378 | assert response.status_code == 200 |
| ... | ... | @@ -390,9 +381,14 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch |
| 390 | 381 | assert data["total"] == 2 |
| 391 | 382 | assert len(data["results"]) == 2 |
| 392 | 383 | assert data["results"][0]["spu_id"] == "1001" |
| 393 | - assert data["results"][0]["qanchors"]["zh"] == "zh-anchor-1001" | |
| 394 | - assert data["results"][0]["qanchors"]["en"] == "en-anchor-1001" | |
| 395 | - assert "tag1" in data["results"][0]["tags"] | |
| 384 | + assert data["results"][0]["qanchors"]["zh"] == ["zh-anchor-1001"] | |
| 385 | + assert data["results"][0]["qanchors"]["en"] == ["en-anchor-1001"] | |
| 386 | + assert data["results"][0]["enriched_tags"]["zh"] == ["tag1", "tag2"] | |
| 387 | + assert data["results"][0]["enriched_tags"]["en"] == ["tag1", "tag2"] | |
| 388 | + assert data["results"][0]["enriched_attributes"][0] == { | |
| 389 | + "name": "enriched_tags", | |
| 390 | + "value": {"zh": "tag1"}, | |
| 391 | + } | |
| 396 | 392 | |
| 397 | 393 | |
| 398 | 394 | def test_indexer_documents_contract(indexer_client: TestClient): |
| ... | ... | @@ -515,7 +511,6 @@ def test_indexer_enrich_content_validation_max_items(indexer_client: TestClient) |
| 515 | 511 | json={ |
| 516 | 512 | "tenant_id": "162", |
| 517 | 513 | "items": [{"spu_id": str(i), "title": "x"} for i in range(51)], |
| 518 | - "languages": ["zh"], | |
| 519 | 514 | }, |
| 520 | 515 | ) |
| 521 | 516 | assert response.status_code == 400 | ... | ... |
tests/test_llm_enrichment_batch_fill.py
| ... | ... | @@ -7,33 +7,30 @@ import pandas as pd |
| 7 | 7 | from indexer.document_transformer import SPUDocumentTransformer |
| 8 | 8 | |
| 9 | 9 | |
| 10 | -def test_fill_llm_attributes_batch_calls_analyze_in_batches(monkeypatch): | |
| 10 | +def test_fill_llm_attributes_batch_uses_product_enrich_helper(monkeypatch): | |
| 11 | 11 | seen_calls: List[Dict[str, Any]] = [] |
| 12 | 12 | |
| 13 | - def _fake_analyze_products(products, target_lang="zh", batch_size=None, tenant_id=None): | |
| 14 | - # should always request batch_size=20 and pass full list; internal splitter handles >20 | |
| 15 | - seen_calls.append( | |
| 16 | - { | |
| 17 | - "n": len(products), | |
| 18 | - "target_lang": target_lang, | |
| 19 | - "batch_size": batch_size, | |
| 20 | - "tenant_id": tenant_id, | |
| 21 | - } | |
| 22 | - ) | |
| 13 | + def _fake_build_index_content_fields(items, tenant_id=None): | |
| 14 | + seen_calls.append({"n": len(items), "tenant_id": tenant_id}) | |
| 23 | 15 | return [ |
| 24 | 16 | { |
| 25 | - "id": p["id"], | |
| 26 | - "lang": target_lang, | |
| 27 | - "title_input": p["title"], | |
| 28 | - "tags": "t1,t2", | |
| 29 | - "anchor_text": f"{target_lang}-anchor-{p['id']}", | |
| 17 | + "id": item["id"], | |
| 18 | + "qanchors": { | |
| 19 | + "zh": [f"zh-anchor-{item['id']}"], | |
| 20 | + "en": [f"en-anchor-{item['id']}"], | |
| 21 | + }, | |
| 22 | + "tags": {"zh": ["t1", "t2"], "en": ["t1", "t2"]}, | |
| 23 | + "enriched_attributes": [ | |
| 24 | + {"name": "tags", "value": {"zh": "t1"}}, | |
| 25 | + {"name": "tags", "value": {"en": "t1"}}, | |
| 26 | + ], | |
| 30 | 27 | } |
| 31 | - for p in products | |
| 28 | + for item in items | |
| 32 | 29 | ] |
| 33 | 30 | |
| 34 | 31 | import indexer.document_transformer as doc_tr |
| 35 | 32 | |
| 36 | - monkeypatch.setattr(doc_tr, "analyze_products", _fake_analyze_products) | |
| 33 | + monkeypatch.setattr(doc_tr, "build_index_content_fields", _fake_build_index_content_fields) | |
| 37 | 34 | |
| 38 | 35 | transformer = SPUDocumentTransformer( |
| 39 | 36 | category_id_to_name={}, |
| ... | ... | @@ -54,11 +51,11 @@ def test_fill_llm_attributes_batch_calls_analyze_in_batches(monkeypatch): |
| 54 | 51 | |
| 55 | 52 | transformer.fill_llm_attributes_batch(docs, rows) |
| 56 | 53 | |
| 57 | - # called once per language, with full list; analyze_products handles splitting | |
| 58 | - assert seen_calls == [ | |
| 59 | - {"n": 45, "target_lang": "zh", "batch_size": 20, "tenant_id": "162"}, | |
| 60 | - {"n": 45, "target_lang": "en", "batch_size": 20, "tenant_id": "162"}, | |
| 61 | - ] | |
| 54 | + assert seen_calls == [{"n": 45, "tenant_id": "162"}] | |
| 62 | 55 | |
| 63 | - assert docs[0]["qanchors"]["zh"] == "zh-anchor-0" | |
| 64 | - assert docs[0]["qanchors"]["en"] == "en-anchor-0" | |
| 56 | + assert docs[0]["qanchors"]["zh"] == ["zh-anchor-0"] | |
| 57 | + assert docs[0]["qanchors"]["en"] == ["en-anchor-0"] | |
| 58 | + assert docs[0]["tags"]["zh"] == ["t1", "t2"] | |
| 59 | + assert docs[0]["tags"]["en"] == ["t1", "t2"] | |
| 60 | + assert {"name": "tags", "value": {"zh": "t1"}} in docs[0]["enriched_attributes"] | |
| 61 | + assert {"name": "tags", "value": {"en": "t1"}} in docs[0]["enriched_attributes"] | ... | ... |
tests/test_suggestions.py
| ... | ... | @@ -403,10 +403,13 @@ def test_build_full_candidates_tags_and_qanchor_phrases(monkeypatch): |
| 403 | 403 | "spu_id": "900", |
| 404 | 404 | "title": {"en": "Tee", "zh": "T恤"}, |
| 405 | 405 | "qanchors": { |
| 406 | - "en": "slim fit, sporty casual", | |
| 407 | - "zh": "修身, 显瘦", | |
| 406 | + "en": ["slim fit", "sporty casual"], | |
| 407 | + "zh": ["修身", "显瘦"], | |
| 408 | + }, | |
| 409 | + "tags": { | |
| 410 | + "en": ["Classic", "ribbed neckline"], | |
| 411 | + "zh": ["辣妹风"], | |
| 408 | 412 | }, |
| 409 | - "tags": ["Classic", "辣妹风", "ribbed neckline"], | |
| 410 | 413 | }, |
| 411 | 414 | } |
| 412 | 415 | ] | ... | ... |