Commit 2703b6ea8b5d2ecdc442f11ad26279cc2dce41e6
1 parent
5aaf0c7d
refactor(indexer): 将 analysis_kinds 拆分为 enrichment_scopes +
category_taxonomy_profile - 原 analysis_kinds 混用了“增强类型”(content/taxonomy)与“品类特定配置”,不利于扩展不同品类的 taxonomy 分析(如 3C、家居等) - 新增 enrichment_scopes 参数:支持 generic(通用增强,产出 qanchors/enriched_tags/enriched_attributes)和 category_taxonomy(品类增强,产出 enriched_taxonomy_attributes) - 新增 category_taxonomy_profile 参数:指定品类增强使用哪套 profile(当前内置 apparel),每套 profile 包含独立的 prompt、输出列定义、解析规则及缓存版本 - 保留 analysis_kinds 作为兼容别名,避免破坏现有调用方 - 重构内部 taxonomy 分析为 profile registry 模式:新增 _get_taxonomy_schema(profile_name) 函数,根据 profile 动态返回对应的 AnalysisSchema - 缓存 key 现在按“分析类型 + profile + schema 指纹 + 输入字段哈希”隔离,确保不同品类、不同 prompt 版本自动失效 - 更新 API 文档及微服务接口文档,明确新参数语义与使用示例 技术细节: - 修改入口:api/routes/indexer.py 中 enrich-content 端点,解析新参数并向下传递 - 核心逻辑:indexer/product_enrich.py 中 enrich_products_batch 增加 profile 参数;_process_batch_for_schema 根据 scope 和 profile 动态获取 schema - 兼容层:若请求同时提供 analysis_kinds,则映射为 enrichment_scopes(content→generic,taxonomy→category_taxonomy),category_taxonomy_profile 默认为 "apparel" - 测试覆盖:新增 enrichment_scopes 组合、profile 切换及兼容模式测试
Showing
9 changed files
with
368 additions
and
71 deletions
Show diff stats
api/routes/indexer.py
| ... | ... | @@ -88,20 +88,41 @@ class EnrichContentItem(BaseModel): |
| 88 | 88 | |
| 89 | 89 | class EnrichContentRequest(BaseModel): |
| 90 | 90 | """ |
| 91 | - 内容理解字段生成请求:根据商品标题批量生成 qanchors、enriched_attributes、tags、taxonomy attributes。 | |
| 91 | + 内容理解字段生成请求:根据商品标题批量生成通用增强字段与品类 taxonomy 字段。 | |
| 92 | 92 | 供外部 indexer 在自行组织 doc 时调用,与翻译、向量化等微服务并列。 |
| 93 | 93 | """ |
| 94 | 94 | tenant_id: str = Field(..., description="租户 ID,用于请求路由与结果归属,不参与缓存键") |
| 95 | 95 | items: List[EnrichContentItem] = Field(..., description="待分析的 SPU 列表(spu_id + title,可附带 brief/description/image_url)") |
| 96 | - analysis_kinds: List[Literal["content", "taxonomy"]] = Field( | |
| 97 | - default_factory=lambda: ["content", "taxonomy"], | |
| 96 | + enrichment_scopes: Optional[List[Literal["generic", "category_taxonomy"]]] = Field( | |
| 97 | + default=None, | |
| 98 | 98 | description=( |
| 99 | - "要执行的分析族。" | |
| 100 | - "`content` 返回 qanchors/enriched_tags/enriched_attributes;" | |
| 101 | - "`taxonomy` 返回 enriched_taxonomy_attributes。" | |
| 99 | + "要执行的增强范围。" | |
| 100 | + "`generic` 返回 qanchors/enriched_tags/enriched_attributes;" | |
| 101 | + "`category_taxonomy` 返回 enriched_taxonomy_attributes。" | |
| 102 | 102 | "默认两者都执行。" |
| 103 | 103 | ), |
| 104 | 104 | ) |
| 105 | + category_taxonomy_profile: str = Field( | |
| 106 | + "apparel", | |
| 107 | + description=( | |
| 108 | + "品类 taxonomy profile。当前默认且已支持的是 `apparel`。" | |
| 109 | + "未来可扩展为 `electronics` 等。" | |
| 110 | + ), | |
| 111 | + ) | |
| 112 | + analysis_kinds: Optional[List[Literal["content", "taxonomy"]]] = Field( | |
| 113 | + default=None, | |
| 114 | + description="Deprecated alias of enrichment_scopes. `content` -> `generic`, `taxonomy` -> `category_taxonomy`.", | |
| 115 | + ) | |
| 116 | + | |
| 117 | + def resolved_enrichment_scopes(self) -> List[str]: | |
| 118 | + if self.enrichment_scopes: | |
| 119 | + return list(self.enrichment_scopes) | |
| 120 | + if self.analysis_kinds: | |
| 121 | + mapped = [] | |
| 122 | + for item in self.analysis_kinds: | |
| 123 | + mapped.append("generic" if item == "content" else "category_taxonomy") | |
| 124 | + return mapped | |
| 125 | + return ["generic", "category_taxonomy"] | |
| 105 | 126 | |
| 106 | 127 | |
| 107 | 128 | @router.post("/reindex") |
| ... | ... | @@ -452,7 +473,8 @@ async def build_docs_from_db(request: BuildDocsFromDbRequest): |
| 452 | 473 | def _run_enrich_content( |
| 453 | 474 | tenant_id: str, |
| 454 | 475 | items: List[Dict[str, str]], |
| 455 | - analysis_kinds: Optional[List[str]] = None, | |
| 476 | + enrichment_scopes: Optional[List[str]] = None, | |
| 477 | + category_taxonomy_profile: str = "apparel", | |
| 456 | 478 | ) -> List[Dict[str, Any]]: |
| 457 | 479 | """ |
| 458 | 480 | 同步执行内容理解,返回与 ES mapping 对齐的字段结构。 |
| ... | ... | @@ -463,7 +485,8 @@ def _run_enrich_content( |
| 463 | 485 | results = build_index_content_fields( |
| 464 | 486 | items=items, |
| 465 | 487 | tenant_id=tenant_id, |
| 466 | - analysis_kinds=analysis_kinds, | |
| 488 | + enrichment_scopes=enrichment_scopes, | |
| 489 | + category_taxonomy_profile=category_taxonomy_profile, | |
| 467 | 490 | ) |
| 468 | 491 | return [ |
| 469 | 492 | { |
| ... | ... | @@ -481,7 +504,7 @@ def _run_enrich_content( |
| 481 | 504 | @router.post("/enrich-content") |
| 482 | 505 | async def enrich_content(request: EnrichContentRequest): |
| 483 | 506 | """ |
| 484 | - 内容理解字段生成接口:根据商品标题批量生成 qanchors、enriched_attributes、tags、taxonomy attributes。 | |
| 507 | + 内容理解字段生成接口:根据商品标题批量生成通用增强字段与品类 taxonomy 字段。 | |
| 485 | 508 | |
| 486 | 509 | 使用场景: |
| 487 | 510 | - 外部 indexer 采用「微服务组合」方式自己组织 doc 时,可调用本接口获取 LLM 生成的 |
| ... | ... | @@ -511,17 +534,20 @@ async def enrich_content(request: EnrichContentRequest): |
| 511 | 534 | for it in request.items |
| 512 | 535 | ] |
| 513 | 536 | loop = asyncio.get_event_loop() |
| 537 | + enrichment_scopes = request.resolved_enrichment_scopes() | |
| 514 | 538 | result = await loop.run_in_executor( |
| 515 | 539 | None, |
| 516 | 540 | lambda: _run_enrich_content( |
| 517 | 541 | tenant_id=request.tenant_id, |
| 518 | 542 | items=items_payload, |
| 519 | - analysis_kinds=request.analysis_kinds, | |
| 543 | + enrichment_scopes=enrichment_scopes, | |
| 544 | + category_taxonomy_profile=request.category_taxonomy_profile, | |
| 520 | 545 | ), |
| 521 | 546 | ) |
| 522 | 547 | return { |
| 523 | 548 | "tenant_id": request.tenant_id, |
| 524 | - "analysis_kinds": request.analysis_kinds, | |
| 549 | + "enrichment_scopes": enrichment_scopes, | |
| 550 | + "category_taxonomy_profile": request.category_taxonomy_profile, | |
| 525 | 551 | "results": result, |
| 526 | 552 | "total": len(result), |
| 527 | 553 | } | ... | ... |
docs/issues/issue-2026-04-08-eval框架主指标ERR的问题以及bm25调参-done-0408.md
| ... | ... | @@ -92,3 +92,29 @@ curl -s -u "$AUTH" -X GET "$ES/${INDEX}/_settings?filter_path=**.similarity&pret |
| 92 | 92 | ```bash |
| 93 | 93 | curl -s -u "$AUTH" -X GET "$ES/_cat/indices/search_products_tenant_*?h=index&v" |
| 94 | 94 | ``` |
| 95 | + | |
| 96 | + | |
| 97 | + | |
| 98 | +实验结果: | |
| 99 | +已完成四轮实验,并把 `search_products_tenant_163` 当前索引参数留在了最优组:`b=0.5, k1=1.0`。刚刚已校验生效。 | |
| 100 | + | |
| 101 | +对比结果如下: | |
| 102 | + | |
| 103 | +- 基线当前线上组 `b=0.10, k1=0.30`:`0.604555` | |
| 104 | + 报告:[batch_20260408T055948Z_00b6a8aa3d.md](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260408T055948Z_00b6a8aa3d.md) | |
| 105 | +- 复跑 `b=0.10, k1=0.30`:`0.604298` | |
| 106 | + 报告:[batch_20260408T064011Z_00b6a8aa3d.json](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260408T064011Z_00b6a8aa3d.json) | |
| 107 | +- 实验 `b=0.20, k1=0.60`:`0.604274` | |
| 108 | + 报告:[batch_20260408T064123Z_00b6a8aa3d.json](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260408T064123Z_00b6a8aa3d.json) | |
| 109 | +- 实验 `b=0.50, k1=1.0`:`0.606041` | |
| 110 | + 报告:[batch_20260408T064216Z_00b6a8aa3d.json](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260408T064216Z_00b6a8aa3d.json) | |
| 111 | +- 实验 `b=0.10, k1=0.75`:`0.603438` | |
| 112 | + 报告:[batch_20260408T064304Z_00b6a8aa3d.json](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260408T064304Z_00b6a8aa3d.json) | |
| 113 | +- 老基线 `b=0.0, k1=0.0`:`0.602598` | |
| 114 | + 报告:[batch_20260407T150946Z_00b6a8aa3d.md](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260407T150946Z_00b6a8aa3d.md) | |
| 115 | + | |
| 116 | +结论是这四组里 `b=0.50, k1=1.0` 最好,相比你给的当前基线 `0.604555` 提升了 `+0.001486`,相比 `0/0` 提升了 `+0.003443`。`0.20/0.60` 基本和 `0.10/0.30` 持平,`0.10/0.75` 反而更差,说明这轮里单独加大 `k1` 不如同时把 `b` 和 `k1` 一起放宽。 | |
| 117 | + | |
| 118 | +补充一点:我只改了现有索引 `search_products_tenant_163` 的 index-level setting,没有改仓库里的默认映射。[search_products.json](/data/saas-search/mappings/search_products.json) 现在仍是 `0.1/0.3`。如果你要,我下一步可以直接帮你把映射默认值也改成 `0.5/1.0`,并顺手批量同步其他 `search_products_tenant_*` 索引。 | |
| 119 | + | |
| 120 | + | ... | ... |
docs/搜索API对接指南-05-索引接口(Indexer).md
| ... | ... | @@ -655,7 +655,8 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ |
| 655 | 655 | ```json |
| 656 | 656 | { |
| 657 | 657 | "tenant_id": "170", |
| 658 | - "analysis_kinds": ["content", "taxonomy"], | |
| 658 | + "enrichment_scopes": ["generic", "category_taxonomy"], | |
| 659 | + "category_taxonomy_profile": "apparel", | |
| 659 | 660 | "items": [ |
| 660 | 661 | { |
| 661 | 662 | "spu_id": "223167", |
| ... | ... | @@ -676,7 +677,8 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ |
| 676 | 677 | | 参数 | 类型 | 必填 | 默认值 | 说明 | |
| 677 | 678 | |------|------|------|--------|------| |
| 678 | 679 | | `tenant_id` | string | Y | - | 租户 ID。目前仅用于记录日志,不产生实际作用| |
| 679 | -| `analysis_kinds` | array[string] | N | `["content", "taxonomy"]` | 选择要执行的分析族。`content` 生成 `qanchors`/`enriched_tags`/`enriched_attributes`,`taxonomy` 生成 `enriched_taxonomy_attributes` | | |
| 680 | +| `enrichment_scopes` | array[string] | N | `["generic", "category_taxonomy"]` | 选择要执行的增强范围。`generic` 生成 `qanchors`/`enriched_tags`/`enriched_attributes`,`category_taxonomy` 生成 `enriched_taxonomy_attributes` | | |
| 681 | +| `category_taxonomy_profile` | string | N | `apparel` | 品类 taxonomy profile。当前内置为服装大类 `apparel`,后续可扩展到其他大类 | | |
| 680 | 682 | | `items` | array | Y | - | 待分析列表;**单次最多 50 条** | |
| 681 | 683 | |
| 682 | 684 | `items[]` 字段说明: |
| ... | ... | @@ -691,8 +693,8 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ |
| 691 | 693 | |
| 692 | 694 | 缓存说明: |
| 693 | 695 | |
| 694 | -- 内容缓存按 **分析族拆分**,即 `content` 与 `taxonomy` 使用不同的缓存命名空间,互不污染、可独立演进。 | |
| 695 | -- 缓存键由 `analysis_kind + target_lang + prompt/schema 版本指纹 + prompt 输入文本 hash` 构成。 | |
| 696 | +- 内容缓存按 **增强范围 + taxonomy profile** 拆分;`generic` 与 `category_taxonomy:apparel` 等使用不同缓存命名空间,互不污染、可独立演进。 | |
| 697 | +- 缓存键由 `analysis_kind + target_lang + prompt/schema 版本指纹 + prompt 输入文本 hash` 构成;对 category taxonomy 来说,profile 会进入 schema 标识与版本指纹。 | |
| 696 | 698 | - 当前真正参与 prompt 输入的字段是:`title`、`brief`、`description`;这些字段任一变化,都会落到新的缓存 key。 |
| 697 | 699 | - `prompt/schema 版本指纹` 会综合 system prompt、shared instruction、localized table headers、result fields、user instruction template 等信息生成;因此只要提示词或输出契约变化,旧缓存会自然失效。 |
| 698 | 700 | - `tenant_id`、`spu_id` 只用于请求归属与结果回填,不参与缓存键。 |
| ... | ... | @@ -714,7 +716,8 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ |
| 714 | 716 | ```json |
| 715 | 717 | { |
| 716 | 718 | "tenant_id": "170", |
| 717 | - "analysis_kinds": ["content", "taxonomy"], | |
| 719 | + "enrichment_scopes": ["generic", "category_taxonomy"], | |
| 720 | + "category_taxonomy_profile": "apparel", | |
| 718 | 721 | "total": 2, |
| 719 | 722 | "results": [ |
| 720 | 723 | { |
| ... | ... | @@ -755,7 +758,8 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ |
| 755 | 758 | |
| 756 | 759 | | 字段 | 类型 | 说明 | |
| 757 | 760 | |------|------|------| |
| 758 | -| `analysis_kinds` | array | 实际执行的分析族列表 | | |
| 761 | +| `enrichment_scopes` | array | 实际执行的增强范围列表 | | |
| 762 | +| `category_taxonomy_profile` | string | 实际使用的品类 taxonomy profile | | |
| 759 | 763 | | `results` | array | 与请求 `items` 一一对应,每项含 `spu_id`、`qanchors`、`enriched_attributes`、`enriched_tags`、`enriched_taxonomy_attributes` | |
| 760 | 764 | | `results[].qanchors` | object | 与 ES `qanchors` 字段同结构,按语言键返回短语数组 | |
| 761 | 765 | | `results[].enriched_tags` | object | 与 ES `enriched_tags` 字段同结构,按语言键返回标签数组 | |
| ... | ... | @@ -774,7 +778,8 @@ curl -X POST "http://localhost:6004/indexer/enrich-content" \ |
| 774 | 778 | -H "Content-Type: application/json" \ |
| 775 | 779 | -d '{ |
| 776 | 780 | "tenant_id": "163", |
| 777 | - "analysis_kinds": ["content", "taxonomy"], | |
| 781 | + "enrichment_scopes": ["generic", "category_taxonomy"], | |
| 782 | + "category_taxonomy_profile": "apparel", | |
| 778 | 783 | "items": [ |
| 779 | 784 | { |
| 780 | 785 | "spu_id": "223167", | ... | ... |
docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md
| ... | ... | @@ -444,7 +444,7 @@ curl "http://localhost:6006/health" |
| 444 | 444 | |
| 445 | 445 | - **Base URL**: Indexer 服务地址,如 `http://localhost:6004` |
| 446 | 446 | - **路径**: `POST /indexer/enrich-content` |
| 447 | -- **说明**: 根据商品标题批量生成 `qanchors`、`enriched_attributes`、`enriched_tags`、`enriched_taxonomy_attributes`,用于拼装 ES 文档。支持通过 `analysis_kinds` 选择执行 `content` / `taxonomy`;默认两者都执行。内部使用大模型(需配置 `DASHSCOPE_API_KEY`),支持多语言与 Redis 缓存;单次最多 50 条,建议批量调用以提升效率。 | |
| 447 | +- **说明**: 根据商品标题批量生成 `qanchors`、`enriched_attributes`、`enriched_tags`、`enriched_taxonomy_attributes`,用于拼装 ES 文档。支持通过 `enrichment_scopes` 选择执行 `generic` / `category_taxonomy`,并通过 `category_taxonomy_profile` 选择对应大类的 taxonomy prompt/profile;默认执行 `generic + category_taxonomy(apparel)`。内部使用大模型(需配置 `DASHSCOPE_API_KEY`),支持多语言与 Redis 缓存;单次最多 50 条,建议批量调用以提升效率。 | |
| 448 | 448 | |
| 449 | 449 | 请求/响应格式、示例及错误码见 [-05-索引接口(Indexer)](./搜索API对接指南-05-索引接口(Indexer).md#58-内容理解字段生成接口)。 |
| 450 | 450 | ... | ... |
indexer/product_enrich.py
| ... | ... | @@ -151,7 +151,8 @@ if _missing_prompt_langs: |
| 151 | 151 | # 多值字段分隔:英文逗号、中文逗号、顿号,及历史约定的 ; | / 与空白 |
| 152 | 152 | _MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[,、,;|/\n\t]+") |
| 153 | 153 | _CORE_INDEX_LANGUAGES = ("zh", "en") |
| 154 | -_DEFAULT_ANALYSIS_KINDS = ("content", "taxonomy") | |
| 154 | +_DEFAULT_ENRICHMENT_SCOPES = ("generic", "category_taxonomy") | |
| 155 | +_DEFAULT_CATEGORY_TAXONOMY_PROFILE = "apparel" | |
| 155 | 156 | _CONTENT_ANALYSIS_ATTRIBUTE_FIELD_MAP = ( |
| 156 | 157 | ("tags", "enriched_tags"), |
| 157 | 158 | ("target_audience", "target_audience"), |
| ... | ... | @@ -187,7 +188,7 @@ _CONTENT_ANALYSIS_FIELD_ALIASES = { |
| 187 | 188 | "tags": ("tags", "enriched_tags"), |
| 188 | 189 | } |
| 189 | 190 | _CONTENT_ANALYSIS_QUALITY_FIELDS = ("title", "category_path", "anchor_text") |
| 190 | -_TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP = ( | |
| 191 | +_APPAREL_TAXONOMY_ATTRIBUTE_FIELD_MAP = ( | |
| 191 | 192 | ("product_type", "Product Type"), |
| 192 | 193 | ("target_gender", "Target Gender"), |
| 193 | 194 | ("age_group", "Age Group"), |
| ... | ... | @@ -215,8 +216,8 @@ _TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP = ( |
| 215 | 216 | ("occasion_end_use", "Occasion / End Use"), |
| 216 | 217 | ("style_aesthetic", "Style Aesthetic"), |
| 217 | 218 | ) |
| 218 | -_TAXONOMY_ANALYSIS_RESULT_FIELDS = tuple( | |
| 219 | - field_name for field_name, _ in _TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP | |
| 219 | +_APPAREL_TAXONOMY_ANALYSIS_RESULT_FIELDS = tuple( | |
| 220 | + field_name for field_name, _ in _APPAREL_TAXONOMY_ATTRIBUTE_FIELD_MAP | |
| 220 | 221 | ) |
| 221 | 222 | |
| 222 | 223 | |
| ... | ... | @@ -252,37 +253,66 @@ _ANALYSIS_SCHEMAS: Dict[str, AnalysisSchema] = { |
| 252 | 253 | field_aliases=_CONTENT_ANALYSIS_FIELD_ALIASES, |
| 253 | 254 | quality_fields=_CONTENT_ANALYSIS_QUALITY_FIELDS, |
| 254 | 255 | ), |
| 255 | - "taxonomy": AnalysisSchema( | |
| 256 | - name="taxonomy", | |
| 256 | +} | |
| 257 | + | |
| 258 | +_CATEGORY_TAXONOMY_PROFILE_SCHEMAS: Dict[str, AnalysisSchema] = { | |
| 259 | + "apparel": AnalysisSchema( | |
| 260 | + name="taxonomy:apparel", | |
| 257 | 261 | shared_instruction=TAXONOMY_SHARED_ANALYSIS_INSTRUCTION, |
| 258 | 262 | markdown_table_headers=TAXONOMY_LANGUAGE_MARKDOWN_TABLE_HEADERS, |
| 259 | - result_fields=_TAXONOMY_ANALYSIS_RESULT_FIELDS, | |
| 260 | - meaningful_fields=_TAXONOMY_ANALYSIS_RESULT_FIELDS, | |
| 263 | + result_fields=_APPAREL_TAXONOMY_ANALYSIS_RESULT_FIELDS, | |
| 264 | + meaningful_fields=_APPAREL_TAXONOMY_ANALYSIS_RESULT_FIELDS, | |
| 261 | 265 | cache_version="v1", |
| 262 | 266 | fallback_headers=TAXONOMY_MARKDOWN_TABLE_HEADERS_EN, |
| 263 | 267 | ), |
| 264 | 268 | } |
| 265 | 269 | |
| 270 | +_CATEGORY_TAXONOMY_PROFILE_ATTRIBUTE_FIELD_MAPS: Dict[str, Tuple[Tuple[str, str], ...]] = { | |
| 271 | + "apparel": _APPAREL_TAXONOMY_ATTRIBUTE_FIELD_MAP, | |
| 272 | +} | |
| 273 | + | |
| 266 | 274 | |
| 267 | -def _get_analysis_schema(analysis_kind: str) -> AnalysisSchema: | |
| 268 | - schema = _ANALYSIS_SCHEMAS.get(analysis_kind) | |
| 269 | - if schema is None: | |
| 270 | - raise ValueError(f"Unsupported analysis_kind: {analysis_kind}") | |
| 271 | - return schema | |
| 275 | +def _normalize_category_taxonomy_profile(category_taxonomy_profile: Optional[str] = None) -> str: | |
| 276 | + profile = str(category_taxonomy_profile or _DEFAULT_CATEGORY_TAXONOMY_PROFILE).strip() | |
| 277 | + if profile not in _CATEGORY_TAXONOMY_PROFILE_SCHEMAS: | |
| 278 | + raise ValueError(f"Unsupported category_taxonomy_profile: {profile}") | |
| 279 | + return profile | |
| 272 | 280 | |
| 273 | 281 | |
| 274 | -def _normalize_analysis_kinds( | |
| 275 | - analysis_kinds: Optional[List[str]] = None, | |
| 282 | +def _get_analysis_schema( | |
| 283 | + analysis_kind: str, | |
| 284 | + *, | |
| 285 | + category_taxonomy_profile: Optional[str] = None, | |
| 286 | +) -> AnalysisSchema: | |
| 287 | + if analysis_kind == "content": | |
| 288 | + return _ANALYSIS_SCHEMAS["content"] | |
| 289 | + if analysis_kind == "taxonomy": | |
| 290 | + profile = _normalize_category_taxonomy_profile(category_taxonomy_profile) | |
| 291 | + return _CATEGORY_TAXONOMY_PROFILE_SCHEMAS[profile] | |
| 292 | + raise ValueError(f"Unsupported analysis_kind: {analysis_kind}") | |
| 293 | + | |
| 294 | + | |
| 295 | +def _get_taxonomy_attribute_field_map( | |
| 296 | + category_taxonomy_profile: Optional[str] = None, | |
| 297 | +) -> Tuple[Tuple[str, str], ...]: | |
| 298 | + profile = _normalize_category_taxonomy_profile(category_taxonomy_profile) | |
| 299 | + return _CATEGORY_TAXONOMY_PROFILE_ATTRIBUTE_FIELD_MAPS[profile] | |
| 300 | + | |
| 301 | + | |
| 302 | +def _normalize_enrichment_scopes( | |
| 303 | + enrichment_scopes: Optional[List[str]] = None, | |
| 276 | 304 | ) -> Tuple[str, ...]: |
| 277 | - requested = _DEFAULT_ANALYSIS_KINDS if not analysis_kinds else tuple(analysis_kinds) | |
| 305 | + requested = _DEFAULT_ENRICHMENT_SCOPES if not enrichment_scopes else tuple(enrichment_scopes) | |
| 278 | 306 | normalized: List[str] = [] |
| 279 | 307 | seen = set() |
| 280 | - for analysis_kind in requested: | |
| 281 | - schema = _get_analysis_schema(str(analysis_kind).strip()) | |
| 282 | - if schema.name in seen: | |
| 308 | + for enrichment_scope in requested: | |
| 309 | + scope = str(enrichment_scope).strip() | |
| 310 | + if scope not in {"generic", "category_taxonomy"}: | |
| 311 | + raise ValueError(f"Unsupported enrichment_scope: {scope}") | |
| 312 | + if scope in seen: | |
| 283 | 313 | continue |
| 284 | - seen.add(schema.name) | |
| 285 | - normalized.append(schema.name) | |
| 314 | + seen.add(scope) | |
| 315 | + normalized.append(scope) | |
| 286 | 316 | return tuple(normalized) |
| 287 | 317 | |
| 288 | 318 | |
| ... | ... | @@ -448,7 +478,13 @@ def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang: |
| 448 | 478 | _append_lang_phrase_map(result["enriched_tags"], lang=lang, raw_value=raw) |
| 449 | 479 | |
| 450 | 480 | |
| 451 | -def _apply_index_taxonomy_row(result: Dict[str, Any], row: Dict[str, Any], lang: str) -> None: | |
| 481 | +def _apply_index_taxonomy_row( | |
| 482 | + result: Dict[str, Any], | |
| 483 | + row: Dict[str, Any], | |
| 484 | + lang: str, | |
| 485 | + *, | |
| 486 | + category_taxonomy_profile: Optional[str] = None, | |
| 487 | +) -> None: | |
| 452 | 488 | if not row or row.get("error"): |
| 453 | 489 | return |
| 454 | 490 | |
| ... | ... | @@ -456,8 +492,11 @@ def _apply_index_taxonomy_row(result: Dict[str, Any], row: Dict[str, Any], lang: |
| 456 | 492 | result["enriched_taxonomy_attributes"], |
| 457 | 493 | row=row, |
| 458 | 494 | lang=lang, |
| 459 | - schema=_get_analysis_schema("taxonomy"), | |
| 460 | - field_map=_TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP, | |
| 495 | + schema=_get_analysis_schema( | |
| 496 | + "taxonomy", | |
| 497 | + category_taxonomy_profile=category_taxonomy_profile, | |
| 498 | + ), | |
| 499 | + field_map=_get_taxonomy_attribute_field_map(category_taxonomy_profile), | |
| 461 | 500 | ) |
| 462 | 501 | |
| 463 | 502 | |
| ... | ... | @@ -475,7 +514,8 @@ def _normalize_index_content_item(item: Dict[str, Any]) -> Dict[str, str]: |
| 475 | 514 | def build_index_content_fields( |
| 476 | 515 | items: List[Dict[str, Any]], |
| 477 | 516 | tenant_id: Optional[str] = None, |
| 478 | - analysis_kinds: Optional[List[str]] = None, | |
| 517 | + enrichment_scopes: Optional[List[str]] = None, | |
| 518 | + category_taxonomy_profile: Optional[str] = None, | |
| 479 | 519 | ) -> List[Dict[str, Any]]: |
| 480 | 520 | """ |
| 481 | 521 | 高层入口:生成与 ES mapping 对齐的内容理解字段。 |
| ... | ... | @@ -484,7 +524,8 @@ def build_index_content_fields( |
| 484 | 524 | - `id` 或 `spu_id` |
| 485 | 525 | - `title` |
| 486 | 526 | - 可选 `brief` / `description` / `image_url` |
| 487 | - - 可选 `analysis_kinds`,默认同时执行 `content` 与 `taxonomy` | |
| 527 | + - 可选 `enrichment_scopes`,默认同时执行 `generic` 与 `category_taxonomy` | |
| 528 | + - 可选 `category_taxonomy_profile`,默认 `apparel` | |
| 488 | 529 | |
| 489 | 530 | 返回项结构: |
| 490 | 531 | - `id` |
| ... | ... | @@ -498,7 +539,8 @@ def build_index_content_fields( |
| 498 | 539 | - `qanchors.{lang}` 为短语数组 |
| 499 | 540 | - `enriched_tags.{lang}` 为标签数组 |
| 500 | 541 | """ |
| 501 | - requested_analysis_kinds = _normalize_analysis_kinds(analysis_kinds) | |
| 542 | + requested_enrichment_scopes = _normalize_enrichment_scopes(enrichment_scopes) | |
| 543 | + normalized_taxonomy_profile = _normalize_category_taxonomy_profile(category_taxonomy_profile) | |
| 502 | 544 | normalized_items = [_normalize_index_content_item(item) for item in items] |
| 503 | 545 | if not normalized_items: |
| 504 | 546 | return [] |
| ... | ... | @@ -515,7 +557,7 @@ def build_index_content_fields( |
| 515 | 557 | } |
| 516 | 558 | |
| 517 | 559 | for lang in _CORE_INDEX_LANGUAGES: |
| 518 | - if "content" in requested_analysis_kinds: | |
| 560 | + if "generic" in requested_enrichment_scopes: | |
| 519 | 561 | try: |
| 520 | 562 | rows = analyze_products( |
| 521 | 563 | products=normalized_items, |
| ... | ... | @@ -523,6 +565,7 @@ def build_index_content_fields( |
| 523 | 565 | batch_size=BATCH_SIZE, |
| 524 | 566 | tenant_id=tenant_id, |
| 525 | 567 | analysis_kind="content", |
| 568 | + category_taxonomy_profile=normalized_taxonomy_profile, | |
| 526 | 569 | ) |
| 527 | 570 | except Exception as e: |
| 528 | 571 | logger.warning("build_index_content_fields content enrichment failed for lang=%s: %s", lang, e) |
| ... | ... | @@ -539,7 +582,7 @@ def build_index_content_fields( |
| 539 | 582 | continue |
| 540 | 583 | _apply_index_content_row(results_by_id[item_id], row=row, lang=lang) |
| 541 | 584 | |
| 542 | - if "taxonomy" in requested_analysis_kinds: | |
| 585 | + if "category_taxonomy" in requested_enrichment_scopes: | |
| 543 | 586 | try: |
| 544 | 587 | taxonomy_rows = analyze_products( |
| 545 | 588 | products=normalized_items, |
| ... | ... | @@ -547,6 +590,7 @@ def build_index_content_fields( |
| 547 | 590 | batch_size=BATCH_SIZE, |
| 548 | 591 | tenant_id=tenant_id, |
| 549 | 592 | analysis_kind="taxonomy", |
| 593 | + category_taxonomy_profile=normalized_taxonomy_profile, | |
| 550 | 594 | ) |
| 551 | 595 | except Exception as e: |
| 552 | 596 | logger.warning( |
| ... | ... | @@ -565,7 +609,12 @@ def build_index_content_fields( |
| 565 | 609 | if row.get("error"): |
| 566 | 610 | results_by_id[item_id].setdefault("error", row["error"]) |
| 567 | 611 | continue |
| 568 | - _apply_index_taxonomy_row(results_by_id[item_id], row=row, lang=lang) | |
| 612 | + _apply_index_taxonomy_row( | |
| 613 | + results_by_id[item_id], | |
| 614 | + row=row, | |
| 615 | + lang=lang, | |
| 616 | + category_taxonomy_profile=normalized_taxonomy_profile, | |
| 617 | + ) | |
| 569 | 618 | |
| 570 | 619 | return [results_by_id[item["id"]] for item in normalized_items] |
| 571 | 620 | |
| ... | ... | @@ -636,9 +685,13 @@ def _make_analysis_cache_key( |
| 636 | 685 | product: Dict[str, Any], |
| 637 | 686 | target_lang: str, |
| 638 | 687 | analysis_kind: str, |
| 688 | + category_taxonomy_profile: Optional[str] = None, | |
| 639 | 689 | ) -> str: |
| 640 | 690 | """构造缓存 key,仅由分析类型、prompt 实际输入文本内容与目标语言决定。""" |
| 641 | - schema = _get_analysis_schema(analysis_kind) | |
| 691 | + schema = _get_analysis_schema( | |
| 692 | + analysis_kind, | |
| 693 | + category_taxonomy_profile=category_taxonomy_profile, | |
| 694 | + ) | |
| 642 | 695 | prompt_input = _build_prompt_input_text(product) |
| 643 | 696 | h = hashlib.md5(prompt_input.encode("utf-8")).hexdigest() |
| 644 | 697 | prompt_contract = { |
| ... | ... | @@ -672,12 +725,21 @@ def _get_cached_analysis_result( |
| 672 | 725 | product: Dict[str, Any], |
| 673 | 726 | target_lang: str, |
| 674 | 727 | analysis_kind: str, |
| 728 | + category_taxonomy_profile: Optional[str] = None, | |
| 675 | 729 | ) -> Optional[Dict[str, Any]]: |
| 676 | 730 | if not _anchor_redis: |
| 677 | 731 | return None |
| 678 | - schema = _get_analysis_schema(analysis_kind) | |
| 732 | + schema = _get_analysis_schema( | |
| 733 | + analysis_kind, | |
| 734 | + category_taxonomy_profile=category_taxonomy_profile, | |
| 735 | + ) | |
| 679 | 736 | try: |
| 680 | - key = _make_analysis_cache_key(product, target_lang, analysis_kind) | |
| 737 | + key = _make_analysis_cache_key( | |
| 738 | + product, | |
| 739 | + target_lang, | |
| 740 | + analysis_kind, | |
| 741 | + category_taxonomy_profile=category_taxonomy_profile, | |
| 742 | + ) | |
| 681 | 743 | raw = _anchor_redis.get(key) |
| 682 | 744 | if not raw: |
| 683 | 745 | return None |
| ... | ... | @@ -707,10 +769,14 @@ def _set_cached_analysis_result( |
| 707 | 769 | target_lang: str, |
| 708 | 770 | result: Dict[str, Any], |
| 709 | 771 | analysis_kind: str, |
| 772 | + category_taxonomy_profile: Optional[str] = None, | |
| 710 | 773 | ) -> None: |
| 711 | 774 | if not _anchor_redis: |
| 712 | 775 | return |
| 713 | - schema = _get_analysis_schema(analysis_kind) | |
| 776 | + schema = _get_analysis_schema( | |
| 777 | + analysis_kind, | |
| 778 | + category_taxonomy_profile=category_taxonomy_profile, | |
| 779 | + ) | |
| 714 | 780 | try: |
| 715 | 781 | normalized = _normalize_analysis_result( |
| 716 | 782 | result, |
| ... | ... | @@ -720,7 +786,12 @@ def _set_cached_analysis_result( |
| 720 | 786 | ) |
| 721 | 787 | if not _has_meaningful_analysis_content(normalized, schema): |
| 722 | 788 | return |
| 723 | - key = _make_analysis_cache_key(product, target_lang, analysis_kind) | |
| 789 | + key = _make_analysis_cache_key( | |
| 790 | + product, | |
| 791 | + target_lang, | |
| 792 | + analysis_kind, | |
| 793 | + category_taxonomy_profile=category_taxonomy_profile, | |
| 794 | + ) | |
| 724 | 795 | ttl = ANCHOR_CACHE_EXPIRE_DAYS * 24 * 3600 |
| 725 | 796 | _anchor_redis.setex(key, ttl, json.dumps(normalized, ensure_ascii=False)) |
| 726 | 797 | except Exception as e: |
| ... | ... | @@ -775,9 +846,13 @@ def create_prompt( |
| 775 | 846 | products: List[Dict[str, str]], |
| 776 | 847 | target_lang: str = "zh", |
| 777 | 848 | analysis_kind: str = "content", |
| 849 | + category_taxonomy_profile: Optional[str] = None, | |
| 778 | 850 | ) -> Tuple[Optional[str], Optional[str], Optional[str]]: |
| 779 | 851 | """根据目标语言创建共享上下文、本地化输出要求和 Partial Mode assistant 前缀。""" |
| 780 | - schema = _get_analysis_schema(analysis_kind) | |
| 852 | + schema = _get_analysis_schema( | |
| 853 | + analysis_kind, | |
| 854 | + category_taxonomy_profile=category_taxonomy_profile, | |
| 855 | + ) | |
| 781 | 856 | markdown_table_headers = schema.get_headers(target_lang) |
| 782 | 857 | if not markdown_table_headers: |
| 783 | 858 | logger.warning( |
| ... | ... | @@ -978,9 +1053,13 @@ def call_llm( |
| 978 | 1053 | def parse_markdown_table( |
| 979 | 1054 | markdown_content: str, |
| 980 | 1055 | analysis_kind: str = "content", |
| 1056 | + category_taxonomy_profile: Optional[str] = None, | |
| 981 | 1057 | ) -> List[Dict[str, str]]: |
| 982 | 1058 | """解析markdown表格内容""" |
| 983 | - schema = _get_analysis_schema(analysis_kind) | |
| 1059 | + schema = _get_analysis_schema( | |
| 1060 | + analysis_kind, | |
| 1061 | + category_taxonomy_profile=category_taxonomy_profile, | |
| 1062 | + ) | |
| 984 | 1063 | lines = markdown_content.strip().split("\n") |
| 985 | 1064 | data = [] |
| 986 | 1065 | data_started = False |
| ... | ... | @@ -1025,8 +1104,12 @@ def _log_parsed_result_quality( |
| 1025 | 1104 | target_lang: str, |
| 1026 | 1105 | batch_num: int, |
| 1027 | 1106 | analysis_kind: str, |
| 1107 | + category_taxonomy_profile: Optional[str] = None, | |
| 1028 | 1108 | ) -> None: |
| 1029 | - schema = _get_analysis_schema(analysis_kind) | |
| 1109 | + schema = _get_analysis_schema( | |
| 1110 | + analysis_kind, | |
| 1111 | + category_taxonomy_profile=category_taxonomy_profile, | |
| 1112 | + ) | |
| 1030 | 1113 | expected = len(batch_data) |
| 1031 | 1114 | actual = len(parsed_results) |
| 1032 | 1115 | if actual != expected: |
| ... | ... | @@ -1071,9 +1154,13 @@ def process_batch( |
| 1071 | 1154 | batch_num: int, |
| 1072 | 1155 | target_lang: str = "zh", |
| 1073 | 1156 | analysis_kind: str = "content", |
| 1157 | + category_taxonomy_profile: Optional[str] = None, | |
| 1074 | 1158 | ) -> List[Dict[str, Any]]: |
| 1075 | 1159 | """处理一个批次的数据""" |
| 1076 | - schema = _get_analysis_schema(analysis_kind) | |
| 1160 | + schema = _get_analysis_schema( | |
| 1161 | + analysis_kind, | |
| 1162 | + category_taxonomy_profile=category_taxonomy_profile, | |
| 1163 | + ) | |
| 1077 | 1164 | logger.info(f"\n{'#' * 80}") |
| 1078 | 1165 | logger.info( |
| 1079 | 1166 | "Processing Batch %s (%s items, kind=%s)", |
| ... | ... | @@ -1087,6 +1174,7 @@ def process_batch( |
| 1087 | 1174 | batch_data, |
| 1088 | 1175 | target_lang=target_lang, |
| 1089 | 1176 | analysis_kind=analysis_kind, |
| 1177 | + category_taxonomy_profile=category_taxonomy_profile, | |
| 1090 | 1178 | ) |
| 1091 | 1179 | |
| 1092 | 1180 | # 如果提示词创建失败(例如不支持的 target_lang),本次批次整体失败,不再继续调用 LLM |
| ... | ... | @@ -1119,13 +1207,18 @@ def process_batch( |
| 1119 | 1207 | ) |
| 1120 | 1208 | |
| 1121 | 1209 | # 解析结果 |
| 1122 | - parsed_results = parse_markdown_table(raw_response, analysis_kind=analysis_kind) | |
| 1210 | + parsed_results = parse_markdown_table( | |
| 1211 | + raw_response, | |
| 1212 | + analysis_kind=analysis_kind, | |
| 1213 | + category_taxonomy_profile=category_taxonomy_profile, | |
| 1214 | + ) | |
| 1123 | 1215 | _log_parsed_result_quality( |
| 1124 | 1216 | batch_data, |
| 1125 | 1217 | parsed_results, |
| 1126 | 1218 | target_lang, |
| 1127 | 1219 | batch_num, |
| 1128 | 1220 | analysis_kind, |
| 1221 | + category_taxonomy_profile, | |
| 1129 | 1222 | ) |
| 1130 | 1223 | |
| 1131 | 1224 | logger.info(f"\nParsed Results ({len(parsed_results)} items):") |
| ... | ... | @@ -1190,6 +1283,7 @@ def analyze_products( |
| 1190 | 1283 | batch_size: Optional[int] = None, |
| 1191 | 1284 | tenant_id: Optional[str] = None, |
| 1192 | 1285 | analysis_kind: str = "content", |
| 1286 | + category_taxonomy_profile: Optional[str] = None, | |
| 1193 | 1287 | ) -> List[Dict[str, Any]]: |
| 1194 | 1288 | """ |
| 1195 | 1289 | 库调用入口:根据输入+语言,返回锚文本及各维度信息。 |
| ... | ... | @@ -1205,7 +1299,10 @@ def analyze_products( |
| 1205 | 1299 | if not products: |
| 1206 | 1300 | return [] |
| 1207 | 1301 | |
| 1208 | - _get_analysis_schema(analysis_kind) | |
| 1302 | + _get_analysis_schema( | |
| 1303 | + analysis_kind, | |
| 1304 | + category_taxonomy_profile=category_taxonomy_profile, | |
| 1305 | + ) | |
| 1209 | 1306 | results_by_index: List[Optional[Dict[str, Any]]] = [None] * len(products) |
| 1210 | 1307 | uncached_items: List[Tuple[int, Dict[str, str]]] = [] |
| 1211 | 1308 | |
| ... | ... | @@ -1215,7 +1312,12 @@ def analyze_products( |
| 1215 | 1312 | uncached_items.append((idx, product)) |
| 1216 | 1313 | continue |
| 1217 | 1314 | |
| 1218 | - cached = _get_cached_analysis_result(product, target_lang, analysis_kind) | |
| 1315 | + cached = _get_cached_analysis_result( | |
| 1316 | + product, | |
| 1317 | + target_lang, | |
| 1318 | + analysis_kind, | |
| 1319 | + category_taxonomy_profile=category_taxonomy_profile, | |
| 1320 | + ) | |
| 1219 | 1321 | if cached: |
| 1220 | 1322 | logger.info( |
| 1221 | 1323 | f"[analyze_products] Cache hit for title='{title[:50]}...', " |
| ... | ... | @@ -1255,6 +1357,7 @@ def analyze_products( |
| 1255 | 1357 | batch_num=batch_num, |
| 1256 | 1358 | target_lang=target_lang, |
| 1257 | 1359 | analysis_kind=analysis_kind, |
| 1360 | + category_taxonomy_profile=category_taxonomy_profile, | |
| 1258 | 1361 | ) |
| 1259 | 1362 | |
| 1260 | 1363 | for (original_idx, product), item in zip(batch_slice, batch_results): |
| ... | ... | @@ -1266,7 +1369,13 @@ def analyze_products( |
| 1266 | 1369 | # 不缓存错误结果,避免放大临时故障 |
| 1267 | 1370 | continue |
| 1268 | 1371 | try: |
| 1269 | - _set_cached_analysis_result(product, target_lang, item, analysis_kind) | |
| 1372 | + _set_cached_analysis_result( | |
| 1373 | + product, | |
| 1374 | + target_lang, | |
| 1375 | + item, | |
| 1376 | + analysis_kind, | |
| 1377 | + category_taxonomy_profile=category_taxonomy_profile, | |
| 1378 | + ) | |
| 1270 | 1379 | except Exception: |
| 1271 | 1380 | # 已在内部记录 warning |
| 1272 | 1381 | pass |
| ... | ... | @@ -1293,6 +1402,7 @@ def analyze_products( |
| 1293 | 1402 | batch_num=batch_num, |
| 1294 | 1403 | target_lang=target_lang, |
| 1295 | 1404 | analysis_kind=analysis_kind, |
| 1405 | + category_taxonomy_profile=category_taxonomy_profile, | |
| 1296 | 1406 | ) |
| 1297 | 1407 | |
| 1298 | 1408 | # 按 batch_num 回填,确保输出稳定(results_by_index 是按原始 input index 映射的) |
| ... | ... | @@ -1307,7 +1417,13 @@ def analyze_products( |
| 1307 | 1417 | # 不缓存错误结果,避免放大临时故障 |
| 1308 | 1418 | continue |
| 1309 | 1419 | try: |
| 1310 | - _set_cached_analysis_result(product, target_lang, item, analysis_kind) | |
| 1420 | + _set_cached_analysis_result( | |
| 1421 | + product, | |
| 1422 | + target_lang, | |
| 1423 | + item, | |
| 1424 | + analysis_kind, | |
| 1425 | + category_taxonomy_profile=category_taxonomy_profile, | |
| 1426 | + ) | |
| 1311 | 1427 | except Exception: |
| 1312 | 1428 | # 已在内部记录 warning |
| 1313 | 1429 | pass | ... | ... |
indexer/taxonomy.md
| 1 | 1 | |
| 2 | -服装大类的taxonomy | |
| 2 | +# Cross-Border E-commerce Core Categories 大类 | |
| 3 | + | |
| 4 | +## 1. 3C | |
| 5 | +Phone accessories, computer peripherals, smart wearables, audio & video, smart home, gaming gear. 手机配件、电脑周边、智能穿戴、影音娱乐、智能家居、游戏设备。 | |
| 6 | + | |
| 7 | +## 2. Bags 包 | |
| 8 | +Handbags, backpacks, wallets, luggage, crossbody bags, tote bags. 手提包、双肩包、钱包、行李箱、斜挎包、托特包。 | |
| 9 | + | |
| 10 | +## 3. Pet Supplies 宠物用品 | |
| 11 | +Pet food, pet toys, pet care products, pet grooming, pet clothing, smart pet devices. 宠物食品、宠物玩具、宠物护理用品、宠物美容、宠物服装、智能宠物设备。 | |
| 12 | + | |
| 13 | +## 4. Electronics 电子产品 | |
| 14 | +Consumer electronics, home appliances, digital devices, cables & chargers, batteries, electronic components. 消费电子产品、家用电器、数码设备、线材充电器、电池、电子元器件。 | |
| 15 | + | |
| 16 | +## 5. Clothing 服装 | |
| 17 | +Women's wear, men's wear, kid's wear, underwear, outerwear, activewear. 女装、男装、童装、内衣、外套、运动服装。 | |
| 18 | + | |
| 19 | +## 6. Outdoor 户外用品 | |
| 20 | +Camping gear, hiking equipment, fishing supplies, outdoor clothing, travel accessories, survival tools. 露营装备、徒步用品、渔具、户外服装、旅行配件、求生工具。 | |
| 21 | + | |
| 22 | +## 7. Home Appliances 家电/电器 | |
| 23 | +Kitchen appliances, cleaning appliances, personal care appliances, heating & cooling, smart home devices. 厨房电器、清洁电器、个护电器、冷暖设备、智能家居设备。 | |
| 24 | + | |
| 25 | +## 8. Home & Living 家居 | |
| 26 | +Furniture, home textiles, lighting, kitchenware, storage, home decor. 家具、家纺、灯具、厨具、收纳、家居装饰。 | |
| 27 | + | |
| 28 | +## 9. Wigs 假发 | |
| 29 | + | |
| 30 | +## 10. Beauty & Cosmetics 美容美妆 | |
| 31 | +Skincare, makeup, nail care, beauty tools, hair care, fragrances. 护肤品、彩妆、美甲、美容工具、护发、香水。 | |
| 32 | + | |
| 33 | +## 11. Accessories 配饰 | |
| 34 | +Jewelry, watches, belts, scarves, hats, sunglasses, hair accessories. 珠宝、手表、腰带、围巾、帽子、太阳镜、发饰。 | |
| 35 | + | |
| 36 | +## 12. Toys 玩具 | |
| 37 | +Educational toys, plush toys, action figures, puzzles, outdoor toys, DIY toys. 益智玩具、毛绒玩具、可动人偶、拼图、户外玩具、DIY玩具。 | |
| 38 | + | |
| 39 | +## 13. Shoes 鞋子 | |
| 40 | +Sneakers, boots, sandals, heels, flats, sports shoes. 运动鞋、靴子、凉鞋、高跟鞋、平底鞋、球鞋。 | |
| 41 | + | |
| 42 | +## 14. Sports 运动产品 | |
| 43 | +Fitness equipment, sports gear, team sports, racquet sports, water sports, cycling. 健身器材、运动装备、团队运动、球拍运动、水上运动、骑行。 | |
| 44 | + | |
| 45 | +## 15. Others 其他 | |
| 46 | + | |
| 47 | +# 各个大类的taxonomy | |
| 48 | +## 1. Clothing & Apparel 服装 | |
| 3 | 49 | |
| 4 | 50 | ### A. Product Classification |
| 5 | 51 | ... | ... |
tests/ci/test_service_api_contracts.py
| ... | ... | @@ -348,10 +348,12 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch |
| 348 | 348 | def _fake_build_index_content_fields( |
| 349 | 349 | items: List[Dict[str, str]], |
| 350 | 350 | tenant_id: str | None = None, |
| 351 | - analysis_kinds: List[str] | None = None, | |
| 351 | + enrichment_scopes: List[str] | None = None, | |
| 352 | + category_taxonomy_profile: str = "apparel", | |
| 352 | 353 | ): |
| 353 | 354 | assert tenant_id == "162" |
| 354 | - assert analysis_kinds == ["content", "taxonomy"] | |
| 355 | + assert enrichment_scopes == ["generic", "category_taxonomy"] | |
| 356 | + assert category_taxonomy_profile == "apparel" | |
| 355 | 357 | return [ |
| 356 | 358 | { |
| 357 | 359 | "id": p["spu_id"], |
| ... | ... | @@ -376,6 +378,8 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch |
| 376 | 378 | "/indexer/enrich-content", |
| 377 | 379 | json={ |
| 378 | 380 | "tenant_id": "162", |
| 381 | + "enrichment_scopes": ["generic", "category_taxonomy"], | |
| 382 | + "category_taxonomy_profile": "apparel", | |
| 379 | 383 | "items": [ |
| 380 | 384 | {"spu_id": "1001", "title": "T-shirt"}, |
| 381 | 385 | {"spu_id": "1002", "title": "Toy"}, |
| ... | ... | @@ -385,7 +389,8 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch |
| 385 | 389 | assert response.status_code == 200 |
| 386 | 390 | data = response.json() |
| 387 | 391 | assert data["tenant_id"] == "162" |
| 388 | - assert data["analysis_kinds"] == ["content", "taxonomy"] | |
| 392 | + assert data["enrichment_scopes"] == ["generic", "category_taxonomy"] | |
| 393 | + assert data["category_taxonomy_profile"] == "apparel" | |
| 389 | 394 | assert data["total"] == 2 |
| 390 | 395 | assert len(data["results"]) == 2 |
| 391 | 396 | assert data["results"][0]["spu_id"] == "1001" |
| ... | ... | @@ -403,6 +408,52 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch |
| 403 | 408 | } |
| 404 | 409 | |
| 405 | 410 | |
| 411 | +def test_indexer_enrich_content_contract_accepts_deprecated_analysis_kinds(indexer_client: TestClient, monkeypatch): | |
| 412 | + import indexer.product_enrich as process_products | |
| 413 | + | |
| 414 | + seen: Dict[str, Any] = {} | |
| 415 | + | |
| 416 | + def _fake_build_index_content_fields( | |
| 417 | + items: List[Dict[str, str]], | |
| 418 | + tenant_id: str | None = None, | |
| 419 | + enrichment_scopes: List[str] | None = None, | |
| 420 | + category_taxonomy_profile: str = "apparel", | |
| 421 | + ): | |
| 422 | + seen["tenant_id"] = tenant_id | |
| 423 | + seen["enrichment_scopes"] = enrichment_scopes | |
| 424 | + seen["category_taxonomy_profile"] = category_taxonomy_profile | |
| 425 | + return [ | |
| 426 | + { | |
| 427 | + "id": items[0]["spu_id"], | |
| 428 | + "qanchors": {}, | |
| 429 | + "enriched_tags": {}, | |
| 430 | + "enriched_attributes": [], | |
| 431 | + "enriched_taxonomy_attributes": [], | |
| 432 | + } | |
| 433 | + ] | |
| 434 | + | |
| 435 | + monkeypatch.setattr(process_products, "build_index_content_fields", _fake_build_index_content_fields) | |
| 436 | + | |
| 437 | + response = indexer_client.post( | |
| 438 | + "/indexer/enrich-content", | |
| 439 | + json={ | |
| 440 | + "tenant_id": "162", | |
| 441 | + "analysis_kinds": ["taxonomy"], | |
| 442 | + "items": [{"spu_id": "1001", "title": "T-shirt"}], | |
| 443 | + }, | |
| 444 | + ) | |
| 445 | + | |
| 446 | + assert response.status_code == 200 | |
| 447 | + data = response.json() | |
| 448 | + assert seen == { | |
| 449 | + "tenant_id": "162", | |
| 450 | + "enrichment_scopes": ["category_taxonomy"], | |
| 451 | + "category_taxonomy_profile": "apparel", | |
| 452 | + } | |
| 453 | + assert data["enrichment_scopes"] == ["category_taxonomy"] | |
| 454 | + assert data["category_taxonomy_profile"] == "apparel" | |
| 455 | + | |
| 456 | + | |
| 406 | 457 | def test_indexer_documents_contract(indexer_client: TestClient): |
| 407 | 458 | """POST /indexer/documents: tenant_id + spu_ids, returns success/failed lists (no ES write).""" |
| 408 | 459 | response = indexer_client.post( | ... | ... |
tests/test_process_products_batching.py
| ... | ... | @@ -18,8 +18,10 @@ def test_analyze_products_caps_batch_size_to_20(monkeypatch): |
| 18 | 18 | batch_num: int, |
| 19 | 19 | target_lang: str = "zh", |
| 20 | 20 | analysis_kind: str = "content", |
| 21 | + category_taxonomy_profile=None, | |
| 21 | 22 | ): |
| 22 | 23 | assert analysis_kind == "content" |
| 24 | + assert category_taxonomy_profile is None | |
| 23 | 25 | seen_batch_sizes.append(len(batch_data)) |
| 24 | 26 | return [ |
| 25 | 27 | { |
| ... | ... | @@ -64,8 +66,10 @@ def test_analyze_products_uses_min_batch_size_1(monkeypatch): |
| 64 | 66 | batch_num: int, |
| 65 | 67 | target_lang: str = "zh", |
| 66 | 68 | analysis_kind: str = "content", |
| 69 | + category_taxonomy_profile=None, | |
| 67 | 70 | ): |
| 68 | 71 | assert analysis_kind == "content" |
| 72 | + assert category_taxonomy_profile is None | |
| 69 | 73 | seen_batch_sizes.append(len(batch_data)) |
| 70 | 74 | return [ |
| 71 | 75 | { | ... | ... |
tests/test_product_enrich_partial_mode.py
| ... | ... | @@ -295,16 +295,36 @@ def test_analyze_products_uses_product_level_cache_across_batch_requests(): |
| 295 | 295 | product.get("image_url", ""), |
| 296 | 296 | ) |
| 297 | 297 | |
| 298 | - def fake_get_cached_analysis_result(product, target_lang, analysis_kind="content"): | |
| 298 | + def fake_get_cached_analysis_result( | |
| 299 | + product, | |
| 300 | + target_lang, | |
| 301 | + analysis_kind="content", | |
| 302 | + category_taxonomy_profile=None, | |
| 303 | + ): | |
| 299 | 304 | assert analysis_kind == "content" |
| 305 | + assert category_taxonomy_profile is None | |
| 300 | 306 | return cache_store.get(_cache_key(product, target_lang)) |
| 301 | 307 | |
| 302 | - def fake_set_cached_analysis_result(product, target_lang, result, analysis_kind="content"): | |
| 308 | + def fake_set_cached_analysis_result( | |
| 309 | + product, | |
| 310 | + target_lang, | |
| 311 | + result, | |
| 312 | + analysis_kind="content", | |
| 313 | + category_taxonomy_profile=None, | |
| 314 | + ): | |
| 303 | 315 | assert analysis_kind == "content" |
| 316 | + assert category_taxonomy_profile is None | |
| 304 | 317 | cache_store[_cache_key(product, target_lang)] = result |
| 305 | 318 | |
| 306 | - def fake_process_batch(batch_data, batch_num, target_lang="zh", analysis_kind="content"): | |
| 319 | + def fake_process_batch( | |
| 320 | + batch_data, | |
| 321 | + batch_num, | |
| 322 | + target_lang="zh", | |
| 323 | + analysis_kind="content", | |
| 324 | + category_taxonomy_profile=None, | |
| 325 | + ): | |
| 307 | 326 | assert analysis_kind == "content" |
| 327 | + assert category_taxonomy_profile is None | |
| 308 | 328 | process_calls.append( |
| 309 | 329 | { |
| 310 | 330 | "batch_num": batch_num, |
| ... | ... | @@ -400,7 +420,7 @@ def test_analyze_products_reuses_cached_content_with_current_product_identity(): |
| 400 | 420 | with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object( |
| 401 | 421 | product_enrich, |
| 402 | 422 | "_get_cached_analysis_result", |
| 403 | - wraps=lambda product, target_lang, analysis_kind="content": product_enrich._normalize_analysis_result( | |
| 423 | + wraps=lambda product, target_lang, analysis_kind="content", category_taxonomy_profile=None: product_enrich._normalize_analysis_result( | |
| 404 | 424 | cached_result, |
| 405 | 425 | product=product, |
| 406 | 426 | target_lang=target_lang, |
| ... | ... | @@ -443,8 +463,10 @@ def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output() |
| 443 | 463 | batch_size=None, |
| 444 | 464 | tenant_id=None, |
| 445 | 465 | analysis_kind="content", |
| 466 | + category_taxonomy_profile=None, | |
| 446 | 467 | ): |
| 447 | 468 | if analysis_kind == "taxonomy": |
| 469 | + assert category_taxonomy_profile == "apparel" | |
| 448 | 470 | return [ |
| 449 | 471 | { |
| 450 | 472 | "id": products[0]["id"], |
| ... | ... | @@ -478,6 +500,7 @@ def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output() |
| 478 | 500 | "style_aesthetic": "", |
| 479 | 501 | } |
| 480 | 502 | ] |
| 503 | + assert category_taxonomy_profile == "apparel" | |
| 481 | 504 | return [ |
| 482 | 505 | { |
| 483 | 506 | "id": products[0]["id"], | ... | ... |