Commit 2703b6ea8b5d2ecdc442f11ad26279cc2dce41e6

Authored by tangwang
1 parent 5aaf0c7d

refactor(indexer): 将 analysis_kinds 拆分为 enrichment_scopes +

category_taxonomy_profile

- 原 analysis_kinds
  混用了“增强类型”(content/taxonomy)与“品类特定配置”,不利于扩展不同品类的
taxonomy 分析(如 3C、家居等)
- 新增 enrichment_scopes 参数:支持 generic(通用增强,产出
  qanchors/enriched_tags/enriched_attributes)和
category_taxonomy(品类增强,产出 enriched_taxonomy_attributes)
- 新增 category_taxonomy_profile 参数:指定品类增强使用哪套
  profile(当前内置 apparel),每套 profile 包含独立的
prompt、输出列定义、解析规则及缓存版本
- 保留 analysis_kinds 作为兼容别名,避免破坏现有调用方
- 重构内部 taxonomy 分析为 profile registry 模式:新增
  _get_taxonomy_schema(profile_name) 函数,根据 profile 动态返回对应的
AnalysisSchema
- 缓存 key 现在按“分析类型 + profile + schema 指纹 +
  输入字段哈希”隔离,确保不同品类、不同 prompt 版本自动失效
- 更新 API 文档及微服务接口文档,明确新参数语义与使用示例

技术细节:
- 修改入口:api/routes/indexer.py 中 enrich-content
  端点,解析新参数并向下传递
- 核心逻辑:indexer/product_enrich.py 中 enrich_products_batch 增加
  profile 参数;_process_batch_for_schema 根据 scope 和 profile 动态获取
schema
- 兼容层:若请求同时提供 analysis_kinds,则映射为
  enrichment_scopes(content→generic,taxonomy→category_taxonomy),category_taxonomy_profile
默认为 "apparel"
- 测试覆盖:新增 enrichment_scopes 组合、profile 切换及兼容模式测试
api/routes/indexer.py
@@ -88,20 +88,41 @@ class EnrichContentItem(BaseModel): @@ -88,20 +88,41 @@ class EnrichContentItem(BaseModel):
88 88
89 class EnrichContentRequest(BaseModel): 89 class EnrichContentRequest(BaseModel):
90 """ 90 """
91 - 内容理解字段生成请求:根据商品标题批量生成 qanchors、enriched_attributes、tags、taxonomy attributes 91 + 内容理解字段生成请求:根据商品标题批量生成通用增强字段与品类 taxonomy 字段
92 供外部 indexer 在自行组织 doc 时调用,与翻译、向量化等微服务并列。 92 供外部 indexer 在自行组织 doc 时调用,与翻译、向量化等微服务并列。
93 """ 93 """
94 tenant_id: str = Field(..., description="租户 ID,用于请求路由与结果归属,不参与缓存键") 94 tenant_id: str = Field(..., description="租户 ID,用于请求路由与结果归属,不参与缓存键")
95 items: List[EnrichContentItem] = Field(..., description="待分析的 SPU 列表(spu_id + title,可附带 brief/description/image_url)") 95 items: List[EnrichContentItem] = Field(..., description="待分析的 SPU 列表(spu_id + title,可附带 brief/description/image_url)")
96 - analysis_kinds: List[Literal["content", "taxonomy"]] = Field(  
97 - default_factory=lambda: ["content", "taxonomy"], 96 + enrichment_scopes: Optional[List[Literal["generic", "category_taxonomy"]]] = Field(
  97 + default=None,
98 description=( 98 description=(
99 - "要执行的分析族。"  
100 - "`content` 返回 qanchors/enriched_tags/enriched_attributes;"  
101 - "`taxonomy` 返回 enriched_taxonomy_attributes。" 99 + "要执行的增强范围。"
  100 + "`generic` 返回 qanchors/enriched_tags/enriched_attributes;"
  101 + "`category_taxonomy` 返回 enriched_taxonomy_attributes。"
102 "默认两者都执行。" 102 "默认两者都执行。"
103 ), 103 ),
104 ) 104 )
  105 + category_taxonomy_profile: str = Field(
  106 + "apparel",
  107 + description=(
  108 + "品类 taxonomy profile。当前默认且已支持的是 `apparel`。"
  109 + "未来可扩展为 `electronics` 等。"
  110 + ),
  111 + )
  112 + analysis_kinds: Optional[List[Literal["content", "taxonomy"]]] = Field(
  113 + default=None,
  114 + description="Deprecated alias of enrichment_scopes. `content` -> `generic`, `taxonomy` -> `category_taxonomy`.",
  115 + )
  116 +
  117 + def resolved_enrichment_scopes(self) -> List[str]:
  118 + if self.enrichment_scopes:
  119 + return list(self.enrichment_scopes)
  120 + if self.analysis_kinds:
  121 + mapped = []
  122 + for item in self.analysis_kinds:
  123 + mapped.append("generic" if item == "content" else "category_taxonomy")
  124 + return mapped
  125 + return ["generic", "category_taxonomy"]
105 126
106 127
107 @router.post("/reindex") 128 @router.post("/reindex")
@@ -452,7 +473,8 @@ async def build_docs_from_db(request: BuildDocsFromDbRequest): @@ -452,7 +473,8 @@ async def build_docs_from_db(request: BuildDocsFromDbRequest):
452 def _run_enrich_content( 473 def _run_enrich_content(
453 tenant_id: str, 474 tenant_id: str,
454 items: List[Dict[str, str]], 475 items: List[Dict[str, str]],
455 - analysis_kinds: Optional[List[str]] = None, 476 + enrichment_scopes: Optional[List[str]] = None,
  477 + category_taxonomy_profile: str = "apparel",
456 ) -> List[Dict[str, Any]]: 478 ) -> List[Dict[str, Any]]:
457 """ 479 """
458 同步执行内容理解,返回与 ES mapping 对齐的字段结构。 480 同步执行内容理解,返回与 ES mapping 对齐的字段结构。
@@ -463,7 +485,8 @@ def _run_enrich_content( @@ -463,7 +485,8 @@ def _run_enrich_content(
463 results = build_index_content_fields( 485 results = build_index_content_fields(
464 items=items, 486 items=items,
465 tenant_id=tenant_id, 487 tenant_id=tenant_id,
466 - analysis_kinds=analysis_kinds, 488 + enrichment_scopes=enrichment_scopes,
  489 + category_taxonomy_profile=category_taxonomy_profile,
467 ) 490 )
468 return [ 491 return [
469 { 492 {
@@ -481,7 +504,7 @@ def _run_enrich_content( @@ -481,7 +504,7 @@ def _run_enrich_content(
481 @router.post("/enrich-content") 504 @router.post("/enrich-content")
482 async def enrich_content(request: EnrichContentRequest): 505 async def enrich_content(request: EnrichContentRequest):
483 """ 506 """
484 - 内容理解字段生成接口:根据商品标题批量生成 qanchors、enriched_attributes、tags、taxonomy attributes 507 + 内容理解字段生成接口:根据商品标题批量生成通用增强字段与品类 taxonomy 字段
485 508
486 使用场景: 509 使用场景:
487 - 外部 indexer 采用「微服务组合」方式自己组织 doc 时,可调用本接口获取 LLM 生成的 510 - 外部 indexer 采用「微服务组合」方式自己组织 doc 时,可调用本接口获取 LLM 生成的
@@ -511,17 +534,20 @@ async def enrich_content(request: EnrichContentRequest): @@ -511,17 +534,20 @@ async def enrich_content(request: EnrichContentRequest):
511 for it in request.items 534 for it in request.items
512 ] 535 ]
513 loop = asyncio.get_event_loop() 536 loop = asyncio.get_event_loop()
  537 + enrichment_scopes = request.resolved_enrichment_scopes()
514 result = await loop.run_in_executor( 538 result = await loop.run_in_executor(
515 None, 539 None,
516 lambda: _run_enrich_content( 540 lambda: _run_enrich_content(
517 tenant_id=request.tenant_id, 541 tenant_id=request.tenant_id,
518 items=items_payload, 542 items=items_payload,
519 - analysis_kinds=request.analysis_kinds, 543 + enrichment_scopes=enrichment_scopes,
  544 + category_taxonomy_profile=request.category_taxonomy_profile,
520 ), 545 ),
521 ) 546 )
522 return { 547 return {
523 "tenant_id": request.tenant_id, 548 "tenant_id": request.tenant_id,
524 - "analysis_kinds": request.analysis_kinds, 549 + "enrichment_scopes": enrichment_scopes,
  550 + "category_taxonomy_profile": request.category_taxonomy_profile,
525 "results": result, 551 "results": result,
526 "total": len(result), 552 "total": len(result),
527 } 553 }
docs/issues/issue-2026-04-08-eval框架主指标ERR的问题以及bm25调参-done-0408.md
@@ -92,3 +92,29 @@ curl -s -u "$AUTH" -X GET "$ES/${INDEX}/_settings?filter_path=**.similarity&pret @@ -92,3 +92,29 @@ curl -s -u "$AUTH" -X GET "$ES/${INDEX}/_settings?filter_path=**.similarity&pret
92 ```bash 92 ```bash
93 curl -s -u "$AUTH" -X GET "$ES/_cat/indices/search_products_tenant_*?h=index&v" 93 curl -s -u "$AUTH" -X GET "$ES/_cat/indices/search_products_tenant_*?h=index&v"
94 ``` 94 ```
  95 +
  96 +
  97 +
  98 +实验结果:
  99 +已完成四轮实验,并把 `search_products_tenant_163` 当前索引参数留在了最优组:`b=0.5, k1=1.0`。刚刚已校验生效。
  100 +
  101 +对比结果如下:
  102 +
  103 +- 基线当前线上组 `b=0.10, k1=0.30`:`0.604555`
  104 + 报告:[batch_20260408T055948Z_00b6a8aa3d.md](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260408T055948Z_00b6a8aa3d.md)
  105 +- 复跑 `b=0.10, k1=0.30`:`0.604298`
  106 + 报告:[batch_20260408T064011Z_00b6a8aa3d.json](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260408T064011Z_00b6a8aa3d.json)
  107 +- 实验 `b=0.20, k1=0.60`:`0.604274`
  108 + 报告:[batch_20260408T064123Z_00b6a8aa3d.json](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260408T064123Z_00b6a8aa3d.json)
  109 +- 实验 `b=0.50, k1=1.0`:`0.606041`
  110 + 报告:[batch_20260408T064216Z_00b6a8aa3d.json](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260408T064216Z_00b6a8aa3d.json)
  111 +- 实验 `b=0.10, k1=0.75`:`0.603438`
  112 + 报告:[batch_20260408T064304Z_00b6a8aa3d.json](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260408T064304Z_00b6a8aa3d.json)
  113 +- 老基线 `b=0.0, k1=0.0`:`0.602598`
  114 + 报告:[batch_20260407T150946Z_00b6a8aa3d.md](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260407T150946Z_00b6a8aa3d.md)
  115 +
  116 +结论是这四组里 `b=0.50, k1=1.0` 最好,相比你给的当前基线 `0.604555` 提升了 `+0.001486`,相比 `0/0` 提升了 `+0.003443`。`0.20/0.60` 基本和 `0.10/0.30` 持平,`0.10/0.75` 反而更差,说明这轮里单独加大 `k1` 不如同时把 `b` 和 `k1` 一起放宽。
  117 +
  118 +补充一点:我只改了现有索引 `search_products_tenant_163` 的 index-level setting,没有改仓库里的默认映射。[search_products.json](/data/saas-search/mappings/search_products.json) 现在仍是 `0.1/0.3`。如果你要,我下一步可以直接帮你把映射默认值也改成 `0.5/1.0`,并顺手批量同步其他 `search_products_tenant_*` 索引。
  119 +
  120 +
docs/搜索API对接指南-05-索引接口(Indexer).md
@@ -655,7 +655,8 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ @@ -655,7 +655,8 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \
655 ```json 655 ```json
656 { 656 {
657 "tenant_id": "170", 657 "tenant_id": "170",
658 - "analysis_kinds": ["content", "taxonomy"], 658 + "enrichment_scopes": ["generic", "category_taxonomy"],
  659 + "category_taxonomy_profile": "apparel",
659 "items": [ 660 "items": [
660 { 661 {
661 "spu_id": "223167", 662 "spu_id": "223167",
@@ -676,7 +677,8 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ @@ -676,7 +677,8 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \
676 | 参数 | 类型 | 必填 | 默认值 | 说明 | 677 | 参数 | 类型 | 必填 | 默认值 | 说明 |
677 |------|------|------|--------|------| 678 |------|------|------|--------|------|
678 | `tenant_id` | string | Y | - | 租户 ID。目前仅用于记录日志,不产生实际作用| 679 | `tenant_id` | string | Y | - | 租户 ID。目前仅用于记录日志,不产生实际作用|
679 -| `analysis_kinds` | array[string] | N | `["content", "taxonomy"]` | 选择要执行的分析族。`content` 生成 `qanchors`/`enriched_tags`/`enriched_attributes`,`taxonomy` 生成 `enriched_taxonomy_attributes` | 680 +| `enrichment_scopes` | array[string] | N | `["generic", "category_taxonomy"]` | 选择要执行的增强范围。`generic` 生成 `qanchors`/`enriched_tags`/`enriched_attributes`,`category_taxonomy` 生成 `enriched_taxonomy_attributes` |
  681 +| `category_taxonomy_profile` | string | N | `apparel` | 品类 taxonomy profile。当前内置为服装大类 `apparel`,后续可扩展到其他大类 |
680 | `items` | array | Y | - | 待分析列表;**单次最多 50 条** | 682 | `items` | array | Y | - | 待分析列表;**单次最多 50 条** |
681 683
682 `items[]` 字段说明: 684 `items[]` 字段说明:
@@ -691,8 +693,8 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ @@ -691,8 +693,8 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \
691 693
692 缓存说明: 694 缓存说明:
693 695
694 -- 内容缓存按 **分析族拆分**,即 `content` 与 `taxonomy` 使用不同的缓存命名空间,互不污染、可独立演进。  
695 -- 缓存键由 `analysis_kind + target_lang + prompt/schema 版本指纹 + prompt 输入文本 hash` 构成。 696 +- 内容缓存按 **增强范围 + taxonomy profile** 拆分;`generic` 与 `category_taxonomy:apparel` 等使用不同缓存命名空间,互不污染、可独立演进。
  697 +- 缓存键由 `analysis_kind + target_lang + prompt/schema 版本指纹 + prompt 输入文本 hash` 构成;对 category taxonomy 来说,profile 会进入 schema 标识与版本指纹。
696 - 当前真正参与 prompt 输入的字段是:`title`、`brief`、`description`;这些字段任一变化,都会落到新的缓存 key。 698 - 当前真正参与 prompt 输入的字段是:`title`、`brief`、`description`;这些字段任一变化,都会落到新的缓存 key。
697 - `prompt/schema 版本指纹` 会综合 system prompt、shared instruction、localized table headers、result fields、user instruction template 等信息生成;因此只要提示词或输出契约变化,旧缓存会自然失效。 699 - `prompt/schema 版本指纹` 会综合 system prompt、shared instruction、localized table headers、result fields、user instruction template 等信息生成;因此只要提示词或输出契约变化,旧缓存会自然失效。
698 - `tenant_id`、`spu_id` 只用于请求归属与结果回填,不参与缓存键。 700 - `tenant_id`、`spu_id` 只用于请求归属与结果回填,不参与缓存键。
@@ -714,7 +716,8 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ @@ -714,7 +716,8 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \
714 ```json 716 ```json
715 { 717 {
716 "tenant_id": "170", 718 "tenant_id": "170",
717 - "analysis_kinds": ["content", "taxonomy"], 719 + "enrichment_scopes": ["generic", "category_taxonomy"],
  720 + "category_taxonomy_profile": "apparel",
718 "total": 2, 721 "total": 2,
719 "results": [ 722 "results": [
720 { 723 {
@@ -755,7 +758,8 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ @@ -755,7 +758,8 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \
755 758
756 | 字段 | 类型 | 说明 | 759 | 字段 | 类型 | 说明 |
757 |------|------|------| 760 |------|------|------|
758 -| `analysis_kinds` | array | 实际执行的分析族列表 | 761 +| `enrichment_scopes` | array | 实际执行的增强范围列表 |
  762 +| `category_taxonomy_profile` | string | 实际使用的品类 taxonomy profile |
759 | `results` | array | 与请求 `items` 一一对应,每项含 `spu_id`、`qanchors`、`enriched_attributes`、`enriched_tags`、`enriched_taxonomy_attributes` | 763 | `results` | array | 与请求 `items` 一一对应,每项含 `spu_id`、`qanchors`、`enriched_attributes`、`enriched_tags`、`enriched_taxonomy_attributes` |
760 | `results[].qanchors` | object | 与 ES `qanchors` 字段同结构,按语言键返回短语数组 | 764 | `results[].qanchors` | object | 与 ES `qanchors` 字段同结构,按语言键返回短语数组 |
761 | `results[].enriched_tags` | object | 与 ES `enriched_tags` 字段同结构,按语言键返回标签数组 | 765 | `results[].enriched_tags` | object | 与 ES `enriched_tags` 字段同结构,按语言键返回标签数组 |
@@ -774,7 +778,8 @@ curl -X POST "http://localhost:6004/indexer/enrich-content" \ @@ -774,7 +778,8 @@ curl -X POST "http://localhost:6004/indexer/enrich-content" \
774 -H "Content-Type: application/json" \ 778 -H "Content-Type: application/json" \
775 -d '{ 779 -d '{
776 "tenant_id": "163", 780 "tenant_id": "163",
777 - "analysis_kinds": ["content", "taxonomy"], 781 + "enrichment_scopes": ["generic", "category_taxonomy"],
  782 + "category_taxonomy_profile": "apparel",
778 "items": [ 783 "items": [
779 { 784 {
780 "spu_id": "223167", 785 "spu_id": "223167",
docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md
@@ -444,7 +444,7 @@ curl "http://localhost:6006/health" @@ -444,7 +444,7 @@ curl "http://localhost:6006/health"
444 444
445 - **Base URL**: Indexer 服务地址,如 `http://localhost:6004` 445 - **Base URL**: Indexer 服务地址,如 `http://localhost:6004`
446 - **路径**: `POST /indexer/enrich-content` 446 - **路径**: `POST /indexer/enrich-content`
447 -- **说明**: 根据商品标题批量生成 `qanchors`、`enriched_attributes`、`enriched_tags`、`enriched_taxonomy_attributes`,用于拼装 ES 文档。支持通过 `analysis_kinds` 选择执行 `content` / `taxonomy`;默认两者都执行。内部使用大模型(需配置 `DASHSCOPE_API_KEY`),支持多语言与 Redis 缓存;单次最多 50 条,建议批量调用以提升效率。 447 +- **说明**: 根据商品标题批量生成 `qanchors`、`enriched_attributes`、`enriched_tags`、`enriched_taxonomy_attributes`,用于拼装 ES 文档。支持通过 `enrichment_scopes` 选择执行 `generic` / `category_taxonomy`,并通过 `category_taxonomy_profile` 选择对应大类的 taxonomy prompt/profile;默认执行 `generic + category_taxonomy(apparel)`。内部使用大模型(需配置 `DASHSCOPE_API_KEY`),支持多语言与 Redis 缓存;单次最多 50 条,建议批量调用以提升效率。
448 448
449 请求/响应格式、示例及错误码见 [-05-索引接口(Indexer)](./搜索API对接指南-05-索引接口(Indexer).md#58-内容理解字段生成接口)。 449 请求/响应格式、示例及错误码见 [-05-索引接口(Indexer)](./搜索API对接指南-05-索引接口(Indexer).md#58-内容理解字段生成接口)。
450 450
indexer/product_enrich.py
@@ -151,7 +151,8 @@ if _missing_prompt_langs: @@ -151,7 +151,8 @@ if _missing_prompt_langs:
151 # 多值字段分隔:英文逗号、中文逗号、顿号,及历史约定的 ; | / 与空白 151 # 多值字段分隔:英文逗号、中文逗号、顿号,及历史约定的 ; | / 与空白
152 _MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[,、,;|/\n\t]+") 152 _MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[,、,;|/\n\t]+")
153 _CORE_INDEX_LANGUAGES = ("zh", "en") 153 _CORE_INDEX_LANGUAGES = ("zh", "en")
154 -_DEFAULT_ANALYSIS_KINDS = ("content", "taxonomy") 154 +_DEFAULT_ENRICHMENT_SCOPES = ("generic", "category_taxonomy")
  155 +_DEFAULT_CATEGORY_TAXONOMY_PROFILE = "apparel"
155 _CONTENT_ANALYSIS_ATTRIBUTE_FIELD_MAP = ( 156 _CONTENT_ANALYSIS_ATTRIBUTE_FIELD_MAP = (
156 ("tags", "enriched_tags"), 157 ("tags", "enriched_tags"),
157 ("target_audience", "target_audience"), 158 ("target_audience", "target_audience"),
@@ -187,7 +188,7 @@ _CONTENT_ANALYSIS_FIELD_ALIASES = { @@ -187,7 +188,7 @@ _CONTENT_ANALYSIS_FIELD_ALIASES = {
187 "tags": ("tags", "enriched_tags"), 188 "tags": ("tags", "enriched_tags"),
188 } 189 }
189 _CONTENT_ANALYSIS_QUALITY_FIELDS = ("title", "category_path", "anchor_text") 190 _CONTENT_ANALYSIS_QUALITY_FIELDS = ("title", "category_path", "anchor_text")
190 -_TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP = ( 191 +_APPAREL_TAXONOMY_ATTRIBUTE_FIELD_MAP = (
191 ("product_type", "Product Type"), 192 ("product_type", "Product Type"),
192 ("target_gender", "Target Gender"), 193 ("target_gender", "Target Gender"),
193 ("age_group", "Age Group"), 194 ("age_group", "Age Group"),
@@ -215,8 +216,8 @@ _TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP = ( @@ -215,8 +216,8 @@ _TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP = (
215 ("occasion_end_use", "Occasion / End Use"), 216 ("occasion_end_use", "Occasion / End Use"),
216 ("style_aesthetic", "Style Aesthetic"), 217 ("style_aesthetic", "Style Aesthetic"),
217 ) 218 )
218 -_TAXONOMY_ANALYSIS_RESULT_FIELDS = tuple(  
219 - field_name for field_name, _ in _TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP 219 +_APPAREL_TAXONOMY_ANALYSIS_RESULT_FIELDS = tuple(
  220 + field_name for field_name, _ in _APPAREL_TAXONOMY_ATTRIBUTE_FIELD_MAP
220 ) 221 )
221 222
222 223
@@ -252,37 +253,66 @@ _ANALYSIS_SCHEMAS: Dict[str, AnalysisSchema] = { @@ -252,37 +253,66 @@ _ANALYSIS_SCHEMAS: Dict[str, AnalysisSchema] = {
252 field_aliases=_CONTENT_ANALYSIS_FIELD_ALIASES, 253 field_aliases=_CONTENT_ANALYSIS_FIELD_ALIASES,
253 quality_fields=_CONTENT_ANALYSIS_QUALITY_FIELDS, 254 quality_fields=_CONTENT_ANALYSIS_QUALITY_FIELDS,
254 ), 255 ),
255 - "taxonomy": AnalysisSchema(  
256 - name="taxonomy", 256 +}
  257 +
  258 +_CATEGORY_TAXONOMY_PROFILE_SCHEMAS: Dict[str, AnalysisSchema] = {
  259 + "apparel": AnalysisSchema(
  260 + name="taxonomy:apparel",
257 shared_instruction=TAXONOMY_SHARED_ANALYSIS_INSTRUCTION, 261 shared_instruction=TAXONOMY_SHARED_ANALYSIS_INSTRUCTION,
258 markdown_table_headers=TAXONOMY_LANGUAGE_MARKDOWN_TABLE_HEADERS, 262 markdown_table_headers=TAXONOMY_LANGUAGE_MARKDOWN_TABLE_HEADERS,
259 - result_fields=_TAXONOMY_ANALYSIS_RESULT_FIELDS,  
260 - meaningful_fields=_TAXONOMY_ANALYSIS_RESULT_FIELDS, 263 + result_fields=_APPAREL_TAXONOMY_ANALYSIS_RESULT_FIELDS,
  264 + meaningful_fields=_APPAREL_TAXONOMY_ANALYSIS_RESULT_FIELDS,
261 cache_version="v1", 265 cache_version="v1",
262 fallback_headers=TAXONOMY_MARKDOWN_TABLE_HEADERS_EN, 266 fallback_headers=TAXONOMY_MARKDOWN_TABLE_HEADERS_EN,
263 ), 267 ),
264 } 268 }
265 269
  270 +_CATEGORY_TAXONOMY_PROFILE_ATTRIBUTE_FIELD_MAPS: Dict[str, Tuple[Tuple[str, str], ...]] = {
  271 + "apparel": _APPAREL_TAXONOMY_ATTRIBUTE_FIELD_MAP,
  272 +}
  273 +
266 274
267 -def _get_analysis_schema(analysis_kind: str) -> AnalysisSchema:  
268 - schema = _ANALYSIS_SCHEMAS.get(analysis_kind)  
269 - if schema is None:  
270 - raise ValueError(f"Unsupported analysis_kind: {analysis_kind}")  
271 - return schema 275 +def _normalize_category_taxonomy_profile(category_taxonomy_profile: Optional[str] = None) -> str:
  276 + profile = str(category_taxonomy_profile or _DEFAULT_CATEGORY_TAXONOMY_PROFILE).strip()
  277 + if profile not in _CATEGORY_TAXONOMY_PROFILE_SCHEMAS:
  278 + raise ValueError(f"Unsupported category_taxonomy_profile: {profile}")
  279 + return profile
272 280
273 281
274 -def _normalize_analysis_kinds(  
275 - analysis_kinds: Optional[List[str]] = None, 282 +def _get_analysis_schema(
  283 + analysis_kind: str,
  284 + *,
  285 + category_taxonomy_profile: Optional[str] = None,
  286 +) -> AnalysisSchema:
  287 + if analysis_kind == "content":
  288 + return _ANALYSIS_SCHEMAS["content"]
  289 + if analysis_kind == "taxonomy":
  290 + profile = _normalize_category_taxonomy_profile(category_taxonomy_profile)
  291 + return _CATEGORY_TAXONOMY_PROFILE_SCHEMAS[profile]
  292 + raise ValueError(f"Unsupported analysis_kind: {analysis_kind}")
  293 +
  294 +
  295 +def _get_taxonomy_attribute_field_map(
  296 + category_taxonomy_profile: Optional[str] = None,
  297 +) -> Tuple[Tuple[str, str], ...]:
  298 + profile = _normalize_category_taxonomy_profile(category_taxonomy_profile)
  299 + return _CATEGORY_TAXONOMY_PROFILE_ATTRIBUTE_FIELD_MAPS[profile]
  300 +
  301 +
  302 +def _normalize_enrichment_scopes(
  303 + enrichment_scopes: Optional[List[str]] = None,
276 ) -> Tuple[str, ...]: 304 ) -> Tuple[str, ...]:
277 - requested = _DEFAULT_ANALYSIS_KINDS if not analysis_kinds else tuple(analysis_kinds) 305 + requested = _DEFAULT_ENRICHMENT_SCOPES if not enrichment_scopes else tuple(enrichment_scopes)
278 normalized: List[str] = [] 306 normalized: List[str] = []
279 seen = set() 307 seen = set()
280 - for analysis_kind in requested:  
281 - schema = _get_analysis_schema(str(analysis_kind).strip())  
282 - if schema.name in seen: 308 + for enrichment_scope in requested:
  309 + scope = str(enrichment_scope).strip()
  310 + if scope not in {"generic", "category_taxonomy"}:
  311 + raise ValueError(f"Unsupported enrichment_scope: {scope}")
  312 + if scope in seen:
283 continue 313 continue
284 - seen.add(schema.name)  
285 - normalized.append(schema.name) 314 + seen.add(scope)
  315 + normalized.append(scope)
286 return tuple(normalized) 316 return tuple(normalized)
287 317
288 318
@@ -448,7 +478,13 @@ def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang: @@ -448,7 +478,13 @@ def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang:
448 _append_lang_phrase_map(result["enriched_tags"], lang=lang, raw_value=raw) 478 _append_lang_phrase_map(result["enriched_tags"], lang=lang, raw_value=raw)
449 479
450 480
451 -def _apply_index_taxonomy_row(result: Dict[str, Any], row: Dict[str, Any], lang: str) -> None: 481 +def _apply_index_taxonomy_row(
  482 + result: Dict[str, Any],
  483 + row: Dict[str, Any],
  484 + lang: str,
  485 + *,
  486 + category_taxonomy_profile: Optional[str] = None,
  487 +) -> None:
452 if not row or row.get("error"): 488 if not row or row.get("error"):
453 return 489 return
454 490
@@ -456,8 +492,11 @@ def _apply_index_taxonomy_row(result: Dict[str, Any], row: Dict[str, Any], lang: @@ -456,8 +492,11 @@ def _apply_index_taxonomy_row(result: Dict[str, Any], row: Dict[str, Any], lang:
456 result["enriched_taxonomy_attributes"], 492 result["enriched_taxonomy_attributes"],
457 row=row, 493 row=row,
458 lang=lang, 494 lang=lang,
459 - schema=_get_analysis_schema("taxonomy"),  
460 - field_map=_TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP, 495 + schema=_get_analysis_schema(
  496 + "taxonomy",
  497 + category_taxonomy_profile=category_taxonomy_profile,
  498 + ),
  499 + field_map=_get_taxonomy_attribute_field_map(category_taxonomy_profile),
461 ) 500 )
462 501
463 502
@@ -475,7 +514,8 @@ def _normalize_index_content_item(item: Dict[str, Any]) -> Dict[str, str]: @@ -475,7 +514,8 @@ def _normalize_index_content_item(item: Dict[str, Any]) -> Dict[str, str]:
475 def build_index_content_fields( 514 def build_index_content_fields(
476 items: List[Dict[str, Any]], 515 items: List[Dict[str, Any]],
477 tenant_id: Optional[str] = None, 516 tenant_id: Optional[str] = None,
478 - analysis_kinds: Optional[List[str]] = None, 517 + enrichment_scopes: Optional[List[str]] = None,
  518 + category_taxonomy_profile: Optional[str] = None,
479 ) -> List[Dict[str, Any]]: 519 ) -> List[Dict[str, Any]]:
480 """ 520 """
481 高层入口:生成与 ES mapping 对齐的内容理解字段。 521 高层入口:生成与 ES mapping 对齐的内容理解字段。
@@ -484,7 +524,8 @@ def build_index_content_fields( @@ -484,7 +524,8 @@ def build_index_content_fields(
484 - `id` 或 `spu_id` 524 - `id` 或 `spu_id`
485 - `title` 525 - `title`
486 - 可选 `brief` / `description` / `image_url` 526 - 可选 `brief` / `description` / `image_url`
487 - - 可选 `analysis_kinds`,默认同时执行 `content` 与 `taxonomy` 527 + - 可选 `enrichment_scopes`,默认同时执行 `generic` 与 `category_taxonomy`
  528 + - 可选 `category_taxonomy_profile`,默认 `apparel`
488 529
489 返回项结构: 530 返回项结构:
490 - `id` 531 - `id`
@@ -498,7 +539,8 @@ def build_index_content_fields( @@ -498,7 +539,8 @@ def build_index_content_fields(
498 - `qanchors.{lang}` 为短语数组 539 - `qanchors.{lang}` 为短语数组
499 - `enriched_tags.{lang}` 为标签数组 540 - `enriched_tags.{lang}` 为标签数组
500 """ 541 """
501 - requested_analysis_kinds = _normalize_analysis_kinds(analysis_kinds) 542 + requested_enrichment_scopes = _normalize_enrichment_scopes(enrichment_scopes)
  543 + normalized_taxonomy_profile = _normalize_category_taxonomy_profile(category_taxonomy_profile)
502 normalized_items = [_normalize_index_content_item(item) for item in items] 544 normalized_items = [_normalize_index_content_item(item) for item in items]
503 if not normalized_items: 545 if not normalized_items:
504 return [] 546 return []
@@ -515,7 +557,7 @@ def build_index_content_fields( @@ -515,7 +557,7 @@ def build_index_content_fields(
515 } 557 }
516 558
517 for lang in _CORE_INDEX_LANGUAGES: 559 for lang in _CORE_INDEX_LANGUAGES:
518 - if "content" in requested_analysis_kinds: 560 + if "generic" in requested_enrichment_scopes:
519 try: 561 try:
520 rows = analyze_products( 562 rows = analyze_products(
521 products=normalized_items, 563 products=normalized_items,
@@ -523,6 +565,7 @@ def build_index_content_fields( @@ -523,6 +565,7 @@ def build_index_content_fields(
523 batch_size=BATCH_SIZE, 565 batch_size=BATCH_SIZE,
524 tenant_id=tenant_id, 566 tenant_id=tenant_id,
525 analysis_kind="content", 567 analysis_kind="content",
  568 + category_taxonomy_profile=normalized_taxonomy_profile,
526 ) 569 )
527 except Exception as e: 570 except Exception as e:
528 logger.warning("build_index_content_fields content enrichment failed for lang=%s: %s", lang, e) 571 logger.warning("build_index_content_fields content enrichment failed for lang=%s: %s", lang, e)
@@ -539,7 +582,7 @@ def build_index_content_fields( @@ -539,7 +582,7 @@ def build_index_content_fields(
539 continue 582 continue
540 _apply_index_content_row(results_by_id[item_id], row=row, lang=lang) 583 _apply_index_content_row(results_by_id[item_id], row=row, lang=lang)
541 584
542 - if "taxonomy" in requested_analysis_kinds: 585 + if "category_taxonomy" in requested_enrichment_scopes:
543 try: 586 try:
544 taxonomy_rows = analyze_products( 587 taxonomy_rows = analyze_products(
545 products=normalized_items, 588 products=normalized_items,
@@ -547,6 +590,7 @@ def build_index_content_fields( @@ -547,6 +590,7 @@ def build_index_content_fields(
547 batch_size=BATCH_SIZE, 590 batch_size=BATCH_SIZE,
548 tenant_id=tenant_id, 591 tenant_id=tenant_id,
549 analysis_kind="taxonomy", 592 analysis_kind="taxonomy",
  593 + category_taxonomy_profile=normalized_taxonomy_profile,
550 ) 594 )
551 except Exception as e: 595 except Exception as e:
552 logger.warning( 596 logger.warning(
@@ -565,7 +609,12 @@ def build_index_content_fields( @@ -565,7 +609,12 @@ def build_index_content_fields(
565 if row.get("error"): 609 if row.get("error"):
566 results_by_id[item_id].setdefault("error", row["error"]) 610 results_by_id[item_id].setdefault("error", row["error"])
567 continue 611 continue
568 - _apply_index_taxonomy_row(results_by_id[item_id], row=row, lang=lang) 612 + _apply_index_taxonomy_row(
  613 + results_by_id[item_id],
  614 + row=row,
  615 + lang=lang,
  616 + category_taxonomy_profile=normalized_taxonomy_profile,
  617 + )
569 618
570 return [results_by_id[item["id"]] for item in normalized_items] 619 return [results_by_id[item["id"]] for item in normalized_items]
571 620
@@ -636,9 +685,13 @@ def _make_analysis_cache_key( @@ -636,9 +685,13 @@ def _make_analysis_cache_key(
636 product: Dict[str, Any], 685 product: Dict[str, Any],
637 target_lang: str, 686 target_lang: str,
638 analysis_kind: str, 687 analysis_kind: str,
  688 + category_taxonomy_profile: Optional[str] = None,
639 ) -> str: 689 ) -> str:
640 """构造缓存 key,仅由分析类型、prompt 实际输入文本内容与目标语言决定。""" 690 """构造缓存 key,仅由分析类型、prompt 实际输入文本内容与目标语言决定。"""
641 - schema = _get_analysis_schema(analysis_kind) 691 + schema = _get_analysis_schema(
  692 + analysis_kind,
  693 + category_taxonomy_profile=category_taxonomy_profile,
  694 + )
642 prompt_input = _build_prompt_input_text(product) 695 prompt_input = _build_prompt_input_text(product)
643 h = hashlib.md5(prompt_input.encode("utf-8")).hexdigest() 696 h = hashlib.md5(prompt_input.encode("utf-8")).hexdigest()
644 prompt_contract = { 697 prompt_contract = {
@@ -672,12 +725,21 @@ def _get_cached_analysis_result( @@ -672,12 +725,21 @@ def _get_cached_analysis_result(
672 product: Dict[str, Any], 725 product: Dict[str, Any],
673 target_lang: str, 726 target_lang: str,
674 analysis_kind: str, 727 analysis_kind: str,
  728 + category_taxonomy_profile: Optional[str] = None,
675 ) -> Optional[Dict[str, Any]]: 729 ) -> Optional[Dict[str, Any]]:
676 if not _anchor_redis: 730 if not _anchor_redis:
677 return None 731 return None
678 - schema = _get_analysis_schema(analysis_kind) 732 + schema = _get_analysis_schema(
  733 + analysis_kind,
  734 + category_taxonomy_profile=category_taxonomy_profile,
  735 + )
679 try: 736 try:
680 - key = _make_analysis_cache_key(product, target_lang, analysis_kind) 737 + key = _make_analysis_cache_key(
  738 + product,
  739 + target_lang,
  740 + analysis_kind,
  741 + category_taxonomy_profile=category_taxonomy_profile,
  742 + )
681 raw = _anchor_redis.get(key) 743 raw = _anchor_redis.get(key)
682 if not raw: 744 if not raw:
683 return None 745 return None
@@ -707,10 +769,14 @@ def _set_cached_analysis_result( @@ -707,10 +769,14 @@ def _set_cached_analysis_result(
707 target_lang: str, 769 target_lang: str,
708 result: Dict[str, Any], 770 result: Dict[str, Any],
709 analysis_kind: str, 771 analysis_kind: str,
  772 + category_taxonomy_profile: Optional[str] = None,
710 ) -> None: 773 ) -> None:
711 if not _anchor_redis: 774 if not _anchor_redis:
712 return 775 return
713 - schema = _get_analysis_schema(analysis_kind) 776 + schema = _get_analysis_schema(
  777 + analysis_kind,
  778 + category_taxonomy_profile=category_taxonomy_profile,
  779 + )
714 try: 780 try:
715 normalized = _normalize_analysis_result( 781 normalized = _normalize_analysis_result(
716 result, 782 result,
@@ -720,7 +786,12 @@ def _set_cached_analysis_result( @@ -720,7 +786,12 @@ def _set_cached_analysis_result(
720 ) 786 )
721 if not _has_meaningful_analysis_content(normalized, schema): 787 if not _has_meaningful_analysis_content(normalized, schema):
722 return 788 return
723 - key = _make_analysis_cache_key(product, target_lang, analysis_kind) 789 + key = _make_analysis_cache_key(
  790 + product,
  791 + target_lang,
  792 + analysis_kind,
  793 + category_taxonomy_profile=category_taxonomy_profile,
  794 + )
724 ttl = ANCHOR_CACHE_EXPIRE_DAYS * 24 * 3600 795 ttl = ANCHOR_CACHE_EXPIRE_DAYS * 24 * 3600
725 _anchor_redis.setex(key, ttl, json.dumps(normalized, ensure_ascii=False)) 796 _anchor_redis.setex(key, ttl, json.dumps(normalized, ensure_ascii=False))
726 except Exception as e: 797 except Exception as e:
@@ -775,9 +846,13 @@ def create_prompt( @@ -775,9 +846,13 @@ def create_prompt(
775 products: List[Dict[str, str]], 846 products: List[Dict[str, str]],
776 target_lang: str = "zh", 847 target_lang: str = "zh",
777 analysis_kind: str = "content", 848 analysis_kind: str = "content",
  849 + category_taxonomy_profile: Optional[str] = None,
778 ) -> Tuple[Optional[str], Optional[str], Optional[str]]: 850 ) -> Tuple[Optional[str], Optional[str], Optional[str]]:
779 """根据目标语言创建共享上下文、本地化输出要求和 Partial Mode assistant 前缀。""" 851 """根据目标语言创建共享上下文、本地化输出要求和 Partial Mode assistant 前缀。"""
780 - schema = _get_analysis_schema(analysis_kind) 852 + schema = _get_analysis_schema(
  853 + analysis_kind,
  854 + category_taxonomy_profile=category_taxonomy_profile,
  855 + )
781 markdown_table_headers = schema.get_headers(target_lang) 856 markdown_table_headers = schema.get_headers(target_lang)
782 if not markdown_table_headers: 857 if not markdown_table_headers:
783 logger.warning( 858 logger.warning(
@@ -978,9 +1053,13 @@ def call_llm( @@ -978,9 +1053,13 @@ def call_llm(
978 def parse_markdown_table( 1053 def parse_markdown_table(
979 markdown_content: str, 1054 markdown_content: str,
980 analysis_kind: str = "content", 1055 analysis_kind: str = "content",
  1056 + category_taxonomy_profile: Optional[str] = None,
981 ) -> List[Dict[str, str]]: 1057 ) -> List[Dict[str, str]]:
982 """解析markdown表格内容""" 1058 """解析markdown表格内容"""
983 - schema = _get_analysis_schema(analysis_kind) 1059 + schema = _get_analysis_schema(
  1060 + analysis_kind,
  1061 + category_taxonomy_profile=category_taxonomy_profile,
  1062 + )
984 lines = markdown_content.strip().split("\n") 1063 lines = markdown_content.strip().split("\n")
985 data = [] 1064 data = []
986 data_started = False 1065 data_started = False
@@ -1025,8 +1104,12 @@ def _log_parsed_result_quality( @@ -1025,8 +1104,12 @@ def _log_parsed_result_quality(
1025 target_lang: str, 1104 target_lang: str,
1026 batch_num: int, 1105 batch_num: int,
1027 analysis_kind: str, 1106 analysis_kind: str,
  1107 + category_taxonomy_profile: Optional[str] = None,
1028 ) -> None: 1108 ) -> None:
1029 - schema = _get_analysis_schema(analysis_kind) 1109 + schema = _get_analysis_schema(
  1110 + analysis_kind,
  1111 + category_taxonomy_profile=category_taxonomy_profile,
  1112 + )
1030 expected = len(batch_data) 1113 expected = len(batch_data)
1031 actual = len(parsed_results) 1114 actual = len(parsed_results)
1032 if actual != expected: 1115 if actual != expected:
@@ -1071,9 +1154,13 @@ def process_batch( @@ -1071,9 +1154,13 @@ def process_batch(
1071 batch_num: int, 1154 batch_num: int,
1072 target_lang: str = "zh", 1155 target_lang: str = "zh",
1073 analysis_kind: str = "content", 1156 analysis_kind: str = "content",
  1157 + category_taxonomy_profile: Optional[str] = None,
1074 ) -> List[Dict[str, Any]]: 1158 ) -> List[Dict[str, Any]]:
1075 """处理一个批次的数据""" 1159 """处理一个批次的数据"""
1076 - schema = _get_analysis_schema(analysis_kind) 1160 + schema = _get_analysis_schema(
  1161 + analysis_kind,
  1162 + category_taxonomy_profile=category_taxonomy_profile,
  1163 + )
1077 logger.info(f"\n{'#' * 80}") 1164 logger.info(f"\n{'#' * 80}")
1078 logger.info( 1165 logger.info(
1079 "Processing Batch %s (%s items, kind=%s)", 1166 "Processing Batch %s (%s items, kind=%s)",
@@ -1087,6 +1174,7 @@ def process_batch( @@ -1087,6 +1174,7 @@ def process_batch(
1087 batch_data, 1174 batch_data,
1088 target_lang=target_lang, 1175 target_lang=target_lang,
1089 analysis_kind=analysis_kind, 1176 analysis_kind=analysis_kind,
  1177 + category_taxonomy_profile=category_taxonomy_profile,
1090 ) 1178 )
1091 1179
1092 # 如果提示词创建失败(例如不支持的 target_lang),本次批次整体失败,不再继续调用 LLM 1180 # 如果提示词创建失败(例如不支持的 target_lang),本次批次整体失败,不再继续调用 LLM
@@ -1119,13 +1207,18 @@ def process_batch( @@ -1119,13 +1207,18 @@ def process_batch(
1119 ) 1207 )
1120 1208
1121 # 解析结果 1209 # 解析结果
1122 - parsed_results = parse_markdown_table(raw_response, analysis_kind=analysis_kind) 1210 + parsed_results = parse_markdown_table(
  1211 + raw_response,
  1212 + analysis_kind=analysis_kind,
  1213 + category_taxonomy_profile=category_taxonomy_profile,
  1214 + )
1123 _log_parsed_result_quality( 1215 _log_parsed_result_quality(
1124 batch_data, 1216 batch_data,
1125 parsed_results, 1217 parsed_results,
1126 target_lang, 1218 target_lang,
1127 batch_num, 1219 batch_num,
1128 analysis_kind, 1220 analysis_kind,
  1221 + category_taxonomy_profile,
1129 ) 1222 )
1130 1223
1131 logger.info(f"\nParsed Results ({len(parsed_results)} items):") 1224 logger.info(f"\nParsed Results ({len(parsed_results)} items):")
@@ -1190,6 +1283,7 @@ def analyze_products( @@ -1190,6 +1283,7 @@ def analyze_products(
1190 batch_size: Optional[int] = None, 1283 batch_size: Optional[int] = None,
1191 tenant_id: Optional[str] = None, 1284 tenant_id: Optional[str] = None,
1192 analysis_kind: str = "content", 1285 analysis_kind: str = "content",
  1286 + category_taxonomy_profile: Optional[str] = None,
1193 ) -> List[Dict[str, Any]]: 1287 ) -> List[Dict[str, Any]]:
1194 """ 1288 """
1195 库调用入口:根据输入+语言,返回锚文本及各维度信息。 1289 库调用入口:根据输入+语言,返回锚文本及各维度信息。
@@ -1205,7 +1299,10 @@ def analyze_products( @@ -1205,7 +1299,10 @@ def analyze_products(
1205 if not products: 1299 if not products:
1206 return [] 1300 return []
1207 1301
1208 - _get_analysis_schema(analysis_kind) 1302 + _get_analysis_schema(
  1303 + analysis_kind,
  1304 + category_taxonomy_profile=category_taxonomy_profile,
  1305 + )
1209 results_by_index: List[Optional[Dict[str, Any]]] = [None] * len(products) 1306 results_by_index: List[Optional[Dict[str, Any]]] = [None] * len(products)
1210 uncached_items: List[Tuple[int, Dict[str, str]]] = [] 1307 uncached_items: List[Tuple[int, Dict[str, str]]] = []
1211 1308
@@ -1215,7 +1312,12 @@ def analyze_products( @@ -1215,7 +1312,12 @@ def analyze_products(
1215 uncached_items.append((idx, product)) 1312 uncached_items.append((idx, product))
1216 continue 1313 continue
1217 1314
1218 - cached = _get_cached_analysis_result(product, target_lang, analysis_kind) 1315 + cached = _get_cached_analysis_result(
  1316 + product,
  1317 + target_lang,
  1318 + analysis_kind,
  1319 + category_taxonomy_profile=category_taxonomy_profile,
  1320 + )
1219 if cached: 1321 if cached:
1220 logger.info( 1322 logger.info(
1221 f"[analyze_products] Cache hit for title='{title[:50]}...', " 1323 f"[analyze_products] Cache hit for title='{title[:50]}...', "
@@ -1255,6 +1357,7 @@ def analyze_products( @@ -1255,6 +1357,7 @@ def analyze_products(
1255 batch_num=batch_num, 1357 batch_num=batch_num,
1256 target_lang=target_lang, 1358 target_lang=target_lang,
1257 analysis_kind=analysis_kind, 1359 analysis_kind=analysis_kind,
  1360 + category_taxonomy_profile=category_taxonomy_profile,
1258 ) 1361 )
1259 1362
1260 for (original_idx, product), item in zip(batch_slice, batch_results): 1363 for (original_idx, product), item in zip(batch_slice, batch_results):
@@ -1266,7 +1369,13 @@ def analyze_products( @@ -1266,7 +1369,13 @@ def analyze_products(
1266 # 不缓存错误结果,避免放大临时故障 1369 # 不缓存错误结果,避免放大临时故障
1267 continue 1370 continue
1268 try: 1371 try:
1269 - _set_cached_analysis_result(product, target_lang, item, analysis_kind) 1372 + _set_cached_analysis_result(
  1373 + product,
  1374 + target_lang,
  1375 + item,
  1376 + analysis_kind,
  1377 + category_taxonomy_profile=category_taxonomy_profile,
  1378 + )
1270 except Exception: 1379 except Exception:
1271 # 已在内部记录 warning 1380 # 已在内部记录 warning
1272 pass 1381 pass
@@ -1293,6 +1402,7 @@ def analyze_products( @@ -1293,6 +1402,7 @@ def analyze_products(
1293 batch_num=batch_num, 1402 batch_num=batch_num,
1294 target_lang=target_lang, 1403 target_lang=target_lang,
1295 analysis_kind=analysis_kind, 1404 analysis_kind=analysis_kind,
  1405 + category_taxonomy_profile=category_taxonomy_profile,
1296 ) 1406 )
1297 1407
1298 # 按 batch_num 回填,确保输出稳定(results_by_index 是按原始 input index 映射的) 1408 # 按 batch_num 回填,确保输出稳定(results_by_index 是按原始 input index 映射的)
@@ -1307,7 +1417,13 @@ def analyze_products( @@ -1307,7 +1417,13 @@ def analyze_products(
1307 # 不缓存错误结果,避免放大临时故障 1417 # 不缓存错误结果,避免放大临时故障
1308 continue 1418 continue
1309 try: 1419 try:
1310 - _set_cached_analysis_result(product, target_lang, item, analysis_kind) 1420 + _set_cached_analysis_result(
  1421 + product,
  1422 + target_lang,
  1423 + item,
  1424 + analysis_kind,
  1425 + category_taxonomy_profile=category_taxonomy_profile,
  1426 + )
1311 except Exception: 1427 except Exception:
1312 # 已在内部记录 warning 1428 # 已在内部记录 warning
1313 pass 1429 pass
indexer/taxonomy.md
1 1
2 -服装大类的taxonomy 2 +# Cross-Border E-commerce Core Categories 大类
  3 +
  4 +## 1. 3C
  5 +Phone accessories, computer peripherals, smart wearables, audio & video, smart home, gaming gear. 手机配件、电脑周边、智能穿戴、影音娱乐、智能家居、游戏设备。
  6 +
  7 +## 2. Bags 包
  8 +Handbags, backpacks, wallets, luggage, crossbody bags, tote bags. 手提包、双肩包、钱包、行李箱、斜挎包、托特包。
  9 +
  10 +## 3. Pet Supplies 宠物用品
  11 +Pet food, pet toys, pet care products, pet grooming, pet clothing, smart pet devices. 宠物食品、宠物玩具、宠物护理用品、宠物美容、宠物服装、智能宠物设备。
  12 +
  13 +## 4. Electronics 电子产品
  14 +Consumer electronics, home appliances, digital devices, cables & chargers, batteries, electronic components. 消费电子产品、家用电器、数码设备、线材充电器、电池、电子元器件。
  15 +
  16 +## 5. Clothing 服装
  17 +Women's wear, men's wear, kid's wear, underwear, outerwear, activewear. 女装、男装、童装、内衣、外套、运动服装。
  18 +
  19 +## 6. Outdoor 户外用品
  20 +Camping gear, hiking equipment, fishing supplies, outdoor clothing, travel accessories, survival tools. 露营装备、徒步用品、渔具、户外服装、旅行配件、求生工具。
  21 +
  22 +## 7. Home Appliances 家电/电器
  23 +Kitchen appliances, cleaning appliances, personal care appliances, heating & cooling, smart home devices. 厨房电器、清洁电器、个护电器、冷暖设备、智能家居设备。
  24 +
  25 +## 8. Home & Living 家居
  26 +Furniture, home textiles, lighting, kitchenware, storage, home decor. 家具、家纺、灯具、厨具、收纳、家居装饰。
  27 +
  28 +## 9. Wigs 假发
  29 +
  30 +## 10. Beauty & Cosmetics 美容美妆
  31 +Skincare, makeup, nail care, beauty tools, hair care, fragrances. 护肤品、彩妆、美甲、美容工具、护发、香水。
  32 +
  33 +## 11. Accessories 配饰
  34 +Jewelry, watches, belts, scarves, hats, sunglasses, hair accessories. 珠宝、手表、腰带、围巾、帽子、太阳镜、发饰。
  35 +
  36 +## 12. Toys 玩具
  37 +Educational toys, plush toys, action figures, puzzles, outdoor toys, DIY toys. 益智玩具、毛绒玩具、可动人偶、拼图、户外玩具、DIY玩具。
  38 +
  39 +## 13. Shoes 鞋子
  40 +Sneakers, boots, sandals, heels, flats, sports shoes. 运动鞋、靴子、凉鞋、高跟鞋、平底鞋、球鞋。
  41 +
  42 +## 14. Sports 运动产品
  43 +Fitness equipment, sports gear, team sports, racquet sports, water sports, cycling. 健身器材、运动装备、团队运动、球拍运动、水上运动、骑行。
  44 +
  45 +## 15. Others 其他
  46 +
  47 +# 各个大类的taxonomy
  48 +## 1. Clothing & Apparel 服装
3 49
4 ### A. Product Classification 50 ### A. Product Classification
5 51
tests/ci/test_service_api_contracts.py
@@ -348,10 +348,12 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch @@ -348,10 +348,12 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch
348 def _fake_build_index_content_fields( 348 def _fake_build_index_content_fields(
349 items: List[Dict[str, str]], 349 items: List[Dict[str, str]],
350 tenant_id: str | None = None, 350 tenant_id: str | None = None,
351 - analysis_kinds: List[str] | None = None, 351 + enrichment_scopes: List[str] | None = None,
  352 + category_taxonomy_profile: str = "apparel",
352 ): 353 ):
353 assert tenant_id == "162" 354 assert tenant_id == "162"
354 - assert analysis_kinds == ["content", "taxonomy"] 355 + assert enrichment_scopes == ["generic", "category_taxonomy"]
  356 + assert category_taxonomy_profile == "apparel"
355 return [ 357 return [
356 { 358 {
357 "id": p["spu_id"], 359 "id": p["spu_id"],
@@ -376,6 +378,8 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch @@ -376,6 +378,8 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch
376 "/indexer/enrich-content", 378 "/indexer/enrich-content",
377 json={ 379 json={
378 "tenant_id": "162", 380 "tenant_id": "162",
  381 + "enrichment_scopes": ["generic", "category_taxonomy"],
  382 + "category_taxonomy_profile": "apparel",
379 "items": [ 383 "items": [
380 {"spu_id": "1001", "title": "T-shirt"}, 384 {"spu_id": "1001", "title": "T-shirt"},
381 {"spu_id": "1002", "title": "Toy"}, 385 {"spu_id": "1002", "title": "Toy"},
@@ -385,7 +389,8 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch @@ -385,7 +389,8 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch
385 assert response.status_code == 200 389 assert response.status_code == 200
386 data = response.json() 390 data = response.json()
387 assert data["tenant_id"] == "162" 391 assert data["tenant_id"] == "162"
388 - assert data["analysis_kinds"] == ["content", "taxonomy"] 392 + assert data["enrichment_scopes"] == ["generic", "category_taxonomy"]
  393 + assert data["category_taxonomy_profile"] == "apparel"
389 assert data["total"] == 2 394 assert data["total"] == 2
390 assert len(data["results"]) == 2 395 assert len(data["results"]) == 2
391 assert data["results"][0]["spu_id"] == "1001" 396 assert data["results"][0]["spu_id"] == "1001"
@@ -403,6 +408,52 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch @@ -403,6 +408,52 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch
403 } 408 }
404 409
405 410
  411 +def test_indexer_enrich_content_contract_accepts_deprecated_analysis_kinds(indexer_client: TestClient, monkeypatch):
  412 + import indexer.product_enrich as process_products
  413 +
  414 + seen: Dict[str, Any] = {}
  415 +
  416 + def _fake_build_index_content_fields(
  417 + items: List[Dict[str, str]],
  418 + tenant_id: str | None = None,
  419 + enrichment_scopes: List[str] | None = None,
  420 + category_taxonomy_profile: str = "apparel",
  421 + ):
  422 + seen["tenant_id"] = tenant_id
  423 + seen["enrichment_scopes"] = enrichment_scopes
  424 + seen["category_taxonomy_profile"] = category_taxonomy_profile
  425 + return [
  426 + {
  427 + "id": items[0]["spu_id"],
  428 + "qanchors": {},
  429 + "enriched_tags": {},
  430 + "enriched_attributes": [],
  431 + "enriched_taxonomy_attributes": [],
  432 + }
  433 + ]
  434 +
  435 + monkeypatch.setattr(process_products, "build_index_content_fields", _fake_build_index_content_fields)
  436 +
  437 + response = indexer_client.post(
  438 + "/indexer/enrich-content",
  439 + json={
  440 + "tenant_id": "162",
  441 + "analysis_kinds": ["taxonomy"],
  442 + "items": [{"spu_id": "1001", "title": "T-shirt"}],
  443 + },
  444 + )
  445 +
  446 + assert response.status_code == 200
  447 + data = response.json()
  448 + assert seen == {
  449 + "tenant_id": "162",
  450 + "enrichment_scopes": ["category_taxonomy"],
  451 + "category_taxonomy_profile": "apparel",
  452 + }
  453 + assert data["enrichment_scopes"] == ["category_taxonomy"]
  454 + assert data["category_taxonomy_profile"] == "apparel"
  455 +
  456 +
406 def test_indexer_documents_contract(indexer_client: TestClient): 457 def test_indexer_documents_contract(indexer_client: TestClient):
407 """POST /indexer/documents: tenant_id + spu_ids, returns success/failed lists (no ES write).""" 458 """POST /indexer/documents: tenant_id + spu_ids, returns success/failed lists (no ES write)."""
408 response = indexer_client.post( 459 response = indexer_client.post(
tests/test_process_products_batching.py
@@ -18,8 +18,10 @@ def test_analyze_products_caps_batch_size_to_20(monkeypatch): @@ -18,8 +18,10 @@ def test_analyze_products_caps_batch_size_to_20(monkeypatch):
18 batch_num: int, 18 batch_num: int,
19 target_lang: str = "zh", 19 target_lang: str = "zh",
20 analysis_kind: str = "content", 20 analysis_kind: str = "content",
  21 + category_taxonomy_profile=None,
21 ): 22 ):
22 assert analysis_kind == "content" 23 assert analysis_kind == "content"
  24 + assert category_taxonomy_profile is None
23 seen_batch_sizes.append(len(batch_data)) 25 seen_batch_sizes.append(len(batch_data))
24 return [ 26 return [
25 { 27 {
@@ -64,8 +66,10 @@ def test_analyze_products_uses_min_batch_size_1(monkeypatch): @@ -64,8 +66,10 @@ def test_analyze_products_uses_min_batch_size_1(monkeypatch):
64 batch_num: int, 66 batch_num: int,
65 target_lang: str = "zh", 67 target_lang: str = "zh",
66 analysis_kind: str = "content", 68 analysis_kind: str = "content",
  69 + category_taxonomy_profile=None,
67 ): 70 ):
68 assert analysis_kind == "content" 71 assert analysis_kind == "content"
  72 + assert category_taxonomy_profile is None
69 seen_batch_sizes.append(len(batch_data)) 73 seen_batch_sizes.append(len(batch_data))
70 return [ 74 return [
71 { 75 {
tests/test_product_enrich_partial_mode.py
@@ -295,16 +295,36 @@ def test_analyze_products_uses_product_level_cache_across_batch_requests(): @@ -295,16 +295,36 @@ def test_analyze_products_uses_product_level_cache_across_batch_requests():
295 product.get("image_url", ""), 295 product.get("image_url", ""),
296 ) 296 )
297 297
298 - def fake_get_cached_analysis_result(product, target_lang, analysis_kind="content"): 298 + def fake_get_cached_analysis_result(
  299 + product,
  300 + target_lang,
  301 + analysis_kind="content",
  302 + category_taxonomy_profile=None,
  303 + ):
299 assert analysis_kind == "content" 304 assert analysis_kind == "content"
  305 + assert category_taxonomy_profile is None
300 return cache_store.get(_cache_key(product, target_lang)) 306 return cache_store.get(_cache_key(product, target_lang))
301 307
302 - def fake_set_cached_analysis_result(product, target_lang, result, analysis_kind="content"): 308 + def fake_set_cached_analysis_result(
  309 + product,
  310 + target_lang,
  311 + result,
  312 + analysis_kind="content",
  313 + category_taxonomy_profile=None,
  314 + ):
303 assert analysis_kind == "content" 315 assert analysis_kind == "content"
  316 + assert category_taxonomy_profile is None
304 cache_store[_cache_key(product, target_lang)] = result 317 cache_store[_cache_key(product, target_lang)] = result
305 318
306 - def fake_process_batch(batch_data, batch_num, target_lang="zh", analysis_kind="content"): 319 + def fake_process_batch(
  320 + batch_data,
  321 + batch_num,
  322 + target_lang="zh",
  323 + analysis_kind="content",
  324 + category_taxonomy_profile=None,
  325 + ):
307 assert analysis_kind == "content" 326 assert analysis_kind == "content"
  327 + assert category_taxonomy_profile is None
308 process_calls.append( 328 process_calls.append(
309 { 329 {
310 "batch_num": batch_num, 330 "batch_num": batch_num,
@@ -400,7 +420,7 @@ def test_analyze_products_reuses_cached_content_with_current_product_identity(): @@ -400,7 +420,7 @@ def test_analyze_products_reuses_cached_content_with_current_product_identity():
400 with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object( 420 with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object(
401 product_enrich, 421 product_enrich,
402 "_get_cached_analysis_result", 422 "_get_cached_analysis_result",
403 - wraps=lambda product, target_lang, analysis_kind="content": product_enrich._normalize_analysis_result( 423 + wraps=lambda product, target_lang, analysis_kind="content", category_taxonomy_profile=None: product_enrich._normalize_analysis_result(
404 cached_result, 424 cached_result,
405 product=product, 425 product=product,
406 target_lang=target_lang, 426 target_lang=target_lang,
@@ -443,8 +463,10 @@ def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output() @@ -443,8 +463,10 @@ def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output()
443 batch_size=None, 463 batch_size=None,
444 tenant_id=None, 464 tenant_id=None,
445 analysis_kind="content", 465 analysis_kind="content",
  466 + category_taxonomy_profile=None,
446 ): 467 ):
447 if analysis_kind == "taxonomy": 468 if analysis_kind == "taxonomy":
  469 + assert category_taxonomy_profile == "apparel"
448 return [ 470 return [
449 { 471 {
450 "id": products[0]["id"], 472 "id": products[0]["id"],
@@ -478,6 +500,7 @@ def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output() @@ -478,6 +500,7 @@ def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output()
478 "style_aesthetic": "", 500 "style_aesthetic": "",
479 } 501 }
480 ] 502 ]
  503 + assert category_taxonomy_profile == "apparel"
481 return [ 504 return [
482 { 505 {
483 "id": products[0]["id"], 506 "id": products[0]["id"],