refactor(indexer): 将 analysis_kinds 拆分为 enrichment_scopes +

category_taxonomy_profile - 原 analysis_kinds 混用了“增强类型”（content/taxonomy）与“品类特定配置”，不利于扩展不同品类的 taxonomy 分析（如 3C、家居等） - 新增 enrichment_scopes 参数：支持 generic（通用增强，产出 qanchors/enriched_tags/enriched_attributes）和 category_taxonomy（品类增强，产出 enriched_taxonomy_attributes） - 新增 category_taxonomy_profile 参数：指定品类增强使用哪套 profile（当前内置 apparel），每套 profile 包含独立的 prompt、输出列定义、解析规则及缓存版本 - 保留 analysis_kinds 作为兼容别名，避免破坏现有调用方 - 重构内部 taxonomy 分析为 profile registry 模式：新增 _get_taxonomy_schema(profile_name) 函数，根据 profile 动态返回对应的 AnalysisSchema - 缓存 key 现在按“分析类型 + profile + schema 指纹 + 输入字段哈希”隔离，确保不同品类、不同 prompt 版本自动失效 - 更新 API 文档及微服务接口文档，明确新参数语义与使用示例技术细节： - 修改入口：api/routes/indexer.py 中 enrich-content 端点，解析新参数并向下传递 - 核心逻辑：indexer/product_enrich.py 中 enrich_products_batch 增加 profile 参数；_process_batch_for_schema 根据 scope 和 profile 动态获取 schema - 兼容层：若请求同时提供 analysis_kinds，则映射为 enrichment_scopes（content→generic，taxonomy→category_taxonomy），category_taxonomy_profile 默认为 "apparel" - 测试覆盖：新增 enrichment_scopes 组合、profile 切换及兼容模式测试

refactor(indexer): 将 analysis_kinds 拆分为 enrichment_scopes +
category_taxonomy_profile - 原 analysis_kinds 混用了“增强类型”（content/taxonomy）与“品类特定配置”，不利于扩展不同品类的 taxonomy 分析（如 3C、家居等） - 新增 enrichment_scopes 参数：支持 generic（通用增强，产出 qanchors/enriched_tags/enriched_attributes）和 category_taxonomy（品类增强，产出 enriched_taxonomy_attributes） - 新增 category_taxonomy_profile 参数：指定品类增强使用哪套 profile（当前内置 apparel），每套 profile 包含独立的 prompt、输出列定义、解析规则及缓存版本 - 保留 analysis_kinds 作为兼容别名，避免破坏现有调用方 - 重构内部 taxonomy 分析为 profile registry 模式：新增 _get_taxonomy_schema(profile_name) 函数，根据 profile 动态返回对应的 AnalysisSchema - 缓存 key 现在按“分析类型 + profile + schema 指纹 + 输入字段哈希”隔离，确保不同品类、不同 prompt 版本自动失效 - 更新 API 文档及微服务接口文档，明确新参数语义与使用示例技术细节： - 修改入口：api/routes/indexer.py 中 enrich-content 端点，解析新参数并向下传递 - 核心逻辑：indexer/product_enrich.py 中 enrich_products_batch 增加 profile 参数；_process_batch_for_schema 根据 scope 和 profile 动态获取 schema - 兼容层：若请求同时提供 analysis_kinds，则映射为 enrichment_scopes（content→generic，taxonomy→category_taxonomy），category_taxonomy_profile 默认为 "apparel" - 测试覆盖：新增 enrichment_scopes 组合、profile 切换及兼容模式测试
tangwang
1 parent 5aaf0c7d
Showing 9 changed files with 368 additions and 71 deletions Show diff stats
api/routes/indexer.py
docs/issues/issue-2026-04-08-eval框架主指标ERR的问题以及bm25调参-done-0408.md
docs/搜索API对接指南-05-索引接口（Indexer）.md
docs/搜索API对接指南-07-微服务接口（Embedding-Reranker-Translation）.md
indexer/product_enrich.py
indexer/taxonomy.md
tests/ci/test_service_api_contracts.py
tests/test_process_products_batching.py
tests/test_product_enrich_partial_mode.py
@@ -88,20 +88,41 @@ class EnrichContentItem(BaseModel):
  
 class EnrichContentRequest(BaseModel):
     """
-    内容理解字段生成请求：根据商品标题批量生成 qanchors、enriched_attributes、tags、taxonomy attributes。
+    内容理解字段生成请求：根据商品标题批量生成通用增强字段与品类 taxonomy 字段。
     供外部 indexer 在自行组织 doc 时调用，与翻译、向量化等微服务并列。
     """
     tenant_id: str = Field(..., description="租户 ID，用于请求路由与结果归属，不参与缓存键")
     items: List[EnrichContentItem] = Field(..., description="待分析的 SPU 列表（spu_id + title，可附带 brief/description/image_url）")
-    analysis_kinds: List[Literal["content", "taxonomy"]] = Field(
-        default_factory=lambda: ["content", "taxonomy"],
+    enrichment_scopes: Optional[List[Literal["generic", "category_taxonomy"]]] = Field(
+        default=None,
         description=(
-            "要执行的分析族。"
-            "`content` 返回 qanchors/enriched_tags/enriched_attributes；"
-            "`taxonomy` 返回 enriched_taxonomy_attributes。"
+            "要执行的增强范围。"
+            "`generic` 返回 qanchors/enriched_tags/enriched_attributes；"
+            "`category_taxonomy` 返回 enriched_taxonomy_attributes。"
             "默认两者都执行。"
         ),
     )
+    category_taxonomy_profile: str = Field(
+        "apparel",
+        description=(
+            "品类 taxonomy profile。当前默认且已支持的是 `apparel`。"
+            "未来可扩展为 `electronics` 等。"
+        ),
+    )
+    analysis_kinds: Optional[List[Literal["content", "taxonomy"]]] = Field(
+        default=None,
+        description="Deprecated alias of enrichment_scopes. `content` -> `generic`, `taxonomy` -> `category_taxonomy`.",
+    )
+
+    def resolved_enrichment_scopes(self) -> List[str]:
+        if self.enrichment_scopes:
+            return list(self.enrichment_scopes)
+        if self.analysis_kinds:
+            mapped = []
+            for item in self.analysis_kinds:
+                mapped.append("generic" if item == "content" else "category_taxonomy")
+            return mapped
+        return ["generic", "category_taxonomy"]
  
  
 @router.post("/reindex")
@@ -452,7 +473,8 @@ async def build_docs_from_db(request: BuildDocsFromDbRequest):
 def _run_enrich_content(
     tenant_id: str,
     items: List[Dict[str, str]],
-    analysis_kinds: Optional[List[str]] = None,
+    enrichment_scopes: Optional[List[str]] = None,
+    category_taxonomy_profile: str = "apparel",
 ) -> List[Dict[str, Any]]:
     """
     同步执行内容理解，返回与 ES mapping 对齐的字段结构。
@@ -463,7 +485,8 @@ def _run_enrich_content(
     results = build_index_content_fields(
         items=items,
         tenant_id=tenant_id,
-        analysis_kinds=analysis_kinds,
+        enrichment_scopes=enrichment_scopes,
+        category_taxonomy_profile=category_taxonomy_profile,
     )
     return [
         {
@@ -481,7 +504,7 @@ def _run_enrich_content(
 @router.post("/enrich-content")
 async def enrich_content(request: EnrichContentRequest):
     """
-    内容理解字段生成接口：根据商品标题批量生成 qanchors、enriched_attributes、tags、taxonomy attributes。
+    内容理解字段生成接口：根据商品标题批量生成通用增强字段与品类 taxonomy 字段。
  
     使用场景：
     - 外部 indexer 采用「微服务组合」方式自己组织 doc 时，可调用本接口获取 LLM 生成的
@@ -511,17 +534,20 @@ async def enrich_content(request: EnrichContentRequest):
             for it in request.items
         ]
         loop = asyncio.get_event_loop()
+        enrichment_scopes = request.resolved_enrichment_scopes()
         result = await loop.run_in_executor(
             None,
             lambda: _run_enrich_content(
                 tenant_id=request.tenant_id,
                 items=items_payload,
-                analysis_kinds=request.analysis_kinds,
+                enrichment_scopes=enrichment_scopes,
+                category_taxonomy_profile=request.category_taxonomy_profile,
             ),
         )
         return {
             "tenant_id": request.tenant_id,
-            "analysis_kinds": request.analysis_kinds,
+            "enrichment_scopes": enrichment_scopes,
+            "category_taxonomy_profile": request.category_taxonomy_profile,
             "results": result,
             "total": len(result),
         }
@@ -92,3 +92,29 @@ curl -s -u &quot;$AUTH&quot; -X GET &quot;$ES/${INDEX}/_settings?filter_path=**.similarity&amp;pret
 ```bash
 curl -s -u "$AUTH" -X GET "$ES/_cat/indices/search_products_tenant_*?h=index&v"
 ```
+
+
+
+实验结果：
+已完成四轮实验，并把 `search_products_tenant_163` 当前索引参数留在了最优组：`b=0.5, k1=1.0`。刚刚已校验生效。
+
+对比结果如下：
+
+- 基线当前线上组 `b=0.10, k1=0.30`：`0.604555`
+  报告：[batch_20260408T055948Z_00b6a8aa3d.md](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260408T055948Z_00b6a8aa3d.md)
+- 复跑 `b=0.10, k1=0.30`：`0.604298`
+  报告：[batch_20260408T064011Z_00b6a8aa3d.json](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260408T064011Z_00b6a8aa3d.json)
+- 实验 `b=0.20, k1=0.60`：`0.604274`
+  报告：[batch_20260408T064123Z_00b6a8aa3d.json](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260408T064123Z_00b6a8aa3d.json)
+- 实验 `b=0.50, k1=1.0`：`0.606041`
+  报告：[batch_20260408T064216Z_00b6a8aa3d.json](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260408T064216Z_00b6a8aa3d.json)
+- 实验 `b=0.10, k1=0.75`：`0.603438`
+  报告：[batch_20260408T064304Z_00b6a8aa3d.json](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260408T064304Z_00b6a8aa3d.json)
+- 老基线 `b=0.0, k1=0.0`：`0.602598`
+  报告：[batch_20260407T150946Z_00b6a8aa3d.md](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260407T150946Z_00b6a8aa3d.md)
+
+结论是这四组里 `b=0.50, k1=1.0` 最好，相比你给的当前基线 `0.604555` 提升了 `+0.001486`，相比 `0/0` 提升了 `+0.003443`。`0.20/0.60` 基本和 `0.10/0.30` 持平，`0.10/0.75` 反而更差，说明这轮里单独加大 `k1` 不如同时把 `b` 和 `k1` 一起放宽。
+
+补充一点：我只改了现有索引 `search_products_tenant_163` 的 index-level setting，没有改仓库里的默认映射。[search_products.json](/data/saas-search/mappings/search_products.json) 现在仍是 `0.1/0.3`。如果你要，我下一步可以直接帮你把映射默认值也改成 `0.5/1.0`，并顺手批量同步其他 `search_products_tenant_*` 索引。
+
+
@@ -655,7 +655,8 @@ curl -X POST &quot;http://127.0.0.1:6004/indexer/build-docs-from-db&quot; \
 ```json
 {
   "tenant_id": "170",
-  "analysis_kinds": ["content", "taxonomy"],
+  "enrichment_scopes": ["generic", "category_taxonomy"],
+  "category_taxonomy_profile": "apparel",
   "items": [
     {
       "spu_id": "223167",
@@ -676,7 +677,8 @@ curl -X POST &quot;http://127.0.0.1:6004/indexer/build-docs-from-db&quot; \
 | 参数 | 类型 | 必填 | 默认值 | 说明 |
 |------|------|------|--------|------|
 | `tenant_id` | string | Y | - | 租户 ID。目前仅用于记录日志，不产生实际作用|
-| `analysis_kinds` | array[string] | N | `["content", "taxonomy"]` | 选择要执行的分析族。`content` 生成 `qanchors`/`enriched_tags`/`enriched_attributes`，`taxonomy` 生成 `enriched_taxonomy_attributes` |
+| `enrichment_scopes` | array[string] | N | `["generic", "category_taxonomy"]` | 选择要执行的增强范围。`generic` 生成 `qanchors`/`enriched_tags`/`enriched_attributes`，`category_taxonomy` 生成 `enriched_taxonomy_attributes` |
+| `category_taxonomy_profile` | string | N | `apparel` | 品类 taxonomy profile。当前内置为服装大类 `apparel`，后续可扩展到其他大类 |
 | `items` | array | Y | - | 待分析列表；**单次最多 50 条** |
  
 `items[]` 字段说明：
@@ -691,8 +693,8 @@ curl -X POST &quot;http://127.0.0.1:6004/indexer/build-docs-from-db&quot; \
  
 缓存说明：
  
-- 内容缓存按 **分析族拆分**，即 `content` 与 `taxonomy` 使用不同的缓存命名空间，互不污染、可独立演进。
-- 缓存键由 `analysis_kind + target_lang + prompt/schema 版本指纹 + prompt 输入文本 hash` 构成。
+- 内容缓存按 **增强范围 + taxonomy profile** 拆分；`generic` 与 `category_taxonomy:apparel` 等使用不同缓存命名空间，互不污染、可独立演进。
+- 缓存键由 `analysis_kind + target_lang + prompt/schema 版本指纹 + prompt 输入文本 hash` 构成；对 category taxonomy 来说，profile 会进入 schema 标识与版本指纹。
 - 当前真正参与 prompt 输入的字段是：`title`、`brief`、`description`；这些字段任一变化，都会落到新的缓存 key。
 - `prompt/schema 版本指纹` 会综合 system prompt、shared instruction、localized table headers、result fields、user instruction template 等信息生成；因此只要提示词或输出契约变化，旧缓存会自然失效。
 - `tenant_id`、`spu_id` 只用于请求归属与结果回填，不参与缓存键。
@@ -714,7 +716,8 @@ curl -X POST &quot;http://127.0.0.1:6004/indexer/build-docs-from-db&quot; \
 ```json
 {
   "tenant_id": "170",
-  "analysis_kinds": ["content", "taxonomy"],
+  "enrichment_scopes": ["generic", "category_taxonomy"],
+  "category_taxonomy_profile": "apparel",
   "total": 2,
   "results": [
     {
@@ -755,7 +758,8 @@ curl -X POST &quot;http://127.0.0.1:6004/indexer/build-docs-from-db&quot; \
  
 | 字段 | 类型 | 说明 |
 |------|------|------|
-| `analysis_kinds` | array | 实际执行的分析族列表 |
+| `enrichment_scopes` | array | 实际执行的增强范围列表 |
+| `category_taxonomy_profile` | string | 实际使用的品类 taxonomy profile |
 | `results` | array | 与请求 `items` 一一对应，每项含 `spu_id`、`qanchors`、`enriched_attributes`、`enriched_tags`、`enriched_taxonomy_attributes` |
 | `results[].qanchors` | object | 与 ES `qanchors` 字段同结构，按语言键返回短语数组 |
 | `results[].enriched_tags` | object | 与 ES `enriched_tags` 字段同结构，按语言键返回标签数组 |
@@ -774,7 +778,8 @@ curl -X POST &quot;http://localhost:6004/indexer/enrich-content&quot; \
   -H "Content-Type: application/json" \
   -d '{
     "tenant_id": "163",
-    "analysis_kinds": ["content", "taxonomy"],
+    "enrichment_scopes": ["generic", "category_taxonomy"],
+    "category_taxonomy_profile": "apparel",
     "items": [
       {
         "spu_id": "223167",
@@ -444,7 +444,7 @@ curl &quot;http://localhost:6006/health&quot;
  
 - **Base URL**: Indexer 服务地址，如 `http://localhost:6004`
 - **路径**: `POST /indexer/enrich-content`
-- **说明**: 根据商品标题批量生成 `qanchors`、`enriched_attributes`、`enriched_tags`、`enriched_taxonomy_attributes`，用于拼装 ES 文档。支持通过 `analysis_kinds` 选择执行 `content` / `taxonomy`；默认两者都执行。内部使用大模型（需配置 `DASHSCOPE_API_KEY`），支持多语言与 Redis 缓存；单次最多 50 条，建议批量调用以提升效率。
+- **说明**: 根据商品标题批量生成 `qanchors`、`enriched_attributes`、`enriched_tags`、`enriched_taxonomy_attributes`，用于拼装 ES 文档。支持通过 `enrichment_scopes` 选择执行 `generic` / `category_taxonomy`，并通过 `category_taxonomy_profile` 选择对应大类的 taxonomy prompt/profile；默认执行 `generic + category_taxonomy(apparel)`。内部使用大模型（需配置 `DASHSCOPE_API_KEY`），支持多语言与 Redis 缓存；单次最多 50 条，建议批量调用以提升效率。
  
 请求/响应格式、示例及错误码见 [-05-索引接口（Indexer）](./搜索API对接指南-05-索引接口（Indexer）.md#58-内容理解字段生成接口)。
  
@@ -151,7 +151,8 @@ if _missing_prompt_langs:
 # 多值字段分隔：英文逗号、中文逗号、顿号，及历史约定的 ; | / 与空白
 _MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[，、,;|/\n\t]+")
 _CORE_INDEX_LANGUAGES = ("zh", "en")
-_DEFAULT_ANALYSIS_KINDS = ("content", "taxonomy")
+_DEFAULT_ENRICHMENT_SCOPES = ("generic", "category_taxonomy")
+_DEFAULT_CATEGORY_TAXONOMY_PROFILE = "apparel"
 _CONTENT_ANALYSIS_ATTRIBUTE_FIELD_MAP = (
     ("tags", "enriched_tags"),
     ("target_audience", "target_audience"),
@@ -187,7 +188,7 @@ _CONTENT_ANALYSIS_FIELD_ALIASES = {
     "tags": ("tags", "enriched_tags"),
 }
 _CONTENT_ANALYSIS_QUALITY_FIELDS = ("title", "category_path", "anchor_text")
-_TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP = (
+_APPAREL_TAXONOMY_ATTRIBUTE_FIELD_MAP = (
     ("product_type", "Product Type"),
     ("target_gender", "Target Gender"),
     ("age_group", "Age Group"),
@@ -215,8 +216,8 @@ _TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP = (
     ("occasion_end_use", "Occasion / End Use"),
     ("style_aesthetic", "Style Aesthetic"),
 )
-_TAXONOMY_ANALYSIS_RESULT_FIELDS = tuple(
-    field_name for field_name, _ in _TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP
+_APPAREL_TAXONOMY_ANALYSIS_RESULT_FIELDS = tuple(
+    field_name for field_name, _ in _APPAREL_TAXONOMY_ATTRIBUTE_FIELD_MAP
 )
  
  
@@ -252,37 +253,66 @@ _ANALYSIS_SCHEMAS: Dict[str, AnalysisSchema] = {
         field_aliases=_CONTENT_ANALYSIS_FIELD_ALIASES,
         quality_fields=_CONTENT_ANALYSIS_QUALITY_FIELDS,
     ),
-    "taxonomy": AnalysisSchema(
-        name="taxonomy",
+}
+
+_CATEGORY_TAXONOMY_PROFILE_SCHEMAS: Dict[str, AnalysisSchema] = {
+    "apparel": AnalysisSchema(
+        name="taxonomy:apparel",
         shared_instruction=TAXONOMY_SHARED_ANALYSIS_INSTRUCTION,
         markdown_table_headers=TAXONOMY_LANGUAGE_MARKDOWN_TABLE_HEADERS,
-        result_fields=_TAXONOMY_ANALYSIS_RESULT_FIELDS,
-        meaningful_fields=_TAXONOMY_ANALYSIS_RESULT_FIELDS,
+        result_fields=_APPAREL_TAXONOMY_ANALYSIS_RESULT_FIELDS,
+        meaningful_fields=_APPAREL_TAXONOMY_ANALYSIS_RESULT_FIELDS,
         cache_version="v1",
         fallback_headers=TAXONOMY_MARKDOWN_TABLE_HEADERS_EN,
     ),
 }
  
+_CATEGORY_TAXONOMY_PROFILE_ATTRIBUTE_FIELD_MAPS: Dict[str, Tuple[Tuple[str, str], ...]] = {
+    "apparel": _APPAREL_TAXONOMY_ATTRIBUTE_FIELD_MAP,
+}
+
  
-def _get_analysis_schema(analysis_kind: str) -> AnalysisSchema:
-    schema = _ANALYSIS_SCHEMAS.get(analysis_kind)
-    if schema is None:
-        raise ValueError(f"Unsupported analysis_kind: {analysis_kind}")
-    return schema
+def _normalize_category_taxonomy_profile(category_taxonomy_profile: Optional[str] = None) -> str:
+    profile = str(category_taxonomy_profile or _DEFAULT_CATEGORY_TAXONOMY_PROFILE).strip()
+    if profile not in _CATEGORY_TAXONOMY_PROFILE_SCHEMAS:
+        raise ValueError(f"Unsupported category_taxonomy_profile: {profile}")
+    return profile
  
  
-def _normalize_analysis_kinds(
-    analysis_kinds: Optional[List[str]] = None,
+def _get_analysis_schema(
+    analysis_kind: str,
+    *,
+    category_taxonomy_profile: Optional[str] = None,
+) -> AnalysisSchema:
+    if analysis_kind == "content":
+        return _ANALYSIS_SCHEMAS["content"]
+    if analysis_kind == "taxonomy":
+        profile = _normalize_category_taxonomy_profile(category_taxonomy_profile)
+        return _CATEGORY_TAXONOMY_PROFILE_SCHEMAS[profile]
+    raise ValueError(f"Unsupported analysis_kind: {analysis_kind}")
+
+
+def _get_taxonomy_attribute_field_map(
+    category_taxonomy_profile: Optional[str] = None,
+) -> Tuple[Tuple[str, str], ...]:
+    profile = _normalize_category_taxonomy_profile(category_taxonomy_profile)
+    return _CATEGORY_TAXONOMY_PROFILE_ATTRIBUTE_FIELD_MAPS[profile]
+
+
+def _normalize_enrichment_scopes(
+    enrichment_scopes: Optional[List[str]] = None,
 ) -> Tuple[str, ...]:
-    requested = _DEFAULT_ANALYSIS_KINDS if not analysis_kinds else tuple(analysis_kinds)
+    requested = _DEFAULT_ENRICHMENT_SCOPES if not enrichment_scopes else tuple(enrichment_scopes)
     normalized: List[str] = []
     seen = set()
-    for analysis_kind in requested:
-        schema = _get_analysis_schema(str(analysis_kind).strip())
-        if schema.name in seen:
+    for enrichment_scope in requested:
+        scope = str(enrichment_scope).strip()
+        if scope not in {"generic", "category_taxonomy"}:
+            raise ValueError(f"Unsupported enrichment_scope: {scope}")
+        if scope in seen:
             continue
-        seen.add(schema.name)
-        normalized.append(schema.name)
+        seen.add(scope)
+        normalized.append(scope)
     return tuple(normalized)
  
  
@@ -448,7 +478,13 @@ def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang: 
             _append_lang_phrase_map(result["enriched_tags"], lang=lang, raw_value=raw)
  
  
-def _apply_index_taxonomy_row(result: Dict[str, Any], row: Dict[str, Any], lang: str) -> None:
+def _apply_index_taxonomy_row(
+    result: Dict[str, Any],
+    row: Dict[str, Any],
+    lang: str,
+    *,
+    category_taxonomy_profile: Optional[str] = None,
+) -> None:
     if not row or row.get("error"):
         return
  
@@ -456,8 +492,11 @@ def _apply_index_taxonomy_row(result: Dict[str, Any], row: Dict[str, Any], lang:
         result["enriched_taxonomy_attributes"],
         row=row,
         lang=lang,
-        schema=_get_analysis_schema("taxonomy"),
-        field_map=_TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP,
+        schema=_get_analysis_schema(
+            "taxonomy",
+            category_taxonomy_profile=category_taxonomy_profile,
+        ),
+        field_map=_get_taxonomy_attribute_field_map(category_taxonomy_profile),
     )
  
  
@@ -475,7 +514,8 @@ def _normalize_index_content_item(item: Dict[str, Any]) -&gt; Dict[str, str]:
 def build_index_content_fields(
     items: List[Dict[str, Any]],
     tenant_id: Optional[str] = None,
-    analysis_kinds: Optional[List[str]] = None,
+    enrichment_scopes: Optional[List[str]] = None,
+    category_taxonomy_profile: Optional[str] = None,
 ) -> List[Dict[str, Any]]:
     """
     高层入口：生成与 ES mapping 对齐的内容理解字段。
@@ -484,7 +524,8 @@ def build_index_content_fields(
     - `id` 或 `spu_id`
     - `title`
     - 可选 `brief` / `description` / `image_url`
-    - 可选 `analysis_kinds`，默认同时执行 `content` 与 `taxonomy`
+    - 可选 `enrichment_scopes`，默认同时执行 `generic` 与 `category_taxonomy`
+    - 可选 `category_taxonomy_profile`，默认 `apparel`
  
     返回项结构：
     - `id`
@@ -498,7 +539,8 @@ def build_index_content_fields(
     - `qanchors.{lang}` 为短语数组
     - `enriched_tags.{lang}` 为标签数组
     """
-    requested_analysis_kinds = _normalize_analysis_kinds(analysis_kinds)
+    requested_enrichment_scopes = _normalize_enrichment_scopes(enrichment_scopes)
+    normalized_taxonomy_profile = _normalize_category_taxonomy_profile(category_taxonomy_profile)
     normalized_items = [_normalize_index_content_item(item) for item in items]
     if not normalized_items:
         return []
@@ -515,7 +557,7 @@ def build_index_content_fields(
     }
  
     for lang in _CORE_INDEX_LANGUAGES:
-        if "content" in requested_analysis_kinds:
+        if "generic" in requested_enrichment_scopes:
             try:
                 rows = analyze_products(
                     products=normalized_items,
@@ -523,6 +565,7 @@ def build_index_content_fields(
                     batch_size=BATCH_SIZE,
                     tenant_id=tenant_id,
                     analysis_kind="content",
+                    category_taxonomy_profile=normalized_taxonomy_profile,
                 )
             except Exception as e:
                 logger.warning("build_index_content_fields content enrichment failed for lang=%s: %s", lang, e)
@@ -539,7 +582,7 @@ def build_index_content_fields(
                     continue
                 _apply_index_content_row(results_by_id[item_id], row=row, lang=lang)
  
-        if "taxonomy" in requested_analysis_kinds:
+        if "category_taxonomy" in requested_enrichment_scopes:
             try:
                 taxonomy_rows = analyze_products(
                     products=normalized_items,
@@ -547,6 +590,7 @@ def build_index_content_fields(
                     batch_size=BATCH_SIZE,
                     tenant_id=tenant_id,
                     analysis_kind="taxonomy",
+                    category_taxonomy_profile=normalized_taxonomy_profile,
                 )
             except Exception as e:
                 logger.warning(
@@ -565,7 +609,12 @@ def build_index_content_fields(
                 if row.get("error"):
                     results_by_id[item_id].setdefault("error", row["error"])
                     continue
-                _apply_index_taxonomy_row(results_by_id[item_id], row=row, lang=lang)
+                _apply_index_taxonomy_row(
+                    results_by_id[item_id],
+                    row=row,
+                    lang=lang,
+                    category_taxonomy_profile=normalized_taxonomy_profile,
+                )
  
     return [results_by_id[item["id"]] for item in normalized_items]
  
@@ -636,9 +685,13 @@ def _make_analysis_cache_key(
     product: Dict[str, Any],
     target_lang: str,
     analysis_kind: str,
+    category_taxonomy_profile: Optional[str] = None,
 ) -> str:
     """构造缓存 key，仅由分析类型、prompt 实际输入文本内容与目标语言决定。"""
-    schema = _get_analysis_schema(analysis_kind)
+    schema = _get_analysis_schema(
+        analysis_kind,
+        category_taxonomy_profile=category_taxonomy_profile,
+    )
     prompt_input = _build_prompt_input_text(product)
     h = hashlib.md5(prompt_input.encode("utf-8")).hexdigest()
     prompt_contract = {
@@ -672,12 +725,21 @@ def _get_cached_analysis_result(
     product: Dict[str, Any],
     target_lang: str,
     analysis_kind: str,
+    category_taxonomy_profile: Optional[str] = None,
 ) -> Optional[Dict[str, Any]]:
     if not _anchor_redis:
         return None
-    schema = _get_analysis_schema(analysis_kind)
+    schema = _get_analysis_schema(
+        analysis_kind,
+        category_taxonomy_profile=category_taxonomy_profile,
+    )
     try:
-        key = _make_analysis_cache_key(product, target_lang, analysis_kind)
+        key = _make_analysis_cache_key(
+            product,
+            target_lang,
+            analysis_kind,
+            category_taxonomy_profile=category_taxonomy_profile,
+        )
         raw = _anchor_redis.get(key)
         if not raw:
             return None
@@ -707,10 +769,14 @@ def _set_cached_analysis_result(
     target_lang: str,
     result: Dict[str, Any],
     analysis_kind: str,
+    category_taxonomy_profile: Optional[str] = None,
 ) -> None:
     if not _anchor_redis:
         return
-    schema = _get_analysis_schema(analysis_kind)
+    schema = _get_analysis_schema(
+        analysis_kind,
+        category_taxonomy_profile=category_taxonomy_profile,
+    )
     try:
         normalized = _normalize_analysis_result(
             result,
@@ -720,7 +786,12 @@ def _set_cached_analysis_result(
         )
         if not _has_meaningful_analysis_content(normalized, schema):
             return
-        key = _make_analysis_cache_key(product, target_lang, analysis_kind)
+        key = _make_analysis_cache_key(
+            product,
+            target_lang,
+            analysis_kind,
+            category_taxonomy_profile=category_taxonomy_profile,
+        )
         ttl = ANCHOR_CACHE_EXPIRE_DAYS * 24 * 3600
         _anchor_redis.setex(key, ttl, json.dumps(normalized, ensure_ascii=False))
     except Exception as e:
@@ -775,9 +846,13 @@ def create_prompt(
     products: List[Dict[str, str]],
     target_lang: str = "zh",
     analysis_kind: str = "content",
+    category_taxonomy_profile: Optional[str] = None,
 ) -> Tuple[Optional[str], Optional[str], Optional[str]]:
     """根据目标语言创建共享上下文、本地化输出要求和 Partial Mode assistant 前缀。"""
-    schema = _get_analysis_schema(analysis_kind)
+    schema = _get_analysis_schema(
+        analysis_kind,
+        category_taxonomy_profile=category_taxonomy_profile,
+    )
     markdown_table_headers = schema.get_headers(target_lang)
     if not markdown_table_headers:
         logger.warning(
@@ -978,9 +1053,13 @@ def call_llm(
 def parse_markdown_table(
     markdown_content: str,
     analysis_kind: str = "content",
+    category_taxonomy_profile: Optional[str] = None,
 ) -> List[Dict[str, str]]:
     """解析markdown表格内容"""
-    schema = _get_analysis_schema(analysis_kind)
+    schema = _get_analysis_schema(
+        analysis_kind,
+        category_taxonomy_profile=category_taxonomy_profile,
+    )
     lines = markdown_content.strip().split("\n")
     data = []
     data_started = False
@@ -1025,8 +1104,12 @@ def _log_parsed_result_quality(
     target_lang: str,
     batch_num: int,
     analysis_kind: str,
+    category_taxonomy_profile: Optional[str] = None,
 ) -> None:
-    schema = _get_analysis_schema(analysis_kind)
+    schema = _get_analysis_schema(
+        analysis_kind,
+        category_taxonomy_profile=category_taxonomy_profile,
+    )
     expected = len(batch_data)
     actual = len(parsed_results)
     if actual != expected:
@@ -1071,9 +1154,13 @@ def process_batch(
     batch_num: int,
     target_lang: str = "zh",
     analysis_kind: str = "content",
+    category_taxonomy_profile: Optional[str] = None,
 ) -> List[Dict[str, Any]]:
     """处理一个批次的数据"""
-    schema = _get_analysis_schema(analysis_kind)
+    schema = _get_analysis_schema(
+        analysis_kind,
+        category_taxonomy_profile=category_taxonomy_profile,
+    )
     logger.info(f"\n{'#' * 80}")
     logger.info(
         "Processing Batch %s (%s items, kind=%s)",
@@ -1087,6 +1174,7 @@ def process_batch(
         batch_data,
         target_lang=target_lang,
         analysis_kind=analysis_kind,
+        category_taxonomy_profile=category_taxonomy_profile,
     )
  
     # 如果提示词创建失败（例如不支持的 target_lang），本次批次整体失败，不再继续调用 LLM
@@ -1119,13 +1207,18 @@ def process_batch(
         )
  
         # 解析结果
-        parsed_results = parse_markdown_table(raw_response, analysis_kind=analysis_kind)
+        parsed_results = parse_markdown_table(
+            raw_response,
+            analysis_kind=analysis_kind,
+            category_taxonomy_profile=category_taxonomy_profile,
+        )
         _log_parsed_result_quality(
             batch_data,
             parsed_results,
             target_lang,
             batch_num,
             analysis_kind,
+            category_taxonomy_profile,
         )
  
         logger.info(f"\nParsed Results ({len(parsed_results)} items):")
@@ -1190,6 +1283,7 @@ def analyze_products(
     batch_size: Optional[int] = None,
     tenant_id: Optional[str] = None,
     analysis_kind: str = "content",
+    category_taxonomy_profile: Optional[str] = None,
 ) -> List[Dict[str, Any]]:
     """
     库调用入口：根据输入+语言，返回锚文本及各维度信息。
@@ -1205,7 +1299,10 @@ def analyze_products(
     if not products:
         return []
  
-    _get_analysis_schema(analysis_kind)
+    _get_analysis_schema(
+        analysis_kind,
+        category_taxonomy_profile=category_taxonomy_profile,
+    )
     results_by_index: List[Optional[Dict[str, Any]]] = [None] * len(products)
     uncached_items: List[Tuple[int, Dict[str, str]]] = []
  
@@ -1215,7 +1312,12 @@ def analyze_products(
             uncached_items.append((idx, product))
             continue
  
-        cached = _get_cached_analysis_result(product, target_lang, analysis_kind)
+        cached = _get_cached_analysis_result(
+            product,
+            target_lang,
+            analysis_kind,
+            category_taxonomy_profile=category_taxonomy_profile,
+        )
         if cached:
             logger.info(
                 f"[analyze_products] Cache hit for title='{title[:50]}...', "
@@ -1255,6 +1357,7 @@ def analyze_products(
                 batch_num=batch_num,
                 target_lang=target_lang,
                 analysis_kind=analysis_kind,
+                category_taxonomy_profile=category_taxonomy_profile,
             )
  
             for (original_idx, product), item in zip(batch_slice, batch_results):
@@ -1266,7 +1369,13 @@ def analyze_products(
                     # 不缓存错误结果，避免放大临时故障
                     continue
                 try:
-                    _set_cached_analysis_result(product, target_lang, item, analysis_kind)
+                    _set_cached_analysis_result(
+                        product,
+                        target_lang,
+                        item,
+                        analysis_kind,
+                        category_taxonomy_profile=category_taxonomy_profile,
+                    )
                 except Exception:
                     # 已在内部记录 warning
                     pass
@@ -1293,6 +1402,7 @@ def analyze_products(
                 batch_num=batch_num,
                 target_lang=target_lang,
                 analysis_kind=analysis_kind,
+                category_taxonomy_profile=category_taxonomy_profile,
             )
  
         # 按 batch_num 回填，确保输出稳定（results_by_index 是按原始 input index 映射的）
@@ -1307,7 +1417,13 @@ def analyze_products(
                     # 不缓存错误结果，避免放大临时故障
                     continue
                 try:
-                    _set_cached_analysis_result(product, target_lang, item, analysis_kind)
+                    _set_cached_analysis_result(
+                        product,
+                        target_lang,
+                        item,
+                        analysis_kind,
+                        category_taxonomy_profile=category_taxonomy_profile,
+                    )
                 except Exception:
                     # 已在内部记录 warning
                     pass
  
-服装大类的taxonomy
+# Cross-Border E-commerce Core Categories 大类
+
+## 1. 3C
+Phone accessories, computer peripherals, smart wearables, audio & video, smart home, gaming gear. 手机配件、电脑周边、智能穿戴、影音娱乐、智能家居、游戏设备。
+
+## 2. Bags 包
+Handbags, backpacks, wallets, luggage, crossbody bags, tote bags. 手提包、双肩包、钱包、行李箱、斜挎包、托特包。
+
+## 3. Pet Supplies 宠物用品
+Pet food, pet toys, pet care products, pet grooming, pet clothing, smart pet devices. 宠物食品、宠物玩具、宠物护理用品、宠物美容、宠物服装、智能宠物设备。
+
+## 4. Electronics 电子产品
+Consumer electronics, home appliances, digital devices, cables & chargers, batteries, electronic components. 消费电子产品、家用电器、数码设备、线材充电器、电池、电子元器件。
+
+## 5. Clothing 服装
+Women's wear, men's wear, kid's wear, underwear, outerwear, activewear. 女装、男装、童装、内衣、外套、运动服装。
+
+## 6. Outdoor 户外用品
+Camping gear, hiking equipment, fishing supplies, outdoor clothing, travel accessories, survival tools. 露营装备、徒步用品、渔具、户外服装、旅行配件、求生工具。
+
+## 7. Home Appliances 家电/电器
+Kitchen appliances, cleaning appliances, personal care appliances, heating & cooling, smart home devices. 厨房电器、清洁电器、个护电器、冷暖设备、智能家居设备。
+
+## 8. Home & Living 家居
+Furniture, home textiles, lighting, kitchenware, storage, home decor. 家具、家纺、灯具、厨具、收纳、家居装饰。
+
+## 9. Wigs 假发
+
+## 10. Beauty & Cosmetics 美容美妆
+Skincare, makeup, nail care, beauty tools, hair care, fragrances. 护肤品、彩妆、美甲、美容工具、护发、香水。
+
+## 11. Accessories 配饰
+Jewelry, watches, belts, scarves, hats, sunglasses, hair accessories. 珠宝、手表、腰带、围巾、帽子、太阳镜、发饰。
+
+## 12. Toys 玩具
+Educational toys, plush toys, action figures, puzzles, outdoor toys, DIY toys. 益智玩具、毛绒玩具、可动人偶、拼图、户外玩具、DIY玩具。
+
+## 13. Shoes 鞋子
+Sneakers, boots, sandals, heels, flats, sports shoes. 运动鞋、靴子、凉鞋、高跟鞋、平底鞋、球鞋。
+
+## 14. Sports 运动产品
+Fitness equipment, sports gear, team sports, racquet sports, water sports, cycling. 健身器材、运动装备、团队运动、球拍运动、水上运动、骑行。
+
+## 15. Others 其他
+
+# 各个大类的taxonomy
+## 1. Clothing & Apparel 服装
  
 ### A. Product Classification
  
@@ -348,10 +348,12 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch
     def _fake_build_index_content_fields(
         items: List[Dict[str, str]],
         tenant_id: str | None = None,
-        analysis_kinds: List[str] | None = None,
+        enrichment_scopes: List[str] | None = None,
+        category_taxonomy_profile: str = "apparel",
     ):
         assert tenant_id == "162"
-        assert analysis_kinds == ["content", "taxonomy"]
+        assert enrichment_scopes == ["generic", "category_taxonomy"]
+        assert category_taxonomy_profile == "apparel"
         return [
             {
                 "id": p["spu_id"],
@@ -376,6 +378,8 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch
         "/indexer/enrich-content",
         json={
             "tenant_id": "162",
+            "enrichment_scopes": ["generic", "category_taxonomy"],
+            "category_taxonomy_profile": "apparel",
             "items": [
                 {"spu_id": "1001", "title": "T-shirt"},
                 {"spu_id": "1002", "title": "Toy"},
@@ -385,7 +389,8 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch
     assert response.status_code == 200
     data = response.json()
     assert data["tenant_id"] == "162"
-    assert data["analysis_kinds"] == ["content", "taxonomy"]
+    assert data["enrichment_scopes"] == ["generic", "category_taxonomy"]
+    assert data["category_taxonomy_profile"] == "apparel"
     assert data["total"] == 2
     assert len(data["results"]) == 2
     assert data["results"][0]["spu_id"] == "1001"
@@ -403,6 +408,52 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch
     }
  
  
+def test_indexer_enrich_content_contract_accepts_deprecated_analysis_kinds(indexer_client: TestClient, monkeypatch):
+    import indexer.product_enrich as process_products
+
+    seen: Dict[str, Any] = {}
+
+    def _fake_build_index_content_fields(
+        items: List[Dict[str, str]],
+        tenant_id: str | None = None,
+        enrichment_scopes: List[str] | None = None,
+        category_taxonomy_profile: str = "apparel",
+    ):
+        seen["tenant_id"] = tenant_id
+        seen["enrichment_scopes"] = enrichment_scopes
+        seen["category_taxonomy_profile"] = category_taxonomy_profile
+        return [
+            {
+                "id": items[0]["spu_id"],
+                "qanchors": {},
+                "enriched_tags": {},
+                "enriched_attributes": [],
+                "enriched_taxonomy_attributes": [],
+            }
+        ]
+
+    monkeypatch.setattr(process_products, "build_index_content_fields", _fake_build_index_content_fields)
+
+    response = indexer_client.post(
+        "/indexer/enrich-content",
+        json={
+            "tenant_id": "162",
+            "analysis_kinds": ["taxonomy"],
+            "items": [{"spu_id": "1001", "title": "T-shirt"}],
+        },
+    )
+
+    assert response.status_code == 200
+    data = response.json()
+    assert seen == {
+        "tenant_id": "162",
+        "enrichment_scopes": ["category_taxonomy"],
+        "category_taxonomy_profile": "apparel",
+    }
+    assert data["enrichment_scopes"] == ["category_taxonomy"]
+    assert data["category_taxonomy_profile"] == "apparel"
+
+
 def test_indexer_documents_contract(indexer_client: TestClient):
     """POST /indexer/documents: tenant_id + spu_ids, returns success/failed lists (no ES write)."""
     response = indexer_client.post(
@@ -18,8 +18,10 @@ def test_analyze_products_caps_batch_size_to_20(monkeypatch):
         batch_num: int,
         target_lang: str = "zh",
         analysis_kind: str = "content",
+        category_taxonomy_profile=None,
     ):
         assert analysis_kind == "content"
+        assert category_taxonomy_profile is None
         seen_batch_sizes.append(len(batch_data))
         return [
             {
@@ -64,8 +66,10 @@ def test_analyze_products_uses_min_batch_size_1(monkeypatch):
         batch_num: int,
         target_lang: str = "zh",
         analysis_kind: str = "content",
+        category_taxonomy_profile=None,
     ):
         assert analysis_kind == "content"
+        assert category_taxonomy_profile is None
         seen_batch_sizes.append(len(batch_data))
         return [
             {
@@ -295,16 +295,36 @@ def test_analyze_products_uses_product_level_cache_across_batch_requests():
             product.get("image_url", ""),
         )
  
-    def fake_get_cached_analysis_result(product, target_lang, analysis_kind="content"):
+    def fake_get_cached_analysis_result(
+        product,
+        target_lang,
+        analysis_kind="content",
+        category_taxonomy_profile=None,
+    ):
         assert analysis_kind == "content"
+        assert category_taxonomy_profile is None
         return cache_store.get(_cache_key(product, target_lang))
  
-    def fake_set_cached_analysis_result(product, target_lang, result, analysis_kind="content"):
+    def fake_set_cached_analysis_result(
+        product,
+        target_lang,
+        result,
+        analysis_kind="content",
+        category_taxonomy_profile=None,
+    ):
         assert analysis_kind == "content"
+        assert category_taxonomy_profile is None
         cache_store[_cache_key(product, target_lang)] = result
  
-    def fake_process_batch(batch_data, batch_num, target_lang="zh", analysis_kind="content"):
+    def fake_process_batch(
+        batch_data,
+        batch_num,
+        target_lang="zh",
+        analysis_kind="content",
+        category_taxonomy_profile=None,
+    ):
         assert analysis_kind == "content"
+        assert category_taxonomy_profile is None
         process_calls.append(
             {
                 "batch_num": batch_num,
@@ -400,7 +420,7 @@ def test_analyze_products_reuses_cached_content_with_current_product_identity():
     with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object(
         product_enrich,
         "_get_cached_analysis_result",
-        wraps=lambda product, target_lang, analysis_kind="content": product_enrich._normalize_analysis_result(
+        wraps=lambda product, target_lang, analysis_kind="content", category_taxonomy_profile=None: product_enrich._normalize_analysis_result(
             cached_result,
             product=product,
             target_lang=target_lang,
@@ -443,8 +463,10 @@ def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output()
         batch_size=None,
         tenant_id=None,
         analysis_kind="content",
+        category_taxonomy_profile=None,
     ):
         if analysis_kind == "taxonomy":
+            assert category_taxonomy_profile == "apparel"
             return [
                 {
                     "id": products[0]["id"],
@@ -478,6 +500,7 @@ def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output()
                     "style_aesthetic": "",
                 }
             ]
+        assert category_taxonomy_profile == "apparel"
         return [
             {
                 "id": products[0]["id"],