feat(indexer): 完善 enriched_taxonomy_attributes 接口输出及缓存设计

- `/indexer/enrich-content` 路由`enriched_taxonomy_attributes` 与 `enriched_attributes` 一并返回 - 新增请求参数 `analysis_kinds`（可选，默认 `["content", "taxonomy"]`），允许调用方按需选择内容分析类型，为后续扩展和成本控制预留空间 - 重构缓存策略：将 `content` 与 `taxonomy` 两类分析的缓存完全隔离，缓存 key 包含 prompt 模板、表头、输出字段定义（即 schema 指纹），确保提示词或解析规则变更时自动失效 - 缓存 key 仅依赖真正参与 LLM 输入的字段（`title`、`brief`、`description`），`image_url`、`tenant_id`、`spu_id` 不再污染缓存键，提高缓存命中率 - 更新 API 文档（`docs/搜索API对接指南-05-索引接口（Indexer）.md`），说明新增参数与返回字段技术细节： - 路由层调整：在 `api/routes/indexer.py` 的 enrich-content 端点中，将 `product_enrich.enrich_products_batch` 返回的 `enriched_taxonomy_attributes` 字段显式加入 HTTP 响应体 - `analysis_kinds` 参数透传至底层 `enrich_products_batch`，支持按需跳过某一类分析（如仅需 taxonomy 时减少 LLM 调用） - 缓存指纹计算位于 `product_enrich.py` 的 `_get_cache_key` 函数，对每种 `AnalysisSchema` 独立生成；版本号通过 `schema.version` 或 prompt 内容哈希隐式包含 - 测试覆盖：新增 `analysis_kinds` 组合场景及缓存隔离测试

feat(indexer): 完善 enriched_taxonomy_attributes 接口输出及缓存设计
- `/indexer/enrich-content` 路由`enriched_taxonomy_attributes` 与 `enriched_attributes` 一并返回 - 新增请求参数 `analysis_kinds`（可选，默认 `["content", "taxonomy"]`），允许调用方按需选择内容分析类型，为后续扩展和成本控制预留空间 - 重构缓存策略：将 `content` 与 `taxonomy` 两类分析的缓存完全隔离，缓存 key 包含 prompt 模板、表头、输出字段定义（即 schema 指纹），确保提示词或解析规则变更时自动失效 - 缓存 key 仅依赖真正参与 LLM 输入的字段（`title`、`brief`、`description`），`image_url`、`tenant_id`、`spu_id` 不再污染缓存键，提高缓存命中率 - 更新 API 文档（`docs/搜索API对接指南-05-索引接口（Indexer）.md`），说明新增参数与返回字段技术细节： - 路由层调整：在 `api/routes/indexer.py` 的 enrich-content 端点中，将 `product_enrich.enrich_products_batch` 返回的 `enriched_taxonomy_attributes` 字段显式加入 HTTP 响应体 - `analysis_kinds` 参数透传至底层 `enrich_products_batch`，支持按需跳过某一类分析（如仅需 taxonomy 时减少 LLM 调用） - 缓存指纹计算位于 `product_enrich.py` 的 `_get_cache_key` 函数，对每种 `AnalysisSchema` 独立生成；版本号通过 `schema.version` 或 prompt 内容哈希隐式包含 - 测试覆盖：新增 `analysis_kinds` 组合场景及缓存隔离测试
tangwang
1 parent 36516857
Showing 9 changed files with 202 additions and 100 deletions Show diff stats
api/routes/indexer.py
docs/搜索API对接指南-05-索引接口（Indexer）.md
docs/搜索API对接指南-07-微服务接口（Embedding-Reranker-Translation）.md
docs/缓存与Redis使用说明.md
indexer/README.md
indexer/product_enrich.py
indexer/prompts.txt
tests/ci/test_service_api_contracts.py
tests/test_product_enrich_partial_mode.py
@@ -7,7 +7,7 @@
 import asyncio
 import re
 from fastapi import APIRouter, HTTPException
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Literal, Optional
 from pydantic import BaseModel, Field
 import logging
 from sqlalchemy import text
@@ -88,11 +88,20 @@ class EnrichContentItem(BaseModel):
 class EnrichContentRequest(BaseModel):
     """
-    内容理解字段生成请求：根据商品标题批量生成 qanchors、enriched_attributes、tags。
+    内容理解字段生成请求：根据商品标题批量生成 qanchors、enriched_attributes、tags、taxonomy attributes。
     供外部 indexer 在自行组织 doc 时调用，与翻译、向量化等微服务并列。
     """
     tenant_id: str = Field(..., description="租户 ID，用于请求路由与结果归属，不参与缓存键")
     items: List[EnrichContentItem] = Field(..., description="待分析的 SPU 列表（spu_id + title，可附带 brief/description/image_url）")
+    analysis_kinds: List[Literal["content", "taxonomy"]] = Field(
+        default_factory=lambda: ["content", "taxonomy"],
+        description=(
+            "要执行的分析族。"
+            "`content` 返回 qanchors/enriched_tags/enriched_attributes；"
+            "`taxonomy` 返回 enriched_taxonomy_attributes。"
+            "默认两者都执行。"
+        ),
+    )
 @router.post("/reindex")
@@ -440,20 +449,29 @@ async def build_docs_from_db(request: BuildDocsFromDbRequest):
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
-def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]]) -> List[Dict[str, Any]]:
+def _run_enrich_content(
+    tenant_id: str,
+    items: List[Dict[str, str]],
+    analysis_kinds: Optional[List[str]] = None,
+) -> List[Dict[str, Any]]:
     """
     同步执行内容理解，返回与 ES mapping 对齐的字段结构。
     语言策略由 product_enrich 内部统一决定，路由层不参与。
     """
     from indexer.product_enrich import build_index_content_fields
-    results = build_index_content_fields(items=items, tenant_id=tenant_id)
+    results = build_index_content_fields(
+        items=items,
+        tenant_id=tenant_id,
+        analysis_kinds=analysis_kinds,
+    )
     return [
         {
             "spu_id": item["id"],
             "qanchors": item["qanchors"],
             "enriched_attributes": item["enriched_attributes"],
             "enriched_tags": item["enriched_tags"],
+            "enriched_taxonomy_attributes": item["enriched_taxonomy_attributes"],
             **({"error": item["error"]} if item.get("error") else {}),
         }
         for item in results
@@ -463,15 +481,15 @@ def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]]) -&gt; List[Dic
 @router.post("/enrich-content")
 async def enrich_content(request: EnrichContentRequest):
     """
-    内容理解字段生成接口：根据商品标题批量生成 qanchors、enriched_attributes、tags。
+    内容理解字段生成接口：根据商品标题批量生成 qanchors、enriched_attributes、tags、taxonomy attributes。
     使用场景：
     - 外部 indexer 采用「微服务组合」方式自己组织 doc 时，可调用本接口获取 LLM 生成的
       锚文本与语义属性，再与翻译、向量化结果合并写入 ES。
     - 与 /indexer/build-docs 解耦，避免 build-docs 因 LLM 耗时过长而阻塞；调用方可
-      先拿不含 qanchors/enriched_tags 的 doc，再异步或离线补齐本接口结果后更新 ES。
+      先拿不含 qanchors/enriched_tags/taxonomy attributes 的 doc，再异步或离线补齐本接口结果后更新 ES。
-    实现逻辑与 indexer.product_enrich.analyze_products 一致，支持多语言与 Redis 缓存。
+    实现逻辑与 indexer.product_enrich.build_index_content_fields 一致，支持多语言与 Redis 缓存。
     """
     try:
         if not request.items:
@@ -497,11 +515,13 @@ async def enrich_content(request: EnrichContentRequest):
             None,
             lambda: _run_enrich_content(
                 tenant_id=request.tenant_id,
-                items=items_payload
+                items=items_payload,
+                analysis_kinds=request.analysis_kinds,
             ),
         )
         return {
             "tenant_id": request.tenant_id,
+            "analysis_kinds": request.analysis_kinds,
             "results": result,
             "total": len(result),
         }
@@ -648,13 +648,14 @@ curl -X POST &quot;http://127.0.0.1:6004/indexer/build-docs-from-db&quot; \
 ### 5.8 内容理解字段生成接口
 - **端点**: `POST /indexer/enrich-content`
-- **描述**: 根据商品内容信息批量生成 **qanchors**（锚文本）、**enriched_attributes**（语义属性）、**enriched_tags**（细分标签），供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段（必填/可选见下表）。接口只暴露商品内容输入，语言选择、分析维度与最终字段结构统一由 `indexer.product_enrich` 内部决定；当前返回结果与 `search_products` mapping 保持一致。单次请求在线程池中执行，避免阻塞其他接口。
+- **描述**: 根据商品内容信息批量生成 **qanchors**（锚文本）、**enriched_attributes**（通用语义属性）、**enriched_tags**（细分标签）、**enriched_taxonomy_attributes**（taxonomy 结构化属性），供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段（必填/可选见下表）。接口只暴露商品内容输入，语言选择、分析维度与最终字段结构统一由 `indexer.product_enrich` 内部决定；当前返回结果与 `search_products` mapping 保持一致。单次请求在线程池中执行，避免阻塞其他接口。
 #### 请求参数
 ```json
 {
   "tenant_id": "170",
+  "analysis_kinds": ["content", "taxonomy"],
   "items": [
     {
       "spu_id": "223167",
@@ -675,6 +676,7 @@ curl -X POST &quot;http://127.0.0.1:6004/indexer/build-docs-from-db&quot; \
 | 参数 | 类型 | 必填 | 默认值 | 说明 |
 |------|------|------|--------|------|
 | `tenant_id` | string | Y | - | 租户 ID。目前仅用于记录日志，不产生实际作用|
+| `analysis_kinds` | array[string] | N | `["content", "taxonomy"]` | 选择要执行的分析族。`content` 生成 `qanchors`/`enriched_tags`/`enriched_attributes`，`taxonomy` 生成 `enriched_taxonomy_attributes` |
 | `items` | array | Y | - | 待分析列表；**单次最多 50 条** |
 `items[]` 字段说明：
@@ -683,15 +685,18 @@ curl -X POST &quot;http://127.0.0.1:6004/indexer/build-docs-from-db&quot; \
 |------|------|------|------|
 | `spu_id` | string | Y | SPU ID，用于回填结果；目前仅用于记录日志，不产生实际作用|
 | `title` | string | Y | 商品标题 |
-| `image_url` | string | N | 商品主图 URL；当前会参与内容缓存键，后续可用于图像/多模态内容理解 |
-| `brief` | string | N | 商品简介/短描述；当前会参与内容缓存键 |
-| `description` | string | N | 商品详情/长描述；当前会参与内容缓存键 |
+| `image_url` | string | N | 商品主图 URL；当前仅透传，暂未参与 prompt 与缓存键，后续可用于图像/多模态内容理解 |
+| `brief` | string | N | 商品简介/短描述；当前会参与 prompt 与缓存键 |
+| `description` | string | N | 商品详情/长描述；当前会参与 prompt 与缓存键 |
 缓存说明：
-- 内容缓存键仅由 `target_lang + items[]` 中会影响内容理解结果的输入文本构成，目前包括：`title`、`brief`、`description`、`image_url` 的规范化内容 hash。
+- 内容缓存按 **分析族拆分**，即 `content` 与 `taxonomy` 使用不同的缓存命名空间，互不污染、可独立演进。
+- 缓存键由 `analysis_kind + target_lang + prompt/schema 版本指纹 + prompt 输入文本 hash` 构成。
+- 当前真正参与 prompt 输入的字段是：`title`、`brief`、`description`；这些字段任一变化，都会落到新的缓存 key。
+- `prompt/schema 版本指纹` 会综合 system prompt、shared instruction、localized table headers、result fields、user instruction template 等信息生成；因此只要提示词或输出契约变化，旧缓存会自然失效。
 - `tenant_id`、`spu_id` 只用于请求归属与结果回填，不参与缓存键。
-- 因此，输入内容不变时可跨请求直接命中缓存；任一输入字段变化时，会自然落到新的缓存 key。
+- 因此，输入内容与 prompt 契约都不变时可跨请求直接命中缓存；任一一侧变化，都会自然落到新的缓存 key。
 语言说明：
@@ -709,6 +714,7 @@ curl -X POST &quot;http://127.0.0.1:6004/indexer/build-docs-from-db&quot; \
 ```json
 {
   "tenant_id": "170",
+  "analysis_kinds": ["content", "taxonomy"],
   "total": 2,
   "results": [
     {
@@ -725,6 +731,11 @@ curl -X POST &quot;http://127.0.0.1:6004/indexer/build-docs-from-db&quot; \
         { "name": "enriched_tags", "value": { "zh": "纯棉" } },
         { "name": "usage_scene", "value": { "zh": "日常" } },
         { "name": "enriched_tags", "value": { "en": "cotton" } }
+      ],
+      "enriched_taxonomy_attributes": [
+        { "name": "Product Type", "value": { "zh": ["T恤"], "en": ["t-shirt"] } },
+        { "name": "Target Gender", "value": { "zh": ["男"], "en": ["men"] } },
+        { "name": "Season", "value": { "zh": ["夏季"], "en": ["summer"] } }
       ]
     },
     {
@@ -735,7 +746,8 @@ curl -X POST &quot;http://127.0.0.1:6004/indexer/build-docs-from-db&quot; \
       "enriched_tags": {
         "en": ["dolls", "toys"]
       },
-      "enriched_attributes": []
+      "enriched_attributes": [],
+      "enriched_taxonomy_attributes": []
     }
   ]
 }
@@ -743,10 +755,12 @@ curl -X POST &quot;http://127.0.0.1:6004/indexer/build-docs-from-db&quot; \
 | 字段 | 类型 | 说明 |
 |------|------|------|
-| `results` | array | 与请求 `items` 一一对应，每项含 `spu_id`、`qanchors`、`enriched_attributes`、`enriched_tags` |
+| `analysis_kinds` | array | 实际执行的分析族列表 |
+| `results` | array | 与请求 `items` 一一对应，每项含 `spu_id`、`qanchors`、`enriched_attributes`、`enriched_tags`、`enriched_taxonomy_attributes` |
 | `results[].qanchors` | object | 与 ES `qanchors` 字段同结构，按语言键返回短语数组 |
 | `results[].enriched_tags` | object | 与 ES `enriched_tags` 字段同结构，按语言键返回标签数组 |
 | `results[].enriched_attributes` | array | 与 ES `enriched_attributes` nested 字段同结构，每项为 `{ "name", "value": { "zh"?: "...", "en"?: "..." } }` |
+| `results[].enriched_taxonomy_attributes` | array | 与 ES `enriched_taxonomy_attributes` nested 字段同结构，每项为 `{ "name", "value": { "zh"?: [...], "en"?: [...] } }` |
 | `results[].error` | string | 若该条处理失败（如 LLM 异常），会在此字段返回错误信息 |
 **错误响应**:
@@ -760,6 +774,7 @@ curl -X POST &quot;http://localhost:6004/indexer/enrich-content&quot; \
   -H "Content-Type: application/json" \
   -d '{
     "tenant_id": "163",
+    "analysis_kinds": ["content", "taxonomy"],
     "items": [
       {
         "spu_id": "223167",
@@ -773,4 +788,3 @@ curl -X POST &quot;http://localhost:6004/indexer/enrich-content&quot; \
 ```
 ---
-
@@ -444,7 +444,7 @@ curl &quot;http://localhost:6006/health&quot;
 - **Base URL**: Indexer 服务地址，如 `http://localhost:6004`
 - **路径**: `POST /indexer/enrich-content`
-- **说明**: 根据商品标题批量生成 `qanchors`、`enriched_attributes`、`tags`，用于拼装 ES 文档。内部使用大模型（需配置 `DASHSCOPE_API_KEY`），支持多语言与 Redis 缓存；单次最多 50 条，建议批量调用以提升效率。
+- **说明**: 根据商品标题批量生成 `qanchors`、`enriched_attributes`、`enriched_tags`、`enriched_taxonomy_attributes`，用于拼装 ES 文档。支持通过 `analysis_kinds` 选择执行 `content` / `taxonomy`；默认两者都执行。内部使用大模型（需配置 `DASHSCOPE_API_KEY`），支持多语言与 Redis 缓存；单次最多 50 条，建议批量调用以提升效率。
 请求/响应格式、示例及错误码见 [-05-索引接口（Indexer）](./搜索API对接指南-05-索引接口（Indexer）.md#58-内容理解字段生成接口)。
@@ -196,18 +196,25 @@ services:
 - 配置项：
   - `ANCHOR_CACHE_PREFIX = REDIS_CONFIG.get("anchor_cache_prefix", "product_anchors")`
   - `ANCHOR_CACHE_EXPIRE_DAYS = int(REDIS_CONFIG.get("anchor_cache_expire_days", 30))`
-- Key 构造函数：`_make_anchor_cache_key(title, target_lang, tenant_id)`
+- Key 构造函数：`_make_analysis_cache_key(product, target_lang, analysis_kind)`
 - 模板：
 ```text
-{ANCHOR_CACHE_PREFIX}:{tenant_or_global}:{target_lang}:{md5(title)}
+{ANCHOR_CACHE_PREFIX}:{analysis_kind}:{prompt_contract_hash}:{target_lang}:{prompt_input_prefix}{md5(prompt_input)}
 ```
 - 字段说明：
   - `ANCHOR_CACHE_PREFIX`：默认 `"product_anchors"`，可通过 `.env` 中的 `REDIS_ANCHOR_CACHE_PREFIX`（若存在）间接配置到 `REDIS_CONFIG`；
-  - `tenant_or_global`：`tenant_id` 去空白后的字符串，若为空则使用 `"global"`；
+  - `analysis_kind`：分析族，目前至少包括 `content` 与 `taxonomy`，两者缓存隔离；
+  - `prompt_contract_hash`：基于 system prompt、shared instruction、localized headers、result fields、user instruction template、schema cache version 等生成的短 hash；只要提示词或输出契约变化，缓存会自动失效；
   - `target_lang`：内容理解输出语言，例如 `zh`；
-  - `md5(title)`：对原始商品标题（UTF-8）做 MD5。
+  - `prompt_input_prefix + md5(prompt_input)`：对真正送入 prompt 的商品文本做前缀 + MD5；当前 prompt 输入来自 `title`、`brief`、`description` 的规范化拼接结果。
+
+设计原则：
+
+- 只让**实际影响 LLM 输出**的输入参与 key；
+- 不让 `tenant_id`、`spu_id` 这类“结果归属信息”污染缓存；
+- prompt 或 schema 变更时，不依赖人工清理 Redis，也能自然切换到新 key。
 ### 4.2 Value 与类型
@@ -229,6 +236,7 @@ services:
 ```
 - 读取时通过 `json.loads(raw)` 还原为 `Dict[str, Any]`。
+- `content` 与 `taxonomy` 的 value 结构会随各自 schema 不同而不同，但都会先通过统一的 normalize 逻辑再写缓存。
 ### 4.3 过期策略
@@ -8,7 +8,7 @@
 ### 1.1 系统角色划分
-- **Java 索引程序（/home/tw/saas-server）**
+- **Java 索引程序**
   - 负责“**什么时候、对哪些 SPU 做索引**”（调度 & 触发）。
   - 负责**商品/店铺/类目等基础数据同步**（写 MySQL）。
   - 负责**多租户环境下的全量/增量索引调度**，但不再关心具体 doc 字段细节。
@@ -151,6 +151,7 @@ if _missing_prompt_langs:
 # 多值字段分隔：英文逗号、中文逗号、顿号，及历史约定的 ; | / 与空白
 _MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[，、,;|/\n\t]+")
 _CORE_INDEX_LANGUAGES = ("zh", "en")
+_DEFAULT_ANALYSIS_KINDS = ("content", "taxonomy")
 _CONTENT_ANALYSIS_ATTRIBUTE_FIELD_MAP = (
     ("tags", "enriched_tags"),
     ("target_audience", "target_audience"),
@@ -226,6 +227,7 @@ class AnalysisSchema:
     markdown_table_headers: Dict[str, List[str]]
     result_fields: Tuple[str, ...]
     meaningful_fields: Tuple[str, ...]
+    cache_version: str = "v1"
     field_aliases: Dict[str, Tuple[str, ...]] = field(default_factory=dict)
     fallback_headers: Optional[List[str]] = None
     quality_fields: Tuple[str, ...] = ()
@@ -246,6 +248,7 @@ _ANALYSIS_SCHEMAS: Dict[str, AnalysisSchema] = {
         markdown_table_headers=LANGUAGE_MARKDOWN_TABLE_HEADERS,
         result_fields=_CONTENT_ANALYSIS_RESULT_FIELDS,
         meaningful_fields=_CONTENT_ANALYSIS_MEANINGFUL_FIELDS,
+        cache_version="v2",
         field_aliases=_CONTENT_ANALYSIS_FIELD_ALIASES,
         quality_fields=_CONTENT_ANALYSIS_QUALITY_FIELDS,
     ),
@@ -255,6 +258,7 @@ _ANALYSIS_SCHEMAS: Dict[str, AnalysisSchema] = {
         markdown_table_headers=TAXONOMY_LANGUAGE_MARKDOWN_TABLE_HEADERS,
         result_fields=_TAXONOMY_ANALYSIS_RESULT_FIELDS,
         meaningful_fields=_TAXONOMY_ANALYSIS_RESULT_FIELDS,
+        cache_version="v1",
         fallback_headers=TAXONOMY_MARKDOWN_TABLE_HEADERS_EN,
     ),
 }
@@ -267,6 +271,21 @@ def _get_analysis_schema(analysis_kind: str) -&gt; AnalysisSchema:
     return schema
+def _normalize_analysis_kinds(
+    analysis_kinds: Optional[List[str]] = None,
+) -> Tuple[str, ...]:
+    requested = _DEFAULT_ANALYSIS_KINDS if not analysis_kinds else tuple(analysis_kinds)
+    normalized: List[str] = []
+    seen = set()
+    for analysis_kind in requested:
+        schema = _get_analysis_schema(str(analysis_kind).strip())
+        if schema.name in seen:
+            continue
+        seen.add(schema.name)
+        normalized.append(schema.name)
+    return tuple(normalized)
+
+
 def split_multi_value_field(text: Optional[str]) -> List[str]:
     """将 LLM/业务中的多值字符串拆成短语列表（strip 后去空）。"""
     if text is None:
@@ -456,6 +475,7 @@ def _normalize_index_content_item(item: Dict[str, Any]) -&gt; Dict[str, str]:
 def build_index_content_fields(
     items: List[Dict[str, Any]],
     tenant_id: Optional[str] = None,
+    analysis_kinds: Optional[List[str]] = None,
 ) -> List[Dict[str, Any]]:
     """
     高层入口：生成与 ES mapping 对齐的内容理解字段。
@@ -464,6 +484,7 @@ def build_index_content_fields(
     - `id` 或 `spu_id`
     - `title`
     - 可选 `brief` / `description` / `image_url`
+    - 可选 `analysis_kinds`，默认同时执行 `content` 与 `taxonomy`
     返回项结构：
     - `id`
@@ -477,6 +498,7 @@ def build_index_content_fields(
     - `qanchors.{lang}` 为短语数组
     - `enriched_tags.{lang}` 为标签数组
     """
+    requested_analysis_kinds = _normalize_analysis_kinds(analysis_kinds)
     normalized_items = [_normalize_index_content_item(item) for item in items]
     if not normalized_items:
         return []
@@ -493,54 +515,57 @@ def build_index_content_fields(
     }
     for lang in _CORE_INDEX_LANGUAGES:
-        try:
-            rows = analyze_products(
-                products=normalized_items,
-                target_lang=lang,
-                batch_size=BATCH_SIZE,
-                tenant_id=tenant_id,
-            )
-        except Exception as e:
-            logger.warning("build_index_content_fields failed for lang=%s: %s", lang, e)
-            for item in normalized_items:
-                results_by_id[item["id"]].setdefault("error", str(e))
-            continue
-
-        for row in rows or []:
-            item_id = str(row.get("id") or "").strip()
-            if not item_id or item_id not in results_by_id:
-                continue
-            if row.get("error"):
-                results_by_id[item_id].setdefault("error", row["error"])
+        if "content" in requested_analysis_kinds:
+            try:
+                rows = analyze_products(
+                    products=normalized_items,
+                    target_lang=lang,
+                    batch_size=BATCH_SIZE,
+                    tenant_id=tenant_id,
+                    analysis_kind="content",
+                )
+            except Exception as e:
+                logger.warning("build_index_content_fields content enrichment failed for lang=%s: %s", lang, e)
+                for item in normalized_items:
+                    results_by_id[item["id"]].setdefault("error", str(e))
                 continue
-            _apply_index_content_row(results_by_id[item_id], row=row, lang=lang)
-
-        try:
-            taxonomy_rows = analyze_products(
-                products=normalized_items,
-                target_lang=lang,
-                batch_size=BATCH_SIZE,
-                tenant_id=tenant_id,
-                analysis_kind="taxonomy",
-            )
-        except Exception as e:
-            logger.warning(
-                "build_index_content_fields taxonomy enrichment failed for lang=%s: %s",
-                lang,
-                e,
-            )
-            for item in normalized_items:
-                results_by_id[item["id"]].setdefault("error", str(e))
-            continue
-        for row in taxonomy_rows or []:
-            item_id = str(row.get("id") or "").strip()
-            if not item_id or item_id not in results_by_id:
-                continue
-            if row.get("error"):
-                results_by_id[item_id].setdefault("error", row["error"])
+            for row in rows or []:
+                item_id = str(row.get("id") or "").strip()
+                if not item_id or item_id not in results_by_id:
+                    continue
+                if row.get("error"):
+                    results_by_id[item_id].setdefault("error", row["error"])
+                    continue
+                _apply_index_content_row(results_by_id[item_id], row=row, lang=lang)
+
+        if "taxonomy" in requested_analysis_kinds:
+            try:
+                taxonomy_rows = analyze_products(
+                    products=normalized_items,
+                    target_lang=lang,
+                    batch_size=BATCH_SIZE,
+                    tenant_id=tenant_id,
+                    analysis_kind="taxonomy",
+                )
+            except Exception as e:
+                logger.warning(
+                    "build_index_content_fields taxonomy enrichment failed for lang=%s: %s",
+                    lang,
+                    e,
+                )
+                for item in normalized_items:
+                    results_by_id[item["id"]].setdefault("error", str(e))
                 continue
-            _apply_index_taxonomy_row(results_by_id[item_id], row=row, lang=lang)
+
+            for row in taxonomy_rows or []:
+                item_id = str(row.get("id") or "").strip()
+                if not item_id or item_id not in results_by_id:
+                    continue
+                if row.get("error"):
+                    results_by_id[item_id].setdefault("error", row["error"])
+                    continue
+                _apply_index_taxonomy_row(results_by_id[item_id], row=row, lang=lang)
     return [results_by_id[item["id"]] for item in normalized_items]
@@ -613,9 +638,27 @@ def _make_analysis_cache_key(
     analysis_kind: str,
 ) -> str:
     """构造缓存 key，仅由分析类型、prompt 实际输入文本内容与目标语言决定。"""
+    schema = _get_analysis_schema(analysis_kind)
     prompt_input = _build_prompt_input_text(product)
     h = hashlib.md5(prompt_input.encode("utf-8")).hexdigest()
-    return f"{ANCHOR_CACHE_PREFIX}:{analysis_kind}:{target_lang}:{prompt_input[:4]}{h}"
+    prompt_contract = {
+        "schema_name": schema.name,
+        "cache_version": schema.cache_version,
+        "system_message": SYSTEM_MESSAGE,
+        "user_instruction_template": USER_INSTRUCTION_TEMPLATE,
+        "shared_instruction": schema.shared_instruction,
+        "assistant_headers": schema.get_headers(target_lang),
+        "result_fields": schema.result_fields,
+        "meaningful_fields": schema.meaningful_fields,
+        "field_aliases": schema.field_aliases,
+    }
+    prompt_contract_hash = hashlib.md5(
+        json.dumps(prompt_contract, ensure_ascii=False, sort_keys=True).encode("utf-8")
+    ).hexdigest()[:12]
+    return (
+        f"{ANCHOR_CACHE_PREFIX}:{analysis_kind}:{prompt_contract_hash}:"
+        f"{target_lang}:{prompt_input[:4]}{h}"
+    )
 def _make_anchor_cache_key(
@@ -1,30 +0,0 @@
-因为需要组织整个doc，我需要将当前的java程序迁移过来，项目路径在 /home/tw/saas-server
-程序相对路径 包括但不限于 module-shoplazza/src/main/java/com/hsyl/saas/module/shoplazza/service/index/ProductIndexServiceImpl.java
-请仔细阅读java相关代码，提取相关逻辑，特别是 翻译的相关字段
-
-
-
-
-
-架构说明：
-
-java索引程序职责：
-
-负责增量、全量的触发，调度。
-
-包括但不限于：
-1、索引结构调整成按tenant_id的结构，并翻译对应的语言shoplazza_shop_config表对应的新增字段primary_language，translate_to_en，translate_to_zh
-2、每晚上商品同步时，判断当前店铺主语言是什么，存入primary_language
-3、同步店匠的类目shoplazza_product_category
-4、加入MQ处理店匠批量导入商品并发太高，服务器承载不了的问题
-
-
-本模块：
-负责 msyql 基础数据 → 索引结构的doc （包括缓存）
-
-翻译接口： curl -X POST http://43.166.252.75:6006/translate   -H "Content-Type: application/json"   -d '{"text":"儿童小男孩女孩开学 100 天衬衫短袖 搞笑图案字母印花庆祝上衣","target_lang":"en","source_lang":"auto"}'
-
-java的组织doc的逻辑都需要迁移过来。
-
-当前项目，是直接将doc写入ES，这个功能也保留，但是，也要提供一个接口，输入完整的字段信息
-
@@ -345,8 +345,13 @@ def test_indexer_build_docs_from_db_contract(indexer_client: TestClient):
 def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch):
     import indexer.product_enrich as process_products
-    def _fake_build_index_content_fields(items: List[Dict[str, str]], tenant_id: str | None = None):
+    def _fake_build_index_content_fields(
+        items: List[Dict[str, str]],
+        tenant_id: str | None = None,
+        analysis_kinds: List[str] | None = None,
+    ):
         assert tenant_id == "162"
+        assert analysis_kinds == ["content", "taxonomy"]
         return [
             {
                 "id": p["spu_id"],
@@ -358,6 +363,9 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch
                 "enriched_attributes": [
                     {"name": "enriched_tags", "value": {"zh": ["tag1"], "en": ["tag1"]}},
                 ],
+                "enriched_taxonomy_attributes": [
+                    {"name": "Product Type", "value": {"zh": ["T恤"], "en": ["t-shirt"]}},
+                ],
             }
             for p in items
         ]
@@ -377,6 +385,7 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch
     assert response.status_code == 200
     data = response.json()
     assert data["tenant_id"] == "162"
+    assert data["analysis_kinds"] == ["content", "taxonomy"]
     assert data["total"] == 2
     assert len(data["results"]) == 2
     assert data["results"][0]["spu_id"] == "1001"
@@ -388,6 +397,10 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch
         "name": "enriched_tags",
         "value": {"zh": ["tag1"], "en": ["tag1"]},
     }
+    assert data["results"][0]["enriched_taxonomy_attributes"][0] == {
+        "name": "Product Type",
+        "value": {"zh": ["T恤"], "en": ["t-shirt"]},
+    }
 def test_indexer_documents_contract(indexer_client: TestClient):
@@ -573,6 +573,40 @@ def test_anchor_cache_key_depends_on_product_input_not_identifiers():
     assert key_a != key_c
+def test_analysis_cache_key_isolated_by_analysis_kind():
+    product = {
+        "id": "1",
+        "title": "dress",
+        "brief": "soft cotton",
+        "description": "summer dress",
+    }
+
+    content_key = product_enrich._make_analysis_cache_key(product, "zh", "content")
+    taxonomy_key = product_enrich._make_analysis_cache_key(product, "zh", "taxonomy")
+
+    assert content_key != taxonomy_key
+
+
+def test_analysis_cache_key_changes_when_prompt_contract_changes():
+    product = {
+        "id": "1",
+        "title": "dress",
+        "brief": "soft cotton",
+        "description": "summer dress",
+    }
+
+    original_key = product_enrich._make_analysis_cache_key(product, "zh", "taxonomy")
+
+    with mock.patch.object(
+        product_enrich,
+        "USER_INSTRUCTION_TEMPLATE",
+        "Please return JSON only. Language: {language}",
+    ):
+        changed_key = product_enrich._make_analysis_cache_key(product, "zh", "taxonomy")
+
+    assert original_key != changed_key
+
+
 def test_build_prompt_input_text_appends_brief_and_description_for_short_title():
     product = {
         "title": "T恤",