From 2703b6ea8b5d2ecdc442f11ad26279cc2dce41e6 Mon Sep 17 00:00:00 2001 From: tangwang Date: Thu, 9 Apr 2026 13:53:36 +0800 Subject: [PATCH] refactor(indexer): 将 analysis_kinds 拆分为 enrichment_scopes + category_taxonomy_profile --- api/routes/indexer.py | 48 +++++++++++++++++++++++++++++++++++++----------- docs/issues/issue-2026-04-08-eval框架主指标ERR的问题以及bm25调参-done-0408.md | 26 ++++++++++++++++++++++++++ docs/搜索API对接指南-05-索引接口(Indexer).md | 19 ++++++++++++------- docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md | 2 +- indexer/product_enrich.py | 204 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------------------- indexer/taxonomy.md | 48 +++++++++++++++++++++++++++++++++++++++++++++++- tests/ci/test_service_api_contracts.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--- tests/test_process_products_batching.py | 4 ++++ tests/test_product_enrich_partial_mode.py | 31 +++++++++++++++++++++++++++---- 9 files changed, 368 insertions(+), 71 deletions(-) diff --git a/api/routes/indexer.py b/api/routes/indexer.py index f58a17b..a253593 100644 --- a/api/routes/indexer.py +++ b/api/routes/indexer.py @@ -88,20 +88,41 @@ class EnrichContentItem(BaseModel): class EnrichContentRequest(BaseModel): """ - 内容理解字段生成请求:根据商品标题批量生成 qanchors、enriched_attributes、tags、taxonomy attributes。 + 内容理解字段生成请求:根据商品标题批量生成通用增强字段与品类 taxonomy 字段。 供外部 indexer 在自行组织 doc 时调用,与翻译、向量化等微服务并列。 """ tenant_id: str = Field(..., description="租户 ID,用于请求路由与结果归属,不参与缓存键") items: List[EnrichContentItem] = Field(..., description="待分析的 SPU 列表(spu_id + title,可附带 brief/description/image_url)") - analysis_kinds: List[Literal["content", "taxonomy"]] = Field( - default_factory=lambda: ["content", "taxonomy"], + enrichment_scopes: Optional[List[Literal["generic", "category_taxonomy"]]] = Field( + default=None, description=( - "要执行的分析族。" - "`content` 返回 qanchors/enriched_tags/enriched_attributes;" - "`taxonomy` 返回 enriched_taxonomy_attributes。" + "要执行的增强范围。" + "`generic` 返回 qanchors/enriched_tags/enriched_attributes;" + "`category_taxonomy` 返回 enriched_taxonomy_attributes。" "默认两者都执行。" ), ) + category_taxonomy_profile: str = Field( + "apparel", + description=( + "品类 taxonomy profile。当前默认且已支持的是 `apparel`。" + "未来可扩展为 `electronics` 等。" + ), + ) + analysis_kinds: Optional[List[Literal["content", "taxonomy"]]] = Field( + default=None, + description="Deprecated alias of enrichment_scopes. `content` -> `generic`, `taxonomy` -> `category_taxonomy`.", + ) + + def resolved_enrichment_scopes(self) -> List[str]: + if self.enrichment_scopes: + return list(self.enrichment_scopes) + if self.analysis_kinds: + mapped = [] + for item in self.analysis_kinds: + mapped.append("generic" if item == "content" else "category_taxonomy") + return mapped + return ["generic", "category_taxonomy"] @router.post("/reindex") @@ -452,7 +473,8 @@ async def build_docs_from_db(request: BuildDocsFromDbRequest): def _run_enrich_content( tenant_id: str, items: List[Dict[str, str]], - analysis_kinds: Optional[List[str]] = None, + enrichment_scopes: Optional[List[str]] = None, + category_taxonomy_profile: str = "apparel", ) -> List[Dict[str, Any]]: """ 同步执行内容理解,返回与 ES mapping 对齐的字段结构。 @@ -463,7 +485,8 @@ def _run_enrich_content( results = build_index_content_fields( items=items, tenant_id=tenant_id, - analysis_kinds=analysis_kinds, + enrichment_scopes=enrichment_scopes, + category_taxonomy_profile=category_taxonomy_profile, ) return [ { @@ -481,7 +504,7 @@ def _run_enrich_content( @router.post("/enrich-content") async def enrich_content(request: EnrichContentRequest): """ - 内容理解字段生成接口:根据商品标题批量生成 qanchors、enriched_attributes、tags、taxonomy attributes。 + 内容理解字段生成接口:根据商品标题批量生成通用增强字段与品类 taxonomy 字段。 使用场景: - 外部 indexer 采用「微服务组合」方式自己组织 doc 时,可调用本接口获取 LLM 生成的 @@ -511,17 +534,20 @@ async def enrich_content(request: EnrichContentRequest): for it in request.items ] loop = asyncio.get_event_loop() + enrichment_scopes = request.resolved_enrichment_scopes() result = await loop.run_in_executor( None, lambda: _run_enrich_content( tenant_id=request.tenant_id, items=items_payload, - analysis_kinds=request.analysis_kinds, + enrichment_scopes=enrichment_scopes, + category_taxonomy_profile=request.category_taxonomy_profile, ), ) return { "tenant_id": request.tenant_id, - "analysis_kinds": request.analysis_kinds, + "enrichment_scopes": enrichment_scopes, + "category_taxonomy_profile": request.category_taxonomy_profile, "results": result, "total": len(result), } diff --git a/docs/issues/issue-2026-04-08-eval框架主指标ERR的问题以及bm25调参-done-0408.md b/docs/issues/issue-2026-04-08-eval框架主指标ERR的问题以及bm25调参-done-0408.md index da2945f..fec4cab 100644 --- a/docs/issues/issue-2026-04-08-eval框架主指标ERR的问题以及bm25调参-done-0408.md +++ b/docs/issues/issue-2026-04-08-eval框架主指标ERR的问题以及bm25调参-done-0408.md @@ -92,3 +92,29 @@ curl -s -u "$AUTH" -X GET "$ES/${INDEX}/_settings?filter_path=**.similarity&pret ```bash curl -s -u "$AUTH" -X GET "$ES/_cat/indices/search_products_tenant_*?h=index&v" ``` + + + +实验结果: +已完成四轮实验,并把 `search_products_tenant_163` 当前索引参数留在了最优组:`b=0.5, k1=1.0`。刚刚已校验生效。 + +对比结果如下: + +- 基线当前线上组 `b=0.10, k1=0.30`:`0.604555` + 报告:[batch_20260408T055948Z_00b6a8aa3d.md](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260408T055948Z_00b6a8aa3d.md) +- 复跑 `b=0.10, k1=0.30`:`0.604298` + 报告:[batch_20260408T064011Z_00b6a8aa3d.json](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260408T064011Z_00b6a8aa3d.json) +- 实验 `b=0.20, k1=0.60`:`0.604274` + 报告:[batch_20260408T064123Z_00b6a8aa3d.json](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260408T064123Z_00b6a8aa3d.json) +- 实验 `b=0.50, k1=1.0`:`0.606041` + 报告:[batch_20260408T064216Z_00b6a8aa3d.json](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260408T064216Z_00b6a8aa3d.json) +- 实验 `b=0.10, k1=0.75`:`0.603438` + 报告:[batch_20260408T064304Z_00b6a8aa3d.json](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260408T064304Z_00b6a8aa3d.json) +- 老基线 `b=0.0, k1=0.0`:`0.602598` + 报告:[batch_20260407T150946Z_00b6a8aa3d.md](/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260407T150946Z_00b6a8aa3d.md) + +结论是这四组里 `b=0.50, k1=1.0` 最好,相比你给的当前基线 `0.604555` 提升了 `+0.001486`,相比 `0/0` 提升了 `+0.003443`。`0.20/0.60` 基本和 `0.10/0.30` 持平,`0.10/0.75` 反而更差,说明这轮里单独加大 `k1` 不如同时把 `b` 和 `k1` 一起放宽。 + +补充一点:我只改了现有索引 `search_products_tenant_163` 的 index-level setting,没有改仓库里的默认映射。[search_products.json](/data/saas-search/mappings/search_products.json) 现在仍是 `0.1/0.3`。如果你要,我下一步可以直接帮你把映射默认值也改成 `0.5/1.0`,并顺手批量同步其他 `search_products_tenant_*` 索引。 + + diff --git a/docs/搜索API对接指南-05-索引接口(Indexer).md b/docs/搜索API对接指南-05-索引接口(Indexer).md index d5837f6..c87cf91 100644 --- a/docs/搜索API对接指南-05-索引接口(Indexer).md +++ b/docs/搜索API对接指南-05-索引接口(Indexer).md @@ -655,7 +655,8 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ ```json { "tenant_id": "170", - "analysis_kinds": ["content", "taxonomy"], + "enrichment_scopes": ["generic", "category_taxonomy"], + "category_taxonomy_profile": "apparel", "items": [ { "spu_id": "223167", @@ -676,7 +677,8 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | 参数 | 类型 | 必填 | 默认值 | 说明 | |------|------|------|--------|------| | `tenant_id` | string | Y | - | 租户 ID。目前仅用于记录日志,不产生实际作用| -| `analysis_kinds` | array[string] | N | `["content", "taxonomy"]` | 选择要执行的分析族。`content` 生成 `qanchors`/`enriched_tags`/`enriched_attributes`,`taxonomy` 生成 `enriched_taxonomy_attributes` | +| `enrichment_scopes` | array[string] | N | `["generic", "category_taxonomy"]` | 选择要执行的增强范围。`generic` 生成 `qanchors`/`enriched_tags`/`enriched_attributes`,`category_taxonomy` 生成 `enriched_taxonomy_attributes` | +| `category_taxonomy_profile` | string | N | `apparel` | 品类 taxonomy profile。当前内置为服装大类 `apparel`,后续可扩展到其他大类 | | `items` | array | Y | - | 待分析列表;**单次最多 50 条** | `items[]` 字段说明: @@ -691,8 +693,8 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ 缓存说明: -- 内容缓存按 **分析族拆分**,即 `content` 与 `taxonomy` 使用不同的缓存命名空间,互不污染、可独立演进。 -- 缓存键由 `analysis_kind + target_lang + prompt/schema 版本指纹 + prompt 输入文本 hash` 构成。 +- 内容缓存按 **增强范围 + taxonomy profile** 拆分;`generic` 与 `category_taxonomy:apparel` 等使用不同缓存命名空间,互不污染、可独立演进。 +- 缓存键由 `analysis_kind + target_lang + prompt/schema 版本指纹 + prompt 输入文本 hash` 构成;对 category taxonomy 来说,profile 会进入 schema 标识与版本指纹。 - 当前真正参与 prompt 输入的字段是:`title`、`brief`、`description`;这些字段任一变化,都会落到新的缓存 key。 - `prompt/schema 版本指纹` 会综合 system prompt、shared instruction、localized table headers、result fields、user instruction template 等信息生成;因此只要提示词或输出契约变化,旧缓存会自然失效。 - `tenant_id`、`spu_id` 只用于请求归属与结果回填,不参与缓存键。 @@ -714,7 +716,8 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ ```json { "tenant_id": "170", - "analysis_kinds": ["content", "taxonomy"], + "enrichment_scopes": ["generic", "category_taxonomy"], + "category_taxonomy_profile": "apparel", "total": 2, "results": [ { @@ -755,7 +758,8 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | 字段 | 类型 | 说明 | |------|------|------| -| `analysis_kinds` | array | 实际执行的分析族列表 | +| `enrichment_scopes` | array | 实际执行的增强范围列表 | +| `category_taxonomy_profile` | string | 实际使用的品类 taxonomy profile | | `results` | array | 与请求 `items` 一一对应,每项含 `spu_id`、`qanchors`、`enriched_attributes`、`enriched_tags`、`enriched_taxonomy_attributes` | | `results[].qanchors` | object | 与 ES `qanchors` 字段同结构,按语言键返回短语数组 | | `results[].enriched_tags` | object | 与 ES `enriched_tags` 字段同结构,按语言键返回标签数组 | @@ -774,7 +778,8 @@ curl -X POST "http://localhost:6004/indexer/enrich-content" \ -H "Content-Type: application/json" \ -d '{ "tenant_id": "163", - "analysis_kinds": ["content", "taxonomy"], + "enrichment_scopes": ["generic", "category_taxonomy"], + "category_taxonomy_profile": "apparel", "items": [ { "spu_id": "223167", diff --git a/docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md b/docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md index 3d5abde..5f0a029 100644 --- a/docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md +++ b/docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md @@ -444,7 +444,7 @@ curl "http://localhost:6006/health" - **Base URL**: Indexer 服务地址,如 `http://localhost:6004` - **路径**: `POST /indexer/enrich-content` -- **说明**: 根据商品标题批量生成 `qanchors`、`enriched_attributes`、`enriched_tags`、`enriched_taxonomy_attributes`,用于拼装 ES 文档。支持通过 `analysis_kinds` 选择执行 `content` / `taxonomy`;默认两者都执行。内部使用大模型(需配置 `DASHSCOPE_API_KEY`),支持多语言与 Redis 缓存;单次最多 50 条,建议批量调用以提升效率。 +- **说明**: 根据商品标题批量生成 `qanchors`、`enriched_attributes`、`enriched_tags`、`enriched_taxonomy_attributes`,用于拼装 ES 文档。支持通过 `enrichment_scopes` 选择执行 `generic` / `category_taxonomy`,并通过 `category_taxonomy_profile` 选择对应大类的 taxonomy prompt/profile;默认执行 `generic + category_taxonomy(apparel)`。内部使用大模型(需配置 `DASHSCOPE_API_KEY`),支持多语言与 Redis 缓存;单次最多 50 条,建议批量调用以提升效率。 请求/响应格式、示例及错误码见 [-05-索引接口(Indexer)](./搜索API对接指南-05-索引接口(Indexer).md#58-内容理解字段生成接口)。 diff --git a/indexer/product_enrich.py b/indexer/product_enrich.py index 5e3c90d..6537e85 100644 --- a/indexer/product_enrich.py +++ b/indexer/product_enrich.py @@ -151,7 +151,8 @@ if _missing_prompt_langs: # 多值字段分隔:英文逗号、中文逗号、顿号,及历史约定的 ; | / 与空白 _MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[,、,;|/\n\t]+") _CORE_INDEX_LANGUAGES = ("zh", "en") -_DEFAULT_ANALYSIS_KINDS = ("content", "taxonomy") +_DEFAULT_ENRICHMENT_SCOPES = ("generic", "category_taxonomy") +_DEFAULT_CATEGORY_TAXONOMY_PROFILE = "apparel" _CONTENT_ANALYSIS_ATTRIBUTE_FIELD_MAP = ( ("tags", "enriched_tags"), ("target_audience", "target_audience"), @@ -187,7 +188,7 @@ _CONTENT_ANALYSIS_FIELD_ALIASES = { "tags": ("tags", "enriched_tags"), } _CONTENT_ANALYSIS_QUALITY_FIELDS = ("title", "category_path", "anchor_text") -_TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP = ( +_APPAREL_TAXONOMY_ATTRIBUTE_FIELD_MAP = ( ("product_type", "Product Type"), ("target_gender", "Target Gender"), ("age_group", "Age Group"), @@ -215,8 +216,8 @@ _TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP = ( ("occasion_end_use", "Occasion / End Use"), ("style_aesthetic", "Style Aesthetic"), ) -_TAXONOMY_ANALYSIS_RESULT_FIELDS = tuple( - field_name for field_name, _ in _TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP +_APPAREL_TAXONOMY_ANALYSIS_RESULT_FIELDS = tuple( + field_name for field_name, _ in _APPAREL_TAXONOMY_ATTRIBUTE_FIELD_MAP ) @@ -252,37 +253,66 @@ _ANALYSIS_SCHEMAS: Dict[str, AnalysisSchema] = { field_aliases=_CONTENT_ANALYSIS_FIELD_ALIASES, quality_fields=_CONTENT_ANALYSIS_QUALITY_FIELDS, ), - "taxonomy": AnalysisSchema( - name="taxonomy", +} + +_CATEGORY_TAXONOMY_PROFILE_SCHEMAS: Dict[str, AnalysisSchema] = { + "apparel": AnalysisSchema( + name="taxonomy:apparel", shared_instruction=TAXONOMY_SHARED_ANALYSIS_INSTRUCTION, markdown_table_headers=TAXONOMY_LANGUAGE_MARKDOWN_TABLE_HEADERS, - result_fields=_TAXONOMY_ANALYSIS_RESULT_FIELDS, - meaningful_fields=_TAXONOMY_ANALYSIS_RESULT_FIELDS, + result_fields=_APPAREL_TAXONOMY_ANALYSIS_RESULT_FIELDS, + meaningful_fields=_APPAREL_TAXONOMY_ANALYSIS_RESULT_FIELDS, cache_version="v1", fallback_headers=TAXONOMY_MARKDOWN_TABLE_HEADERS_EN, ), } +_CATEGORY_TAXONOMY_PROFILE_ATTRIBUTE_FIELD_MAPS: Dict[str, Tuple[Tuple[str, str], ...]] = { + "apparel": _APPAREL_TAXONOMY_ATTRIBUTE_FIELD_MAP, +} + -def _get_analysis_schema(analysis_kind: str) -> AnalysisSchema: - schema = _ANALYSIS_SCHEMAS.get(analysis_kind) - if schema is None: - raise ValueError(f"Unsupported analysis_kind: {analysis_kind}") - return schema +def _normalize_category_taxonomy_profile(category_taxonomy_profile: Optional[str] = None) -> str: + profile = str(category_taxonomy_profile or _DEFAULT_CATEGORY_TAXONOMY_PROFILE).strip() + if profile not in _CATEGORY_TAXONOMY_PROFILE_SCHEMAS: + raise ValueError(f"Unsupported category_taxonomy_profile: {profile}") + return profile -def _normalize_analysis_kinds( - analysis_kinds: Optional[List[str]] = None, +def _get_analysis_schema( + analysis_kind: str, + *, + category_taxonomy_profile: Optional[str] = None, +) -> AnalysisSchema: + if analysis_kind == "content": + return _ANALYSIS_SCHEMAS["content"] + if analysis_kind == "taxonomy": + profile = _normalize_category_taxonomy_profile(category_taxonomy_profile) + return _CATEGORY_TAXONOMY_PROFILE_SCHEMAS[profile] + raise ValueError(f"Unsupported analysis_kind: {analysis_kind}") + + +def _get_taxonomy_attribute_field_map( + category_taxonomy_profile: Optional[str] = None, +) -> Tuple[Tuple[str, str], ...]: + profile = _normalize_category_taxonomy_profile(category_taxonomy_profile) + return _CATEGORY_TAXONOMY_PROFILE_ATTRIBUTE_FIELD_MAPS[profile] + + +def _normalize_enrichment_scopes( + enrichment_scopes: Optional[List[str]] = None, ) -> Tuple[str, ...]: - requested = _DEFAULT_ANALYSIS_KINDS if not analysis_kinds else tuple(analysis_kinds) + requested = _DEFAULT_ENRICHMENT_SCOPES if not enrichment_scopes else tuple(enrichment_scopes) normalized: List[str] = [] seen = set() - for analysis_kind in requested: - schema = _get_analysis_schema(str(analysis_kind).strip()) - if schema.name in seen: + for enrichment_scope in requested: + scope = str(enrichment_scope).strip() + if scope not in {"generic", "category_taxonomy"}: + raise ValueError(f"Unsupported enrichment_scope: {scope}") + if scope in seen: continue - seen.add(schema.name) - normalized.append(schema.name) + seen.add(scope) + normalized.append(scope) return tuple(normalized) @@ -448,7 +478,13 @@ def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang: _append_lang_phrase_map(result["enriched_tags"], lang=lang, raw_value=raw) -def _apply_index_taxonomy_row(result: Dict[str, Any], row: Dict[str, Any], lang: str) -> None: +def _apply_index_taxonomy_row( + result: Dict[str, Any], + row: Dict[str, Any], + lang: str, + *, + category_taxonomy_profile: Optional[str] = None, +) -> None: if not row or row.get("error"): return @@ -456,8 +492,11 @@ def _apply_index_taxonomy_row(result: Dict[str, Any], row: Dict[str, Any], lang: result["enriched_taxonomy_attributes"], row=row, lang=lang, - schema=_get_analysis_schema("taxonomy"), - field_map=_TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP, + schema=_get_analysis_schema( + "taxonomy", + category_taxonomy_profile=category_taxonomy_profile, + ), + field_map=_get_taxonomy_attribute_field_map(category_taxonomy_profile), ) @@ -475,7 +514,8 @@ def _normalize_index_content_item(item: Dict[str, Any]) -> Dict[str, str]: def build_index_content_fields( items: List[Dict[str, Any]], tenant_id: Optional[str] = None, - analysis_kinds: Optional[List[str]] = None, + enrichment_scopes: Optional[List[str]] = None, + category_taxonomy_profile: Optional[str] = None, ) -> List[Dict[str, Any]]: """ 高层入口:生成与 ES mapping 对齐的内容理解字段。 @@ -484,7 +524,8 @@ def build_index_content_fields( - `id` 或 `spu_id` - `title` - 可选 `brief` / `description` / `image_url` - - 可选 `analysis_kinds`,默认同时执行 `content` 与 `taxonomy` + - 可选 `enrichment_scopes`,默认同时执行 `generic` 与 `category_taxonomy` + - 可选 `category_taxonomy_profile`,默认 `apparel` 返回项结构: - `id` @@ -498,7 +539,8 @@ def build_index_content_fields( - `qanchors.{lang}` 为短语数组 - `enriched_tags.{lang}` 为标签数组 """ - requested_analysis_kinds = _normalize_analysis_kinds(analysis_kinds) + requested_enrichment_scopes = _normalize_enrichment_scopes(enrichment_scopes) + normalized_taxonomy_profile = _normalize_category_taxonomy_profile(category_taxonomy_profile) normalized_items = [_normalize_index_content_item(item) for item in items] if not normalized_items: return [] @@ -515,7 +557,7 @@ def build_index_content_fields( } for lang in _CORE_INDEX_LANGUAGES: - if "content" in requested_analysis_kinds: + if "generic" in requested_enrichment_scopes: try: rows = analyze_products( products=normalized_items, @@ -523,6 +565,7 @@ def build_index_content_fields( batch_size=BATCH_SIZE, tenant_id=tenant_id, analysis_kind="content", + category_taxonomy_profile=normalized_taxonomy_profile, ) except Exception as e: logger.warning("build_index_content_fields content enrichment failed for lang=%s: %s", lang, e) @@ -539,7 +582,7 @@ def build_index_content_fields( continue _apply_index_content_row(results_by_id[item_id], row=row, lang=lang) - if "taxonomy" in requested_analysis_kinds: + if "category_taxonomy" in requested_enrichment_scopes: try: taxonomy_rows = analyze_products( products=normalized_items, @@ -547,6 +590,7 @@ def build_index_content_fields( batch_size=BATCH_SIZE, tenant_id=tenant_id, analysis_kind="taxonomy", + category_taxonomy_profile=normalized_taxonomy_profile, ) except Exception as e: logger.warning( @@ -565,7 +609,12 @@ def build_index_content_fields( if row.get("error"): results_by_id[item_id].setdefault("error", row["error"]) continue - _apply_index_taxonomy_row(results_by_id[item_id], row=row, lang=lang) + _apply_index_taxonomy_row( + results_by_id[item_id], + row=row, + lang=lang, + category_taxonomy_profile=normalized_taxonomy_profile, + ) return [results_by_id[item["id"]] for item in normalized_items] @@ -636,9 +685,13 @@ def _make_analysis_cache_key( product: Dict[str, Any], target_lang: str, analysis_kind: str, + category_taxonomy_profile: Optional[str] = None, ) -> str: """构造缓存 key,仅由分析类型、prompt 实际输入文本内容与目标语言决定。""" - schema = _get_analysis_schema(analysis_kind) + schema = _get_analysis_schema( + analysis_kind, + category_taxonomy_profile=category_taxonomy_profile, + ) prompt_input = _build_prompt_input_text(product) h = hashlib.md5(prompt_input.encode("utf-8")).hexdigest() prompt_contract = { @@ -672,12 +725,21 @@ def _get_cached_analysis_result( product: Dict[str, Any], target_lang: str, analysis_kind: str, + category_taxonomy_profile: Optional[str] = None, ) -> Optional[Dict[str, Any]]: if not _anchor_redis: return None - schema = _get_analysis_schema(analysis_kind) + schema = _get_analysis_schema( + analysis_kind, + category_taxonomy_profile=category_taxonomy_profile, + ) try: - key = _make_analysis_cache_key(product, target_lang, analysis_kind) + key = _make_analysis_cache_key( + product, + target_lang, + analysis_kind, + category_taxonomy_profile=category_taxonomy_profile, + ) raw = _anchor_redis.get(key) if not raw: return None @@ -707,10 +769,14 @@ def _set_cached_analysis_result( target_lang: str, result: Dict[str, Any], analysis_kind: str, + category_taxonomy_profile: Optional[str] = None, ) -> None: if not _anchor_redis: return - schema = _get_analysis_schema(analysis_kind) + schema = _get_analysis_schema( + analysis_kind, + category_taxonomy_profile=category_taxonomy_profile, + ) try: normalized = _normalize_analysis_result( result, @@ -720,7 +786,12 @@ def _set_cached_analysis_result( ) if not _has_meaningful_analysis_content(normalized, schema): return - key = _make_analysis_cache_key(product, target_lang, analysis_kind) + key = _make_analysis_cache_key( + product, + target_lang, + analysis_kind, + category_taxonomy_profile=category_taxonomy_profile, + ) ttl = ANCHOR_CACHE_EXPIRE_DAYS * 24 * 3600 _anchor_redis.setex(key, ttl, json.dumps(normalized, ensure_ascii=False)) except Exception as e: @@ -775,9 +846,13 @@ def create_prompt( products: List[Dict[str, str]], target_lang: str = "zh", analysis_kind: str = "content", + category_taxonomy_profile: Optional[str] = None, ) -> Tuple[Optional[str], Optional[str], Optional[str]]: """根据目标语言创建共享上下文、本地化输出要求和 Partial Mode assistant 前缀。""" - schema = _get_analysis_schema(analysis_kind) + schema = _get_analysis_schema( + analysis_kind, + category_taxonomy_profile=category_taxonomy_profile, + ) markdown_table_headers = schema.get_headers(target_lang) if not markdown_table_headers: logger.warning( @@ -978,9 +1053,13 @@ def call_llm( def parse_markdown_table( markdown_content: str, analysis_kind: str = "content", + category_taxonomy_profile: Optional[str] = None, ) -> List[Dict[str, str]]: """解析markdown表格内容""" - schema = _get_analysis_schema(analysis_kind) + schema = _get_analysis_schema( + analysis_kind, + category_taxonomy_profile=category_taxonomy_profile, + ) lines = markdown_content.strip().split("\n") data = [] data_started = False @@ -1025,8 +1104,12 @@ def _log_parsed_result_quality( target_lang: str, batch_num: int, analysis_kind: str, + category_taxonomy_profile: Optional[str] = None, ) -> None: - schema = _get_analysis_schema(analysis_kind) + schema = _get_analysis_schema( + analysis_kind, + category_taxonomy_profile=category_taxonomy_profile, + ) expected = len(batch_data) actual = len(parsed_results) if actual != expected: @@ -1071,9 +1154,13 @@ def process_batch( batch_num: int, target_lang: str = "zh", analysis_kind: str = "content", + category_taxonomy_profile: Optional[str] = None, ) -> List[Dict[str, Any]]: """处理一个批次的数据""" - schema = _get_analysis_schema(analysis_kind) + schema = _get_analysis_schema( + analysis_kind, + category_taxonomy_profile=category_taxonomy_profile, + ) logger.info(f"\n{'#' * 80}") logger.info( "Processing Batch %s (%s items, kind=%s)", @@ -1087,6 +1174,7 @@ def process_batch( batch_data, target_lang=target_lang, analysis_kind=analysis_kind, + category_taxonomy_profile=category_taxonomy_profile, ) # 如果提示词创建失败(例如不支持的 target_lang),本次批次整体失败,不再继续调用 LLM @@ -1119,13 +1207,18 @@ def process_batch( ) # 解析结果 - parsed_results = parse_markdown_table(raw_response, analysis_kind=analysis_kind) + parsed_results = parse_markdown_table( + raw_response, + analysis_kind=analysis_kind, + category_taxonomy_profile=category_taxonomy_profile, + ) _log_parsed_result_quality( batch_data, parsed_results, target_lang, batch_num, analysis_kind, + category_taxonomy_profile, ) logger.info(f"\nParsed Results ({len(parsed_results)} items):") @@ -1190,6 +1283,7 @@ def analyze_products( batch_size: Optional[int] = None, tenant_id: Optional[str] = None, analysis_kind: str = "content", + category_taxonomy_profile: Optional[str] = None, ) -> List[Dict[str, Any]]: """ 库调用入口:根据输入+语言,返回锚文本及各维度信息。 @@ -1205,7 +1299,10 @@ def analyze_products( if not products: return [] - _get_analysis_schema(analysis_kind) + _get_analysis_schema( + analysis_kind, + category_taxonomy_profile=category_taxonomy_profile, + ) results_by_index: List[Optional[Dict[str, Any]]] = [None] * len(products) uncached_items: List[Tuple[int, Dict[str, str]]] = [] @@ -1215,7 +1312,12 @@ def analyze_products( uncached_items.append((idx, product)) continue - cached = _get_cached_analysis_result(product, target_lang, analysis_kind) + cached = _get_cached_analysis_result( + product, + target_lang, + analysis_kind, + category_taxonomy_profile=category_taxonomy_profile, + ) if cached: logger.info( f"[analyze_products] Cache hit for title='{title[:50]}...', " @@ -1255,6 +1357,7 @@ def analyze_products( batch_num=batch_num, target_lang=target_lang, analysis_kind=analysis_kind, + category_taxonomy_profile=category_taxonomy_profile, ) for (original_idx, product), item in zip(batch_slice, batch_results): @@ -1266,7 +1369,13 @@ def analyze_products( # 不缓存错误结果,避免放大临时故障 continue try: - _set_cached_analysis_result(product, target_lang, item, analysis_kind) + _set_cached_analysis_result( + product, + target_lang, + item, + analysis_kind, + category_taxonomy_profile=category_taxonomy_profile, + ) except Exception: # 已在内部记录 warning pass @@ -1293,6 +1402,7 @@ def analyze_products( batch_num=batch_num, target_lang=target_lang, analysis_kind=analysis_kind, + category_taxonomy_profile=category_taxonomy_profile, ) # 按 batch_num 回填,确保输出稳定(results_by_index 是按原始 input index 映射的) @@ -1307,7 +1417,13 @@ def analyze_products( # 不缓存错误结果,避免放大临时故障 continue try: - _set_cached_analysis_result(product, target_lang, item, analysis_kind) + _set_cached_analysis_result( + product, + target_lang, + item, + analysis_kind, + category_taxonomy_profile=category_taxonomy_profile, + ) except Exception: # 已在内部记录 warning pass diff --git a/indexer/taxonomy.md b/indexer/taxonomy.md index 1da8cee..1abb4b1 100644 --- a/indexer/taxonomy.md +++ b/indexer/taxonomy.md @@ -1,5 +1,51 @@ -服装大类的taxonomy +# Cross-Border E-commerce Core Categories 大类 + +## 1. 3C +Phone accessories, computer peripherals, smart wearables, audio & video, smart home, gaming gear. 手机配件、电脑周边、智能穿戴、影音娱乐、智能家居、游戏设备。 + +## 2. Bags 包 +Handbags, backpacks, wallets, luggage, crossbody bags, tote bags. 手提包、双肩包、钱包、行李箱、斜挎包、托特包。 + +## 3. Pet Supplies 宠物用品 +Pet food, pet toys, pet care products, pet grooming, pet clothing, smart pet devices. 宠物食品、宠物玩具、宠物护理用品、宠物美容、宠物服装、智能宠物设备。 + +## 4. Electronics 电子产品 +Consumer electronics, home appliances, digital devices, cables & chargers, batteries, electronic components. 消费电子产品、家用电器、数码设备、线材充电器、电池、电子元器件。 + +## 5. Clothing 服装 +Women's wear, men's wear, kid's wear, underwear, outerwear, activewear. 女装、男装、童装、内衣、外套、运动服装。 + +## 6. Outdoor 户外用品 +Camping gear, hiking equipment, fishing supplies, outdoor clothing, travel accessories, survival tools. 露营装备、徒步用品、渔具、户外服装、旅行配件、求生工具。 + +## 7. Home Appliances 家电/电器 +Kitchen appliances, cleaning appliances, personal care appliances, heating & cooling, smart home devices. 厨房电器、清洁电器、个护电器、冷暖设备、智能家居设备。 + +## 8. Home & Living 家居 +Furniture, home textiles, lighting, kitchenware, storage, home decor. 家具、家纺、灯具、厨具、收纳、家居装饰。 + +## 9. Wigs 假发 + +## 10. Beauty & Cosmetics 美容美妆 +Skincare, makeup, nail care, beauty tools, hair care, fragrances. 护肤品、彩妆、美甲、美容工具、护发、香水。 + +## 11. Accessories 配饰 +Jewelry, watches, belts, scarves, hats, sunglasses, hair accessories. 珠宝、手表、腰带、围巾、帽子、太阳镜、发饰。 + +## 12. Toys 玩具 +Educational toys, plush toys, action figures, puzzles, outdoor toys, DIY toys. 益智玩具、毛绒玩具、可动人偶、拼图、户外玩具、DIY玩具。 + +## 13. Shoes 鞋子 +Sneakers, boots, sandals, heels, flats, sports shoes. 运动鞋、靴子、凉鞋、高跟鞋、平底鞋、球鞋。 + +## 14. Sports 运动产品 +Fitness equipment, sports gear, team sports, racquet sports, water sports, cycling. 健身器材、运动装备、团队运动、球拍运动、水上运动、骑行。 + +## 15. Others 其他 + +# 各个大类的taxonomy +## 1. Clothing & Apparel 服装 ### A. Product Classification diff --git a/tests/ci/test_service_api_contracts.py b/tests/ci/test_service_api_contracts.py index 486c5aa..113c442 100644 --- a/tests/ci/test_service_api_contracts.py +++ b/tests/ci/test_service_api_contracts.py @@ -348,10 +348,12 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch def _fake_build_index_content_fields( items: List[Dict[str, str]], tenant_id: str | None = None, - analysis_kinds: List[str] | None = None, + enrichment_scopes: List[str] | None = None, + category_taxonomy_profile: str = "apparel", ): assert tenant_id == "162" - assert analysis_kinds == ["content", "taxonomy"] + assert enrichment_scopes == ["generic", "category_taxonomy"] + assert category_taxonomy_profile == "apparel" return [ { "id": p["spu_id"], @@ -376,6 +378,8 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch "/indexer/enrich-content", json={ "tenant_id": "162", + "enrichment_scopes": ["generic", "category_taxonomy"], + "category_taxonomy_profile": "apparel", "items": [ {"spu_id": "1001", "title": "T-shirt"}, {"spu_id": "1002", "title": "Toy"}, @@ -385,7 +389,8 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch assert response.status_code == 200 data = response.json() assert data["tenant_id"] == "162" - assert data["analysis_kinds"] == ["content", "taxonomy"] + assert data["enrichment_scopes"] == ["generic", "category_taxonomy"] + assert data["category_taxonomy_profile"] == "apparel" assert data["total"] == 2 assert len(data["results"]) == 2 assert data["results"][0]["spu_id"] == "1001" @@ -403,6 +408,52 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch } +def test_indexer_enrich_content_contract_accepts_deprecated_analysis_kinds(indexer_client: TestClient, monkeypatch): + import indexer.product_enrich as process_products + + seen: Dict[str, Any] = {} + + def _fake_build_index_content_fields( + items: List[Dict[str, str]], + tenant_id: str | None = None, + enrichment_scopes: List[str] | None = None, + category_taxonomy_profile: str = "apparel", + ): + seen["tenant_id"] = tenant_id + seen["enrichment_scopes"] = enrichment_scopes + seen["category_taxonomy_profile"] = category_taxonomy_profile + return [ + { + "id": items[0]["spu_id"], + "qanchors": {}, + "enriched_tags": {}, + "enriched_attributes": [], + "enriched_taxonomy_attributes": [], + } + ] + + monkeypatch.setattr(process_products, "build_index_content_fields", _fake_build_index_content_fields) + + response = indexer_client.post( + "/indexer/enrich-content", + json={ + "tenant_id": "162", + "analysis_kinds": ["taxonomy"], + "items": [{"spu_id": "1001", "title": "T-shirt"}], + }, + ) + + assert response.status_code == 200 + data = response.json() + assert seen == { + "tenant_id": "162", + "enrichment_scopes": ["category_taxonomy"], + "category_taxonomy_profile": "apparel", + } + assert data["enrichment_scopes"] == ["category_taxonomy"] + assert data["category_taxonomy_profile"] == "apparel" + + def test_indexer_documents_contract(indexer_client: TestClient): """POST /indexer/documents: tenant_id + spu_ids, returns success/failed lists (no ES write).""" response = indexer_client.post( diff --git a/tests/test_process_products_batching.py b/tests/test_process_products_batching.py index d39f1d3..e72a0e0 100644 --- a/tests/test_process_products_batching.py +++ b/tests/test_process_products_batching.py @@ -18,8 +18,10 @@ def test_analyze_products_caps_batch_size_to_20(monkeypatch): batch_num: int, target_lang: str = "zh", analysis_kind: str = "content", + category_taxonomy_profile=None, ): assert analysis_kind == "content" + assert category_taxonomy_profile is None seen_batch_sizes.append(len(batch_data)) return [ { @@ -64,8 +66,10 @@ def test_analyze_products_uses_min_batch_size_1(monkeypatch): batch_num: int, target_lang: str = "zh", analysis_kind: str = "content", + category_taxonomy_profile=None, ): assert analysis_kind == "content" + assert category_taxonomy_profile is None seen_batch_sizes.append(len(batch_data)) return [ { diff --git a/tests/test_product_enrich_partial_mode.py b/tests/test_product_enrich_partial_mode.py index 1c9f38b..446b182 100644 --- a/tests/test_product_enrich_partial_mode.py +++ b/tests/test_product_enrich_partial_mode.py @@ -295,16 +295,36 @@ def test_analyze_products_uses_product_level_cache_across_batch_requests(): product.get("image_url", ""), ) - def fake_get_cached_analysis_result(product, target_lang, analysis_kind="content"): + def fake_get_cached_analysis_result( + product, + target_lang, + analysis_kind="content", + category_taxonomy_profile=None, + ): assert analysis_kind == "content" + assert category_taxonomy_profile is None return cache_store.get(_cache_key(product, target_lang)) - def fake_set_cached_analysis_result(product, target_lang, result, analysis_kind="content"): + def fake_set_cached_analysis_result( + product, + target_lang, + result, + analysis_kind="content", + category_taxonomy_profile=None, + ): assert analysis_kind == "content" + assert category_taxonomy_profile is None cache_store[_cache_key(product, target_lang)] = result - def fake_process_batch(batch_data, batch_num, target_lang="zh", analysis_kind="content"): + def fake_process_batch( + batch_data, + batch_num, + target_lang="zh", + analysis_kind="content", + category_taxonomy_profile=None, + ): assert analysis_kind == "content" + assert category_taxonomy_profile is None process_calls.append( { "batch_num": batch_num, @@ -400,7 +420,7 @@ def test_analyze_products_reuses_cached_content_with_current_product_identity(): with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object( product_enrich, "_get_cached_analysis_result", - wraps=lambda product, target_lang, analysis_kind="content": product_enrich._normalize_analysis_result( + wraps=lambda product, target_lang, analysis_kind="content", category_taxonomy_profile=None: product_enrich._normalize_analysis_result( cached_result, product=product, target_lang=target_lang, @@ -443,8 +463,10 @@ def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output() batch_size=None, tenant_id=None, analysis_kind="content", + category_taxonomy_profile=None, ): if analysis_kind == "taxonomy": + assert category_taxonomy_profile == "apparel" return [ { "id": products[0]["id"], @@ -478,6 +500,7 @@ def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output() "style_aesthetic": "", } ] + assert category_taxonomy_profile == "apparel" return [ { "id": products[0]["id"], -- libgit2 0.21.2