diff --git a/api/routes/indexer.py b/api/routes/indexer.py index a253593..1abe603 100644 --- a/api/routes/indexer.py +++ b/api/routes/indexer.py @@ -19,6 +19,11 @@ logger = logging.getLogger(__name__) router = APIRouter(prefix="/indexer", tags=["indexer"]) +SUPPORTED_CATEGORY_TAXONOMY_PROFILES = ( + "apparel, 3c, bags, pet_supplies, electronics, outdoor, " + "home_appliances, home_living, wigs, beauty, accessories, toys, shoes, sports, others" +) + class ReindexRequest(BaseModel): """全量重建索引请求""" @@ -105,8 +110,9 @@ class EnrichContentRequest(BaseModel): category_taxonomy_profile: str = Field( "apparel", description=( - "品类 taxonomy profile。当前默认且已支持的是 `apparel`。" - "未来可扩展为 `electronics` 等。" + "品类 taxonomy profile。默认 `apparel`。" + f"当前支持:{SUPPORTED_CATEGORY_TAXONOMY_PROFILES}。" + "其中除 `apparel` 外,其余 profile 的 taxonomy 输出仅返回 `en`。" ), ) analysis_kinds: Optional[List[Literal["content", "taxonomy"]]] = Field( diff --git a/docs/搜索API对接指南-05-索引接口(Indexer).md b/docs/搜索API对接指南-05-索引接口(Indexer).md index c87cf91..0b444c5 100644 --- a/docs/搜索API对接指南-05-索引接口(Indexer).md +++ b/docs/搜索API对接指南-05-索引接口(Indexer).md @@ -650,6 +650,28 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ - **端点**: `POST /indexer/enrich-content` - **描述**: 根据商品内容信息批量生成 **qanchors**(锚文本)、**enriched_attributes**(通用语义属性)、**enriched_tags**(细分标签)、**enriched_taxonomy_attributes**(taxonomy 结构化属性),供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段(必填/可选见下表)。接口只暴露商品内容输入,语言选择、分析维度与最终字段结构统一由 `indexer.product_enrich` 内部决定;当前返回结果与 `search_products` mapping 保持一致。单次请求在线程池中执行,避免阻塞其他接口。 +当前支持的 `category_taxonomy_profile`: +- `apparel` +- `3c` +- `bags` +- `pet_supplies` +- `electronics` +- `outdoor` +- `home_appliances` +- `home_living` +- `wigs` +- `beauty` +- `accessories` +- `toys` +- `shoes` +- `sports` +- `others` + +说明: +- `apparel` 仍返回 `zh` + `en` 两种 taxonomy 值。 +- 其余 profile 的 `enriched_taxonomy_attributes.value` 只返回 `en`,以控制字段体积并保持结构简单。 +- Indexer 内部构建 ES 文档时,如果调用链没有显式指定 profile,会优先根据商品的类目字段自动推断 taxonomy profile;外部调用 `/indexer/enrich-content` 时仍以请求中的 `category_taxonomy_profile` 为准。 + #### 请求参数 ```json @@ -678,7 +700,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ |------|------|------|--------|------| | `tenant_id` | string | Y | - | 租户 ID。目前仅用于记录日志,不产生实际作用| | `enrichment_scopes` | array[string] | N | `["generic", "category_taxonomy"]` | 选择要执行的增强范围。`generic` 生成 `qanchors`/`enriched_tags`/`enriched_attributes`,`category_taxonomy` 生成 `enriched_taxonomy_attributes` | -| `category_taxonomy_profile` | string | N | `apparel` | 品类 taxonomy profile。当前内置为服装大类 `apparel`,后续可扩展到其他大类 | +| `category_taxonomy_profile` | string | N | `apparel` | 品类 taxonomy profile。支持:`apparel`、`3c`、`bags`、`pet_supplies`、`electronics`、`outdoor`、`home_appliances`、`home_living`、`wigs`、`beauty`、`accessories`、`toys`、`shoes`、`sports`、`others` | | `items` | array | Y | - | 待分析列表;**单次最多 50 条** | `items[]` 字段说明: @@ -704,7 +726,8 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ - 接口不接受语言控制参数。 - 返回哪些语言、返回哪些语义维度,统一由 `indexer.product_enrich` 内部逻辑决定。 -- 当前为了与 `search_products` mapping 对齐,返回结果只包含核心索引语言 `zh`、`en`。 +- 当前为了与 `search_products` mapping 对齐,通用增强字段只包含核心索引语言 `zh`、`en`。 +- taxonomy 字段中,`apparel` 返回 `zh`、`en`;其他 profile 仅返回 `en`。 批量请求建议: - **全量**:强烈建议 尽可能 **20 个 SPU/doc** 攒成一个批次后再请求一次。 @@ -764,7 +787,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | `results[].qanchors` | object | 与 ES `qanchors` 字段同结构,按语言键返回短语数组 | | `results[].enriched_tags` | object | 与 ES `enriched_tags` 字段同结构,按语言键返回标签数组 | | `results[].enriched_attributes` | array | 与 ES `enriched_attributes` nested 字段同结构,每项为 `{ "name", "value": { "zh"?: "...", "en"?: "..." } }` | -| `results[].enriched_taxonomy_attributes` | array | 与 ES `enriched_taxonomy_attributes` nested 字段同结构,每项为 `{ "name", "value": { "zh"?: [...], "en"?: [...] } }` | +| `results[].enriched_taxonomy_attributes` | array | 与 ES `enriched_taxonomy_attributes` nested 字段同结构。`apparel` 每项通常为 `{ "name", "value": { "zh"?: [...], "en"?: [...] } }`;其他 profile 仅返回 `{ "name", "value": { "en": [...] } }` | | `results[].error` | string | 若该条处理失败(如 LLM 异常),会在此字段返回错误信息 | **错误响应**: diff --git a/docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md b/docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md index 5f0a029..b3167e1 100644 --- a/docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md +++ b/docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md @@ -444,7 +444,7 @@ curl "http://localhost:6006/health" - **Base URL**: Indexer 服务地址,如 `http://localhost:6004` - **路径**: `POST /indexer/enrich-content` -- **说明**: 根据商品标题批量生成 `qanchors`、`enriched_attributes`、`enriched_tags`、`enriched_taxonomy_attributes`,用于拼装 ES 文档。支持通过 `enrichment_scopes` 选择执行 `generic` / `category_taxonomy`,并通过 `category_taxonomy_profile` 选择对应大类的 taxonomy prompt/profile;默认执行 `generic + category_taxonomy(apparel)`。内部使用大模型(需配置 `DASHSCOPE_API_KEY`),支持多语言与 Redis 缓存;单次最多 50 条,建议批量调用以提升效率。 +- **说明**: 根据商品标题批量生成 `qanchors`、`enriched_attributes`、`enriched_tags`、`enriched_taxonomy_attributes`,用于拼装 ES 文档。支持通过 `enrichment_scopes` 选择执行 `generic` / `category_taxonomy`,并通过 `category_taxonomy_profile` 选择对应大类的 taxonomy prompt/profile;默认执行 `generic + category_taxonomy(apparel)`。当前支持的 taxonomy profile 包括 `apparel`、`3c`、`bags`、`pet_supplies`、`electronics`、`outdoor`、`home_appliances`、`home_living`、`wigs`、`beauty`、`accessories`、`toys`、`shoes`、`sports`、`others`。其中 `apparel` 的 taxonomy 输出为 `zh` + `en`,其余 profile 的 taxonomy 输出仅返回 `en`。内部使用大模型(需配置 `DASHSCOPE_API_KEY`),支持多语言与 Redis 缓存;单次最多 50 条,建议批量调用以提升效率。 请求/响应格式、示例及错误码见 [-05-索引接口(Indexer)](./搜索API对接指南-05-索引接口(Indexer).md#58-内容理解字段生成接口)。 diff --git a/indexer/document_transformer.py b/indexer/document_transformer.py index d9e56cb..e0fb19e 100644 --- a/indexer/document_transformer.py +++ b/indexer/document_transformer.py @@ -259,6 +259,13 @@ class SPUDocumentTransformer: title = str(row.get("title") or "").strip() if not spu_id or not title: continue + category_path_obj = docs[i].get("category_path") or {} + resolved_category_path = "" + if isinstance(category_path_obj, dict): + resolved_category_path = next( + (str(value).strip() for value in category_path_obj.values() if str(value).strip()), + "", + ) id_to_idx[spu_id] = i items.append( { @@ -267,6 +274,9 @@ class SPUDocumentTransformer: "brief": str(row.get("brief") or "").strip(), "description": str(row.get("description") or "").strip(), "image_url": str(row.get("image_src") or "").strip(), + "category": str(row.get("category") or "").strip(), + "category_path": resolved_category_path, + "category1_name": str(docs[i].get("category1_name") or "").strip(), } ) if not items: @@ -677,6 +687,16 @@ class SPUDocumentTransformer: "brief": str(spu_row.get("brief") or "").strip(), "description": str(spu_row.get("description") or "").strip(), "image_url": str(spu_row.get("image_src") or "").strip(), + "category": str(spu_row.get("category") or "").strip(), + "category_path": next( + ( + str(value).strip() + for value in (doc.get("category_path") or {}).values() + if str(value).strip() + ), + "", + ), + "category1_name": str(doc.get("category1_name") or "").strip(), } ], tenant_id=str(tenant_id), diff --git a/indexer/product_enrich.py b/indexer/product_enrich.py index 6537e85..848287b 100644 --- a/indexer/product_enrich.py +++ b/indexer/product_enrich.py @@ -31,9 +31,7 @@ from indexer.product_enrich_prompts import ( USER_INSTRUCTION_TEMPLATE, LANGUAGE_MARKDOWN_TABLE_HEADERS, SHARED_ANALYSIS_INSTRUCTION, - TAXONOMY_LANGUAGE_MARKDOWN_TABLE_HEADERS, - TAXONOMY_MARKDOWN_TABLE_HEADERS_EN, - TAXONOMY_SHARED_ANALYSIS_INSTRUCTION, + CATEGORY_TAXONOMY_PROFILES, ) # 配置 @@ -188,37 +186,6 @@ _CONTENT_ANALYSIS_FIELD_ALIASES = { "tags": ("tags", "enriched_tags"), } _CONTENT_ANALYSIS_QUALITY_FIELDS = ("title", "category_path", "anchor_text") -_APPAREL_TAXONOMY_ATTRIBUTE_FIELD_MAP = ( - ("product_type", "Product Type"), - ("target_gender", "Target Gender"), - ("age_group", "Age Group"), - ("season", "Season"), - ("fit", "Fit"), - ("silhouette", "Silhouette"), - ("neckline", "Neckline"), - ("sleeve_length_type", "Sleeve Length Type"), - ("sleeve_style", "Sleeve Style"), - ("strap_type", "Strap Type"), - ("rise_waistline", "Rise / Waistline"), - ("leg_shape", "Leg Shape"), - ("skirt_shape", "Skirt Shape"), - ("length_type", "Length Type"), - ("closure_type", "Closure Type"), - ("design_details", "Design Details"), - ("fabric", "Fabric"), - ("material_composition", "Material Composition"), - ("fabric_properties", "Fabric Properties"), - ("clothing_features", "Clothing Features"), - ("functional_benefits", "Functional Benefits"), - ("color", "Color"), - ("color_family", "Color Family"), - ("print_pattern", "Print / Pattern"), - ("occasion_end_use", "Occasion / End Use"), - ("style_aesthetic", "Style Aesthetic"), -) -_APPAREL_TAXONOMY_ANALYSIS_RESULT_FIELDS = tuple( - field_name for field_name, _ in _APPAREL_TAXONOMY_ATTRIBUTE_FIELD_MAP -) @dataclass(frozen=True) @@ -228,6 +195,7 @@ class AnalysisSchema: markdown_table_headers: Dict[str, List[str]] result_fields: Tuple[str, ...] meaningful_fields: Tuple[str, ...] + output_languages: Tuple[str, ...] = ("zh", "en") cache_version: str = "v1" field_aliases: Dict[str, Tuple[str, ...]] = field(default_factory=dict) fallback_headers: Optional[List[str]] = None @@ -249,36 +217,111 @@ _ANALYSIS_SCHEMAS: Dict[str, AnalysisSchema] = { markdown_table_headers=LANGUAGE_MARKDOWN_TABLE_HEADERS, result_fields=_CONTENT_ANALYSIS_RESULT_FIELDS, meaningful_fields=_CONTENT_ANALYSIS_MEANINGFUL_FIELDS, + output_languages=_CORE_INDEX_LANGUAGES, cache_version="v2", field_aliases=_CONTENT_ANALYSIS_FIELD_ALIASES, quality_fields=_CONTENT_ANALYSIS_QUALITY_FIELDS, ), } -_CATEGORY_TAXONOMY_PROFILE_SCHEMAS: Dict[str, AnalysisSchema] = { - "apparel": AnalysisSchema( - name="taxonomy:apparel", - shared_instruction=TAXONOMY_SHARED_ANALYSIS_INSTRUCTION, - markdown_table_headers=TAXONOMY_LANGUAGE_MARKDOWN_TABLE_HEADERS, - result_fields=_APPAREL_TAXONOMY_ANALYSIS_RESULT_FIELDS, - meaningful_fields=_APPAREL_TAXONOMY_ANALYSIS_RESULT_FIELDS, +def _build_taxonomy_profile_schema(profile: str, config: Dict[str, Any]) -> AnalysisSchema: + result_fields = tuple(field["key"] for field in config["fields"]) + headers = config["markdown_table_headers"] + return AnalysisSchema( + name=f"taxonomy:{profile}", + shared_instruction=config["shared_instruction"], + markdown_table_headers=headers, + result_fields=result_fields, + meaningful_fields=result_fields, + output_languages=tuple(config["output_languages"]), cache_version="v1", - fallback_headers=TAXONOMY_MARKDOWN_TABLE_HEADERS_EN, - ), + fallback_headers=headers.get("en") if len(headers) > 1 else None, + ) + + +_CATEGORY_TAXONOMY_PROFILE_SCHEMAS: Dict[str, AnalysisSchema] = { + profile: _build_taxonomy_profile_schema(profile, config) + for profile, config in CATEGORY_TAXONOMY_PROFILES.items() } _CATEGORY_TAXONOMY_PROFILE_ATTRIBUTE_FIELD_MAPS: Dict[str, Tuple[Tuple[str, str], ...]] = { - "apparel": _APPAREL_TAXONOMY_ATTRIBUTE_FIELD_MAP, + profile: tuple((field["key"], field["label"]) for field in config["fields"]) + for profile, config in CATEGORY_TAXONOMY_PROFILES.items() } +def get_supported_category_taxonomy_profiles() -> Tuple[str, ...]: + return tuple(_CATEGORY_TAXONOMY_PROFILE_SCHEMAS.keys()) + + +def _normalize_category_hint(text: Any) -> str: + value = str(text or "").strip().lower() + if not value: + return "" + value = value.replace("_", " ").replace(">", " ").replace("/", " ") + value = re.sub(r"\s+", " ", value) + return value + + +_CATEGORY_TAXONOMY_PROFILE_ALIAS_MATCHERS: Tuple[Tuple[str, str], ...] = tuple( + sorted( + ( + (_normalize_category_hint(alias), profile) + for profile, config in CATEGORY_TAXONOMY_PROFILES.items() + for alias in (profile, *tuple(config.get("aliases") or ())) + if _normalize_category_hint(alias) + ), + key=lambda item: len(item[0]), + reverse=True, + ) +) + + def _normalize_category_taxonomy_profile(category_taxonomy_profile: Optional[str] = None) -> str: profile = str(category_taxonomy_profile or _DEFAULT_CATEGORY_TAXONOMY_PROFILE).strip() if profile not in _CATEGORY_TAXONOMY_PROFILE_SCHEMAS: - raise ValueError(f"Unsupported category_taxonomy_profile: {profile}") + supported = ", ".join(get_supported_category_taxonomy_profiles()) + raise ValueError( + f"Unsupported category_taxonomy_profile: {profile}. Supported profiles: {supported}" + ) return profile +def detect_category_taxonomy_profile(item: Dict[str, Any]) -> Optional[str]: + """ + 根据商品已有类目信息猜测 taxonomy profile。 + 未命中时返回 None,由上层决定是否回退到默认 profile。 + """ + category_hints = ( + item.get("category_taxonomy_profile"), + item.get("category1_name"), + item.get("category_name_text"), + item.get("category"), + item.get("category_path"), + ) + for hint in category_hints: + normalized_hint = _normalize_category_hint(hint) + if not normalized_hint: + continue + for alias, profile in _CATEGORY_TAXONOMY_PROFILE_ALIAS_MATCHERS: + if alias and alias in normalized_hint: + return profile + return None + + +def _resolve_category_taxonomy_profile( + item: Dict[str, Any], + fallback_profile: Optional[str] = None, +) -> str: + explicit_profile = str(item.get("category_taxonomy_profile") or "").strip() + if explicit_profile: + return _normalize_category_taxonomy_profile(explicit_profile) + detected_profile = detect_category_taxonomy_profile(item) + if detected_profile: + return detected_profile + return _normalize_category_taxonomy_profile(fallback_profile) + + def _get_analysis_schema( analysis_kind: str, *, @@ -299,6 +342,17 @@ def _get_taxonomy_attribute_field_map( return _CATEGORY_TAXONOMY_PROFILE_ATTRIBUTE_FIELD_MAPS[profile] +def _get_analysis_output_languages( + analysis_kind: str, + *, + category_taxonomy_profile: Optional[str] = None, +) -> Tuple[str, ...]: + return _get_analysis_schema( + analysis_kind, + category_taxonomy_profile=category_taxonomy_profile, + ).output_languages + + def _normalize_enrichment_scopes( enrichment_scopes: Optional[List[str]] = None, ) -> Tuple[str, ...]: @@ -508,6 +562,11 @@ def _normalize_index_content_item(item: Dict[str, Any]) -> Dict[str, str]: "brief": str(item.get("brief") or "").strip(), "description": str(item.get("description") or "").strip(), "image_url": str(item.get("image_url") or "").strip(), + "category": str(item.get("category") or "").strip(), + "category_path": str(item.get("category_path") or "").strip(), + "category_name_text": str(item.get("category_name_text") or "").strip(), + "category1_name": str(item.get("category1_name") or "").strip(), + "category_taxonomy_profile": str(item.get("category_taxonomy_profile") or "").strip(), } @@ -525,7 +584,8 @@ def build_index_content_fields( - `title` - 可选 `brief` / `description` / `image_url` - 可选 `enrichment_scopes`,默认同时执行 `generic` 与 `category_taxonomy` - - 可选 `category_taxonomy_profile`,默认 `apparel` + - 可选 `category_taxonomy_profile`;若不传,则优先根据 item 自带的类目字段推断,否则回退到默认 `apparel` + - 可选类目提示字段:`category` / `category_path` / `category_name_text` / `category1_name` 返回项结构: - `id` @@ -540,10 +600,21 @@ def build_index_content_fields( - `enriched_tags.{lang}` 为标签数组 """ requested_enrichment_scopes = _normalize_enrichment_scopes(enrichment_scopes) - normalized_taxonomy_profile = _normalize_category_taxonomy_profile(category_taxonomy_profile) + fallback_taxonomy_profile = ( + _normalize_category_taxonomy_profile(category_taxonomy_profile) + if category_taxonomy_profile + else None + ) normalized_items = [_normalize_index_content_item(item) for item in items] if not normalized_items: return [] + taxonomy_profile_by_id = { + item["id"]: _resolve_category_taxonomy_profile( + item, + fallback_profile=fallback_taxonomy_profile, + ) + for item in normalized_items + } results_by_id: Dict[str, Dict[str, Any]] = { item["id"]: { @@ -556,7 +627,7 @@ def build_index_content_fields( for item in normalized_items } - for lang in _CORE_INDEX_LANGUAGES: + for lang in _get_analysis_output_languages("content"): if "generic" in requested_enrichment_scopes: try: rows = analyze_products( @@ -565,7 +636,7 @@ def build_index_content_fields( batch_size=BATCH_SIZE, tenant_id=tenant_id, analysis_kind="content", - category_taxonomy_profile=normalized_taxonomy_profile, + category_taxonomy_profile=fallback_taxonomy_profile, ) except Exception as e: logger.warning("build_index_content_fields content enrichment failed for lang=%s: %s", lang, e) @@ -582,39 +653,49 @@ def build_index_content_fields( continue _apply_index_content_row(results_by_id[item_id], row=row, lang=lang) - if "category_taxonomy" in requested_enrichment_scopes: - try: - taxonomy_rows = analyze_products( - products=normalized_items, - target_lang=lang, - batch_size=BATCH_SIZE, - tenant_id=tenant_id, - analysis_kind="taxonomy", - category_taxonomy_profile=normalized_taxonomy_profile, - ) - except Exception as e: - logger.warning( - "build_index_content_fields taxonomy enrichment failed for lang=%s: %s", - lang, - e, - ) - for item in normalized_items: - results_by_id[item["id"]].setdefault("error", str(e)) - continue + if "category_taxonomy" in requested_enrichment_scopes: + items_by_profile: Dict[str, List[Dict[str, str]]] = {} + for item in normalized_items: + items_by_profile.setdefault(taxonomy_profile_by_id[item["id"]], []).append(item) - for row in taxonomy_rows or []: - item_id = str(row.get("id") or "").strip() - if not item_id or item_id not in results_by_id: - continue - if row.get("error"): - results_by_id[item_id].setdefault("error", row["error"]) + for taxonomy_profile, profile_items in items_by_profile.items(): + for lang in _get_analysis_output_languages( + "taxonomy", + category_taxonomy_profile=taxonomy_profile, + ): + try: + taxonomy_rows = analyze_products( + products=profile_items, + target_lang=lang, + batch_size=BATCH_SIZE, + tenant_id=tenant_id, + analysis_kind="taxonomy", + category_taxonomy_profile=taxonomy_profile, + ) + except Exception as e: + logger.warning( + "build_index_content_fields taxonomy enrichment failed for profile=%s lang=%s: %s", + taxonomy_profile, + lang, + e, + ) + for item in profile_items: + results_by_id[item["id"]].setdefault("error", str(e)) continue - _apply_index_taxonomy_row( - results_by_id[item_id], - row=row, - lang=lang, - category_taxonomy_profile=normalized_taxonomy_profile, - ) + + for row in taxonomy_rows or []: + item_id = str(row.get("id") or "").strip() + if not item_id or item_id not in results_by_id: + continue + if row.get("error"): + results_by_id[item_id].setdefault("error", row["error"]) + continue + _apply_index_taxonomy_row( + results_by_id[item_id], + row=row, + lang=lang, + category_taxonomy_profile=taxonomy_profile, + ) return [results_by_id[item["id"]] for item in normalized_items] diff --git a/indexer/product_enrich_prompts.py b/indexer/product_enrich_prompts.py index 704ad09..8c328cd 100644 --- a/indexer/product_enrich_prompts.py +++ b/indexer/product_enrich_prompts.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -from typing import Any, Dict +from typing import Any, Dict, Tuple SYSTEM_MESSAGE = ( "You are an e-commerce product annotator. " @@ -33,110 +33,362 @@ Input product list: USER_INSTRUCTION_TEMPLATE = """Please strictly return a Markdown table following the given columns in the specified language. For any column containing multiple values, separate them with commas. Do not add any other explanation. Language: {language}""" -TAXONOMY_SHARED_ANALYSIS_INSTRUCTION = """Analyze each input product text and fill the columns below using an apparel attribute taxonomy. +def _taxonomy_field( + key: str, + label: str, + description: str, + zh_label: str | None = None, +) -> Dict[str, str]: + return { + "key": key, + "label": label, + "description": description, + "zh_label": zh_label or label, + } -Output columns: -1. Product Type: concise ecommerce apparel category label, not a full marketing title -2. Target Gender: intended gender only if clearly implied -3. Age Group: only if clearly implied, e.g. adults, kids, teens, toddlers, babies -4. Season: season(s) or all-season suitability only if supported -5. Fit: body closeness, e.g. slim, regular, relaxed, oversized, fitted -6. Silhouette: overall garment shape, e.g. straight, A-line, boxy, tapered, bodycon, wide-leg -7. Neckline: neckline type when applicable, e.g. crew neck, V-neck, hooded, collared, square neck -8. Sleeve Length Type: sleeve length only, e.g. sleeveless, short sleeve, long sleeve, three-quarter sleeve -9. Sleeve Style: sleeve design only, e.g. puff sleeve, raglan sleeve, batwing sleeve, bell sleeve -10. Strap Type: strap design when applicable, e.g. spaghetti strap, wide strap, halter strap, adjustable strap -11. Rise / Waistline: waist placement when applicable, e.g. high rise, mid rise, low rise, empire waist -12. Leg Shape: for bottoms only, e.g. straight leg, wide leg, flare leg, tapered leg, skinny leg -13. Skirt Shape: for skirts only, e.g. A-line, pleated, pencil, mermaid -14. Length Type: design length only, not size, e.g. cropped, regular, longline, mini, midi, maxi, ankle length, full length -15. Closure Type: fastening method when applicable, e.g. zipper, button, drawstring, elastic waist, hook-and-loop -16. Design Details: construction or visual details, e.g. ruched, ruffled, pleated, cut-out, layered, distressed, split hem -17. Fabric: fabric type only, e.g. denim, knit, chiffon, jersey, fleece, cotton twill -18. Material Composition: fiber content or blend only if stated, e.g. cotton, polyester, spandex, linen blend, 95% cotton 5% elastane -19. Fabric Properties: inherent fabric traits, e.g. stretch, breathable, lightweight, soft-touch, water-resistant -20. Clothing Features: product features, e.g. lined, reversible, hooded, packable, padded, pocketed -21. Functional Benefits: wearer benefits, e.g. moisture-wicking, thermal insulation, UV protection, easy care, supportive compression -22. Color: specific color name when available -23. Color Family: normalized broad retail color group, e.g. black, white, blue, green, red, pink, beige, brown, gray -24. Print / Pattern: surface pattern when applicable, e.g. solid, striped, plaid, floral, graphic, animal print -25. Occasion / End Use: likely use occasion only if supported, e.g. office, casual wear, streetwear, lounge, workout, outdoor -26. Style Aesthetic: overall style only if supported, e.g. minimalist, streetwear, athleisure, smart casual, romantic, playful -Rules: -- Keep the same row order and row count as input. -- Infer only from the provided product text. -- Leave blank if not applicable or not reasonably supported. -- Use concise, standardized ecommerce wording. -- Do not combine different attribute dimensions in one field. -- If multiple values are needed, use the delimiter required by the localization setting. +def _build_taxonomy_shared_instruction(profile_label: str, fields: Tuple[Dict[str, str], ...]) -> str: + lines = [ + f"Analyze each input product text and fill the columns below using a {profile_label} attribute taxonomy.", + "", + "Output columns:", + ] + for idx, field in enumerate(fields, start=1): + lines.append(f"{idx}. {field['label']}: {field['description']}") + lines.extend( + [ + "", + "Rules:", + "- Keep the same row order and row count as input.", + "- Infer only from the provided product text.", + "- Leave blank if not applicable or not reasonably supported.", + "- Use concise, standardized ecommerce wording.", + "- Do not combine different attribute dimensions in one field.", + "- If multiple values are needed, use the delimiter required by the localization setting.", + "", + "Input product list:", + ] + ) + return "\n".join(lines) -Input product list: -""" -TAXONOMY_MARKDOWN_TABLE_HEADERS_EN = [ - "No.", - "Product Type", - "Target Gender", - "Age Group", - "Season", - "Fit", - "Silhouette", - "Neckline", - "Sleeve Length Type", - "Sleeve Style", - "Strap Type", - "Rise / Waistline", - "Leg Shape", - "Skirt Shape", - "Length Type", - "Closure Type", - "Design Details", - "Fabric", - "Material Composition", - "Fabric Properties", - "Clothing Features", - "Functional Benefits", - "Color", - "Color Family", - "Print / Pattern", - "Occasion / End Use", - "Style Aesthetic", -] +def _make_taxonomy_profile( + profile_label: str, + fields: Tuple[Dict[str, str], ...], + *, + aliases: Tuple[str, ...], + output_languages: Tuple[str, ...] = ("en",), + zh_headers: Tuple[str, ...] = (), +) -> Dict[str, Any]: + headers = {"en": ["No.", *[field["label"] for field in fields]]} + if zh_headers: + headers["zh"] = ["序号", *zh_headers] + return { + "profile_label": profile_label, + "fields": fields, + "aliases": aliases, + "output_languages": output_languages, + "shared_instruction": _build_taxonomy_shared_instruction(profile_label, fields), + "markdown_table_headers": headers, + } -TAXONOMY_LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { - "en": TAXONOMY_MARKDOWN_TABLE_HEADERS_EN, - "zh": [ - "序号", - "品类", - "目标性别", - "年龄段", - "适用季节", - "版型", - "廓形", - "领型", - "袖长类型", - "袖型", - "肩带设计", - "腰型", - "裤型", - "裙型", - "长度类型", - "闭合方式", - "设计细节", - "面料", - "成分", - "面料特性", - "服装特征", - "功能", - "主颜色", - "色系", - "印花 / 图案", - "适用场景", - "风格", - ], + +APPAREL_TAXONOMY_FIELDS = ( + _taxonomy_field("product_type", "Product Type", "concise ecommerce apparel category label, not a full marketing title", "品类"), + _taxonomy_field("target_gender", "Target Gender", "intended gender only if clearly implied", "目标性别"), + _taxonomy_field("age_group", "Age Group", "only if clearly implied, e.g. adults, kids, teens, toddlers, babies", "年龄段"), + _taxonomy_field("season", "Season", "season(s) or all-season suitability only if supported", "适用季节"), + _taxonomy_field("fit", "Fit", "body closeness, e.g. slim, regular, relaxed, oversized, fitted", "版型"), + _taxonomy_field("silhouette", "Silhouette", "overall garment shape, e.g. straight, A-line, boxy, tapered, bodycon, wide-leg", "廓形"), + _taxonomy_field("neckline", "Neckline", "neckline type when applicable, e.g. crew neck, V-neck, hooded, collared, square neck", "领型"), + _taxonomy_field("sleeve_length_type", "Sleeve Length Type", "sleeve length only, e.g. sleeveless, short sleeve, long sleeve, three-quarter sleeve", "袖长类型"), + _taxonomy_field("sleeve_style", "Sleeve Style", "sleeve design only, e.g. puff sleeve, raglan sleeve, batwing sleeve, bell sleeve", "袖型"), + _taxonomy_field("strap_type", "Strap Type", "strap design when applicable, e.g. spaghetti strap, wide strap, halter strap, adjustable strap", "肩带设计"), + _taxonomy_field("rise_waistline", "Rise / Waistline", "waist placement when applicable, e.g. high rise, mid rise, low rise, empire waist", "腰型"), + _taxonomy_field("leg_shape", "Leg Shape", "for bottoms only, e.g. straight leg, wide leg, flare leg, tapered leg, skinny leg", "裤型"), + _taxonomy_field("skirt_shape", "Skirt Shape", "for skirts only, e.g. A-line, pleated, pencil, mermaid", "裙型"), + _taxonomy_field("length_type", "Length Type", "design length only, not size, e.g. cropped, regular, longline, mini, midi, maxi, ankle length, full length", "长度类型"), + _taxonomy_field("closure_type", "Closure Type", "fastening method when applicable, e.g. zipper, button, drawstring, elastic waist, hook-and-loop", "闭合方式"), + _taxonomy_field("design_details", "Design Details", "construction or visual details, e.g. ruched, ruffled, pleated, cut-out, layered, distressed, split hem", "设计细节"), + _taxonomy_field("fabric", "Fabric", "fabric type only, e.g. denim, knit, chiffon, jersey, fleece, cotton twill", "面料"), + _taxonomy_field("material_composition", "Material Composition", "fiber content or blend only if stated, e.g. cotton, polyester, spandex, linen blend, 95% cotton 5% elastane", "成分"), + _taxonomy_field("fabric_properties", "Fabric Properties", "inherent fabric traits, e.g. stretch, breathable, lightweight, soft-touch, water-resistant", "面料特性"), + _taxonomy_field("clothing_features", "Clothing Features", "product features, e.g. lined, reversible, hooded, packable, padded, pocketed", "服装特征"), + _taxonomy_field("functional_benefits", "Functional Benefits", "wearer benefits, e.g. moisture-wicking, thermal insulation, UV protection, easy care, supportive compression", "功能"), + _taxonomy_field("color", "Color", "specific color name when available", "主颜色"), + _taxonomy_field("color_family", "Color Family", "normalized broad retail color group, e.g. black, white, blue, green, red, pink, beige, brown, gray", "色系"), + _taxonomy_field("print_pattern", "Print / Pattern", "surface pattern when applicable, e.g. solid, striped, plaid, floral, graphic, animal print", "印花 / 图案"), + _taxonomy_field("occasion_end_use", "Occasion / End Use", "likely use occasion only if supported, e.g. office, casual wear, streetwear, lounge, workout, outdoor", "适用场景"), + _taxonomy_field("style_aesthetic", "Style Aesthetic", "overall style only if supported, e.g. minimalist, streetwear, athleisure, smart casual, romantic, playful", "风格"), +) + +THREE_C_TAXONOMY_FIELDS = ( + _taxonomy_field("product_type", "Product Type", "concise 3C accessory or peripheral category label"), + _taxonomy_field("compatible_device", "Compatible Device / Model", "supported device family, series, model, or form factor when clearly stated"), + _taxonomy_field("connectivity", "Connectivity", "connection method such as wired, wireless, Bluetooth, Wi-Fi, NFC, or 2.4G"), + _taxonomy_field("interface_port_type", "Interface / Port Type", "relevant connector or port, e.g. USB-C, Lightning, HDMI, AUX, RJ45"), + _taxonomy_field("power_charging", "Power Source / Charging", "charging or power mode, e.g. battery powered, fast charging, rechargeable, plug-in"), + _taxonomy_field("key_features", "Key Features", "primary hardware features such as noise cancelling, foldable, magnetic, backlit, waterproof"), + _taxonomy_field("material_finish", "Material / Finish", "main material or exterior finish when supported"), + _taxonomy_field("color", "Color", "specific color name when available"), + _taxonomy_field("pack_size", "Pack Size", "unit count or bundle size when stated"), + _taxonomy_field("use_case", "Use Case", "intended usage such as travel, office, gaming, car, charging, streaming"), +) + +BAGS_TAXONOMY_FIELDS = ( + _taxonomy_field("product_type", "Product Type", "concise bag category such as backpack, tote bag, crossbody bag, luggage, or wallet"), + _taxonomy_field("target_gender", "Target Gender", "intended gender only if clearly implied"), + _taxonomy_field("carry_style", "Carry Style", "how the bag is worn or carried, e.g. handheld, shoulder, crossbody, backpack"), + _taxonomy_field("size_capacity", "Size / Capacity", "size tier or capacity when supported, e.g. mini, large capacity, 20L"), + _taxonomy_field("material", "Material", "main bag material such as leather, nylon, canvas, PU, straw"), + _taxonomy_field("closure_type", "Closure Type", "bag closure such as zipper, flap, buckle, drawstring, magnetic snap"), + _taxonomy_field("structure_compartments", "Structure / Compartments", "organizational structure such as multi-pocket, laptop sleeve, card slots, expandable"), + _taxonomy_field("strap_handle_type", "Strap / Handle Type", "strap or handle design such as chain strap, top handle, adjustable strap"), + _taxonomy_field("color", "Color", "specific color name when available"), + _taxonomy_field("occasion_end_use", "Occasion / End Use", "likely use such as commute, travel, evening, school, casual"), +) + +PET_SUPPLIES_TAXONOMY_FIELDS = ( + _taxonomy_field("product_type", "Product Type", "concise pet supplies category label"), + _taxonomy_field("pet_type", "Pet Type", "target pet such as dog, cat, bird, fish, hamster"), + _taxonomy_field("breed_size", "Breed Size", "pet size or breed size when stated, e.g. small breed, large dogs"), + _taxonomy_field("life_stage", "Life Stage", "pet age stage when supported, e.g. puppy, kitten, adult, senior"), + _taxonomy_field("material_ingredients", "Material / Ingredients", "main material or ingredient composition when supported"), + _taxonomy_field("flavor_scent", "Flavor / Scent", "flavor or scent when applicable"), + _taxonomy_field("key_features", "Key Features", "primary attributes such as interactive, leak-proof, orthopedic, washable, elevated"), + _taxonomy_field("functional_benefits", "Functional Benefits", "benefits such as dental care, calming, digestion support, joint support"), + _taxonomy_field("size_capacity", "Size / Capacity", "size, count, or net content when stated"), + _taxonomy_field("use_scenario", "Use Scenario", "usage such as feeding, training, grooming, travel, indoor play"), +) + +ELECTRONICS_TAXONOMY_FIELDS = ( + _taxonomy_field("product_type", "Product Type", "concise electronics device or component category label"), + _taxonomy_field("device_category", "Device Category / Compatibility", "supported platform, component class, or compatible device family when stated"), + _taxonomy_field("power_voltage", "Power / Voltage", "power, voltage, wattage, or battery spec when supported"), + _taxonomy_field("connectivity", "Connectivity", "connection method such as wired, Bluetooth, Wi-Fi, RF, or smart app control"), + _taxonomy_field("interface_port_type", "Interface / Port Type", "relevant port or interface such as USB-C, AC plug type, HDMI, SATA"), + _taxonomy_field("capacity_storage", "Capacity / Storage", "capacity or storage spec such as 256GB, 2TB, 5000mAh"), + _taxonomy_field("key_features", "Key Features", "main product features such as touch control, HD display, noise reduction, smart control"), + _taxonomy_field("material_finish", "Material / Finish", "main housing material or finish when supported"), + _taxonomy_field("color", "Color", "specific color name when available"), + _taxonomy_field("use_case", "Use Case", "intended use such as home entertainment, office, charging, security, repair"), +) + +OUTDOOR_TAXONOMY_FIELDS = ( + _taxonomy_field("product_type", "Product Type", "concise outdoor gear category label"), + _taxonomy_field("activity_type", "Activity Type", "primary outdoor activity such as camping, hiking, fishing, climbing, travel"), + _taxonomy_field("season_weather", "Season / Weather", "season or weather suitability when supported"), + _taxonomy_field("material", "Material", "main material such as aluminum, ripstop nylon, stainless steel, EVA"), + _taxonomy_field("capacity_size", "Capacity / Size", "size, length, or capacity when stated"), + _taxonomy_field("protection_resistance", "Protection / Resistance", "resistance or protection such as waterproof, UV resistant, windproof"), + _taxonomy_field("key_features", "Key Features", "primary gear attributes such as foldable, lightweight, insulated, non-slip"), + _taxonomy_field("portability_packability", "Portability / Packability", "carry or storage trait such as collapsible, compact, ultralight, packable"), + _taxonomy_field("color", "Color", "specific color name when available"), + _taxonomy_field("use_scenario", "Use Scenario", "likely use setting such as campsite, trail, survival kit, beach, picnic"), +) + +HOME_APPLIANCES_TAXONOMY_FIELDS = ( + _taxonomy_field("product_type", "Product Type", "concise home appliance category label"), + _taxonomy_field("appliance_category", "Appliance Category", "functional class such as kitchen appliance, cleaning appliance, personal care appliance"), + _taxonomy_field("power_voltage", "Power / Voltage", "wattage, voltage, plug type, or power supply when supported"), + _taxonomy_field("capacity_coverage", "Capacity / Coverage", "capacity or coverage metric such as 1.5L, 20L, 40sqm"), + _taxonomy_field("control_method", "Control Method", "operation method such as touch, knob, remote, app control"), + _taxonomy_field("installation_type", "Installation Type", "setup style such as countertop, handheld, portable, wall-mounted, built-in"), + _taxonomy_field("key_features", "Key Features", "main product features such as timer, steam, HEPA filter, self-cleaning"), + _taxonomy_field("material_finish", "Material / Finish", "main material or exterior finish when supported"), + _taxonomy_field("color", "Color", "specific color name when available"), + _taxonomy_field("use_scenario", "Use Scenario", "intended use such as cooking, cleaning, grooming, cooling, air treatment"), +) + +HOME_LIVING_TAXONOMY_FIELDS = ( + _taxonomy_field("product_type", "Product Type", "concise home and living category label"), + _taxonomy_field("room_placement", "Room / Placement", "intended room or placement such as bedroom, kitchen, bathroom, desktop"), + _taxonomy_field("material", "Material", "main material such as wood, ceramic, cotton, glass, metal"), + _taxonomy_field("style", "Style", "home style such as modern, farmhouse, minimalist, boho, Nordic"), + _taxonomy_field("size_dimensions", "Size / Dimensions", "size or dimensions when stated"), + _taxonomy_field("color", "Color", "specific color name when available"), + _taxonomy_field("pattern_finish", "Pattern / Finish", "surface pattern or finish such as solid, marble, matte, ribbed"), + _taxonomy_field("key_features", "Key Features", "main product features such as stackable, washable, blackout, space-saving"), + _taxonomy_field("assembly_installation", "Assembly / Installation", "assembly or installation trait when supported"), + _taxonomy_field("use_scenario", "Use Scenario", "intended use such as storage, dining, decor, sleep, organization"), +) + +WIGS_TAXONOMY_FIELDS = ( + _taxonomy_field("product_type", "Product Type", "concise wig or hairpiece category label"), + _taxonomy_field("hair_material", "Hair Material", "hair material such as human hair, synthetic fiber, heat-resistant fiber"), + _taxonomy_field("hair_texture", "Hair Texture", "texture or curl pattern such as straight, body wave, curly, kinky"), + _taxonomy_field("hair_length", "Hair Length", "hair length when stated"), + _taxonomy_field("hair_color", "Hair Color", "specific hair color or blend when available"), + _taxonomy_field("cap_construction", "Cap Construction", "cap type such as full lace, lace front, glueless, U part"), + _taxonomy_field("lace_area_part_type", "Lace Area / Part Type", "lace size or part style such as 13x4 lace, middle part, T part"), + _taxonomy_field("density_volume", "Density / Volume", "hair density or fullness when supported"), + _taxonomy_field("style_bang_type", "Style / Bang Type", "style cue such as bob, pixie, layered, with bangs"), + _taxonomy_field("occasion_end_use", "Occasion / End Use", "intended use such as daily wear, cosplay, protective style, party"), +) + +BEAUTY_TAXONOMY_FIELDS = ( + _taxonomy_field("product_type", "Product Type", "concise beauty or cosmetics category label"), + _taxonomy_field("target_area", "Target Area", "target area such as face, lips, eyes, nails, hair, body"), + _taxonomy_field("skin_hair_type", "Skin Type / Hair Type", "suitable skin or hair type when supported"), + _taxonomy_field("finish_effect", "Finish / Effect", "cosmetic finish or effect such as matte, dewy, volumizing, brightening"), + _taxonomy_field("key_ingredients", "Key Ingredients", "notable ingredients when stated"), + _taxonomy_field("shade_color", "Shade / Color", "specific shade or color when available"), + _taxonomy_field("scent", "Scent", "fragrance or scent only when supported"), + _taxonomy_field("formulation", "Formulation", "product form such as cream, serum, powder, gel, stick"), + _taxonomy_field("functional_benefits", "Functional Benefits", "benefits such as hydration, anti-aging, long-wear, repair, sun protection"), + _taxonomy_field("use_scenario", "Use Scenario", "intended use such as daily routine, salon, travel, evening makeup"), +) + +ACCESSORIES_TAXONOMY_FIELDS = ( + _taxonomy_field("product_type", "Product Type", "concise accessory category label such as necklace, watch, belt, hat, or sunglasses"), + _taxonomy_field("target_gender", "Target Gender", "intended gender only if clearly implied"), + _taxonomy_field("material", "Material", "main material such as alloy, leather, stainless steel, acetate, fabric"), + _taxonomy_field("color", "Color", "specific color name when available"), + _taxonomy_field("pattern_finish", "Pattern / Finish", "surface treatment or style finish such as polished, textured, braided, rhinestone"), + _taxonomy_field("closure_fastening", "Closure / Fastening", "fastening method when applicable"), + _taxonomy_field("size_fit", "Size / Fit", "size or fit information such as adjustable, one size, 42mm"), + _taxonomy_field("style", "Style", "style cue such as minimalist, vintage, statement, sporty"), + _taxonomy_field("occasion_end_use", "Occasion / End Use", "likely use such as daily wear, formal, party, travel, sun protection"), + _taxonomy_field("set_pack_size", "Set / Pack Size", "set count or pack size when stated"), +) + +TOYS_TAXONOMY_FIELDS = ( + _taxonomy_field("product_type", "Product Type", "concise toy category label"), + _taxonomy_field("age_group", "Age Group", "intended age group when clearly implied"), + _taxonomy_field("character_theme", "Character / Theme", "licensed character, theme, or play theme when supported"), + _taxonomy_field("material", "Material", "main toy material such as plush, plastic, wood, silicone"), + _taxonomy_field("power_source", "Power Source", "battery, rechargeable, wind-up, or non-powered when supported"), + _taxonomy_field("interactive_features", "Interactive Features", "interactive functions such as sound, lights, remote control, motion"), + _taxonomy_field("educational_play_value", "Educational / Play Value", "play value such as STEM, pretend play, sensory, puzzle solving"), + _taxonomy_field("piece_count_size", "Piece Count / Size", "piece count or size when stated"), + _taxonomy_field("color", "Color", "specific color name when available"), + _taxonomy_field("use_scenario", "Use Scenario", "intended use such as indoor play, bath time, party favor, outdoor play"), +) + +SHOES_TAXONOMY_FIELDS = ( + _taxonomy_field("product_type", "Product Type", "concise footwear category label"), + _taxonomy_field("target_gender", "Target Gender", "intended gender only if clearly implied"), + _taxonomy_field("age_group", "Age Group", "only if clearly implied"), + _taxonomy_field("closure_type", "Closure Type", "fastening method such as lace-up, slip-on, buckle, hook-and-loop"), + _taxonomy_field("toe_shape", "Toe Shape", "toe shape when applicable, e.g. round toe, pointed toe, open toe"), + _taxonomy_field("heel_sole_type", "Heel Height / Sole Type", "heel or sole profile such as flat, block heel, wedge, platform, thick sole"), + _taxonomy_field("upper_material", "Upper Material", "main upper material such as leather, knit, canvas, mesh"), + _taxonomy_field("lining_insole_material", "Lining / Insole Material", "lining or insole material when supported"), + _taxonomy_field("color", "Color", "specific color name when available"), + _taxonomy_field("occasion_end_use", "Occasion / End Use", "likely use such as running, casual, office, hiking, formal"), +) + +SPORTS_TAXONOMY_FIELDS = ( + _taxonomy_field("product_type", "Product Type", "concise sports product category label"), + _taxonomy_field("sport_activity", "Sport / Activity", "primary sport or activity such as fitness, yoga, basketball, cycling, swimming"), + _taxonomy_field("skill_level", "Skill Level", "target user level when supported, e.g. beginner, training, professional"), + _taxonomy_field("material", "Material", "main material such as EVA, carbon fiber, neoprene, latex"), + _taxonomy_field("size_capacity", "Size / Capacity", "size, weight, resistance level, or capacity when stated"), + _taxonomy_field("protection_support", "Protection / Support", "support or protection function such as ankle support, shock absorption, impact protection"), + _taxonomy_field("key_features", "Key Features", "main features such as anti-slip, adjustable, foldable, quick-dry"), + _taxonomy_field("power_source", "Power Source", "battery, electric, or non-powered when applicable"), + _taxonomy_field("color", "Color", "specific color name when available"), + _taxonomy_field("use_scenario", "Use Scenario", "intended use such as gym, home workout, field training, competition"), +) + +OTHERS_TAXONOMY_FIELDS = ( + _taxonomy_field("product_type", "Product Type", "concise product category label, not a full marketing title"), + _taxonomy_field("product_category", "Product Category", "broader retail grouping when the specific product type is narrow"), + _taxonomy_field("target_user", "Target User", "intended user, audience, or recipient when clearly implied"), + _taxonomy_field("material_ingredients", "Material / Ingredients", "main material or ingredients when supported"), + _taxonomy_field("key_features", "Key Features", "primary product attributes or standout features"), + _taxonomy_field("functional_benefits", "Functional Benefits", "practical benefits or performance advantages when supported"), + _taxonomy_field("size_capacity", "Size / Capacity", "size, count, weight, or capacity when stated"), + _taxonomy_field("color", "Color", "specific color name when available"), + _taxonomy_field("style_theme", "Style / Theme", "overall style, design theme, or visual direction when supported"), + _taxonomy_field("use_scenario", "Use Scenario", "likely use occasion or application setting when supported"), +) + +CATEGORY_TAXONOMY_PROFILES: Dict[str, Dict[str, Any]] = { + "apparel": _make_taxonomy_profile( + "apparel", + APPAREL_TAXONOMY_FIELDS, + aliases=("服装", "服饰", "apparel", "clothing", "fashion"), + output_languages=("zh", "en"), + zh_headers=tuple(field["zh_label"] for field in APPAREL_TAXONOMY_FIELDS), + ), + "3c": _make_taxonomy_profile( + "3C", + THREE_C_TAXONOMY_FIELDS, + aliases=("3c", "数码", "phone accessories", "computer peripherals", "smart wearables", "audio", "gaming gear"), + ), + "bags": _make_taxonomy_profile( + "bags", + BAGS_TAXONOMY_FIELDS, + aliases=("bags", "bag", "包", "箱包", "handbag", "backpack", "wallet", "luggage"), + ), + "pet_supplies": _make_taxonomy_profile( + "pet supplies", + PET_SUPPLIES_TAXONOMY_FIELDS, + aliases=("pet", "宠物", "pet supplies", "pet food", "pet toys", "pet care"), + ), + "electronics": _make_taxonomy_profile( + "electronics", + ELECTRONICS_TAXONOMY_FIELDS, + aliases=("electronics", "电子", "electronic components", "consumer electronics", "digital devices"), + ), + "outdoor": _make_taxonomy_profile( + "outdoor products", + OUTDOOR_TAXONOMY_FIELDS, + aliases=("outdoor", "户外", "camping", "hiking", "fishing", "travel accessories"), + ), + "home_appliances": _make_taxonomy_profile( + "home appliances", + HOME_APPLIANCES_TAXONOMY_FIELDS, + aliases=("home appliances", "家电", "电器", "kitchen appliances", "cleaning appliances", "smart home devices"), + ), + "home_living": _make_taxonomy_profile( + "home and living", + HOME_LIVING_TAXONOMY_FIELDS, + aliases=("home", "living", "家居", "家具", "家纺", "home decor", "kitchenware"), + ), + "wigs": _make_taxonomy_profile( + "wigs", + WIGS_TAXONOMY_FIELDS, + aliases=("wig", "wigs", "假发", "hairpiece"), + ), + "beauty": _make_taxonomy_profile( + "beauty and cosmetics", + BEAUTY_TAXONOMY_FIELDS, + aliases=("beauty", "cosmetics", "美容", "美妆", "makeup", "skincare", "nail care"), + ), + "accessories": _make_taxonomy_profile( + "accessories", + ACCESSORIES_TAXONOMY_FIELDS, + aliases=("accessories", "配饰", "jewelry", "watches", "belts", "scarves", "hats", "sunglasses"), + ), + "toys": _make_taxonomy_profile( + "toys", + TOYS_TAXONOMY_FIELDS, + aliases=("toys", "toy", "玩具", "plush", "action figures", "puzzles", "educational toys"), + ), + "shoes": _make_taxonomy_profile( + "shoes", + SHOES_TAXONOMY_FIELDS, + aliases=("shoes", "shoe", "鞋", "sneakers", "boots", "sandals", "heels"), + ), + "sports": _make_taxonomy_profile( + "sports products", + SPORTS_TAXONOMY_FIELDS, + aliases=("sports", "sport", "运动", "fitness", "cycling", "team sports", "water sports"), + ), + "others": _make_taxonomy_profile( + "general merchandise", + OTHERS_TAXONOMY_FIELDS, + aliases=("others", "other", "其他", "general merchandise"), + ), } +CATEGORY_TAXONOMY_PROFILE_NAMES = tuple(CATEGORY_TAXONOMY_PROFILES.keys()) +TAXONOMY_SHARED_ANALYSIS_INSTRUCTION = CATEGORY_TAXONOMY_PROFILES["apparel"]["shared_instruction"] +TAXONOMY_MARKDOWN_TABLE_HEADERS_EN = CATEGORY_TAXONOMY_PROFILES["apparel"]["markdown_table_headers"]["en"] +TAXONOMY_LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = CATEGORY_TAXONOMY_PROFILES["apparel"]["markdown_table_headers"] + LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "en": [ "No.", diff --git a/indexer/taxonomy.md b/indexer/taxonomy.md index 1abb4b1..7dfd351 100644 --- a/indexer/taxonomy.md +++ b/indexer/taxonomy.md @@ -171,3 +171,27 @@ Rules: Input product list: """ ``` + +## 2. Other taxonomy profiles + +说明: +- `apparel` 继续返回 `zh` + `en`。 +- 其他 profile 只返回 `en`,并且只定义英文列名。 +- 代码中的 profile slug 与下面保持一致。 + +| Profile | Core columns (`en`) | +| --- | --- | +| `3c` | Product Type, Compatible Device / Model, Connectivity, Interface / Port Type, Power Source / Charging, Key Features, Material / Finish, Color, Pack Size, Use Case | +| `bags` | Product Type, Target Gender, Carry Style, Size / Capacity, Material, Closure Type, Structure / Compartments, Strap / Handle Type, Color, Occasion / End Use | +| `pet_supplies` | Product Type, Pet Type, Breed Size, Life Stage, Material / Ingredients, Flavor / Scent, Key Features, Functional Benefits, Size / Capacity, Use Scenario | +| `electronics` | Product Type, Device Category / Compatibility, Power / Voltage, Connectivity, Interface / Port Type, Capacity / Storage, Key Features, Material / Finish, Color, Use Case | +| `outdoor` | Product Type, Activity Type, Season / Weather, Material, Capacity / Size, Protection / Resistance, Key Features, Portability / Packability, Color, Use Scenario | +| `home_appliances` | Product Type, Appliance Category, Power / Voltage, Capacity / Coverage, Control Method, Installation Type, Key Features, Material / Finish, Color, Use Scenario | +| `home_living` | Product Type, Room / Placement, Material, Style, Size / Dimensions, Color, Pattern / Finish, Key Features, Assembly / Installation, Use Scenario | +| `wigs` | Product Type, Hair Material, Hair Texture, Hair Length, Hair Color, Cap Construction, Lace Area / Part Type, Density / Volume, Style / Bang Type, Occasion / End Use | +| `beauty` | Product Type, Target Area, Skin Type / Hair Type, Finish / Effect, Key Ingredients, Shade / Color, Scent, Formulation, Functional Benefits, Use Scenario | +| `accessories` | Product Type, Target Gender, Material, Color, Pattern / Finish, Closure / Fastening, Size / Fit, Style, Occasion / End Use, Set / Pack Size | +| `toys` | Product Type, Age Group, Character / Theme, Material, Power Source, Interactive Features, Educational / Play Value, Piece Count / Size, Color, Use Scenario | +| `shoes` | Product Type, Target Gender, Age Group, Closure Type, Toe Shape, Heel Height / Sole Type, Upper Material, Lining / Insole Material, Color, Occasion / End Use | +| `sports` | Product Type, Sport / Activity, Skill Level, Material, Size / Capacity, Protection / Support, Key Features, Power Source, Color, Use Scenario | +| `others` | Product Type, Product Category, Target User, Material / Ingredients, Key Features, Functional Benefits, Size / Capacity, Color, Style / Theme, Use Scenario | diff --git a/tests/ci/test_service_api_contracts.py b/tests/ci/test_service_api_contracts.py index 113c442..cfa0fc7 100644 --- a/tests/ci/test_service_api_contracts.py +++ b/tests/ci/test_service_api_contracts.py @@ -454,6 +454,52 @@ def test_indexer_enrich_content_contract_accepts_deprecated_analysis_kinds(index assert data["category_taxonomy_profile"] == "apparel" +def test_indexer_enrich_content_contract_supports_non_apparel_taxonomy_profiles(indexer_client: TestClient, monkeypatch): + import indexer.product_enrich as process_products + + def _fake_build_index_content_fields( + items: List[Dict[str, str]], + tenant_id: str | None = None, + enrichment_scopes: List[str] | None = None, + category_taxonomy_profile: str = "apparel", + ): + assert tenant_id == "162" + assert enrichment_scopes == ["category_taxonomy"] + assert category_taxonomy_profile == "toys" + return [ + { + "id": items[0]["spu_id"], + "qanchors": {}, + "enriched_tags": {}, + "enriched_attributes": [], + "enriched_taxonomy_attributes": [ + {"name": "Product Type", "value": {"en": ["doll set"]}}, + {"name": "Age Group", "value": {"en": ["kids"]}}, + ], + } + ] + + monkeypatch.setattr(process_products, "build_index_content_fields", _fake_build_index_content_fields) + + response = indexer_client.post( + "/indexer/enrich-content", + json={ + "tenant_id": "162", + "enrichment_scopes": ["category_taxonomy"], + "category_taxonomy_profile": "toys", + "items": [{"spu_id": "1001", "title": "Toy"}], + }, + ) + + assert response.status_code == 200 + data = response.json() + assert data["category_taxonomy_profile"] == "toys" + assert data["results"][0]["enriched_taxonomy_attributes"] == [ + {"name": "Product Type", "value": {"en": ["doll set"]}}, + {"name": "Age Group", "value": {"en": ["kids"]}}, + ] + + def test_indexer_documents_contract(indexer_client: TestClient): """POST /indexer/documents: tenant_id + spu_ids, returns success/failed lists (no ES write).""" response = indexer_client.post( diff --git a/tests/test_product_enrich_partial_mode.py b/tests/test_product_enrich_partial_mode.py index 446b182..7405f90 100644 --- a/tests/test_product_enrich_partial_mode.py +++ b/tests/test_product_enrich_partial_mode.py @@ -500,7 +500,6 @@ def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output() "style_aesthetic": "", } ] - assert category_taxonomy_profile == "apparel" return [ { "id": products[0]["id"], @@ -562,6 +561,120 @@ def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output() ] +def test_detect_category_taxonomy_profile_matches_category_hints(): + assert product_enrich.detect_category_taxonomy_profile({"category1_name": "玩具"}) == "toys" + assert product_enrich.detect_category_taxonomy_profile({"category": "Beauty & Cosmetics"}) == "beauty" + assert product_enrich.detect_category_taxonomy_profile({"category_path": "Home Appliances / Kitchen"}) == "home_appliances" + + +def test_build_index_content_fields_routes_taxonomy_by_item_profile_and_non_apparel_returns_en_only(): + seen_calls = [] + + def fake_analyze_products( + products, + target_lang="zh", + batch_size=None, + tenant_id=None, + analysis_kind="content", + category_taxonomy_profile=None, + ): + seen_calls.append((analysis_kind, target_lang, category_taxonomy_profile, tuple(p["id"] for p in products))) + if analysis_kind == "taxonomy": + if category_taxonomy_profile == "apparel": + return [ + { + "id": products[0]["id"], + "lang": target_lang, + "title_input": products[0]["title"], + "product_type": f"{target_lang}-dress", + "target_gender": f"{target_lang}-women", + "age_group": "", + "season": "", + "fit": "", + "silhouette": "", + "neckline": "", + "sleeve_length_type": "", + "sleeve_style": "", + "strap_type": "", + "rise_waistline": "", + "leg_shape": "", + "skirt_shape": "", + "length_type": "", + "closure_type": "", + "design_details": "", + "fabric": "", + "material_composition": "", + "fabric_properties": "", + "clothing_features": "", + "functional_benefits": "", + "color": "", + "color_family": "", + "print_pattern": "", + "occasion_end_use": "", + "style_aesthetic": "", + } + ] + assert category_taxonomy_profile == "toys" + assert target_lang == "en" + return [ + { + "id": products[0]["id"], + "lang": "en", + "title_input": products[0]["title"], + "product_type": "doll set", + "age_group": "kids", + "character_theme": "", + "material": "", + "power_source": "", + "interactive_features": "", + "educational_play_value": "", + "piece_count_size": "", + "color": "", + "use_scenario": "", + } + ] + + return [ + { + "id": product["id"], + "lang": target_lang, + "title_input": product["title"], + "title": product["title"], + "category_path": "", + "tags": f"{target_lang}-tag", + "target_audience": "", + "usage_scene": "", + "season": "", + "key_attributes": "", + "material": "", + "features": "", + "anchor_text": f"{target_lang}-anchor", + } + for product in products + ] + + with mock.patch.object(product_enrich, "analyze_products", side_effect=fake_analyze_products): + result = product_enrich.build_index_content_fields( + items=[ + {"spu_id": "1", "title": "dress", "category_taxonomy_profile": "apparel"}, + {"spu_id": "2", "title": "toy", "category_taxonomy_profile": "toys"}, + ], + tenant_id="170", + category_taxonomy_profile="apparel", + ) + + assert result[0]["enriched_taxonomy_attributes"] == [ + {"name": "Product Type", "value": {"zh": ["zh-dress"], "en": ["en-dress"]}}, + {"name": "Target Gender", "value": {"zh": ["zh-women"], "en": ["en-women"]}}, + ] + assert result[1]["enriched_taxonomy_attributes"] == [ + {"name": "Product Type", "value": {"en": ["doll set"]}}, + {"name": "Age Group", "value": {"en": ["kids"]}}, + ] + assert ("taxonomy", "zh", "toys", ("2",)) not in seen_calls + assert ("taxonomy", "en", "toys", ("2",)) in seen_calls + + def test_anchor_cache_key_depends_on_product_input_not_identifiers(): product_a = { "id": "1", -- libgit2 0.21.2