Commit 36516857d07bd540ffdc097d5f5349da1048c184

Authored by tangwang
1 parent 78cdef1c

feat(product_enrich): 为产品富化模块增加 enriched_taxonomy_attributes

字段生成

- 新增分类法属性富化能力,遵循 enriched_attributes
  相同的字段结构和处理逻辑,仅提示词和解析维度不同
- 引入 AnalysisSchema
  抽象类,使内容富化(content)与分类法富化(taxonomy)共享批处理、缓存、提示词构建、Markdown
解析及归一化流程
- 重构 product_enrich.py 中原有的富化管道,将通用逻辑抽取至
  _process_batch_for_schema、_parse_markdown_to_attributes
等函数,消除代码重复
- 在 product_enrich_prompts.py
  中添加分类法提示词模板(TAXONOMY_ANALYSIS_PROMPT)及 Markdown
表头定义(TAXONOMY_HEADERS)
- 修复 Markdown
  解析器在空单元格时的行为:原实现会跳过空单元格导致列错位,现改为保留空值,确保稀疏的分类法属性列正确对齐
- 更新 document_transformer.py 中 build_index_content_fields 函数,将
  enriched_taxonomy_attributes(中/英)写入最终索引文档
- 调整相关单元测试(test_product_enrich_partial_mode.py
  等)以覆盖新字段路径,测试通过(14 passed)

技术细节:
- AnalysisSchema 包含
  schema_name、prompt_template、headers、field_name_prefix 等元数据
-
缓存键区分内容/分类法:`enrich:{schema_name}:{product_id}`,避免缓存污染
- 分类法解析使用与 enriched_attributes
  相同的嵌套结构:`{"attribute_key": "value"}`,支持多行表格
- 批处理大小与重试逻辑保持与原有内容富化一致
indexer/document_transformer.py
@@ -242,6 +242,7 @@ class SPUDocumentTransformer: @@ -242,6 +242,7 @@ class SPUDocumentTransformer:
242 - qanchors.{lang} 242 - qanchors.{lang}
243 - enriched_tags.{lang} 243 - enriched_tags.{lang}
244 - enriched_attributes[].value.{lang} 244 - enriched_attributes[].value.{lang}
  245 + - enriched_taxonomy_attributes[].value.{lang}
245 246
246 设计目标: 247 设计目标:
247 - 尽可能攒批调用 LLM; 248 - 尽可能攒批调用 LLM;
@@ -296,6 +297,8 @@ class SPUDocumentTransformer: @@ -296,6 +297,8 @@ class SPUDocumentTransformer:
296 doc["enriched_tags"] = enrichment["enriched_tags"] 297 doc["enriched_tags"] = enrichment["enriched_tags"]
297 if enrichment.get("enriched_attributes"): 298 if enrichment.get("enriched_attributes"):
298 doc["enriched_attributes"] = enrichment["enriched_attributes"] 299 doc["enriched_attributes"] = enrichment["enriched_attributes"]
  300 + if enrichment.get("enriched_taxonomy_attributes"):
  301 + doc["enriched_taxonomy_attributes"] = enrichment["enriched_taxonomy_attributes"]
299 except Exception as e: 302 except Exception as e:
300 logger.warning("Failed to apply enrichment to doc (spu_id=%s): %s", doc.get("spu_id"), e) 303 logger.warning("Failed to apply enrichment to doc (spu_id=%s): %s", doc.get("spu_id"), e)
301 304
indexer/product_enrich.py
@@ -14,6 +14,7 @@ import time @@ -14,6 +14,7 @@ import time
14 import hashlib 14 import hashlib
15 import uuid 15 import uuid
16 import threading 16 import threading
  17 +from dataclasses import dataclass, field
17 from collections import OrderedDict 18 from collections import OrderedDict
18 from datetime import datetime 19 from datetime import datetime
19 from concurrent.futures import ThreadPoolExecutor 20 from concurrent.futures import ThreadPoolExecutor
@@ -30,6 +31,9 @@ from indexer.product_enrich_prompts import ( @@ -30,6 +31,9 @@ from indexer.product_enrich_prompts import (
30 USER_INSTRUCTION_TEMPLATE, 31 USER_INSTRUCTION_TEMPLATE,
31 LANGUAGE_MARKDOWN_TABLE_HEADERS, 32 LANGUAGE_MARKDOWN_TABLE_HEADERS,
32 SHARED_ANALYSIS_INSTRUCTION, 33 SHARED_ANALYSIS_INSTRUCTION,
  34 + TAXONOMY_LANGUAGE_MARKDOWN_TABLE_HEADERS,
  35 + TAXONOMY_MARKDOWN_TABLE_HEADERS_EN,
  36 + TAXONOMY_SHARED_ANALYSIS_INSTRUCTION,
33 ) 37 )
34 38
35 # 配置 39 # 配置
@@ -147,7 +151,7 @@ if _missing_prompt_langs: @@ -147,7 +151,7 @@ if _missing_prompt_langs:
147 # 多值字段分隔:英文逗号、中文逗号、顿号,及历史约定的 ; | / 与空白 151 # 多值字段分隔:英文逗号、中文逗号、顿号,及历史约定的 ; | / 与空白
148 _MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[,、,;|/\n\t]+") 152 _MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[,、,;|/\n\t]+")
149 _CORE_INDEX_LANGUAGES = ("zh", "en") 153 _CORE_INDEX_LANGUAGES = ("zh", "en")
150 -_ANALYSIS_ATTRIBUTE_FIELD_MAP = ( 154 +_CONTENT_ANALYSIS_ATTRIBUTE_FIELD_MAP = (
151 ("tags", "enriched_tags"), 155 ("tags", "enriched_tags"),
152 ("target_audience", "target_audience"), 156 ("target_audience", "target_audience"),
153 ("usage_scene", "usage_scene"), 157 ("usage_scene", "usage_scene"),
@@ -156,7 +160,7 @@ _ANALYSIS_ATTRIBUTE_FIELD_MAP = ( @@ -156,7 +160,7 @@ _ANALYSIS_ATTRIBUTE_FIELD_MAP = (
156 ("material", "material"), 160 ("material", "material"),
157 ("features", "features"), 161 ("features", "features"),
158 ) 162 )
159 -_ANALYSIS_RESULT_FIELDS = ( 163 +_CONTENT_ANALYSIS_RESULT_FIELDS = (
160 "title", 164 "title",
161 "category_path", 165 "category_path",
162 "tags", 166 "tags",
@@ -168,7 +172,7 @@ _ANALYSIS_RESULT_FIELDS = ( @@ -168,7 +172,7 @@ _ANALYSIS_RESULT_FIELDS = (
168 "features", 172 "features",
169 "anchor_text", 173 "anchor_text",
170 ) 174 )
171 -_ANALYSIS_MEANINGFUL_FIELDS = ( 175 +_CONTENT_ANALYSIS_MEANINGFUL_FIELDS = (
172 "tags", 176 "tags",
173 "target_audience", 177 "target_audience",
174 "usage_scene", 178 "usage_scene",
@@ -178,9 +182,89 @@ _ANALYSIS_MEANINGFUL_FIELDS = ( @@ -178,9 +182,89 @@ _ANALYSIS_MEANINGFUL_FIELDS = (
178 "features", 182 "features",
179 "anchor_text", 183 "anchor_text",
180 ) 184 )
181 -_ANALYSIS_FIELD_ALIASES = { 185 +_CONTENT_ANALYSIS_FIELD_ALIASES = {
182 "tags": ("tags", "enriched_tags"), 186 "tags": ("tags", "enriched_tags"),
183 } 187 }
  188 +_CONTENT_ANALYSIS_QUALITY_FIELDS = ("title", "category_path", "anchor_text")
  189 +_TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP = (
  190 + ("product_type", "Product Type"),
  191 + ("target_gender", "Target Gender"),
  192 + ("age_group", "Age Group"),
  193 + ("season", "Season"),
  194 + ("fit", "Fit"),
  195 + ("silhouette", "Silhouette"),
  196 + ("neckline", "Neckline"),
  197 + ("sleeve_length_type", "Sleeve Length Type"),
  198 + ("sleeve_style", "Sleeve Style"),
  199 + ("strap_type", "Strap Type"),
  200 + ("rise_waistline", "Rise / Waistline"),
  201 + ("leg_shape", "Leg Shape"),
  202 + ("skirt_shape", "Skirt Shape"),
  203 + ("length_type", "Length Type"),
  204 + ("closure_type", "Closure Type"),
  205 + ("design_details", "Design Details"),
  206 + ("fabric", "Fabric"),
  207 + ("material_composition", "Material Composition"),
  208 + ("fabric_properties", "Fabric Properties"),
  209 + ("clothing_features", "Clothing Features"),
  210 + ("functional_benefits", "Functional Benefits"),
  211 + ("color", "Color"),
  212 + ("color_family", "Color Family"),
  213 + ("print_pattern", "Print / Pattern"),
  214 + ("occasion_end_use", "Occasion / End Use"),
  215 + ("style_aesthetic", "Style Aesthetic"),
  216 +)
  217 +_TAXONOMY_ANALYSIS_RESULT_FIELDS = tuple(
  218 + field_name for field_name, _ in _TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP
  219 +)
  220 +
  221 +
  222 +@dataclass(frozen=True)
  223 +class AnalysisSchema:
  224 + name: str
  225 + shared_instruction: str
  226 + markdown_table_headers: Dict[str, List[str]]
  227 + result_fields: Tuple[str, ...]
  228 + meaningful_fields: Tuple[str, ...]
  229 + field_aliases: Dict[str, Tuple[str, ...]] = field(default_factory=dict)
  230 + fallback_headers: Optional[List[str]] = None
  231 + quality_fields: Tuple[str, ...] = ()
  232 +
  233 + def get_headers(self, target_lang: str) -> Optional[List[str]]:
  234 + headers = self.markdown_table_headers.get(target_lang)
  235 + if headers:
  236 + return headers
  237 + if self.fallback_headers:
  238 + return self.fallback_headers
  239 + return None
  240 +
  241 +
  242 +_ANALYSIS_SCHEMAS: Dict[str, AnalysisSchema] = {
  243 + "content": AnalysisSchema(
  244 + name="content",
  245 + shared_instruction=SHARED_ANALYSIS_INSTRUCTION,
  246 + markdown_table_headers=LANGUAGE_MARKDOWN_TABLE_HEADERS,
  247 + result_fields=_CONTENT_ANALYSIS_RESULT_FIELDS,
  248 + meaningful_fields=_CONTENT_ANALYSIS_MEANINGFUL_FIELDS,
  249 + field_aliases=_CONTENT_ANALYSIS_FIELD_ALIASES,
  250 + quality_fields=_CONTENT_ANALYSIS_QUALITY_FIELDS,
  251 + ),
  252 + "taxonomy": AnalysisSchema(
  253 + name="taxonomy",
  254 + shared_instruction=TAXONOMY_SHARED_ANALYSIS_INSTRUCTION,
  255 + markdown_table_headers=TAXONOMY_LANGUAGE_MARKDOWN_TABLE_HEADERS,
  256 + result_fields=_TAXONOMY_ANALYSIS_RESULT_FIELDS,
  257 + meaningful_fields=_TAXONOMY_ANALYSIS_RESULT_FIELDS,
  258 + fallback_headers=TAXONOMY_MARKDOWN_TABLE_HEADERS_EN,
  259 + ),
  260 +}
  261 +
  262 +
  263 +def _get_analysis_schema(analysis_kind: str) -> AnalysisSchema:
  264 + schema = _ANALYSIS_SCHEMAS.get(analysis_kind)
  265 + if schema is None:
  266 + raise ValueError(f"Unsupported analysis_kind: {analysis_kind}")
  267 + return schema
184 268
185 269
186 def split_multi_value_field(text: Optional[str]) -> List[str]: 270 def split_multi_value_field(text: Optional[str]) -> List[str]:
@@ -235,12 +319,12 @@ def _get_product_id(product: Dict[str, Any]) -> str: @@ -235,12 +319,12 @@ def _get_product_id(product: Dict[str, Any]) -> str:
235 return str(product.get("id") or product.get("spu_id") or "").strip() 319 return str(product.get("id") or product.get("spu_id") or "").strip()
236 320
237 321
238 -def _get_analysis_field_aliases(field_name: str) -> Tuple[str, ...]:  
239 - return _ANALYSIS_FIELD_ALIASES.get(field_name, (field_name,)) 322 +def _get_analysis_field_aliases(field_name: str, schema: AnalysisSchema) -> Tuple[str, ...]:
  323 + return schema.field_aliases.get(field_name, (field_name,))
240 324
241 325
242 -def _get_analysis_field_value(row: Dict[str, Any], field_name: str) -> Any:  
243 - for alias in _get_analysis_field_aliases(field_name): 326 +def _get_analysis_field_value(row: Dict[str, Any], field_name: str, schema: AnalysisSchema) -> Any:
  327 + for alias in _get_analysis_field_aliases(field_name, schema):
244 if alias in row: 328 if alias in row:
245 return row.get(alias) 329 return row.get(alias)
246 return None 330 return None
@@ -261,6 +345,7 @@ def _has_meaningful_value(value: Any) -> bool: @@ -261,6 +345,7 @@ def _has_meaningful_value(value: Any) -> bool:
261 def _make_empty_analysis_result( 345 def _make_empty_analysis_result(
262 product: Dict[str, Any], 346 product: Dict[str, Any],
263 target_lang: str, 347 target_lang: str,
  348 + schema: AnalysisSchema,
264 error: Optional[str] = None, 349 error: Optional[str] = None,
265 ) -> Dict[str, Any]: 350 ) -> Dict[str, Any]:
266 result = { 351 result = {
@@ -268,7 +353,7 @@ def _make_empty_analysis_result( @@ -268,7 +353,7 @@ def _make_empty_analysis_result(
268 "lang": target_lang, 353 "lang": target_lang,
269 "title_input": str(product.get("title") or "").strip(), 354 "title_input": str(product.get("title") or "").strip(),
270 } 355 }
271 - for field in _ANALYSIS_RESULT_FIELDS: 356 + for field in schema.result_fields:
272 result[field] = "" 357 result[field] = ""
273 if error: 358 if error:
274 result["error"] = error 359 result["error"] = error
@@ -279,42 +364,59 @@ def _normalize_analysis_result( @@ -279,42 +364,59 @@ def _normalize_analysis_result(
279 result: Dict[str, Any], 364 result: Dict[str, Any],
280 product: Dict[str, Any], 365 product: Dict[str, Any],
281 target_lang: str, 366 target_lang: str,
  367 + schema: AnalysisSchema,
282 ) -> Dict[str, Any]: 368 ) -> Dict[str, Any]:
283 - normalized = _make_empty_analysis_result(product, target_lang) 369 + normalized = _make_empty_analysis_result(product, target_lang, schema)
284 if not isinstance(result, dict): 370 if not isinstance(result, dict):
285 return normalized 371 return normalized
286 372
287 normalized["lang"] = str(result.get("lang") or target_lang).strip() or target_lang 373 normalized["lang"] = str(result.get("lang") or target_lang).strip() or target_lang
288 - normalized["title"] = str(result.get("title") or "").strip()  
289 - normalized["category_path"] = str(result.get("category_path") or "").strip()  
290 normalized["title_input"] = str( 374 normalized["title_input"] = str(
291 product.get("title") or result.get("title_input") or "" 375 product.get("title") or result.get("title_input") or ""
292 ).strip() 376 ).strip()
293 377
294 - for field in _ANALYSIS_RESULT_FIELDS:  
295 - if field in {"title", "category_path"}:  
296 - continue  
297 - normalized[field] = str(_get_analysis_field_value(result, field) or "").strip() 378 + for field in schema.result_fields:
  379 + normalized[field] = str(_get_analysis_field_value(result, field, schema) or "").strip()
298 380
299 if result.get("error"): 381 if result.get("error"):
300 normalized["error"] = str(result.get("error")) 382 normalized["error"] = str(result.get("error"))
301 return normalized 383 return normalized
302 384
303 385
304 -def _has_meaningful_analysis_content(result: Dict[str, Any]) -> bool:  
305 - return any(_has_meaningful_value(result.get(field)) for field in _ANALYSIS_MEANINGFUL_FIELDS) 386 +def _has_meaningful_analysis_content(result: Dict[str, Any], schema: AnalysisSchema) -> bool:
  387 + return any(_has_meaningful_value(result.get(field)) for field in schema.meaningful_fields)
  388 +
  389 +
  390 +def _append_analysis_attributes(
  391 + target: List[Dict[str, Any]],
  392 + row: Dict[str, Any],
  393 + lang: str,
  394 + schema: AnalysisSchema,
  395 + field_map: Tuple[Tuple[str, str], ...],
  396 +) -> None:
  397 + for source_name, output_name in field_map:
  398 + raw = _get_analysis_field_value(row, source_name, schema)
  399 + if not raw:
  400 + continue
  401 + _append_named_lang_phrase_map(
  402 + target,
  403 + name=output_name,
  404 + lang=lang,
  405 + raw_value=raw,
  406 + )
306 407
307 408
308 def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang: str) -> None: 409 def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang: str) -> None:
309 if not row or row.get("error"): 410 if not row or row.get("error"):
310 return 411 return
311 412
312 - anchor_text = str(_get_analysis_field_value(row, "anchor_text") or "").strip() 413 + content_schema = _get_analysis_schema("content")
  414 + anchor_text = str(_get_analysis_field_value(row, "anchor_text", content_schema) or "").strip()
313 if anchor_text: 415 if anchor_text:
314 _append_lang_phrase_map(result["qanchors"], lang=lang, raw_value=anchor_text) 416 _append_lang_phrase_map(result["qanchors"], lang=lang, raw_value=anchor_text)
315 417
316 - for source_name, output_name in _ANALYSIS_ATTRIBUTE_FIELD_MAP:  
317 - raw = _get_analysis_field_value(row, source_name) 418 + for source_name, output_name in _CONTENT_ANALYSIS_ATTRIBUTE_FIELD_MAP:
  419 + raw = _get_analysis_field_value(row, source_name, content_schema)
318 if not raw: 420 if not raw:
319 continue 421 continue
320 _append_named_lang_phrase_map( 422 _append_named_lang_phrase_map(
@@ -327,6 +429,19 @@ def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang: @@ -327,6 +429,19 @@ def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang:
327 _append_lang_phrase_map(result["enriched_tags"], lang=lang, raw_value=raw) 429 _append_lang_phrase_map(result["enriched_tags"], lang=lang, raw_value=raw)
328 430
329 431
  432 +def _apply_index_taxonomy_row(result: Dict[str, Any], row: Dict[str, Any], lang: str) -> None:
  433 + if not row or row.get("error"):
  434 + return
  435 +
  436 + _append_analysis_attributes(
  437 + result["enriched_taxonomy_attributes"],
  438 + row=row,
  439 + lang=lang,
  440 + schema=_get_analysis_schema("taxonomy"),
  441 + field_map=_TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP,
  442 + )
  443 +
  444 +
330 def _normalize_index_content_item(item: Dict[str, Any]) -> Dict[str, str]: 445 def _normalize_index_content_item(item: Dict[str, Any]) -> Dict[str, str]:
331 item_id = _get_product_id(item) 446 item_id = _get_product_id(item)
332 return { 447 return {
@@ -355,6 +470,7 @@ def build_index_content_fields( @@ -355,6 +470,7 @@ def build_index_content_fields(
355 - `qanchors` 470 - `qanchors`
356 - `enriched_tags` 471 - `enriched_tags`
357 - `enriched_attributes` 472 - `enriched_attributes`
  473 + - `enriched_taxonomy_attributes`
358 - 可选 `error` 474 - 可选 `error`
359 475
360 其中: 476 其中:
@@ -371,6 +487,7 @@ def build_index_content_fields( @@ -371,6 +487,7 @@ def build_index_content_fields(
371 "qanchors": {}, 487 "qanchors": {},
372 "enriched_tags": {}, 488 "enriched_tags": {},
373 "enriched_attributes": [], 489 "enriched_attributes": [],
  490 + "enriched_taxonomy_attributes": [],
374 } 491 }
375 for item in normalized_items 492 for item in normalized_items
376 } 493 }
@@ -398,6 +515,33 @@ def build_index_content_fields( @@ -398,6 +515,33 @@ def build_index_content_fields(
398 continue 515 continue
399 _apply_index_content_row(results_by_id[item_id], row=row, lang=lang) 516 _apply_index_content_row(results_by_id[item_id], row=row, lang=lang)
400 517
  518 + try:
  519 + taxonomy_rows = analyze_products(
  520 + products=normalized_items,
  521 + target_lang=lang,
  522 + batch_size=BATCH_SIZE,
  523 + tenant_id=tenant_id,
  524 + analysis_kind="taxonomy",
  525 + )
  526 + except Exception as e:
  527 + logger.warning(
  528 + "build_index_content_fields taxonomy enrichment failed for lang=%s: %s",
  529 + lang,
  530 + e,
  531 + )
  532 + for item in normalized_items:
  533 + results_by_id[item["id"]].setdefault("error", str(e))
  534 + continue
  535 +
  536 + for row in taxonomy_rows or []:
  537 + item_id = str(row.get("id") or "").strip()
  538 + if not item_id or item_id not in results_by_id:
  539 + continue
  540 + if row.get("error"):
  541 + results_by_id[item_id].setdefault("error", row["error"])
  542 + continue
  543 + _apply_index_taxonomy_row(results_by_id[item_id], row=row, lang=lang)
  544 +
401 return [results_by_id[item["id"]] for item in normalized_items] 545 return [results_by_id[item["id"]] for item in normalized_items]
402 546
403 547
@@ -463,52 +607,89 @@ def _build_prompt_input_text(product: Dict[str, Any]) -> str: @@ -463,52 +607,89 @@ def _build_prompt_input_text(product: Dict[str, Any]) -> str:
463 return _truncate_by_words(candidate, PROMPT_INPUT_MAX_WORDS) 607 return _truncate_by_words(candidate, PROMPT_INPUT_MAX_WORDS)
464 608
465 609
466 -def _make_anchor_cache_key( 610 +def _make_analysis_cache_key(
467 product: Dict[str, Any], 611 product: Dict[str, Any],
468 target_lang: str, 612 target_lang: str,
  613 + analysis_kind: str,
469 ) -> str: 614 ) -> str:
470 - """构造缓存 key,仅由 prompt 实际输入文本内容 + 目标语言决定。""" 615 + """构造缓存 key,仅由分析类型、prompt 实际输入文本内容与目标语言决定。"""
471 prompt_input = _build_prompt_input_text(product) 616 prompt_input = _build_prompt_input_text(product)
472 h = hashlib.md5(prompt_input.encode("utf-8")).hexdigest() 617 h = hashlib.md5(prompt_input.encode("utf-8")).hexdigest()
473 - return f"{ANCHOR_CACHE_PREFIX}:{target_lang}:{prompt_input[:4]}{h}" 618 + return f"{ANCHOR_CACHE_PREFIX}:{analysis_kind}:{target_lang}:{prompt_input[:4]}{h}"
474 619
475 620
476 -def _get_cached_anchor_result( 621 +def _make_anchor_cache_key(
477 product: Dict[str, Any], 622 product: Dict[str, Any],
478 target_lang: str, 623 target_lang: str,
  624 +) -> str:
  625 + return _make_analysis_cache_key(product, target_lang, analysis_kind="content")
  626 +
  627 +
  628 +def _get_cached_analysis_result(
  629 + product: Dict[str, Any],
  630 + target_lang: str,
  631 + analysis_kind: str,
479 ) -> Optional[Dict[str, Any]]: 632 ) -> Optional[Dict[str, Any]]:
480 if not _anchor_redis: 633 if not _anchor_redis:
481 return None 634 return None
  635 + schema = _get_analysis_schema(analysis_kind)
482 try: 636 try:
483 - key = _make_anchor_cache_key(product, target_lang) 637 + key = _make_analysis_cache_key(product, target_lang, analysis_kind)
484 raw = _anchor_redis.get(key) 638 raw = _anchor_redis.get(key)
485 if not raw: 639 if not raw:
486 return None 640 return None
487 - result = _normalize_analysis_result(json.loads(raw), product=product, target_lang=target_lang)  
488 - if not _has_meaningful_analysis_content(result): 641 + result = _normalize_analysis_result(
  642 + json.loads(raw),
  643 + product=product,
  644 + target_lang=target_lang,
  645 + schema=schema,
  646 + )
  647 + if not _has_meaningful_analysis_content(result, schema):
489 return None 648 return None
490 return result 649 return result
491 except Exception as e: 650 except Exception as e:
492 - logger.warning(f"Failed to get anchor cache: {e}") 651 + logger.warning("Failed to get %s analysis cache: %s", analysis_kind, e)
493 return None 652 return None
494 653
495 654
496 -def _set_cached_anchor_result( 655 +def _get_cached_anchor_result(
  656 + product: Dict[str, Any],
  657 + target_lang: str,
  658 +) -> Optional[Dict[str, Any]]:
  659 + return _get_cached_analysis_result(product, target_lang, analysis_kind="content")
  660 +
  661 +
  662 +def _set_cached_analysis_result(
497 product: Dict[str, Any], 663 product: Dict[str, Any],
498 target_lang: str, 664 target_lang: str,
499 result: Dict[str, Any], 665 result: Dict[str, Any],
  666 + analysis_kind: str,
500 ) -> None: 667 ) -> None:
501 if not _anchor_redis: 668 if not _anchor_redis:
502 return 669 return
  670 + schema = _get_analysis_schema(analysis_kind)
503 try: 671 try:
504 - normalized = _normalize_analysis_result(result, product=product, target_lang=target_lang)  
505 - if not _has_meaningful_analysis_content(normalized): 672 + normalized = _normalize_analysis_result(
  673 + result,
  674 + product=product,
  675 + target_lang=target_lang,
  676 + schema=schema,
  677 + )
  678 + if not _has_meaningful_analysis_content(normalized, schema):
506 return 679 return
507 - key = _make_anchor_cache_key(product, target_lang) 680 + key = _make_analysis_cache_key(product, target_lang, analysis_kind)
508 ttl = ANCHOR_CACHE_EXPIRE_DAYS * 24 * 3600 681 ttl = ANCHOR_CACHE_EXPIRE_DAYS * 24 * 3600
509 _anchor_redis.setex(key, ttl, json.dumps(normalized, ensure_ascii=False)) 682 _anchor_redis.setex(key, ttl, json.dumps(normalized, ensure_ascii=False))
510 except Exception as e: 683 except Exception as e:
511 - logger.warning(f"Failed to set anchor cache: {e}") 684 + logger.warning("Failed to set %s analysis cache: %s", analysis_kind, e)
  685 +
  686 +
  687 +def _set_cached_anchor_result(
  688 + product: Dict[str, Any],
  689 + target_lang: str,
  690 + result: Dict[str, Any],
  691 +) -> None:
  692 + _set_cached_analysis_result(product, target_lang, result, analysis_kind="content")
512 693
513 694
514 def _build_assistant_prefix(headers: List[str]) -> str: 695 def _build_assistant_prefix(headers: List[str]) -> str:
@@ -517,8 +698,8 @@ def _build_assistant_prefix(headers: List[str]) -> str: @@ -517,8 +698,8 @@ def _build_assistant_prefix(headers: List[str]) -> str:
517 return f"{header_line}\n{separator_line}\n" 698 return f"{header_line}\n{separator_line}\n"
518 699
519 700
520 -def _build_shared_context(products: List[Dict[str, str]]) -> str:  
521 - shared_context = SHARED_ANALYSIS_INSTRUCTION 701 +def _build_shared_context(products: List[Dict[str, str]], schema: AnalysisSchema) -> str:
  702 + shared_context = schema.shared_instruction
522 for idx, product in enumerate(products, 1): 703 for idx, product in enumerate(products, 1):
523 prompt_input = _build_prompt_input_text(product) 704 prompt_input = _build_prompt_input_text(product)
524 shared_context += f"{idx}. {prompt_input}\n" 705 shared_context += f"{idx}. {prompt_input}\n"
@@ -550,16 +731,19 @@ def reset_logged_shared_context_keys() -> None: @@ -550,16 +731,19 @@ def reset_logged_shared_context_keys() -> None:
550 def create_prompt( 731 def create_prompt(
551 products: List[Dict[str, str]], 732 products: List[Dict[str, str]],
552 target_lang: str = "zh", 733 target_lang: str = "zh",
553 -) -> Tuple[str, str, str]: 734 + analysis_kind: str = "content",
  735 +) -> Tuple[Optional[str], Optional[str], Optional[str]]:
554 """根据目标语言创建共享上下文、本地化输出要求和 Partial Mode assistant 前缀。""" 736 """根据目标语言创建共享上下文、本地化输出要求和 Partial Mode assistant 前缀。"""
555 - markdown_table_headers = LANGUAGE_MARKDOWN_TABLE_HEADERS.get(target_lang) 737 + schema = _get_analysis_schema(analysis_kind)
  738 + markdown_table_headers = schema.get_headers(target_lang)
556 if not markdown_table_headers: 739 if not markdown_table_headers:
557 logger.warning( 740 logger.warning(
558 - "Unsupported target_lang for markdown table headers: %s", 741 + "Unsupported target_lang for markdown table headers: kind=%s lang=%s",
  742 + analysis_kind,
559 target_lang, 743 target_lang,
560 ) 744 )
561 return None, None, None 745 return None, None, None
562 - shared_context = _build_shared_context(products) 746 + shared_context = _build_shared_context(products, schema)
563 language_label = SOURCE_LANG_CODE_MAP.get(target_lang, target_lang) 747 language_label = SOURCE_LANG_CODE_MAP.get(target_lang, target_lang)
564 user_prompt = USER_INSTRUCTION_TEMPLATE.format(language=language_label).strip() 748 user_prompt = USER_INSTRUCTION_TEMPLATE.format(language=language_label).strip()
565 assistant_prefix = _build_assistant_prefix(markdown_table_headers) 749 assistant_prefix = _build_assistant_prefix(markdown_table_headers)
@@ -592,6 +776,7 @@ def call_llm( @@ -592,6 +776,7 @@ def call_llm(
592 user_prompt: str, 776 user_prompt: str,
593 assistant_prefix: str, 777 assistant_prefix: str,
594 target_lang: str = "zh", 778 target_lang: str = "zh",
  779 + analysis_kind: str = "content",
595 ) -> Tuple[str, str]: 780 ) -> Tuple[str, str]:
596 """调用大模型 API(带重试机制),使用 Partial Mode 强制 markdown 表格前缀。""" 781 """调用大模型 API(带重试机制),使用 Partial Mode 强制 markdown 表格前缀。"""
597 headers = { 782 headers = {
@@ -631,8 +816,9 @@ def call_llm( @@ -631,8 +816,9 @@ def call_llm(
631 if _mark_shared_context_logged_once(shared_context_key): 816 if _mark_shared_context_logged_once(shared_context_key):
632 logger.info(f"\n{'=' * 80}") 817 logger.info(f"\n{'=' * 80}")
633 logger.info( 818 logger.info(
634 - "LLM Shared Context [model=%s, shared_key=%s, chars=%s] (logged once per process key)", 819 + "LLM Shared Context [model=%s, kind=%s, shared_key=%s, chars=%s] (logged once per process key)",
635 MODEL_NAME, 820 MODEL_NAME,
  821 + analysis_kind,
636 shared_context_key, 822 shared_context_key,
637 len(shared_context), 823 len(shared_context),
638 ) 824 )
@@ -641,8 +827,9 @@ def call_llm( @@ -641,8 +827,9 @@ def call_llm(
641 827
642 verbose_logger.info(f"\n{'=' * 80}") 828 verbose_logger.info(f"\n{'=' * 80}")
643 verbose_logger.info( 829 verbose_logger.info(
644 - "LLM Request [model=%s, lang=%s, shared_key=%s, tail_key=%s]:", 830 + "LLM Request [model=%s, kind=%s, lang=%s, shared_key=%s, tail_key=%s]:",
645 MODEL_NAME, 831 MODEL_NAME,
  832 + analysis_kind,
646 target_lang, 833 target_lang,
647 shared_context_key, 834 shared_context_key,
648 localized_tail_key, 835 localized_tail_key,
@@ -654,7 +841,8 @@ def call_llm( @@ -654,7 +841,8 @@ def call_llm(
654 verbose_logger.info(f"\nAssistant Prefix:\n{assistant_prefix}") 841 verbose_logger.info(f"\nAssistant Prefix:\n{assistant_prefix}")
655 842
656 logger.info( 843 logger.info(
657 - "\nLLM Request Variant [lang=%s, shared_key=%s, tail_key=%s, prompt_chars=%s, prefix_chars=%s]", 844 + "\nLLM Request Variant [kind=%s, lang=%s, shared_key=%s, tail_key=%s, prompt_chars=%s, prefix_chars=%s]",
  845 + analysis_kind,
658 target_lang, 846 target_lang,
659 shared_context_key, 847 shared_context_key,
660 localized_tail_key, 848 localized_tail_key,
@@ -685,8 +873,9 @@ def call_llm( @@ -685,8 +873,9 @@ def call_llm(
685 usage = result.get("usage") or {} 873 usage = result.get("usage") or {}
686 874
687 verbose_logger.info( 875 verbose_logger.info(
688 - "\nLLM Response [model=%s, lang=%s, shared_key=%s, tail_key=%s]:", 876 + "\nLLM Response [model=%s, kind=%s, lang=%s, shared_key=%s, tail_key=%s]:",
689 MODEL_NAME, 877 MODEL_NAME,
  878 + analysis_kind,
690 target_lang, 879 target_lang,
691 shared_context_key, 880 shared_context_key,
692 localized_tail_key, 881 localized_tail_key,
@@ -697,7 +886,8 @@ def call_llm( @@ -697,7 +886,8 @@ def call_llm(
697 full_markdown = _merge_partial_response(assistant_prefix, generated_content) 886 full_markdown = _merge_partial_response(assistant_prefix, generated_content)
698 887
699 logger.info( 888 logger.info(
700 - "\nLLM Response Summary [lang=%s, shared_key=%s, tail_key=%s, generated_chars=%s, completion_tokens=%s, prompt_tokens=%s, total_tokens=%s]", 889 + "\nLLM Response Summary [kind=%s, lang=%s, shared_key=%s, tail_key=%s, generated_chars=%s, completion_tokens=%s, prompt_tokens=%s, total_tokens=%s]",
  890 + analysis_kind,
701 target_lang, 891 target_lang,
702 shared_context_key, 892 shared_context_key,
703 localized_tail_key, 893 localized_tail_key,
@@ -742,8 +932,12 @@ def call_llm( @@ -742,8 +932,12 @@ def call_llm(
742 session.close() 932 session.close()
743 933
744 934
745 -def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]: 935 +def parse_markdown_table(
  936 + markdown_content: str,
  937 + analysis_kind: str = "content",
  938 +) -> List[Dict[str, str]]:
746 """解析markdown表格内容""" 939 """解析markdown表格内容"""
  940 + schema = _get_analysis_schema(analysis_kind)
747 lines = markdown_content.strip().split("\n") 941 lines = markdown_content.strip().split("\n")
748 data = [] 942 data = []
749 data_started = False 943 data_started = False
@@ -768,22 +962,15 @@ def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]: @@ -768,22 +962,15 @@ def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]:
768 962
769 # 解析数据行 963 # 解析数据行
770 parts = [p.strip() for p in line.split("|")] 964 parts = [p.strip() for p in line.split("|")]
771 - parts = [p for p in parts if p] # 移除空字符串 965 + if parts and parts[0] == "":
  966 + parts = parts[1:]
  967 + if parts and parts[-1] == "":
  968 + parts = parts[:-1]
772 969
773 if len(parts) >= 2: 970 if len(parts) >= 2:
774 - row = {  
775 - "seq_no": parts[0],  
776 - "title": parts[1], # 商品标题(按目标语言)  
777 - "category_path": parts[2] if len(parts) > 2 else "", # 品类路径  
778 - "tags": parts[3] if len(parts) > 3 else "", # 细分标签  
779 - "target_audience": parts[4] if len(parts) > 4 else "", # 适用人群  
780 - "usage_scene": parts[5] if len(parts) > 5 else "", # 使用场景  
781 - "season": parts[6] if len(parts) > 6 else "", # 适用季节  
782 - "key_attributes": parts[7] if len(parts) > 7 else "", # 关键属性  
783 - "material": parts[8] if len(parts) > 8 else "", # 材质说明  
784 - "features": parts[9] if len(parts) > 9 else "", # 功能特点  
785 - "anchor_text": parts[10] if len(parts) > 10 else "", # 锚文本  
786 - } 971 + row = {"seq_no": parts[0]}
  972 + for field_index, field_name in enumerate(schema.result_fields, start=1):
  973 + row[field_name] = parts[field_index] if len(parts) > field_index else ""
787 data.append(row) 974 data.append(row)
788 975
789 return data 976 return data
@@ -794,31 +981,45 @@ def _log_parsed_result_quality( @@ -794,31 +981,45 @@ def _log_parsed_result_quality(
794 parsed_results: List[Dict[str, str]], 981 parsed_results: List[Dict[str, str]],
795 target_lang: str, 982 target_lang: str,
796 batch_num: int, 983 batch_num: int,
  984 + analysis_kind: str,
797 ) -> None: 985 ) -> None:
  986 + schema = _get_analysis_schema(analysis_kind)
798 expected = len(batch_data) 987 expected = len(batch_data)
799 actual = len(parsed_results) 988 actual = len(parsed_results)
800 if actual != expected: 989 if actual != expected:
801 logger.warning( 990 logger.warning(
802 - "Parsed row count mismatch for batch=%s lang=%s: expected=%s actual=%s", 991 + "Parsed row count mismatch for kind=%s batch=%s lang=%s: expected=%s actual=%s",
  992 + analysis_kind,
803 batch_num, 993 batch_num,
804 target_lang, 994 target_lang,
805 expected, 995 expected,
806 actual, 996 actual,
807 ) 997 )
808 998
809 - missing_anchor = sum(1 for item in parsed_results if not str(item.get("anchor_text") or "").strip())  
810 - missing_category = sum(1 for item in parsed_results if not str(item.get("category_path") or "").strip())  
811 - missing_title = sum(1 for item in parsed_results if not str(item.get("title") or "").strip()) 999 + if not schema.quality_fields:
  1000 + logger.info(
  1001 + "Parsed Quality Summary [kind=%s, batch=%s, lang=%s]: rows=%s/%s",
  1002 + analysis_kind,
  1003 + batch_num,
  1004 + target_lang,
  1005 + actual,
  1006 + expected,
  1007 + )
  1008 + return
812 1009
  1010 + missing_summary = ", ".join(
  1011 + f"missing_{field}="
  1012 + f"{sum(1 for item in parsed_results if not str(item.get(field) or '').strip())}"
  1013 + for field in schema.quality_fields
  1014 + )
813 logger.info( 1015 logger.info(
814 - "Parsed Quality Summary [batch=%s, lang=%s]: rows=%s/%s, missing_title=%s, missing_category=%s, missing_anchor=%s", 1016 + "Parsed Quality Summary [kind=%s, batch=%s, lang=%s]: rows=%s/%s, %s",
  1017 + analysis_kind,
815 batch_num, 1018 batch_num,
816 target_lang, 1019 target_lang,
817 actual, 1020 actual,
818 expected, 1021 expected,
819 - missing_title,  
820 - missing_category,  
821 - missing_anchor, 1022 + missing_summary,
822 ) 1023 )
823 1024
824 1025
@@ -826,29 +1027,39 @@ def process_batch( @@ -826,29 +1027,39 @@ def process_batch(
826 batch_data: List[Dict[str, str]], 1027 batch_data: List[Dict[str, str]],
827 batch_num: int, 1028 batch_num: int,
828 target_lang: str = "zh", 1029 target_lang: str = "zh",
  1030 + analysis_kind: str = "content",
829 ) -> List[Dict[str, Any]]: 1031 ) -> List[Dict[str, Any]]:
830 """处理一个批次的数据""" 1032 """处理一个批次的数据"""
  1033 + schema = _get_analysis_schema(analysis_kind)
831 logger.info(f"\n{'#' * 80}") 1034 logger.info(f"\n{'#' * 80}")
832 - logger.info(f"Processing Batch {batch_num} ({len(batch_data)} items)") 1035 + logger.info(
  1036 + "Processing Batch %s (%s items, kind=%s)",
  1037 + batch_num,
  1038 + len(batch_data),
  1039 + analysis_kind,
  1040 + )
833 1041
834 # 创建提示词 1042 # 创建提示词
835 shared_context, user_prompt, assistant_prefix = create_prompt( 1043 shared_context, user_prompt, assistant_prefix = create_prompt(
836 batch_data, 1044 batch_data,
837 target_lang=target_lang, 1045 target_lang=target_lang,
  1046 + analysis_kind=analysis_kind,
838 ) 1047 )
839 1048
840 # 如果提示词创建失败(例如不支持的 target_lang),本次批次整体失败,不再继续调用 LLM 1049 # 如果提示词创建失败(例如不支持的 target_lang),本次批次整体失败,不再继续调用 LLM
841 if shared_context is None or user_prompt is None or assistant_prefix is None: 1050 if shared_context is None or user_prompt is None or assistant_prefix is None:
842 logger.error( 1051 logger.error(
843 - "Failed to create prompt for batch %s, target_lang=%s; " 1052 + "Failed to create prompt for batch %s, kind=%s, target_lang=%s; "
844 "marking entire batch as failed without calling LLM", 1053 "marking entire batch as failed without calling LLM",
845 batch_num, 1054 batch_num,
  1055 + analysis_kind,
846 target_lang, 1056 target_lang,
847 ) 1057 )
848 return [ 1058 return [
849 _make_empty_analysis_result( 1059 _make_empty_analysis_result(
850 item, 1060 item,
851 target_lang, 1061 target_lang,
  1062 + schema,
852 error=f"prompt_creation_failed: unsupported target_lang={target_lang}", 1063 error=f"prompt_creation_failed: unsupported target_lang={target_lang}",
853 ) 1064 )
854 for item in batch_data 1065 for item in batch_data
@@ -861,11 +1072,18 @@ def process_batch( @@ -861,11 +1072,18 @@ def process_batch(
861 user_prompt, 1072 user_prompt,
862 assistant_prefix, 1073 assistant_prefix,
863 target_lang=target_lang, 1074 target_lang=target_lang,
  1075 + analysis_kind=analysis_kind,
864 ) 1076 )
865 1077
866 # 解析结果 1078 # 解析结果
867 - parsed_results = parse_markdown_table(raw_response)  
868 - _log_parsed_result_quality(batch_data, parsed_results, target_lang, batch_num) 1079 + parsed_results = parse_markdown_table(raw_response, analysis_kind=analysis_kind)
  1080 + _log_parsed_result_quality(
  1081 + batch_data,
  1082 + parsed_results,
  1083 + target_lang,
  1084 + batch_num,
  1085 + analysis_kind,
  1086 + )
869 1087
870 logger.info(f"\nParsed Results ({len(parsed_results)} items):") 1088 logger.info(f"\nParsed Results ({len(parsed_results)} items):")
871 logger.info(json.dumps(parsed_results, ensure_ascii=False, indent=2)) 1089 logger.info(json.dumps(parsed_results, ensure_ascii=False, indent=2))
@@ -879,10 +1097,12 @@ def process_batch( @@ -879,10 +1097,12 @@ def process_batch(
879 parsed_item, 1097 parsed_item,
880 product=source_product, 1098 product=source_product,
881 target_lang=target_lang, 1099 target_lang=target_lang,
  1100 + schema=schema,
882 ) 1101 )
883 results_with_ids.append(result) 1102 results_with_ids.append(result)
884 logger.info( 1103 logger.info(
885 - "Mapped: seq=%s -> original_id=%s", 1104 + "Mapped: kind=%s seq=%s -> original_id=%s",
  1105 + analysis_kind,
886 parsed_item.get("seq_no"), 1106 parsed_item.get("seq_no"),
887 source_product.get("id"), 1107 source_product.get("id"),
888 ) 1108 )
@@ -890,6 +1110,7 @@ def process_batch( @@ -890,6 +1110,7 @@ def process_batch(
890 # 保存批次 JSON 日志到独立文件 1110 # 保存批次 JSON 日志到独立文件
891 batch_log = { 1111 batch_log = {
892 "batch_num": batch_num, 1112 "batch_num": batch_num,
  1113 + "analysis_kind": analysis_kind,
893 "timestamp": datetime.now().isoformat(), 1114 "timestamp": datetime.now().isoformat(),
894 "input_products": batch_data, 1115 "input_products": batch_data,
895 "raw_response": raw_response, 1116 "raw_response": raw_response,
@@ -900,7 +1121,10 @@ def process_batch( @@ -900,7 +1121,10 @@ def process_batch(
900 1121
901 # 并发写 batch json 日志时,保证文件名唯一避免覆盖 1122 # 并发写 batch json 日志时,保证文件名唯一避免覆盖
902 batch_call_id = uuid.uuid4().hex[:12] 1123 batch_call_id = uuid.uuid4().hex[:12]
903 - batch_log_file = LOG_DIR / f"batch_{batch_num:04d}_{timestamp}_{batch_call_id}.json" 1124 + batch_log_file = (
  1125 + LOG_DIR
  1126 + / f"batch_{analysis_kind}_{batch_num:04d}_{timestamp}_{batch_call_id}.json"
  1127 + )
904 with open(batch_log_file, "w", encoding="utf-8") as f: 1128 with open(batch_log_file, "w", encoding="utf-8") as f:
905 json.dump(batch_log, f, ensure_ascii=False, indent=2) 1129 json.dump(batch_log, f, ensure_ascii=False, indent=2)
906 1130
@@ -912,7 +1136,7 @@ def process_batch( @@ -912,7 +1136,7 @@ def process_batch(
912 logger.error(f"Error processing batch {batch_num}: {str(e)}", exc_info=True) 1136 logger.error(f"Error processing batch {batch_num}: {str(e)}", exc_info=True)
913 # 返回空结果,保持ID映射 1137 # 返回空结果,保持ID映射
914 return [ 1138 return [
915 - _make_empty_analysis_result(item, target_lang, error=str(e)) 1139 + _make_empty_analysis_result(item, target_lang, schema, error=str(e))
916 for item in batch_data 1140 for item in batch_data
917 ] 1141 ]
918 1142
@@ -922,6 +1146,7 @@ def analyze_products( @@ -922,6 +1146,7 @@ def analyze_products(
922 target_lang: str = "zh", 1146 target_lang: str = "zh",
923 batch_size: Optional[int] = None, 1147 batch_size: Optional[int] = None,
924 tenant_id: Optional[str] = None, 1148 tenant_id: Optional[str] = None,
  1149 + analysis_kind: str = "content",
925 ) -> List[Dict[str, Any]]: 1150 ) -> List[Dict[str, Any]]:
926 """ 1151 """
927 库调用入口:根据输入+语言,返回锚文本及各维度信息。 1152 库调用入口:根据输入+语言,返回锚文本及各维度信息。
@@ -937,6 +1162,7 @@ def analyze_products( @@ -937,6 +1162,7 @@ def analyze_products(
937 if not products: 1162 if not products:
938 return [] 1163 return []
939 1164
  1165 + _get_analysis_schema(analysis_kind)
940 results_by_index: List[Optional[Dict[str, Any]]] = [None] * len(products) 1166 results_by_index: List[Optional[Dict[str, Any]]] = [None] * len(products)
941 uncached_items: List[Tuple[int, Dict[str, str]]] = [] 1167 uncached_items: List[Tuple[int, Dict[str, str]]] = []
942 1168
@@ -946,11 +1172,11 @@ def analyze_products( @@ -946,11 +1172,11 @@ def analyze_products(
946 uncached_items.append((idx, product)) 1172 uncached_items.append((idx, product))
947 continue 1173 continue
948 1174
949 - cached = _get_cached_anchor_result(product, target_lang) 1175 + cached = _get_cached_analysis_result(product, target_lang, analysis_kind)
950 if cached: 1176 if cached:
951 logger.info( 1177 logger.info(
952 f"[analyze_products] Cache hit for title='{title[:50]}...', " 1178 f"[analyze_products] Cache hit for title='{title[:50]}...', "
953 - f"lang={target_lang}" 1179 + f"kind={analysis_kind}, lang={target_lang}"
954 ) 1180 )
955 results_by_index[idx] = cached 1181 results_by_index[idx] = cached
956 continue 1182 continue
@@ -979,9 +1205,14 @@ def analyze_products( @@ -979,9 +1205,14 @@ def analyze_products(
979 for batch_num, batch_slice, batch in batch_jobs: 1205 for batch_num, batch_slice, batch in batch_jobs:
980 logger.info( 1206 logger.info(
981 f"[analyze_products] Processing batch {batch_num}/{total_batches}, " 1207 f"[analyze_products] Processing batch {batch_num}/{total_batches}, "
982 - f"size={len(batch)}, target_lang={target_lang}" 1208 + f"size={len(batch)}, kind={analysis_kind}, target_lang={target_lang}"
  1209 + )
  1210 + batch_results = process_batch(
  1211 + batch,
  1212 + batch_num=batch_num,
  1213 + target_lang=target_lang,
  1214 + analysis_kind=analysis_kind,
983 ) 1215 )
984 - batch_results = process_batch(batch, batch_num=batch_num, target_lang=target_lang)  
985 1216
986 for (original_idx, product), item in zip(batch_slice, batch_results): 1217 for (original_idx, product), item in zip(batch_slice, batch_results):
987 results_by_index[original_idx] = item 1218 results_by_index[original_idx] = item
@@ -992,7 +1223,7 @@ def analyze_products( @@ -992,7 +1223,7 @@ def analyze_products(
992 # 不缓存错误结果,避免放大临时故障 1223 # 不缓存错误结果,避免放大临时故障
993 continue 1224 continue
994 try: 1225 try:
995 - _set_cached_anchor_result(product, target_lang, item) 1226 + _set_cached_analysis_result(product, target_lang, item, analysis_kind)
996 except Exception: 1227 except Exception:
997 # 已在内部记录 warning 1228 # 已在内部记录 warning
998 pass 1229 pass
@@ -1000,10 +1231,11 @@ def analyze_products( @@ -1000,10 +1231,11 @@ def analyze_products(
1000 max_workers = min(CONTENT_UNDERSTANDING_MAX_WORKERS, len(batch_jobs)) 1231 max_workers = min(CONTENT_UNDERSTANDING_MAX_WORKERS, len(batch_jobs))
1001 logger.info( 1232 logger.info(
1002 "[analyze_products] Using ThreadPoolExecutor for uncached batches: " 1233 "[analyze_products] Using ThreadPoolExecutor for uncached batches: "
1003 - "max_workers=%s, total_batches=%s, bs=%s, target_lang=%s", 1234 + "max_workers=%s, total_batches=%s, bs=%s, kind=%s, target_lang=%s",
1004 max_workers, 1235 max_workers,
1005 total_batches, 1236 total_batches,
1006 bs, 1237 bs,
  1238 + analysis_kind,
1007 target_lang, 1239 target_lang,
1008 ) 1240 )
1009 1241
@@ -1013,7 +1245,11 @@ def analyze_products( @@ -1013,7 +1245,11 @@ def analyze_products(
1013 future_by_batch_num: Dict[int, Any] = {} 1245 future_by_batch_num: Dict[int, Any] = {}
1014 for batch_num, _batch_slice, batch in batch_jobs: 1246 for batch_num, _batch_slice, batch in batch_jobs:
1015 future_by_batch_num[batch_num] = executor.submit( 1247 future_by_batch_num[batch_num] = executor.submit(
1016 - process_batch, batch, batch_num=batch_num, target_lang=target_lang 1248 + process_batch,
  1249 + batch,
  1250 + batch_num=batch_num,
  1251 + target_lang=target_lang,
  1252 + analysis_kind=analysis_kind,
1017 ) 1253 )
1018 1254
1019 # 按 batch_num 回填,确保输出稳定(results_by_index 是按原始 input index 映射的) 1255 # 按 batch_num 回填,确保输出稳定(results_by_index 是按原始 input index 映射的)
@@ -1028,7 +1264,7 @@ def analyze_products( @@ -1028,7 +1264,7 @@ def analyze_products(
1028 # 不缓存错误结果,避免放大临时故障 1264 # 不缓存错误结果,避免放大临时故障
1029 continue 1265 continue
1030 try: 1266 try:
1031 - _set_cached_anchor_result(product, target_lang, item) 1267 + _set_cached_analysis_result(product, target_lang, item, analysis_kind)
1032 except Exception: 1268 except Exception:
1033 # 已在内部记录 warning 1269 # 已在内部记录 warning
1034 pass 1270 pass
indexer/product_enrich_prompts.py
@@ -33,6 +33,110 @@ Input product list: @@ -33,6 +33,110 @@ Input product list:
33 USER_INSTRUCTION_TEMPLATE = """Please strictly return a Markdown table following the given columns in the specified language. For any column containing multiple values, separate them with commas. Do not add any other explanation. 33 USER_INSTRUCTION_TEMPLATE = """Please strictly return a Markdown table following the given columns in the specified language. For any column containing multiple values, separate them with commas. Do not add any other explanation.
34 Language: {language}""" 34 Language: {language}"""
35 35
  36 +TAXONOMY_SHARED_ANALYSIS_INSTRUCTION = """Analyze each input product text and fill the columns below using an apparel attribute taxonomy.
  37 +
  38 +Output columns:
  39 +1. Product Type: concise ecommerce apparel category label, not a full marketing title
  40 +2. Target Gender: intended gender only if clearly implied
  41 +3. Age Group: only if clearly implied, e.g. adults, kids, teens, toddlers, babies
  42 +4. Season: season(s) or all-season suitability only if supported
  43 +5. Fit: body closeness, e.g. slim, regular, relaxed, oversized, fitted
  44 +6. Silhouette: overall garment shape, e.g. straight, A-line, boxy, tapered, bodycon, wide-leg
  45 +7. Neckline: neckline type when applicable, e.g. crew neck, V-neck, hooded, collared, square neck
  46 +8. Sleeve Length Type: sleeve length only, e.g. sleeveless, short sleeve, long sleeve, three-quarter sleeve
  47 +9. Sleeve Style: sleeve design only, e.g. puff sleeve, raglan sleeve, batwing sleeve, bell sleeve
  48 +10. Strap Type: strap design when applicable, e.g. spaghetti strap, wide strap, halter strap, adjustable strap
  49 +11. Rise / Waistline: waist placement when applicable, e.g. high rise, mid rise, low rise, empire waist
  50 +12. Leg Shape: for bottoms only, e.g. straight leg, wide leg, flare leg, tapered leg, skinny leg
  51 +13. Skirt Shape: for skirts only, e.g. A-line, pleated, pencil, mermaid
  52 +14. Length Type: design length only, not size, e.g. cropped, regular, longline, mini, midi, maxi, ankle length, full length
  53 +15. Closure Type: fastening method when applicable, e.g. zipper, button, drawstring, elastic waist, hook-and-loop
  54 +16. Design Details: construction or visual details, e.g. ruched, ruffled, pleated, cut-out, layered, distressed, split hem
  55 +17. Fabric: fabric type only, e.g. denim, knit, chiffon, jersey, fleece, cotton twill
  56 +18. Material Composition: fiber content or blend only if stated, e.g. cotton, polyester, spandex, linen blend, 95% cotton 5% elastane
  57 +19. Fabric Properties: inherent fabric traits, e.g. stretch, breathable, lightweight, soft-touch, water-resistant
  58 +20. Clothing Features: product features, e.g. lined, reversible, hooded, packable, padded, pocketed
  59 +21. Functional Benefits: wearer benefits, e.g. moisture-wicking, thermal insulation, UV protection, easy care, supportive compression
  60 +22. Color: specific color name when available
  61 +23. Color Family: normalized broad retail color group, e.g. black, white, blue, green, red, pink, beige, brown, gray
  62 +24. Print / Pattern: surface pattern when applicable, e.g. solid, striped, plaid, floral, graphic, animal print
  63 +25. Occasion / End Use: likely use occasion only if supported, e.g. office, casual wear, streetwear, lounge, workout, outdoor
  64 +26. Style Aesthetic: overall style only if supported, e.g. minimalist, streetwear, athleisure, smart casual, romantic, playful
  65 +
  66 +Rules:
  67 +- Keep the same row order and row count as input.
  68 +- Infer only from the provided product text.
  69 +- Leave blank if not applicable or not reasonably supported.
  70 +- Use concise, standardized ecommerce wording.
  71 +- Do not combine different attribute dimensions in one field.
  72 +- If multiple values are needed, use the delimiter required by the localization setting.
  73 +
  74 +Input product list:
  75 +"""
  76 +
  77 +TAXONOMY_MARKDOWN_TABLE_HEADERS_EN = [
  78 + "No.",
  79 + "Product Type",
  80 + "Target Gender",
  81 + "Age Group",
  82 + "Season",
  83 + "Fit",
  84 + "Silhouette",
  85 + "Neckline",
  86 + "Sleeve Length Type",
  87 + "Sleeve Style",
  88 + "Strap Type",
  89 + "Rise / Waistline",
  90 + "Leg Shape",
  91 + "Skirt Shape",
  92 + "Length Type",
  93 + "Closure Type",
  94 + "Design Details",
  95 + "Fabric",
  96 + "Material Composition",
  97 + "Fabric Properties",
  98 + "Clothing Features",
  99 + "Functional Benefits",
  100 + "Color",
  101 + "Color Family",
  102 + "Print / Pattern",
  103 + "Occasion / End Use",
  104 + "Style Aesthetic",
  105 +]
  106 +
  107 +TAXONOMY_LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = {
  108 + "en": TAXONOMY_MARKDOWN_TABLE_HEADERS_EN,
  109 + "zh": [
  110 + "序号",
  111 + "品类",
  112 + "目标性别",
  113 + "年龄段",
  114 + "适用季节",
  115 + "版型",
  116 + "廓形",
  117 + "领型",
  118 + "袖长类型",
  119 + "袖型",
  120 + "肩带设计",
  121 + "腰型",
  122 + "裤型",
  123 + "裙型",
  124 + "长度类型",
  125 + "闭合方式",
  126 + "设计细节",
  127 + "面料",
  128 + "成分",
  129 + "面料特性",
  130 + "服装特征",
  131 + "功能",
  132 + "主颜色",
  133 + "色系",
  134 + "印花 / 图案",
  135 + "适用场景",
  136 + "风格",
  137 + ],
  138 +}
  139 +
36 LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { 140 LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = {
37 "en": [ 141 "en": [
38 "No.", 142 "No.",
tests/test_llm_enrichment_batch_fill.py
@@ -19,10 +19,13 @@ def test_fill_llm_attributes_batch_uses_product_enrich_helper(monkeypatch): @@ -19,10 +19,13 @@ def test_fill_llm_attributes_batch_uses_product_enrich_helper(monkeypatch):
19 "zh": [f"zh-anchor-{item['id']}"], 19 "zh": [f"zh-anchor-{item['id']}"],
20 "en": [f"en-anchor-{item['id']}"], 20 "en": [f"en-anchor-{item['id']}"],
21 }, 21 },
22 - "tags": {"zh": ["t1", "t2"], "en": ["t1", "t2"]}, 22 + "enriched_tags": {"zh": ["t1", "t2"], "en": ["t1", "t2"]},
23 "enriched_attributes": [ 23 "enriched_attributes": [
24 {"name": "tags", "value": {"zh": ["t1"], "en": ["t1"]}}, 24 {"name": "tags", "value": {"zh": ["t1"], "en": ["t1"]}},
25 ], 25 ],
  26 + "enriched_taxonomy_attributes": [
  27 + {"name": "Product Type", "value": {"zh": ["连衣裙"], "en": ["dress"]}},
  28 + ],
26 } 29 }
27 for item in items 30 for item in items
28 ] 31 ]
@@ -54,6 +57,10 @@ def test_fill_llm_attributes_batch_uses_product_enrich_helper(monkeypatch): @@ -54,6 +57,10 @@ def test_fill_llm_attributes_batch_uses_product_enrich_helper(monkeypatch):
54 57
55 assert docs[0]["qanchors"]["zh"] == ["zh-anchor-0"] 58 assert docs[0]["qanchors"]["zh"] == ["zh-anchor-0"]
56 assert docs[0]["qanchors"]["en"] == ["en-anchor-0"] 59 assert docs[0]["qanchors"]["en"] == ["en-anchor-0"]
57 - assert docs[0]["tags"]["zh"] == ["t1", "t2"]  
58 - assert docs[0]["tags"]["en"] == ["t1", "t2"] 60 + assert docs[0]["enriched_tags"]["zh"] == ["t1", "t2"]
  61 + assert docs[0]["enriched_tags"]["en"] == ["t1", "t2"]
59 assert {"name": "tags", "value": {"zh": ["t1"], "en": ["t1"]}} in docs[0]["enriched_attributes"] 62 assert {"name": "tags", "value": {"zh": ["t1"], "en": ["t1"]}} in docs[0]["enriched_attributes"]
  63 + assert {
  64 + "name": "Product Type",
  65 + "value": {"zh": ["连衣裙"], "en": ["dress"]},
  66 + } in docs[0]["enriched_taxonomy_attributes"]
tests/test_process_products_batching.py
@@ -13,7 +13,13 @@ def test_analyze_products_caps_batch_size_to_20(monkeypatch): @@ -13,7 +13,13 @@ def test_analyze_products_caps_batch_size_to_20(monkeypatch):
13 monkeypatch.setattr(process_products, "API_KEY", "fake-key") 13 monkeypatch.setattr(process_products, "API_KEY", "fake-key")
14 seen_batch_sizes: List[int] = [] 14 seen_batch_sizes: List[int] = []
15 15
16 - def _fake_process_batch(batch_data: List[Dict[str, str]], batch_num: int, target_lang: str = "zh"): 16 + def _fake_process_batch(
  17 + batch_data: List[Dict[str, str]],
  18 + batch_num: int,
  19 + target_lang: str = "zh",
  20 + analysis_kind: str = "content",
  21 + ):
  22 + assert analysis_kind == "content"
17 seen_batch_sizes.append(len(batch_data)) 23 seen_batch_sizes.append(len(batch_data))
18 return [ 24 return [
19 { 25 {
@@ -35,7 +41,7 @@ def test_analyze_products_caps_batch_size_to_20(monkeypatch): @@ -35,7 +41,7 @@ def test_analyze_products_caps_batch_size_to_20(monkeypatch):
35 ] 41 ]
36 42
37 monkeypatch.setattr(process_products, "process_batch", _fake_process_batch) 43 monkeypatch.setattr(process_products, "process_batch", _fake_process_batch)
38 - monkeypatch.setattr(process_products, "_set_cached_anchor_result", lambda *args, **kwargs: None) 44 + monkeypatch.setattr(process_products, "_set_cached_analysis_result", lambda *args, **kwargs: None)
39 45
40 out = process_products.analyze_products( 46 out = process_products.analyze_products(
41 products=_mk_products(45), 47 products=_mk_products(45),
@@ -53,7 +59,13 @@ def test_analyze_products_uses_min_batch_size_1(monkeypatch): @@ -53,7 +59,13 @@ def test_analyze_products_uses_min_batch_size_1(monkeypatch):
53 monkeypatch.setattr(process_products, "API_KEY", "fake-key") 59 monkeypatch.setattr(process_products, "API_KEY", "fake-key")
54 seen_batch_sizes: List[int] = [] 60 seen_batch_sizes: List[int] = []
55 61
56 - def _fake_process_batch(batch_data: List[Dict[str, str]], batch_num: int, target_lang: str = "zh"): 62 + def _fake_process_batch(
  63 + batch_data: List[Dict[str, str]],
  64 + batch_num: int,
  65 + target_lang: str = "zh",
  66 + analysis_kind: str = "content",
  67 + ):
  68 + assert analysis_kind == "content"
57 seen_batch_sizes.append(len(batch_data)) 69 seen_batch_sizes.append(len(batch_data))
58 return [ 70 return [
59 { 71 {
@@ -75,7 +87,7 @@ def test_analyze_products_uses_min_batch_size_1(monkeypatch): @@ -75,7 +87,7 @@ def test_analyze_products_uses_min_batch_size_1(monkeypatch):
75 ] 87 ]
76 88
77 monkeypatch.setattr(process_products, "process_batch", _fake_process_batch) 89 monkeypatch.setattr(process_products, "process_batch", _fake_process_batch)
78 - monkeypatch.setattr(process_products, "_set_cached_anchor_result", lambda *args, **kwargs: None) 90 + monkeypatch.setattr(process_products, "_set_cached_analysis_result", lambda *args, **kwargs: None)
79 91
80 out = process_products.analyze_products( 92 out = process_products.analyze_products(
81 products=_mk_products(3), 93 products=_mk_products(3),
tests/test_product_enrich_partial_mode.py
@@ -74,6 +74,28 @@ def test_create_prompt_splits_shared_context_and_localized_tail(): @@ -74,6 +74,28 @@ def test_create_prompt_splits_shared_context_and_localized_tail():
74 assert prefix_en.startswith("| No. | Product title | Category path |") 74 assert prefix_en.startswith("| No. | Product title | Category path |")
75 75
76 76
  77 +def test_create_prompt_supports_taxonomy_analysis_kind():
  78 + products = [{"id": "1", "title": "linen dress"}]
  79 +
  80 + shared_zh, user_zh, prefix_zh = product_enrich.create_prompt(
  81 + products,
  82 + target_lang="zh",
  83 + analysis_kind="taxonomy",
  84 + )
  85 + shared_fr, user_fr, prefix_fr = product_enrich.create_prompt(
  86 + products,
  87 + target_lang="fr",
  88 + analysis_kind="taxonomy",
  89 + )
  90 +
  91 + assert "apparel attribute taxonomy" in shared_zh
  92 + assert "1. linen dress" in shared_zh
  93 + assert "Language: Chinese" in user_zh
  94 + assert "Language: French" in user_fr
  95 + assert prefix_zh.startswith("| 序号 | 品类 | 目标性别 |")
  96 + assert prefix_fr.startswith("| No. | Product Type | Target Gender |")
  97 +
  98 +
77 def test_call_llm_logs_shared_context_once_and_verbose_contains_full_requests(): 99 def test_call_llm_logs_shared_context_once_and_verbose_contains_full_requests():
78 payloads = [] 100 payloads = []
79 response_bodies = [ 101 response_bodies = [
@@ -228,6 +250,38 @@ def test_process_batch_reads_result_and_validates_expected_fields(): @@ -228,6 +250,38 @@ def test_process_batch_reads_result_and_validates_expected_fields():
228 assert row["anchor_text"] == "法式收腰连衣裙" 250 assert row["anchor_text"] == "法式收腰连衣裙"
229 251
230 252
  253 +def test_process_batch_reads_taxonomy_result_with_schema_specific_fields():
  254 + merged_markdown = """| 序号 | 品类 | 目标性别 | 年龄段 | 适用季节 | 版型 | 廓形 | 领型 | 袖长类型 | 袖型 | 肩带设计 | 腰型 | 裤型 | 裙型 | 长度类型 | 闭合方式 | 设计细节 | 面料 | 成分 | 面料特性 | 服装特征 | 功能 | 主颜色 | 色系 | 印花 / 图案 | 适用场景 | 风格 |
  255 +|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|
  256 +| 1 | 连衣裙 | 女 | 成人 | 春季,夏季 | 修身 | A字 | V领 | 无袖 | | 细肩带 | 高腰 | | A字裙 | 中长款 | 拉链 | 褶皱 | 梭织 | 聚酯纤维,氨纶 | 轻薄,透气 | 有内衬 | 易打理 | 酒红色 | 红色 | 纯色 | 约会,度假 | 浪漫 |
  257 +"""
  258 +
  259 + with mock.patch.object(
  260 + product_enrich,
  261 + "call_llm",
  262 + return_value=(merged_markdown, json.dumps({"choices": [{"message": {"content": "stub"}}]})),
  263 + ):
  264 + results = product_enrich.process_batch(
  265 + [{"id": "sku-1", "title": "dress"}],
  266 + batch_num=1,
  267 + target_lang="zh",
  268 + analysis_kind="taxonomy",
  269 + )
  270 +
  271 + assert len(results) == 1
  272 + row = results[0]
  273 + assert row["id"] == "sku-1"
  274 + assert row["lang"] == "zh"
  275 + assert row["title_input"] == "dress"
  276 + assert row["product_type"] == "连衣裙"
  277 + assert row["target_gender"] == "女"
  278 + assert row["age_group"] == "成人"
  279 + assert row["sleeve_length_type"] == "无袖"
  280 + assert row["material_composition"] == "聚酯纤维,氨纶"
  281 + assert row["occasion_end_use"] == "约会,度假"
  282 + assert row["style_aesthetic"] == "浪漫"
  283 +
  284 +
231 def test_analyze_products_uses_product_level_cache_across_batch_requests(): 285 def test_analyze_products_uses_product_level_cache_across_batch_requests():
232 cache_store = {} 286 cache_store = {}
233 process_calls = [] 287 process_calls = []
@@ -241,13 +295,16 @@ def test_analyze_products_uses_product_level_cache_across_batch_requests(): @@ -241,13 +295,16 @@ def test_analyze_products_uses_product_level_cache_across_batch_requests():
241 product.get("image_url", ""), 295 product.get("image_url", ""),
242 ) 296 )
243 297
244 - def fake_get_cached_anchor_result(product, target_lang): 298 + def fake_get_cached_analysis_result(product, target_lang, analysis_kind="content"):
  299 + assert analysis_kind == "content"
245 return cache_store.get(_cache_key(product, target_lang)) 300 return cache_store.get(_cache_key(product, target_lang))
246 301
247 - def fake_set_cached_anchor_result(product, target_lang, result): 302 + def fake_set_cached_analysis_result(product, target_lang, result, analysis_kind="content"):
  303 + assert analysis_kind == "content"
248 cache_store[_cache_key(product, target_lang)] = result 304 cache_store[_cache_key(product, target_lang)] = result
249 305
250 - def fake_process_batch(batch_data, batch_num, target_lang="zh"): 306 + def fake_process_batch(batch_data, batch_num, target_lang="zh", analysis_kind="content"):
  307 + assert analysis_kind == "content"
251 process_calls.append( 308 process_calls.append(
252 { 309 {
253 "batch_num": batch_num, 310 "batch_num": batch_num,
@@ -281,12 +338,12 @@ def test_analyze_products_uses_product_level_cache_across_batch_requests(): @@ -281,12 +338,12 @@ def test_analyze_products_uses_product_level_cache_across_batch_requests():
281 338
282 with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object( 339 with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object(
283 product_enrich, 340 product_enrich,
284 - "_get_cached_anchor_result",  
285 - side_effect=fake_get_cached_anchor_result, 341 + "_get_cached_analysis_result",
  342 + side_effect=fake_get_cached_analysis_result,
286 ), mock.patch.object( 343 ), mock.patch.object(
287 product_enrich, 344 product_enrich,
288 - "_set_cached_anchor_result",  
289 - side_effect=fake_set_cached_anchor_result, 345 + "_set_cached_analysis_result",
  346 + side_effect=fake_set_cached_analysis_result,
290 ), mock.patch.object( 347 ), mock.patch.object(
291 product_enrich, 348 product_enrich,
292 "process_batch", 349 "process_batch",
@@ -342,11 +399,12 @@ def test_analyze_products_reuses_cached_content_with_current_product_identity(): @@ -342,11 +399,12 @@ def test_analyze_products_reuses_cached_content_with_current_product_identity():
342 399
343 with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object( 400 with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object(
344 product_enrich, 401 product_enrich,
345 - "_get_cached_anchor_result",  
346 - wraps=lambda product, target_lang: product_enrich._normalize_analysis_result( 402 + "_get_cached_analysis_result",
  403 + wraps=lambda product, target_lang, analysis_kind="content": product_enrich._normalize_analysis_result(
347 cached_result, 404 cached_result,
348 product=product, 405 product=product,
349 target_lang=target_lang, 406 target_lang=target_lang,
  407 + schema=product_enrich._get_analysis_schema("content"),
350 ), 408 ),
351 ), mock.patch.object( 409 ), mock.patch.object(
352 product_enrich, 410 product_enrich,
@@ -379,7 +437,47 @@ def test_analyze_products_reuses_cached_content_with_current_product_identity(): @@ -379,7 +437,47 @@ def test_analyze_products_reuses_cached_content_with_current_product_identity():
379 437
380 438
381 def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output(): 439 def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output():
382 - def fake_analyze_products(products, target_lang="zh", batch_size=None, tenant_id=None): 440 + def fake_analyze_products(
  441 + products,
  442 + target_lang="zh",
  443 + batch_size=None,
  444 + tenant_id=None,
  445 + analysis_kind="content",
  446 + ):
  447 + if analysis_kind == "taxonomy":
  448 + return [
  449 + {
  450 + "id": products[0]["id"],
  451 + "lang": target_lang,
  452 + "title_input": products[0]["title"],
  453 + "product_type": f"{target_lang}-dress",
  454 + "target_gender": f"{target_lang}-women",
  455 + "age_group": "",
  456 + "season": f"{target_lang}-summer",
  457 + "fit": "",
  458 + "silhouette": "",
  459 + "neckline": "",
  460 + "sleeve_length_type": "",
  461 + "sleeve_style": "",
  462 + "strap_type": "",
  463 + "rise_waistline": "",
  464 + "leg_shape": "",
  465 + "skirt_shape": "",
  466 + "length_type": "",
  467 + "closure_type": "",
  468 + "design_details": "",
  469 + "fabric": "",
  470 + "material_composition": "",
  471 + "fabric_properties": "",
  472 + "clothing_features": "",
  473 + "functional_benefits": "",
  474 + "color": "",
  475 + "color_family": "",
  476 + "print_pattern": "",
  477 + "occasion_end_use": "",
  478 + "style_aesthetic": "",
  479 + }
  480 + ]
383 return [ 481 return [
384 { 482 {
385 "id": products[0]["id"], 483 "id": products[0]["id"],
@@ -423,6 +521,20 @@ def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output() @@ -423,6 +521,20 @@ def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output()
423 }, 521 },
424 {"name": "target_audience", "value": {"zh": ["zh-audience"], "en": ["en-audience"]}}, 522 {"name": "target_audience", "value": {"zh": ["zh-audience"], "en": ["en-audience"]}},
425 ], 523 ],
  524 + "enriched_taxonomy_attributes": [
  525 + {
  526 + "name": "Product Type",
  527 + "value": {"zh": ["zh-dress"], "en": ["en-dress"]},
  528 + },
  529 + {
  530 + "name": "Target Gender",
  531 + "value": {"zh": ["zh-women"], "en": ["en-women"]},
  532 + },
  533 + {
  534 + "name": "Season",
  535 + "value": {"zh": ["zh-summer"], "en": ["en-summer"]},
  536 + },
  537 + ],
426 } 538 }
427 ] 539 ]
428 540