Commit 36516857d07bd540ffdc097d5f5349da1048c184

Authored by tangwang
1 parent 78cdef1c

feat(product_enrich): 为产品富化模块增加 enriched_taxonomy_attributes

字段生成

- 新增分类法属性富化能力,遵循 enriched_attributes
  相同的字段结构和处理逻辑,仅提示词和解析维度不同
- 引入 AnalysisSchema
  抽象类,使内容富化(content)与分类法富化(taxonomy)共享批处理、缓存、提示词构建、Markdown
解析及归一化流程
- 重构 product_enrich.py 中原有的富化管道,将通用逻辑抽取至
  _process_batch_for_schema、_parse_markdown_to_attributes
等函数,消除代码重复
- 在 product_enrich_prompts.py
  中添加分类法提示词模板(TAXONOMY_ANALYSIS_PROMPT)及 Markdown
表头定义(TAXONOMY_HEADERS)
- 修复 Markdown
  解析器在空单元格时的行为:原实现会跳过空单元格导致列错位,现改为保留空值,确保稀疏的分类法属性列正确对齐
- 更新 document_transformer.py 中 build_index_content_fields 函数,将
  enriched_taxonomy_attributes(中/英)写入最终索引文档
- 调整相关单元测试(test_product_enrich_partial_mode.py
  等)以覆盖新字段路径,测试通过(14 passed)

技术细节:
- AnalysisSchema 包含
  schema_name、prompt_template、headers、field_name_prefix 等元数据
-
缓存键区分内容/分类法:`enrich:{schema_name}:{product_id}`,避免缓存污染
- 分类法解析使用与 enriched_attributes
  相同的嵌套结构:`{"attribute_key": "value"}`,支持多行表格
- 批处理大小与重试逻辑保持与原有内容富化一致
indexer/document_transformer.py
... ... @@ -242,6 +242,7 @@ class SPUDocumentTransformer:
242 242 - qanchors.{lang}
243 243 - enriched_tags.{lang}
244 244 - enriched_attributes[].value.{lang}
  245 + - enriched_taxonomy_attributes[].value.{lang}
245 246  
246 247 设计目标:
247 248 - 尽可能攒批调用 LLM;
... ... @@ -296,6 +297,8 @@ class SPUDocumentTransformer:
296 297 doc["enriched_tags"] = enrichment["enriched_tags"]
297 298 if enrichment.get("enriched_attributes"):
298 299 doc["enriched_attributes"] = enrichment["enriched_attributes"]
  300 + if enrichment.get("enriched_taxonomy_attributes"):
  301 + doc["enriched_taxonomy_attributes"] = enrichment["enriched_taxonomy_attributes"]
299 302 except Exception as e:
300 303 logger.warning("Failed to apply enrichment to doc (spu_id=%s): %s", doc.get("spu_id"), e)
301 304  
... ...
indexer/product_enrich.py
... ... @@ -14,6 +14,7 @@ import time
14 14 import hashlib
15 15 import uuid
16 16 import threading
  17 +from dataclasses import dataclass, field
17 18 from collections import OrderedDict
18 19 from datetime import datetime
19 20 from concurrent.futures import ThreadPoolExecutor
... ... @@ -30,6 +31,9 @@ from indexer.product_enrich_prompts import (
30 31 USER_INSTRUCTION_TEMPLATE,
31 32 LANGUAGE_MARKDOWN_TABLE_HEADERS,
32 33 SHARED_ANALYSIS_INSTRUCTION,
  34 + TAXONOMY_LANGUAGE_MARKDOWN_TABLE_HEADERS,
  35 + TAXONOMY_MARKDOWN_TABLE_HEADERS_EN,
  36 + TAXONOMY_SHARED_ANALYSIS_INSTRUCTION,
33 37 )
34 38  
35 39 # 配置
... ... @@ -147,7 +151,7 @@ if _missing_prompt_langs:
147 151 # 多值字段分隔:英文逗号、中文逗号、顿号,及历史约定的 ; | / 与空白
148 152 _MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[,、,;|/\n\t]+")
149 153 _CORE_INDEX_LANGUAGES = ("zh", "en")
150   -_ANALYSIS_ATTRIBUTE_FIELD_MAP = (
  154 +_CONTENT_ANALYSIS_ATTRIBUTE_FIELD_MAP = (
151 155 ("tags", "enriched_tags"),
152 156 ("target_audience", "target_audience"),
153 157 ("usage_scene", "usage_scene"),
... ... @@ -156,7 +160,7 @@ _ANALYSIS_ATTRIBUTE_FIELD_MAP = (
156 160 ("material", "material"),
157 161 ("features", "features"),
158 162 )
159   -_ANALYSIS_RESULT_FIELDS = (
  163 +_CONTENT_ANALYSIS_RESULT_FIELDS = (
160 164 "title",
161 165 "category_path",
162 166 "tags",
... ... @@ -168,7 +172,7 @@ _ANALYSIS_RESULT_FIELDS = (
168 172 "features",
169 173 "anchor_text",
170 174 )
171   -_ANALYSIS_MEANINGFUL_FIELDS = (
  175 +_CONTENT_ANALYSIS_MEANINGFUL_FIELDS = (
172 176 "tags",
173 177 "target_audience",
174 178 "usage_scene",
... ... @@ -178,9 +182,89 @@ _ANALYSIS_MEANINGFUL_FIELDS = (
178 182 "features",
179 183 "anchor_text",
180 184 )
181   -_ANALYSIS_FIELD_ALIASES = {
  185 +_CONTENT_ANALYSIS_FIELD_ALIASES = {
182 186 "tags": ("tags", "enriched_tags"),
183 187 }
  188 +_CONTENT_ANALYSIS_QUALITY_FIELDS = ("title", "category_path", "anchor_text")
  189 +_TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP = (
  190 + ("product_type", "Product Type"),
  191 + ("target_gender", "Target Gender"),
  192 + ("age_group", "Age Group"),
  193 + ("season", "Season"),
  194 + ("fit", "Fit"),
  195 + ("silhouette", "Silhouette"),
  196 + ("neckline", "Neckline"),
  197 + ("sleeve_length_type", "Sleeve Length Type"),
  198 + ("sleeve_style", "Sleeve Style"),
  199 + ("strap_type", "Strap Type"),
  200 + ("rise_waistline", "Rise / Waistline"),
  201 + ("leg_shape", "Leg Shape"),
  202 + ("skirt_shape", "Skirt Shape"),
  203 + ("length_type", "Length Type"),
  204 + ("closure_type", "Closure Type"),
  205 + ("design_details", "Design Details"),
  206 + ("fabric", "Fabric"),
  207 + ("material_composition", "Material Composition"),
  208 + ("fabric_properties", "Fabric Properties"),
  209 + ("clothing_features", "Clothing Features"),
  210 + ("functional_benefits", "Functional Benefits"),
  211 + ("color", "Color"),
  212 + ("color_family", "Color Family"),
  213 + ("print_pattern", "Print / Pattern"),
  214 + ("occasion_end_use", "Occasion / End Use"),
  215 + ("style_aesthetic", "Style Aesthetic"),
  216 +)
  217 +_TAXONOMY_ANALYSIS_RESULT_FIELDS = tuple(
  218 + field_name for field_name, _ in _TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP
  219 +)
  220 +
  221 +
  222 +@dataclass(frozen=True)
  223 +class AnalysisSchema:
  224 + name: str
  225 + shared_instruction: str
  226 + markdown_table_headers: Dict[str, List[str]]
  227 + result_fields: Tuple[str, ...]
  228 + meaningful_fields: Tuple[str, ...]
  229 + field_aliases: Dict[str, Tuple[str, ...]] = field(default_factory=dict)
  230 + fallback_headers: Optional[List[str]] = None
  231 + quality_fields: Tuple[str, ...] = ()
  232 +
  233 + def get_headers(self, target_lang: str) -> Optional[List[str]]:
  234 + headers = self.markdown_table_headers.get(target_lang)
  235 + if headers:
  236 + return headers
  237 + if self.fallback_headers:
  238 + return self.fallback_headers
  239 + return None
  240 +
  241 +
  242 +_ANALYSIS_SCHEMAS: Dict[str, AnalysisSchema] = {
  243 + "content": AnalysisSchema(
  244 + name="content",
  245 + shared_instruction=SHARED_ANALYSIS_INSTRUCTION,
  246 + markdown_table_headers=LANGUAGE_MARKDOWN_TABLE_HEADERS,
  247 + result_fields=_CONTENT_ANALYSIS_RESULT_FIELDS,
  248 + meaningful_fields=_CONTENT_ANALYSIS_MEANINGFUL_FIELDS,
  249 + field_aliases=_CONTENT_ANALYSIS_FIELD_ALIASES,
  250 + quality_fields=_CONTENT_ANALYSIS_QUALITY_FIELDS,
  251 + ),
  252 + "taxonomy": AnalysisSchema(
  253 + name="taxonomy",
  254 + shared_instruction=TAXONOMY_SHARED_ANALYSIS_INSTRUCTION,
  255 + markdown_table_headers=TAXONOMY_LANGUAGE_MARKDOWN_TABLE_HEADERS,
  256 + result_fields=_TAXONOMY_ANALYSIS_RESULT_FIELDS,
  257 + meaningful_fields=_TAXONOMY_ANALYSIS_RESULT_FIELDS,
  258 + fallback_headers=TAXONOMY_MARKDOWN_TABLE_HEADERS_EN,
  259 + ),
  260 +}
  261 +
  262 +
  263 +def _get_analysis_schema(analysis_kind: str) -> AnalysisSchema:
  264 + schema = _ANALYSIS_SCHEMAS.get(analysis_kind)
  265 + if schema is None:
  266 + raise ValueError(f"Unsupported analysis_kind: {analysis_kind}")
  267 + return schema
184 268  
185 269  
186 270 def split_multi_value_field(text: Optional[str]) -> List[str]:
... ... @@ -235,12 +319,12 @@ def _get_product_id(product: Dict[str, Any]) -> str:
235 319 return str(product.get("id") or product.get("spu_id") or "").strip()
236 320  
237 321  
238   -def _get_analysis_field_aliases(field_name: str) -> Tuple[str, ...]:
239   - return _ANALYSIS_FIELD_ALIASES.get(field_name, (field_name,))
  322 +def _get_analysis_field_aliases(field_name: str, schema: AnalysisSchema) -> Tuple[str, ...]:
  323 + return schema.field_aliases.get(field_name, (field_name,))
240 324  
241 325  
242   -def _get_analysis_field_value(row: Dict[str, Any], field_name: str) -> Any:
243   - for alias in _get_analysis_field_aliases(field_name):
  326 +def _get_analysis_field_value(row: Dict[str, Any], field_name: str, schema: AnalysisSchema) -> Any:
  327 + for alias in _get_analysis_field_aliases(field_name, schema):
244 328 if alias in row:
245 329 return row.get(alias)
246 330 return None
... ... @@ -261,6 +345,7 @@ def _has_meaningful_value(value: Any) -> bool:
261 345 def _make_empty_analysis_result(
262 346 product: Dict[str, Any],
263 347 target_lang: str,
  348 + schema: AnalysisSchema,
264 349 error: Optional[str] = None,
265 350 ) -> Dict[str, Any]:
266 351 result = {
... ... @@ -268,7 +353,7 @@ def _make_empty_analysis_result(
268 353 "lang": target_lang,
269 354 "title_input": str(product.get("title") or "").strip(),
270 355 }
271   - for field in _ANALYSIS_RESULT_FIELDS:
  356 + for field in schema.result_fields:
272 357 result[field] = ""
273 358 if error:
274 359 result["error"] = error
... ... @@ -279,42 +364,59 @@ def _normalize_analysis_result(
279 364 result: Dict[str, Any],
280 365 product: Dict[str, Any],
281 366 target_lang: str,
  367 + schema: AnalysisSchema,
282 368 ) -> Dict[str, Any]:
283   - normalized = _make_empty_analysis_result(product, target_lang)
  369 + normalized = _make_empty_analysis_result(product, target_lang, schema)
284 370 if not isinstance(result, dict):
285 371 return normalized
286 372  
287 373 normalized["lang"] = str(result.get("lang") or target_lang).strip() or target_lang
288   - normalized["title"] = str(result.get("title") or "").strip()
289   - normalized["category_path"] = str(result.get("category_path") or "").strip()
290 374 normalized["title_input"] = str(
291 375 product.get("title") or result.get("title_input") or ""
292 376 ).strip()
293 377  
294   - for field in _ANALYSIS_RESULT_FIELDS:
295   - if field in {"title", "category_path"}:
296   - continue
297   - normalized[field] = str(_get_analysis_field_value(result, field) or "").strip()
  378 + for field in schema.result_fields:
  379 + normalized[field] = str(_get_analysis_field_value(result, field, schema) or "").strip()
298 380  
299 381 if result.get("error"):
300 382 normalized["error"] = str(result.get("error"))
301 383 return normalized
302 384  
303 385  
304   -def _has_meaningful_analysis_content(result: Dict[str, Any]) -> bool:
305   - return any(_has_meaningful_value(result.get(field)) for field in _ANALYSIS_MEANINGFUL_FIELDS)
  386 +def _has_meaningful_analysis_content(result: Dict[str, Any], schema: AnalysisSchema) -> bool:
  387 + return any(_has_meaningful_value(result.get(field)) for field in schema.meaningful_fields)
  388 +
  389 +
  390 +def _append_analysis_attributes(
  391 + target: List[Dict[str, Any]],
  392 + row: Dict[str, Any],
  393 + lang: str,
  394 + schema: AnalysisSchema,
  395 + field_map: Tuple[Tuple[str, str], ...],
  396 +) -> None:
  397 + for source_name, output_name in field_map:
  398 + raw = _get_analysis_field_value(row, source_name, schema)
  399 + if not raw:
  400 + continue
  401 + _append_named_lang_phrase_map(
  402 + target,
  403 + name=output_name,
  404 + lang=lang,
  405 + raw_value=raw,
  406 + )
306 407  
307 408  
308 409 def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang: str) -> None:
309 410 if not row or row.get("error"):
310 411 return
311 412  
312   - anchor_text = str(_get_analysis_field_value(row, "anchor_text") or "").strip()
  413 + content_schema = _get_analysis_schema("content")
  414 + anchor_text = str(_get_analysis_field_value(row, "anchor_text", content_schema) or "").strip()
313 415 if anchor_text:
314 416 _append_lang_phrase_map(result["qanchors"], lang=lang, raw_value=anchor_text)
315 417  
316   - for source_name, output_name in _ANALYSIS_ATTRIBUTE_FIELD_MAP:
317   - raw = _get_analysis_field_value(row, source_name)
  418 + for source_name, output_name in _CONTENT_ANALYSIS_ATTRIBUTE_FIELD_MAP:
  419 + raw = _get_analysis_field_value(row, source_name, content_schema)
318 420 if not raw:
319 421 continue
320 422 _append_named_lang_phrase_map(
... ... @@ -327,6 +429,19 @@ def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang:
327 429 _append_lang_phrase_map(result["enriched_tags"], lang=lang, raw_value=raw)
328 430  
329 431  
  432 +def _apply_index_taxonomy_row(result: Dict[str, Any], row: Dict[str, Any], lang: str) -> None:
  433 + if not row or row.get("error"):
  434 + return
  435 +
  436 + _append_analysis_attributes(
  437 + result["enriched_taxonomy_attributes"],
  438 + row=row,
  439 + lang=lang,
  440 + schema=_get_analysis_schema("taxonomy"),
  441 + field_map=_TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP,
  442 + )
  443 +
  444 +
330 445 def _normalize_index_content_item(item: Dict[str, Any]) -> Dict[str, str]:
331 446 item_id = _get_product_id(item)
332 447 return {
... ... @@ -355,6 +470,7 @@ def build_index_content_fields(
355 470 - `qanchors`
356 471 - `enriched_tags`
357 472 - `enriched_attributes`
  473 + - `enriched_taxonomy_attributes`
358 474 - 可选 `error`
359 475  
360 476 其中:
... ... @@ -371,6 +487,7 @@ def build_index_content_fields(
371 487 "qanchors": {},
372 488 "enriched_tags": {},
373 489 "enriched_attributes": [],
  490 + "enriched_taxonomy_attributes": [],
374 491 }
375 492 for item in normalized_items
376 493 }
... ... @@ -398,6 +515,33 @@ def build_index_content_fields(
398 515 continue
399 516 _apply_index_content_row(results_by_id[item_id], row=row, lang=lang)
400 517  
  518 + try:
  519 + taxonomy_rows = analyze_products(
  520 + products=normalized_items,
  521 + target_lang=lang,
  522 + batch_size=BATCH_SIZE,
  523 + tenant_id=tenant_id,
  524 + analysis_kind="taxonomy",
  525 + )
  526 + except Exception as e:
  527 + logger.warning(
  528 + "build_index_content_fields taxonomy enrichment failed for lang=%s: %s",
  529 + lang,
  530 + e,
  531 + )
  532 + for item in normalized_items:
  533 + results_by_id[item["id"]].setdefault("error", str(e))
  534 + continue
  535 +
  536 + for row in taxonomy_rows or []:
  537 + item_id = str(row.get("id") or "").strip()
  538 + if not item_id or item_id not in results_by_id:
  539 + continue
  540 + if row.get("error"):
  541 + results_by_id[item_id].setdefault("error", row["error"])
  542 + continue
  543 + _apply_index_taxonomy_row(results_by_id[item_id], row=row, lang=lang)
  544 +
401 545 return [results_by_id[item["id"]] for item in normalized_items]
402 546  
403 547  
... ... @@ -463,52 +607,89 @@ def _build_prompt_input_text(product: Dict[str, Any]) -> str:
463 607 return _truncate_by_words(candidate, PROMPT_INPUT_MAX_WORDS)
464 608  
465 609  
466   -def _make_anchor_cache_key(
  610 +def _make_analysis_cache_key(
467 611 product: Dict[str, Any],
468 612 target_lang: str,
  613 + analysis_kind: str,
469 614 ) -> str:
470   - """构造缓存 key,仅由 prompt 实际输入文本内容 + 目标语言决定。"""
  615 + """构造缓存 key,仅由分析类型、prompt 实际输入文本内容与目标语言决定。"""
471 616 prompt_input = _build_prompt_input_text(product)
472 617 h = hashlib.md5(prompt_input.encode("utf-8")).hexdigest()
473   - return f"{ANCHOR_CACHE_PREFIX}:{target_lang}:{prompt_input[:4]}{h}"
  618 + return f"{ANCHOR_CACHE_PREFIX}:{analysis_kind}:{target_lang}:{prompt_input[:4]}{h}"
474 619  
475 620  
476   -def _get_cached_anchor_result(
  621 +def _make_anchor_cache_key(
477 622 product: Dict[str, Any],
478 623 target_lang: str,
  624 +) -> str:
  625 + return _make_analysis_cache_key(product, target_lang, analysis_kind="content")
  626 +
  627 +
  628 +def _get_cached_analysis_result(
  629 + product: Dict[str, Any],
  630 + target_lang: str,
  631 + analysis_kind: str,
479 632 ) -> Optional[Dict[str, Any]]:
480 633 if not _anchor_redis:
481 634 return None
  635 + schema = _get_analysis_schema(analysis_kind)
482 636 try:
483   - key = _make_anchor_cache_key(product, target_lang)
  637 + key = _make_analysis_cache_key(product, target_lang, analysis_kind)
484 638 raw = _anchor_redis.get(key)
485 639 if not raw:
486 640 return None
487   - result = _normalize_analysis_result(json.loads(raw), product=product, target_lang=target_lang)
488   - if not _has_meaningful_analysis_content(result):
  641 + result = _normalize_analysis_result(
  642 + json.loads(raw),
  643 + product=product,
  644 + target_lang=target_lang,
  645 + schema=schema,
  646 + )
  647 + if not _has_meaningful_analysis_content(result, schema):
489 648 return None
490 649 return result
491 650 except Exception as e:
492   - logger.warning(f"Failed to get anchor cache: {e}")
  651 + logger.warning("Failed to get %s analysis cache: %s", analysis_kind, e)
493 652 return None
494 653  
495 654  
496   -def _set_cached_anchor_result(
  655 +def _get_cached_anchor_result(
  656 + product: Dict[str, Any],
  657 + target_lang: str,
  658 +) -> Optional[Dict[str, Any]]:
  659 + return _get_cached_analysis_result(product, target_lang, analysis_kind="content")
  660 +
  661 +
  662 +def _set_cached_analysis_result(
497 663 product: Dict[str, Any],
498 664 target_lang: str,
499 665 result: Dict[str, Any],
  666 + analysis_kind: str,
500 667 ) -> None:
501 668 if not _anchor_redis:
502 669 return
  670 + schema = _get_analysis_schema(analysis_kind)
503 671 try:
504   - normalized = _normalize_analysis_result(result, product=product, target_lang=target_lang)
505   - if not _has_meaningful_analysis_content(normalized):
  672 + normalized = _normalize_analysis_result(
  673 + result,
  674 + product=product,
  675 + target_lang=target_lang,
  676 + schema=schema,
  677 + )
  678 + if not _has_meaningful_analysis_content(normalized, schema):
506 679 return
507   - key = _make_anchor_cache_key(product, target_lang)
  680 + key = _make_analysis_cache_key(product, target_lang, analysis_kind)
508 681 ttl = ANCHOR_CACHE_EXPIRE_DAYS * 24 * 3600
509 682 _anchor_redis.setex(key, ttl, json.dumps(normalized, ensure_ascii=False))
510 683 except Exception as e:
511   - logger.warning(f"Failed to set anchor cache: {e}")
  684 + logger.warning("Failed to set %s analysis cache: %s", analysis_kind, e)
  685 +
  686 +
  687 +def _set_cached_anchor_result(
  688 + product: Dict[str, Any],
  689 + target_lang: str,
  690 + result: Dict[str, Any],
  691 +) -> None:
  692 + _set_cached_analysis_result(product, target_lang, result, analysis_kind="content")
512 693  
513 694  
514 695 def _build_assistant_prefix(headers: List[str]) -> str:
... ... @@ -517,8 +698,8 @@ def _build_assistant_prefix(headers: List[str]) -> str:
517 698 return f"{header_line}\n{separator_line}\n"
518 699  
519 700  
520   -def _build_shared_context(products: List[Dict[str, str]]) -> str:
521   - shared_context = SHARED_ANALYSIS_INSTRUCTION
  701 +def _build_shared_context(products: List[Dict[str, str]], schema: AnalysisSchema) -> str:
  702 + shared_context = schema.shared_instruction
522 703 for idx, product in enumerate(products, 1):
523 704 prompt_input = _build_prompt_input_text(product)
524 705 shared_context += f"{idx}. {prompt_input}\n"
... ... @@ -550,16 +731,19 @@ def reset_logged_shared_context_keys() -> None:
550 731 def create_prompt(
551 732 products: List[Dict[str, str]],
552 733 target_lang: str = "zh",
553   -) -> Tuple[str, str, str]:
  734 + analysis_kind: str = "content",
  735 +) -> Tuple[Optional[str], Optional[str], Optional[str]]:
554 736 """根据目标语言创建共享上下文、本地化输出要求和 Partial Mode assistant 前缀。"""
555   - markdown_table_headers = LANGUAGE_MARKDOWN_TABLE_HEADERS.get(target_lang)
  737 + schema = _get_analysis_schema(analysis_kind)
  738 + markdown_table_headers = schema.get_headers(target_lang)
556 739 if not markdown_table_headers:
557 740 logger.warning(
558   - "Unsupported target_lang for markdown table headers: %s",
  741 + "Unsupported target_lang for markdown table headers: kind=%s lang=%s",
  742 + analysis_kind,
559 743 target_lang,
560 744 )
561 745 return None, None, None
562   - shared_context = _build_shared_context(products)
  746 + shared_context = _build_shared_context(products, schema)
563 747 language_label = SOURCE_LANG_CODE_MAP.get(target_lang, target_lang)
564 748 user_prompt = USER_INSTRUCTION_TEMPLATE.format(language=language_label).strip()
565 749 assistant_prefix = _build_assistant_prefix(markdown_table_headers)
... ... @@ -592,6 +776,7 @@ def call_llm(
592 776 user_prompt: str,
593 777 assistant_prefix: str,
594 778 target_lang: str = "zh",
  779 + analysis_kind: str = "content",
595 780 ) -> Tuple[str, str]:
596 781 """调用大模型 API(带重试机制),使用 Partial Mode 强制 markdown 表格前缀。"""
597 782 headers = {
... ... @@ -631,8 +816,9 @@ def call_llm(
631 816 if _mark_shared_context_logged_once(shared_context_key):
632 817 logger.info(f"\n{'=' * 80}")
633 818 logger.info(
634   - "LLM Shared Context [model=%s, shared_key=%s, chars=%s] (logged once per process key)",
  819 + "LLM Shared Context [model=%s, kind=%s, shared_key=%s, chars=%s] (logged once per process key)",
635 820 MODEL_NAME,
  821 + analysis_kind,
636 822 shared_context_key,
637 823 len(shared_context),
638 824 )
... ... @@ -641,8 +827,9 @@ def call_llm(
641 827  
642 828 verbose_logger.info(f"\n{'=' * 80}")
643 829 verbose_logger.info(
644   - "LLM Request [model=%s, lang=%s, shared_key=%s, tail_key=%s]:",
  830 + "LLM Request [model=%s, kind=%s, lang=%s, shared_key=%s, tail_key=%s]:",
645 831 MODEL_NAME,
  832 + analysis_kind,
646 833 target_lang,
647 834 shared_context_key,
648 835 localized_tail_key,
... ... @@ -654,7 +841,8 @@ def call_llm(
654 841 verbose_logger.info(f"\nAssistant Prefix:\n{assistant_prefix}")
655 842  
656 843 logger.info(
657   - "\nLLM Request Variant [lang=%s, shared_key=%s, tail_key=%s, prompt_chars=%s, prefix_chars=%s]",
  844 + "\nLLM Request Variant [kind=%s, lang=%s, shared_key=%s, tail_key=%s, prompt_chars=%s, prefix_chars=%s]",
  845 + analysis_kind,
658 846 target_lang,
659 847 shared_context_key,
660 848 localized_tail_key,
... ... @@ -685,8 +873,9 @@ def call_llm(
685 873 usage = result.get("usage") or {}
686 874  
687 875 verbose_logger.info(
688   - "\nLLM Response [model=%s, lang=%s, shared_key=%s, tail_key=%s]:",
  876 + "\nLLM Response [model=%s, kind=%s, lang=%s, shared_key=%s, tail_key=%s]:",
689 877 MODEL_NAME,
  878 + analysis_kind,
690 879 target_lang,
691 880 shared_context_key,
692 881 localized_tail_key,
... ... @@ -697,7 +886,8 @@ def call_llm(
697 886 full_markdown = _merge_partial_response(assistant_prefix, generated_content)
698 887  
699 888 logger.info(
700   - "\nLLM Response Summary [lang=%s, shared_key=%s, tail_key=%s, generated_chars=%s, completion_tokens=%s, prompt_tokens=%s, total_tokens=%s]",
  889 + "\nLLM Response Summary [kind=%s, lang=%s, shared_key=%s, tail_key=%s, generated_chars=%s, completion_tokens=%s, prompt_tokens=%s, total_tokens=%s]",
  890 + analysis_kind,
701 891 target_lang,
702 892 shared_context_key,
703 893 localized_tail_key,
... ... @@ -742,8 +932,12 @@ def call_llm(
742 932 session.close()
743 933  
744 934  
745   -def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]:
  935 +def parse_markdown_table(
  936 + markdown_content: str,
  937 + analysis_kind: str = "content",
  938 +) -> List[Dict[str, str]]:
746 939 """解析markdown表格内容"""
  940 + schema = _get_analysis_schema(analysis_kind)
747 941 lines = markdown_content.strip().split("\n")
748 942 data = []
749 943 data_started = False
... ... @@ -768,22 +962,15 @@ def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]:
768 962  
769 963 # 解析数据行
770 964 parts = [p.strip() for p in line.split("|")]
771   - parts = [p for p in parts if p] # 移除空字符串
  965 + if parts and parts[0] == "":
  966 + parts = parts[1:]
  967 + if parts and parts[-1] == "":
  968 + parts = parts[:-1]
772 969  
773 970 if len(parts) >= 2:
774   - row = {
775   - "seq_no": parts[0],
776   - "title": parts[1], # 商品标题(按目标语言)
777   - "category_path": parts[2] if len(parts) > 2 else "", # 品类路径
778   - "tags": parts[3] if len(parts) > 3 else "", # 细分标签
779   - "target_audience": parts[4] if len(parts) > 4 else "", # 适用人群
780   - "usage_scene": parts[5] if len(parts) > 5 else "", # 使用场景
781   - "season": parts[6] if len(parts) > 6 else "", # 适用季节
782   - "key_attributes": parts[7] if len(parts) > 7 else "", # 关键属性
783   - "material": parts[8] if len(parts) > 8 else "", # 材质说明
784   - "features": parts[9] if len(parts) > 9 else "", # 功能特点
785   - "anchor_text": parts[10] if len(parts) > 10 else "", # 锚文本
786   - }
  971 + row = {"seq_no": parts[0]}
  972 + for field_index, field_name in enumerate(schema.result_fields, start=1):
  973 + row[field_name] = parts[field_index] if len(parts) > field_index else ""
787 974 data.append(row)
788 975  
789 976 return data
... ... @@ -794,31 +981,45 @@ def _log_parsed_result_quality(
794 981 parsed_results: List[Dict[str, str]],
795 982 target_lang: str,
796 983 batch_num: int,
  984 + analysis_kind: str,
797 985 ) -> None:
  986 + schema = _get_analysis_schema(analysis_kind)
798 987 expected = len(batch_data)
799 988 actual = len(parsed_results)
800 989 if actual != expected:
801 990 logger.warning(
802   - "Parsed row count mismatch for batch=%s lang=%s: expected=%s actual=%s",
  991 + "Parsed row count mismatch for kind=%s batch=%s lang=%s: expected=%s actual=%s",
  992 + analysis_kind,
803 993 batch_num,
804 994 target_lang,
805 995 expected,
806 996 actual,
807 997 )
808 998  
809   - missing_anchor = sum(1 for item in parsed_results if not str(item.get("anchor_text") or "").strip())
810   - missing_category = sum(1 for item in parsed_results if not str(item.get("category_path") or "").strip())
811   - missing_title = sum(1 for item in parsed_results if not str(item.get("title") or "").strip())
  999 + if not schema.quality_fields:
  1000 + logger.info(
  1001 + "Parsed Quality Summary [kind=%s, batch=%s, lang=%s]: rows=%s/%s",
  1002 + analysis_kind,
  1003 + batch_num,
  1004 + target_lang,
  1005 + actual,
  1006 + expected,
  1007 + )
  1008 + return
812 1009  
  1010 + missing_summary = ", ".join(
  1011 + f"missing_{field}="
  1012 + f"{sum(1 for item in parsed_results if not str(item.get(field) or '').strip())}"
  1013 + for field in schema.quality_fields
  1014 + )
813 1015 logger.info(
814   - "Parsed Quality Summary [batch=%s, lang=%s]: rows=%s/%s, missing_title=%s, missing_category=%s, missing_anchor=%s",
  1016 + "Parsed Quality Summary [kind=%s, batch=%s, lang=%s]: rows=%s/%s, %s",
  1017 + analysis_kind,
815 1018 batch_num,
816 1019 target_lang,
817 1020 actual,
818 1021 expected,
819   - missing_title,
820   - missing_category,
821   - missing_anchor,
  1022 + missing_summary,
822 1023 )
823 1024  
824 1025  
... ... @@ -826,29 +1027,39 @@ def process_batch(
826 1027 batch_data: List[Dict[str, str]],
827 1028 batch_num: int,
828 1029 target_lang: str = "zh",
  1030 + analysis_kind: str = "content",
829 1031 ) -> List[Dict[str, Any]]:
830 1032 """处理一个批次的数据"""
  1033 + schema = _get_analysis_schema(analysis_kind)
831 1034 logger.info(f"\n{'#' * 80}")
832   - logger.info(f"Processing Batch {batch_num} ({len(batch_data)} items)")
  1035 + logger.info(
  1036 + "Processing Batch %s (%s items, kind=%s)",
  1037 + batch_num,
  1038 + len(batch_data),
  1039 + analysis_kind,
  1040 + )
833 1041  
834 1042 # 创建提示词
835 1043 shared_context, user_prompt, assistant_prefix = create_prompt(
836 1044 batch_data,
837 1045 target_lang=target_lang,
  1046 + analysis_kind=analysis_kind,
838 1047 )
839 1048  
840 1049 # 如果提示词创建失败(例如不支持的 target_lang),本次批次整体失败,不再继续调用 LLM
841 1050 if shared_context is None or user_prompt is None or assistant_prefix is None:
842 1051 logger.error(
843   - "Failed to create prompt for batch %s, target_lang=%s; "
  1052 + "Failed to create prompt for batch %s, kind=%s, target_lang=%s; "
844 1053 "marking entire batch as failed without calling LLM",
845 1054 batch_num,
  1055 + analysis_kind,
846 1056 target_lang,
847 1057 )
848 1058 return [
849 1059 _make_empty_analysis_result(
850 1060 item,
851 1061 target_lang,
  1062 + schema,
852 1063 error=f"prompt_creation_failed: unsupported target_lang={target_lang}",
853 1064 )
854 1065 for item in batch_data
... ... @@ -861,11 +1072,18 @@ def process_batch(
861 1072 user_prompt,
862 1073 assistant_prefix,
863 1074 target_lang=target_lang,
  1075 + analysis_kind=analysis_kind,
864 1076 )
865 1077  
866 1078 # 解析结果
867   - parsed_results = parse_markdown_table(raw_response)
868   - _log_parsed_result_quality(batch_data, parsed_results, target_lang, batch_num)
  1079 + parsed_results = parse_markdown_table(raw_response, analysis_kind=analysis_kind)
  1080 + _log_parsed_result_quality(
  1081 + batch_data,
  1082 + parsed_results,
  1083 + target_lang,
  1084 + batch_num,
  1085 + analysis_kind,
  1086 + )
869 1087  
870 1088 logger.info(f"\nParsed Results ({len(parsed_results)} items):")
871 1089 logger.info(json.dumps(parsed_results, ensure_ascii=False, indent=2))
... ... @@ -879,10 +1097,12 @@ def process_batch(
879 1097 parsed_item,
880 1098 product=source_product,
881 1099 target_lang=target_lang,
  1100 + schema=schema,
882 1101 )
883 1102 results_with_ids.append(result)
884 1103 logger.info(
885   - "Mapped: seq=%s -> original_id=%s",
  1104 + "Mapped: kind=%s seq=%s -> original_id=%s",
  1105 + analysis_kind,
886 1106 parsed_item.get("seq_no"),
887 1107 source_product.get("id"),
888 1108 )
... ... @@ -890,6 +1110,7 @@ def process_batch(
890 1110 # 保存批次 JSON 日志到独立文件
891 1111 batch_log = {
892 1112 "batch_num": batch_num,
  1113 + "analysis_kind": analysis_kind,
893 1114 "timestamp": datetime.now().isoformat(),
894 1115 "input_products": batch_data,
895 1116 "raw_response": raw_response,
... ... @@ -900,7 +1121,10 @@ def process_batch(
900 1121  
901 1122 # 并发写 batch json 日志时,保证文件名唯一避免覆盖
902 1123 batch_call_id = uuid.uuid4().hex[:12]
903   - batch_log_file = LOG_DIR / f"batch_{batch_num:04d}_{timestamp}_{batch_call_id}.json"
  1124 + batch_log_file = (
  1125 + LOG_DIR
  1126 + / f"batch_{analysis_kind}_{batch_num:04d}_{timestamp}_{batch_call_id}.json"
  1127 + )
904 1128 with open(batch_log_file, "w", encoding="utf-8") as f:
905 1129 json.dump(batch_log, f, ensure_ascii=False, indent=2)
906 1130  
... ... @@ -912,7 +1136,7 @@ def process_batch(
912 1136 logger.error(f"Error processing batch {batch_num}: {str(e)}", exc_info=True)
913 1137 # 返回空结果,保持ID映射
914 1138 return [
915   - _make_empty_analysis_result(item, target_lang, error=str(e))
  1139 + _make_empty_analysis_result(item, target_lang, schema, error=str(e))
916 1140 for item in batch_data
917 1141 ]
918 1142  
... ... @@ -922,6 +1146,7 @@ def analyze_products(
922 1146 target_lang: str = "zh",
923 1147 batch_size: Optional[int] = None,
924 1148 tenant_id: Optional[str] = None,
  1149 + analysis_kind: str = "content",
925 1150 ) -> List[Dict[str, Any]]:
926 1151 """
927 1152 库调用入口:根据输入+语言,返回锚文本及各维度信息。
... ... @@ -937,6 +1162,7 @@ def analyze_products(
937 1162 if not products:
938 1163 return []
939 1164  
  1165 + _get_analysis_schema(analysis_kind)
940 1166 results_by_index: List[Optional[Dict[str, Any]]] = [None] * len(products)
941 1167 uncached_items: List[Tuple[int, Dict[str, str]]] = []
942 1168  
... ... @@ -946,11 +1172,11 @@ def analyze_products(
946 1172 uncached_items.append((idx, product))
947 1173 continue
948 1174  
949   - cached = _get_cached_anchor_result(product, target_lang)
  1175 + cached = _get_cached_analysis_result(product, target_lang, analysis_kind)
950 1176 if cached:
951 1177 logger.info(
952 1178 f"[analyze_products] Cache hit for title='{title[:50]}...', "
953   - f"lang={target_lang}"
  1179 + f"kind={analysis_kind}, lang={target_lang}"
954 1180 )
955 1181 results_by_index[idx] = cached
956 1182 continue
... ... @@ -979,9 +1205,14 @@ def analyze_products(
979 1205 for batch_num, batch_slice, batch in batch_jobs:
980 1206 logger.info(
981 1207 f"[analyze_products] Processing batch {batch_num}/{total_batches}, "
982   - f"size={len(batch)}, target_lang={target_lang}"
  1208 + f"size={len(batch)}, kind={analysis_kind}, target_lang={target_lang}"
  1209 + )
  1210 + batch_results = process_batch(
  1211 + batch,
  1212 + batch_num=batch_num,
  1213 + target_lang=target_lang,
  1214 + analysis_kind=analysis_kind,
983 1215 )
984   - batch_results = process_batch(batch, batch_num=batch_num, target_lang=target_lang)
985 1216  
986 1217 for (original_idx, product), item in zip(batch_slice, batch_results):
987 1218 results_by_index[original_idx] = item
... ... @@ -992,7 +1223,7 @@ def analyze_products(
992 1223 # 不缓存错误结果,避免放大临时故障
993 1224 continue
994 1225 try:
995   - _set_cached_anchor_result(product, target_lang, item)
  1226 + _set_cached_analysis_result(product, target_lang, item, analysis_kind)
996 1227 except Exception:
997 1228 # 已在内部记录 warning
998 1229 pass
... ... @@ -1000,10 +1231,11 @@ def analyze_products(
1000 1231 max_workers = min(CONTENT_UNDERSTANDING_MAX_WORKERS, len(batch_jobs))
1001 1232 logger.info(
1002 1233 "[analyze_products] Using ThreadPoolExecutor for uncached batches: "
1003   - "max_workers=%s, total_batches=%s, bs=%s, target_lang=%s",
  1234 + "max_workers=%s, total_batches=%s, bs=%s, kind=%s, target_lang=%s",
1004 1235 max_workers,
1005 1236 total_batches,
1006 1237 bs,
  1238 + analysis_kind,
1007 1239 target_lang,
1008 1240 )
1009 1241  
... ... @@ -1013,7 +1245,11 @@ def analyze_products(
1013 1245 future_by_batch_num: Dict[int, Any] = {}
1014 1246 for batch_num, _batch_slice, batch in batch_jobs:
1015 1247 future_by_batch_num[batch_num] = executor.submit(
1016   - process_batch, batch, batch_num=batch_num, target_lang=target_lang
  1248 + process_batch,
  1249 + batch,
  1250 + batch_num=batch_num,
  1251 + target_lang=target_lang,
  1252 + analysis_kind=analysis_kind,
1017 1253 )
1018 1254  
1019 1255 # 按 batch_num 回填,确保输出稳定(results_by_index 是按原始 input index 映射的)
... ... @@ -1028,7 +1264,7 @@ def analyze_products(
1028 1264 # 不缓存错误结果,避免放大临时故障
1029 1265 continue
1030 1266 try:
1031   - _set_cached_anchor_result(product, target_lang, item)
  1267 + _set_cached_analysis_result(product, target_lang, item, analysis_kind)
1032 1268 except Exception:
1033 1269 # 已在内部记录 warning
1034 1270 pass
... ...
indexer/product_enrich_prompts.py
... ... @@ -33,6 +33,110 @@ Input product list:
33 33 USER_INSTRUCTION_TEMPLATE = """Please strictly return a Markdown table following the given columns in the specified language. For any column containing multiple values, separate them with commas. Do not add any other explanation.
34 34 Language: {language}"""
35 35  
  36 +TAXONOMY_SHARED_ANALYSIS_INSTRUCTION = """Analyze each input product text and fill the columns below using an apparel attribute taxonomy.
  37 +
  38 +Output columns:
  39 +1. Product Type: concise ecommerce apparel category label, not a full marketing title
  40 +2. Target Gender: intended gender only if clearly implied
  41 +3. Age Group: only if clearly implied, e.g. adults, kids, teens, toddlers, babies
  42 +4. Season: season(s) or all-season suitability only if supported
  43 +5. Fit: body closeness, e.g. slim, regular, relaxed, oversized, fitted
  44 +6. Silhouette: overall garment shape, e.g. straight, A-line, boxy, tapered, bodycon, wide-leg
  45 +7. Neckline: neckline type when applicable, e.g. crew neck, V-neck, hooded, collared, square neck
  46 +8. Sleeve Length Type: sleeve length only, e.g. sleeveless, short sleeve, long sleeve, three-quarter sleeve
  47 +9. Sleeve Style: sleeve design only, e.g. puff sleeve, raglan sleeve, batwing sleeve, bell sleeve
  48 +10. Strap Type: strap design when applicable, e.g. spaghetti strap, wide strap, halter strap, adjustable strap
  49 +11. Rise / Waistline: waist placement when applicable, e.g. high rise, mid rise, low rise, empire waist
  50 +12. Leg Shape: for bottoms only, e.g. straight leg, wide leg, flare leg, tapered leg, skinny leg
  51 +13. Skirt Shape: for skirts only, e.g. A-line, pleated, pencil, mermaid
  52 +14. Length Type: design length only, not size, e.g. cropped, regular, longline, mini, midi, maxi, ankle length, full length
  53 +15. Closure Type: fastening method when applicable, e.g. zipper, button, drawstring, elastic waist, hook-and-loop
  54 +16. Design Details: construction or visual details, e.g. ruched, ruffled, pleated, cut-out, layered, distressed, split hem
  55 +17. Fabric: fabric type only, e.g. denim, knit, chiffon, jersey, fleece, cotton twill
  56 +18. Material Composition: fiber content or blend only if stated, e.g. cotton, polyester, spandex, linen blend, 95% cotton 5% elastane
  57 +19. Fabric Properties: inherent fabric traits, e.g. stretch, breathable, lightweight, soft-touch, water-resistant
  58 +20. Clothing Features: product features, e.g. lined, reversible, hooded, packable, padded, pocketed
  59 +21. Functional Benefits: wearer benefits, e.g. moisture-wicking, thermal insulation, UV protection, easy care, supportive compression
  60 +22. Color: specific color name when available
  61 +23. Color Family: normalized broad retail color group, e.g. black, white, blue, green, red, pink, beige, brown, gray
  62 +24. Print / Pattern: surface pattern when applicable, e.g. solid, striped, plaid, floral, graphic, animal print
  63 +25. Occasion / End Use: likely use occasion only if supported, e.g. office, casual wear, streetwear, lounge, workout, outdoor
  64 +26. Style Aesthetic: overall style only if supported, e.g. minimalist, streetwear, athleisure, smart casual, romantic, playful
  65 +
  66 +Rules:
  67 +- Keep the same row order and row count as input.
  68 +- Infer only from the provided product text.
  69 +- Leave blank if not applicable or not reasonably supported.
  70 +- Use concise, standardized ecommerce wording.
  71 +- Do not combine different attribute dimensions in one field.
  72 +- If multiple values are needed, use the delimiter required by the localization setting.
  73 +
  74 +Input product list:
  75 +"""
  76 +
  77 +TAXONOMY_MARKDOWN_TABLE_HEADERS_EN = [
  78 + "No.",
  79 + "Product Type",
  80 + "Target Gender",
  81 + "Age Group",
  82 + "Season",
  83 + "Fit",
  84 + "Silhouette",
  85 + "Neckline",
  86 + "Sleeve Length Type",
  87 + "Sleeve Style",
  88 + "Strap Type",
  89 + "Rise / Waistline",
  90 + "Leg Shape",
  91 + "Skirt Shape",
  92 + "Length Type",
  93 + "Closure Type",
  94 + "Design Details",
  95 + "Fabric",
  96 + "Material Composition",
  97 + "Fabric Properties",
  98 + "Clothing Features",
  99 + "Functional Benefits",
  100 + "Color",
  101 + "Color Family",
  102 + "Print / Pattern",
  103 + "Occasion / End Use",
  104 + "Style Aesthetic",
  105 +]
  106 +
  107 +TAXONOMY_LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = {
  108 + "en": TAXONOMY_MARKDOWN_TABLE_HEADERS_EN,
  109 + "zh": [
  110 + "序号",
  111 + "品类",
  112 + "目标性别",
  113 + "年龄段",
  114 + "适用季节",
  115 + "版型",
  116 + "廓形",
  117 + "领型",
  118 + "袖长类型",
  119 + "袖型",
  120 + "肩带设计",
  121 + "腰型",
  122 + "裤型",
  123 + "裙型",
  124 + "长度类型",
  125 + "闭合方式",
  126 + "设计细节",
  127 + "面料",
  128 + "成分",
  129 + "面料特性",
  130 + "服装特征",
  131 + "功能",
  132 + "主颜色",
  133 + "色系",
  134 + "印花 / 图案",
  135 + "适用场景",
  136 + "风格",
  137 + ],
  138 +}
  139 +
36 140 LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = {
37 141 "en": [
38 142 "No.",
... ...
tests/test_llm_enrichment_batch_fill.py
... ... @@ -19,10 +19,13 @@ def test_fill_llm_attributes_batch_uses_product_enrich_helper(monkeypatch):
19 19 "zh": [f"zh-anchor-{item['id']}"],
20 20 "en": [f"en-anchor-{item['id']}"],
21 21 },
22   - "tags": {"zh": ["t1", "t2"], "en": ["t1", "t2"]},
  22 + "enriched_tags": {"zh": ["t1", "t2"], "en": ["t1", "t2"]},
23 23 "enriched_attributes": [
24 24 {"name": "tags", "value": {"zh": ["t1"], "en": ["t1"]}},
25 25 ],
  26 + "enriched_taxonomy_attributes": [
  27 + {"name": "Product Type", "value": {"zh": ["连衣裙"], "en": ["dress"]}},
  28 + ],
26 29 }
27 30 for item in items
28 31 ]
... ... @@ -54,6 +57,10 @@ def test_fill_llm_attributes_batch_uses_product_enrich_helper(monkeypatch):
54 57  
55 58 assert docs[0]["qanchors"]["zh"] == ["zh-anchor-0"]
56 59 assert docs[0]["qanchors"]["en"] == ["en-anchor-0"]
57   - assert docs[0]["tags"]["zh"] == ["t1", "t2"]
58   - assert docs[0]["tags"]["en"] == ["t1", "t2"]
  60 + assert docs[0]["enriched_tags"]["zh"] == ["t1", "t2"]
  61 + assert docs[0]["enriched_tags"]["en"] == ["t1", "t2"]
59 62 assert {"name": "tags", "value": {"zh": ["t1"], "en": ["t1"]}} in docs[0]["enriched_attributes"]
  63 + assert {
  64 + "name": "Product Type",
  65 + "value": {"zh": ["连衣裙"], "en": ["dress"]},
  66 + } in docs[0]["enriched_taxonomy_attributes"]
... ...
tests/test_process_products_batching.py
... ... @@ -13,7 +13,13 @@ def test_analyze_products_caps_batch_size_to_20(monkeypatch):
13 13 monkeypatch.setattr(process_products, "API_KEY", "fake-key")
14 14 seen_batch_sizes: List[int] = []
15 15  
16   - def _fake_process_batch(batch_data: List[Dict[str, str]], batch_num: int, target_lang: str = "zh"):
  16 + def _fake_process_batch(
  17 + batch_data: List[Dict[str, str]],
  18 + batch_num: int,
  19 + target_lang: str = "zh",
  20 + analysis_kind: str = "content",
  21 + ):
  22 + assert analysis_kind == "content"
17 23 seen_batch_sizes.append(len(batch_data))
18 24 return [
19 25 {
... ... @@ -35,7 +41,7 @@ def test_analyze_products_caps_batch_size_to_20(monkeypatch):
35 41 ]
36 42  
37 43 monkeypatch.setattr(process_products, "process_batch", _fake_process_batch)
38   - monkeypatch.setattr(process_products, "_set_cached_anchor_result", lambda *args, **kwargs: None)
  44 + monkeypatch.setattr(process_products, "_set_cached_analysis_result", lambda *args, **kwargs: None)
39 45  
40 46 out = process_products.analyze_products(
41 47 products=_mk_products(45),
... ... @@ -53,7 +59,13 @@ def test_analyze_products_uses_min_batch_size_1(monkeypatch):
53 59 monkeypatch.setattr(process_products, "API_KEY", "fake-key")
54 60 seen_batch_sizes: List[int] = []
55 61  
56   - def _fake_process_batch(batch_data: List[Dict[str, str]], batch_num: int, target_lang: str = "zh"):
  62 + def _fake_process_batch(
  63 + batch_data: List[Dict[str, str]],
  64 + batch_num: int,
  65 + target_lang: str = "zh",
  66 + analysis_kind: str = "content",
  67 + ):
  68 + assert analysis_kind == "content"
57 69 seen_batch_sizes.append(len(batch_data))
58 70 return [
59 71 {
... ... @@ -75,7 +87,7 @@ def test_analyze_products_uses_min_batch_size_1(monkeypatch):
75 87 ]
76 88  
77 89 monkeypatch.setattr(process_products, "process_batch", _fake_process_batch)
78   - monkeypatch.setattr(process_products, "_set_cached_anchor_result", lambda *args, **kwargs: None)
  90 + monkeypatch.setattr(process_products, "_set_cached_analysis_result", lambda *args, **kwargs: None)
79 91  
80 92 out = process_products.analyze_products(
81 93 products=_mk_products(3),
... ...
tests/test_product_enrich_partial_mode.py
... ... @@ -74,6 +74,28 @@ def test_create_prompt_splits_shared_context_and_localized_tail():
74 74 assert prefix_en.startswith("| No. | Product title | Category path |")
75 75  
76 76  
  77 +def test_create_prompt_supports_taxonomy_analysis_kind():
  78 + products = [{"id": "1", "title": "linen dress"}]
  79 +
  80 + shared_zh, user_zh, prefix_zh = product_enrich.create_prompt(
  81 + products,
  82 + target_lang="zh",
  83 + analysis_kind="taxonomy",
  84 + )
  85 + shared_fr, user_fr, prefix_fr = product_enrich.create_prompt(
  86 + products,
  87 + target_lang="fr",
  88 + analysis_kind="taxonomy",
  89 + )
  90 +
  91 + assert "apparel attribute taxonomy" in shared_zh
  92 + assert "1. linen dress" in shared_zh
  93 + assert "Language: Chinese" in user_zh
  94 + assert "Language: French" in user_fr
  95 + assert prefix_zh.startswith("| 序号 | 品类 | 目标性别 |")
  96 + assert prefix_fr.startswith("| No. | Product Type | Target Gender |")
  97 +
  98 +
77 99 def test_call_llm_logs_shared_context_once_and_verbose_contains_full_requests():
78 100 payloads = []
79 101 response_bodies = [
... ... @@ -228,6 +250,38 @@ def test_process_batch_reads_result_and_validates_expected_fields():
228 250 assert row["anchor_text"] == "法式收腰连衣裙"
229 251  
230 252  
  253 +def test_process_batch_reads_taxonomy_result_with_schema_specific_fields():
  254 + merged_markdown = """| 序号 | 品类 | 目标性别 | 年龄段 | 适用季节 | 版型 | 廓形 | 领型 | 袖长类型 | 袖型 | 肩带设计 | 腰型 | 裤型 | 裙型 | 长度类型 | 闭合方式 | 设计细节 | 面料 | 成分 | 面料特性 | 服装特征 | 功能 | 主颜色 | 色系 | 印花 / 图案 | 适用场景 | 风格 |
  255 +|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|
  256 +| 1 | 连衣裙 | 女 | 成人 | 春季,夏季 | 修身 | A字 | V领 | 无袖 | | 细肩带 | 高腰 | | A字裙 | 中长款 | 拉链 | 褶皱 | 梭织 | 聚酯纤维,氨纶 | 轻薄,透气 | 有内衬 | 易打理 | 酒红色 | 红色 | 纯色 | 约会,度假 | 浪漫 |
  257 +"""
  258 +
  259 + with mock.patch.object(
  260 + product_enrich,
  261 + "call_llm",
  262 + return_value=(merged_markdown, json.dumps({"choices": [{"message": {"content": "stub"}}]})),
  263 + ):
  264 + results = product_enrich.process_batch(
  265 + [{"id": "sku-1", "title": "dress"}],
  266 + batch_num=1,
  267 + target_lang="zh",
  268 + analysis_kind="taxonomy",
  269 + )
  270 +
  271 + assert len(results) == 1
  272 + row = results[0]
  273 + assert row["id"] == "sku-1"
  274 + assert row["lang"] == "zh"
  275 + assert row["title_input"] == "dress"
  276 + assert row["product_type"] == "连衣裙"
  277 + assert row["target_gender"] == "女"
  278 + assert row["age_group"] == "成人"
  279 + assert row["sleeve_length_type"] == "无袖"
  280 + assert row["material_composition"] == "聚酯纤维,氨纶"
  281 + assert row["occasion_end_use"] == "约会,度假"
  282 + assert row["style_aesthetic"] == "浪漫"
  283 +
  284 +
231 285 def test_analyze_products_uses_product_level_cache_across_batch_requests():
232 286 cache_store = {}
233 287 process_calls = []
... ... @@ -241,13 +295,16 @@ def test_analyze_products_uses_product_level_cache_across_batch_requests():
241 295 product.get("image_url", ""),
242 296 )
243 297  
244   - def fake_get_cached_anchor_result(product, target_lang):
  298 + def fake_get_cached_analysis_result(product, target_lang, analysis_kind="content"):
  299 + assert analysis_kind == "content"
245 300 return cache_store.get(_cache_key(product, target_lang))
246 301  
247   - def fake_set_cached_anchor_result(product, target_lang, result):
  302 + def fake_set_cached_analysis_result(product, target_lang, result, analysis_kind="content"):
  303 + assert analysis_kind == "content"
248 304 cache_store[_cache_key(product, target_lang)] = result
249 305  
250   - def fake_process_batch(batch_data, batch_num, target_lang="zh"):
  306 + def fake_process_batch(batch_data, batch_num, target_lang="zh", analysis_kind="content"):
  307 + assert analysis_kind == "content"
251 308 process_calls.append(
252 309 {
253 310 "batch_num": batch_num,
... ... @@ -281,12 +338,12 @@ def test_analyze_products_uses_product_level_cache_across_batch_requests():
281 338  
282 339 with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object(
283 340 product_enrich,
284   - "_get_cached_anchor_result",
285   - side_effect=fake_get_cached_anchor_result,
  341 + "_get_cached_analysis_result",
  342 + side_effect=fake_get_cached_analysis_result,
286 343 ), mock.patch.object(
287 344 product_enrich,
288   - "_set_cached_anchor_result",
289   - side_effect=fake_set_cached_anchor_result,
  345 + "_set_cached_analysis_result",
  346 + side_effect=fake_set_cached_analysis_result,
290 347 ), mock.patch.object(
291 348 product_enrich,
292 349 "process_batch",
... ... @@ -342,11 +399,12 @@ def test_analyze_products_reuses_cached_content_with_current_product_identity():
342 399  
343 400 with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object(
344 401 product_enrich,
345   - "_get_cached_anchor_result",
346   - wraps=lambda product, target_lang: product_enrich._normalize_analysis_result(
  402 + "_get_cached_analysis_result",
  403 + wraps=lambda product, target_lang, analysis_kind="content": product_enrich._normalize_analysis_result(
347 404 cached_result,
348 405 product=product,
349 406 target_lang=target_lang,
  407 + schema=product_enrich._get_analysis_schema("content"),
350 408 ),
351 409 ), mock.patch.object(
352 410 product_enrich,
... ... @@ -379,7 +437,47 @@ def test_analyze_products_reuses_cached_content_with_current_product_identity():
379 437  
380 438  
381 439 def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output():
382   - def fake_analyze_products(products, target_lang="zh", batch_size=None, tenant_id=None):
  440 + def fake_analyze_products(
  441 + products,
  442 + target_lang="zh",
  443 + batch_size=None,
  444 + tenant_id=None,
  445 + analysis_kind="content",
  446 + ):
  447 + if analysis_kind == "taxonomy":
  448 + return [
  449 + {
  450 + "id": products[0]["id"],
  451 + "lang": target_lang,
  452 + "title_input": products[0]["title"],
  453 + "product_type": f"{target_lang}-dress",
  454 + "target_gender": f"{target_lang}-women",
  455 + "age_group": "",
  456 + "season": f"{target_lang}-summer",
  457 + "fit": "",
  458 + "silhouette": "",
  459 + "neckline": "",
  460 + "sleeve_length_type": "",
  461 + "sleeve_style": "",
  462 + "strap_type": "",
  463 + "rise_waistline": "",
  464 + "leg_shape": "",
  465 + "skirt_shape": "",
  466 + "length_type": "",
  467 + "closure_type": "",
  468 + "design_details": "",
  469 + "fabric": "",
  470 + "material_composition": "",
  471 + "fabric_properties": "",
  472 + "clothing_features": "",
  473 + "functional_benefits": "",
  474 + "color": "",
  475 + "color_family": "",
  476 + "print_pattern": "",
  477 + "occasion_end_use": "",
  478 + "style_aesthetic": "",
  479 + }
  480 + ]
383 481 return [
384 482 {
385 483 "id": products[0]["id"],
... ... @@ -423,6 +521,20 @@ def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output()
423 521 },
424 522 {"name": "target_audience", "value": {"zh": ["zh-audience"], "en": ["en-audience"]}},
425 523 ],
  524 + "enriched_taxonomy_attributes": [
  525 + {
  526 + "name": "Product Type",
  527 + "value": {"zh": ["zh-dress"], "en": ["en-dress"]},
  528 + },
  529 + {
  530 + "name": "Target Gender",
  531 + "value": {"zh": ["zh-women"], "en": ["en-women"]},
  532 + },
  533 + {
  534 + "name": "Season",
  535 + "value": {"zh": ["zh-summer"], "en": ["en-summer"]},
  536 + },
  537 + ],
426 538 }
427 539 ]
428 540  
... ...