Commit 36516857d07bd540ffdc097d5f5349da1048c184
1 parent
78cdef1c
feat(product_enrich): 为产品富化模块增加 enriched_taxonomy_attributes
字段生成
- 新增分类法属性富化能力,遵循 enriched_attributes
相同的字段结构和处理逻辑,仅提示词和解析维度不同
- 引入 AnalysisSchema
抽象类,使内容富化(content)与分类法富化(taxonomy)共享批处理、缓存、提示词构建、Markdown
解析及归一化流程
- 重构 product_enrich.py 中原有的富化管道,将通用逻辑抽取至
_process_batch_for_schema、_parse_markdown_to_attributes
等函数,消除代码重复
- 在 product_enrich_prompts.py
中添加分类法提示词模板(TAXONOMY_ANALYSIS_PROMPT)及 Markdown
表头定义(TAXONOMY_HEADERS)
- 修复 Markdown
解析器在空单元格时的行为:原实现会跳过空单元格导致列错位,现改为保留空值,确保稀疏的分类法属性列正确对齐
- 更新 document_transformer.py 中 build_index_content_fields 函数,将
enriched_taxonomy_attributes(中/英)写入最终索引文档
- 调整相关单元测试(test_product_enrich_partial_mode.py
等)以覆盖新字段路径,测试通过(14 passed)
技术细节:
- AnalysisSchema 包含
schema_name、prompt_template、headers、field_name_prefix 等元数据
-
缓存键区分内容/分类法:`enrich:{schema_name}:{product_id}`,避免缓存污染
- 分类法解析使用与 enriched_attributes
相同的嵌套结构:`{"attribute_key": "value"}`,支持多行表格
- 批处理大小与重试逻辑保持与原有内容富化一致
Showing
6 changed files
with
574 additions
and
100 deletions
Show diff stats
indexer/document_transformer.py
| @@ -242,6 +242,7 @@ class SPUDocumentTransformer: | @@ -242,6 +242,7 @@ class SPUDocumentTransformer: | ||
| 242 | - qanchors.{lang} | 242 | - qanchors.{lang} |
| 243 | - enriched_tags.{lang} | 243 | - enriched_tags.{lang} |
| 244 | - enriched_attributes[].value.{lang} | 244 | - enriched_attributes[].value.{lang} |
| 245 | + - enriched_taxonomy_attributes[].value.{lang} | ||
| 245 | 246 | ||
| 246 | 设计目标: | 247 | 设计目标: |
| 247 | - 尽可能攒批调用 LLM; | 248 | - 尽可能攒批调用 LLM; |
| @@ -296,6 +297,8 @@ class SPUDocumentTransformer: | @@ -296,6 +297,8 @@ class SPUDocumentTransformer: | ||
| 296 | doc["enriched_tags"] = enrichment["enriched_tags"] | 297 | doc["enriched_tags"] = enrichment["enriched_tags"] |
| 297 | if enrichment.get("enriched_attributes"): | 298 | if enrichment.get("enriched_attributes"): |
| 298 | doc["enriched_attributes"] = enrichment["enriched_attributes"] | 299 | doc["enriched_attributes"] = enrichment["enriched_attributes"] |
| 300 | + if enrichment.get("enriched_taxonomy_attributes"): | ||
| 301 | + doc["enriched_taxonomy_attributes"] = enrichment["enriched_taxonomy_attributes"] | ||
| 299 | except Exception as e: | 302 | except Exception as e: |
| 300 | logger.warning("Failed to apply enrichment to doc (spu_id=%s): %s", doc.get("spu_id"), e) | 303 | logger.warning("Failed to apply enrichment to doc (spu_id=%s): %s", doc.get("spu_id"), e) |
| 301 | 304 |
indexer/product_enrich.py
| @@ -14,6 +14,7 @@ import time | @@ -14,6 +14,7 @@ import time | ||
| 14 | import hashlib | 14 | import hashlib |
| 15 | import uuid | 15 | import uuid |
| 16 | import threading | 16 | import threading |
| 17 | +from dataclasses import dataclass, field | ||
| 17 | from collections import OrderedDict | 18 | from collections import OrderedDict |
| 18 | from datetime import datetime | 19 | from datetime import datetime |
| 19 | from concurrent.futures import ThreadPoolExecutor | 20 | from concurrent.futures import ThreadPoolExecutor |
| @@ -30,6 +31,9 @@ from indexer.product_enrich_prompts import ( | @@ -30,6 +31,9 @@ from indexer.product_enrich_prompts import ( | ||
| 30 | USER_INSTRUCTION_TEMPLATE, | 31 | USER_INSTRUCTION_TEMPLATE, |
| 31 | LANGUAGE_MARKDOWN_TABLE_HEADERS, | 32 | LANGUAGE_MARKDOWN_TABLE_HEADERS, |
| 32 | SHARED_ANALYSIS_INSTRUCTION, | 33 | SHARED_ANALYSIS_INSTRUCTION, |
| 34 | + TAXONOMY_LANGUAGE_MARKDOWN_TABLE_HEADERS, | ||
| 35 | + TAXONOMY_MARKDOWN_TABLE_HEADERS_EN, | ||
| 36 | + TAXONOMY_SHARED_ANALYSIS_INSTRUCTION, | ||
| 33 | ) | 37 | ) |
| 34 | 38 | ||
| 35 | # 配置 | 39 | # 配置 |
| @@ -147,7 +151,7 @@ if _missing_prompt_langs: | @@ -147,7 +151,7 @@ if _missing_prompt_langs: | ||
| 147 | # 多值字段分隔:英文逗号、中文逗号、顿号,及历史约定的 ; | / 与空白 | 151 | # 多值字段分隔:英文逗号、中文逗号、顿号,及历史约定的 ; | / 与空白 |
| 148 | _MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[,、,;|/\n\t]+") | 152 | _MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[,、,;|/\n\t]+") |
| 149 | _CORE_INDEX_LANGUAGES = ("zh", "en") | 153 | _CORE_INDEX_LANGUAGES = ("zh", "en") |
| 150 | -_ANALYSIS_ATTRIBUTE_FIELD_MAP = ( | 154 | +_CONTENT_ANALYSIS_ATTRIBUTE_FIELD_MAP = ( |
| 151 | ("tags", "enriched_tags"), | 155 | ("tags", "enriched_tags"), |
| 152 | ("target_audience", "target_audience"), | 156 | ("target_audience", "target_audience"), |
| 153 | ("usage_scene", "usage_scene"), | 157 | ("usage_scene", "usage_scene"), |
| @@ -156,7 +160,7 @@ _ANALYSIS_ATTRIBUTE_FIELD_MAP = ( | @@ -156,7 +160,7 @@ _ANALYSIS_ATTRIBUTE_FIELD_MAP = ( | ||
| 156 | ("material", "material"), | 160 | ("material", "material"), |
| 157 | ("features", "features"), | 161 | ("features", "features"), |
| 158 | ) | 162 | ) |
| 159 | -_ANALYSIS_RESULT_FIELDS = ( | 163 | +_CONTENT_ANALYSIS_RESULT_FIELDS = ( |
| 160 | "title", | 164 | "title", |
| 161 | "category_path", | 165 | "category_path", |
| 162 | "tags", | 166 | "tags", |
| @@ -168,7 +172,7 @@ _ANALYSIS_RESULT_FIELDS = ( | @@ -168,7 +172,7 @@ _ANALYSIS_RESULT_FIELDS = ( | ||
| 168 | "features", | 172 | "features", |
| 169 | "anchor_text", | 173 | "anchor_text", |
| 170 | ) | 174 | ) |
| 171 | -_ANALYSIS_MEANINGFUL_FIELDS = ( | 175 | +_CONTENT_ANALYSIS_MEANINGFUL_FIELDS = ( |
| 172 | "tags", | 176 | "tags", |
| 173 | "target_audience", | 177 | "target_audience", |
| 174 | "usage_scene", | 178 | "usage_scene", |
| @@ -178,9 +182,89 @@ _ANALYSIS_MEANINGFUL_FIELDS = ( | @@ -178,9 +182,89 @@ _ANALYSIS_MEANINGFUL_FIELDS = ( | ||
| 178 | "features", | 182 | "features", |
| 179 | "anchor_text", | 183 | "anchor_text", |
| 180 | ) | 184 | ) |
| 181 | -_ANALYSIS_FIELD_ALIASES = { | 185 | +_CONTENT_ANALYSIS_FIELD_ALIASES = { |
| 182 | "tags": ("tags", "enriched_tags"), | 186 | "tags": ("tags", "enriched_tags"), |
| 183 | } | 187 | } |
| 188 | +_CONTENT_ANALYSIS_QUALITY_FIELDS = ("title", "category_path", "anchor_text") | ||
| 189 | +_TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP = ( | ||
| 190 | + ("product_type", "Product Type"), | ||
| 191 | + ("target_gender", "Target Gender"), | ||
| 192 | + ("age_group", "Age Group"), | ||
| 193 | + ("season", "Season"), | ||
| 194 | + ("fit", "Fit"), | ||
| 195 | + ("silhouette", "Silhouette"), | ||
| 196 | + ("neckline", "Neckline"), | ||
| 197 | + ("sleeve_length_type", "Sleeve Length Type"), | ||
| 198 | + ("sleeve_style", "Sleeve Style"), | ||
| 199 | + ("strap_type", "Strap Type"), | ||
| 200 | + ("rise_waistline", "Rise / Waistline"), | ||
| 201 | + ("leg_shape", "Leg Shape"), | ||
| 202 | + ("skirt_shape", "Skirt Shape"), | ||
| 203 | + ("length_type", "Length Type"), | ||
| 204 | + ("closure_type", "Closure Type"), | ||
| 205 | + ("design_details", "Design Details"), | ||
| 206 | + ("fabric", "Fabric"), | ||
| 207 | + ("material_composition", "Material Composition"), | ||
| 208 | + ("fabric_properties", "Fabric Properties"), | ||
| 209 | + ("clothing_features", "Clothing Features"), | ||
| 210 | + ("functional_benefits", "Functional Benefits"), | ||
| 211 | + ("color", "Color"), | ||
| 212 | + ("color_family", "Color Family"), | ||
| 213 | + ("print_pattern", "Print / Pattern"), | ||
| 214 | + ("occasion_end_use", "Occasion / End Use"), | ||
| 215 | + ("style_aesthetic", "Style Aesthetic"), | ||
| 216 | +) | ||
| 217 | +_TAXONOMY_ANALYSIS_RESULT_FIELDS = tuple( | ||
| 218 | + field_name for field_name, _ in _TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP | ||
| 219 | +) | ||
| 220 | + | ||
| 221 | + | ||
| 222 | +@dataclass(frozen=True) | ||
| 223 | +class AnalysisSchema: | ||
| 224 | + name: str | ||
| 225 | + shared_instruction: str | ||
| 226 | + markdown_table_headers: Dict[str, List[str]] | ||
| 227 | + result_fields: Tuple[str, ...] | ||
| 228 | + meaningful_fields: Tuple[str, ...] | ||
| 229 | + field_aliases: Dict[str, Tuple[str, ...]] = field(default_factory=dict) | ||
| 230 | + fallback_headers: Optional[List[str]] = None | ||
| 231 | + quality_fields: Tuple[str, ...] = () | ||
| 232 | + | ||
| 233 | + def get_headers(self, target_lang: str) -> Optional[List[str]]: | ||
| 234 | + headers = self.markdown_table_headers.get(target_lang) | ||
| 235 | + if headers: | ||
| 236 | + return headers | ||
| 237 | + if self.fallback_headers: | ||
| 238 | + return self.fallback_headers | ||
| 239 | + return None | ||
| 240 | + | ||
| 241 | + | ||
| 242 | +_ANALYSIS_SCHEMAS: Dict[str, AnalysisSchema] = { | ||
| 243 | + "content": AnalysisSchema( | ||
| 244 | + name="content", | ||
| 245 | + shared_instruction=SHARED_ANALYSIS_INSTRUCTION, | ||
| 246 | + markdown_table_headers=LANGUAGE_MARKDOWN_TABLE_HEADERS, | ||
| 247 | + result_fields=_CONTENT_ANALYSIS_RESULT_FIELDS, | ||
| 248 | + meaningful_fields=_CONTENT_ANALYSIS_MEANINGFUL_FIELDS, | ||
| 249 | + field_aliases=_CONTENT_ANALYSIS_FIELD_ALIASES, | ||
| 250 | + quality_fields=_CONTENT_ANALYSIS_QUALITY_FIELDS, | ||
| 251 | + ), | ||
| 252 | + "taxonomy": AnalysisSchema( | ||
| 253 | + name="taxonomy", | ||
| 254 | + shared_instruction=TAXONOMY_SHARED_ANALYSIS_INSTRUCTION, | ||
| 255 | + markdown_table_headers=TAXONOMY_LANGUAGE_MARKDOWN_TABLE_HEADERS, | ||
| 256 | + result_fields=_TAXONOMY_ANALYSIS_RESULT_FIELDS, | ||
| 257 | + meaningful_fields=_TAXONOMY_ANALYSIS_RESULT_FIELDS, | ||
| 258 | + fallback_headers=TAXONOMY_MARKDOWN_TABLE_HEADERS_EN, | ||
| 259 | + ), | ||
| 260 | +} | ||
| 261 | + | ||
| 262 | + | ||
| 263 | +def _get_analysis_schema(analysis_kind: str) -> AnalysisSchema: | ||
| 264 | + schema = _ANALYSIS_SCHEMAS.get(analysis_kind) | ||
| 265 | + if schema is None: | ||
| 266 | + raise ValueError(f"Unsupported analysis_kind: {analysis_kind}") | ||
| 267 | + return schema | ||
| 184 | 268 | ||
| 185 | 269 | ||
| 186 | def split_multi_value_field(text: Optional[str]) -> List[str]: | 270 | def split_multi_value_field(text: Optional[str]) -> List[str]: |
| @@ -235,12 +319,12 @@ def _get_product_id(product: Dict[str, Any]) -> str: | @@ -235,12 +319,12 @@ def _get_product_id(product: Dict[str, Any]) -> str: | ||
| 235 | return str(product.get("id") or product.get("spu_id") or "").strip() | 319 | return str(product.get("id") or product.get("spu_id") or "").strip() |
| 236 | 320 | ||
| 237 | 321 | ||
| 238 | -def _get_analysis_field_aliases(field_name: str) -> Tuple[str, ...]: | ||
| 239 | - return _ANALYSIS_FIELD_ALIASES.get(field_name, (field_name,)) | 322 | +def _get_analysis_field_aliases(field_name: str, schema: AnalysisSchema) -> Tuple[str, ...]: |
| 323 | + return schema.field_aliases.get(field_name, (field_name,)) | ||
| 240 | 324 | ||
| 241 | 325 | ||
| 242 | -def _get_analysis_field_value(row: Dict[str, Any], field_name: str) -> Any: | ||
| 243 | - for alias in _get_analysis_field_aliases(field_name): | 326 | +def _get_analysis_field_value(row: Dict[str, Any], field_name: str, schema: AnalysisSchema) -> Any: |
| 327 | + for alias in _get_analysis_field_aliases(field_name, schema): | ||
| 244 | if alias in row: | 328 | if alias in row: |
| 245 | return row.get(alias) | 329 | return row.get(alias) |
| 246 | return None | 330 | return None |
| @@ -261,6 +345,7 @@ def _has_meaningful_value(value: Any) -> bool: | @@ -261,6 +345,7 @@ def _has_meaningful_value(value: Any) -> bool: | ||
| 261 | def _make_empty_analysis_result( | 345 | def _make_empty_analysis_result( |
| 262 | product: Dict[str, Any], | 346 | product: Dict[str, Any], |
| 263 | target_lang: str, | 347 | target_lang: str, |
| 348 | + schema: AnalysisSchema, | ||
| 264 | error: Optional[str] = None, | 349 | error: Optional[str] = None, |
| 265 | ) -> Dict[str, Any]: | 350 | ) -> Dict[str, Any]: |
| 266 | result = { | 351 | result = { |
| @@ -268,7 +353,7 @@ def _make_empty_analysis_result( | @@ -268,7 +353,7 @@ def _make_empty_analysis_result( | ||
| 268 | "lang": target_lang, | 353 | "lang": target_lang, |
| 269 | "title_input": str(product.get("title") or "").strip(), | 354 | "title_input": str(product.get("title") or "").strip(), |
| 270 | } | 355 | } |
| 271 | - for field in _ANALYSIS_RESULT_FIELDS: | 356 | + for field in schema.result_fields: |
| 272 | result[field] = "" | 357 | result[field] = "" |
| 273 | if error: | 358 | if error: |
| 274 | result["error"] = error | 359 | result["error"] = error |
| @@ -279,42 +364,59 @@ def _normalize_analysis_result( | @@ -279,42 +364,59 @@ def _normalize_analysis_result( | ||
| 279 | result: Dict[str, Any], | 364 | result: Dict[str, Any], |
| 280 | product: Dict[str, Any], | 365 | product: Dict[str, Any], |
| 281 | target_lang: str, | 366 | target_lang: str, |
| 367 | + schema: AnalysisSchema, | ||
| 282 | ) -> Dict[str, Any]: | 368 | ) -> Dict[str, Any]: |
| 283 | - normalized = _make_empty_analysis_result(product, target_lang) | 369 | + normalized = _make_empty_analysis_result(product, target_lang, schema) |
| 284 | if not isinstance(result, dict): | 370 | if not isinstance(result, dict): |
| 285 | return normalized | 371 | return normalized |
| 286 | 372 | ||
| 287 | normalized["lang"] = str(result.get("lang") or target_lang).strip() or target_lang | 373 | normalized["lang"] = str(result.get("lang") or target_lang).strip() or target_lang |
| 288 | - normalized["title"] = str(result.get("title") or "").strip() | ||
| 289 | - normalized["category_path"] = str(result.get("category_path") or "").strip() | ||
| 290 | normalized["title_input"] = str( | 374 | normalized["title_input"] = str( |
| 291 | product.get("title") or result.get("title_input") or "" | 375 | product.get("title") or result.get("title_input") or "" |
| 292 | ).strip() | 376 | ).strip() |
| 293 | 377 | ||
| 294 | - for field in _ANALYSIS_RESULT_FIELDS: | ||
| 295 | - if field in {"title", "category_path"}: | ||
| 296 | - continue | ||
| 297 | - normalized[field] = str(_get_analysis_field_value(result, field) or "").strip() | 378 | + for field in schema.result_fields: |
| 379 | + normalized[field] = str(_get_analysis_field_value(result, field, schema) or "").strip() | ||
| 298 | 380 | ||
| 299 | if result.get("error"): | 381 | if result.get("error"): |
| 300 | normalized["error"] = str(result.get("error")) | 382 | normalized["error"] = str(result.get("error")) |
| 301 | return normalized | 383 | return normalized |
| 302 | 384 | ||
| 303 | 385 | ||
| 304 | -def _has_meaningful_analysis_content(result: Dict[str, Any]) -> bool: | ||
| 305 | - return any(_has_meaningful_value(result.get(field)) for field in _ANALYSIS_MEANINGFUL_FIELDS) | 386 | +def _has_meaningful_analysis_content(result: Dict[str, Any], schema: AnalysisSchema) -> bool: |
| 387 | + return any(_has_meaningful_value(result.get(field)) for field in schema.meaningful_fields) | ||
| 388 | + | ||
| 389 | + | ||
| 390 | +def _append_analysis_attributes( | ||
| 391 | + target: List[Dict[str, Any]], | ||
| 392 | + row: Dict[str, Any], | ||
| 393 | + lang: str, | ||
| 394 | + schema: AnalysisSchema, | ||
| 395 | + field_map: Tuple[Tuple[str, str], ...], | ||
| 396 | +) -> None: | ||
| 397 | + for source_name, output_name in field_map: | ||
| 398 | + raw = _get_analysis_field_value(row, source_name, schema) | ||
| 399 | + if not raw: | ||
| 400 | + continue | ||
| 401 | + _append_named_lang_phrase_map( | ||
| 402 | + target, | ||
| 403 | + name=output_name, | ||
| 404 | + lang=lang, | ||
| 405 | + raw_value=raw, | ||
| 406 | + ) | ||
| 306 | 407 | ||
| 307 | 408 | ||
| 308 | def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang: str) -> None: | 409 | def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang: str) -> None: |
| 309 | if not row or row.get("error"): | 410 | if not row or row.get("error"): |
| 310 | return | 411 | return |
| 311 | 412 | ||
| 312 | - anchor_text = str(_get_analysis_field_value(row, "anchor_text") or "").strip() | 413 | + content_schema = _get_analysis_schema("content") |
| 414 | + anchor_text = str(_get_analysis_field_value(row, "anchor_text", content_schema) or "").strip() | ||
| 313 | if anchor_text: | 415 | if anchor_text: |
| 314 | _append_lang_phrase_map(result["qanchors"], lang=lang, raw_value=anchor_text) | 416 | _append_lang_phrase_map(result["qanchors"], lang=lang, raw_value=anchor_text) |
| 315 | 417 | ||
| 316 | - for source_name, output_name in _ANALYSIS_ATTRIBUTE_FIELD_MAP: | ||
| 317 | - raw = _get_analysis_field_value(row, source_name) | 418 | + for source_name, output_name in _CONTENT_ANALYSIS_ATTRIBUTE_FIELD_MAP: |
| 419 | + raw = _get_analysis_field_value(row, source_name, content_schema) | ||
| 318 | if not raw: | 420 | if not raw: |
| 319 | continue | 421 | continue |
| 320 | _append_named_lang_phrase_map( | 422 | _append_named_lang_phrase_map( |
| @@ -327,6 +429,19 @@ def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang: | @@ -327,6 +429,19 @@ def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang: | ||
| 327 | _append_lang_phrase_map(result["enriched_tags"], lang=lang, raw_value=raw) | 429 | _append_lang_phrase_map(result["enriched_tags"], lang=lang, raw_value=raw) |
| 328 | 430 | ||
| 329 | 431 | ||
| 432 | +def _apply_index_taxonomy_row(result: Dict[str, Any], row: Dict[str, Any], lang: str) -> None: | ||
| 433 | + if not row or row.get("error"): | ||
| 434 | + return | ||
| 435 | + | ||
| 436 | + _append_analysis_attributes( | ||
| 437 | + result["enriched_taxonomy_attributes"], | ||
| 438 | + row=row, | ||
| 439 | + lang=lang, | ||
| 440 | + schema=_get_analysis_schema("taxonomy"), | ||
| 441 | + field_map=_TAXONOMY_ANALYSIS_ATTRIBUTE_FIELD_MAP, | ||
| 442 | + ) | ||
| 443 | + | ||
| 444 | + | ||
| 330 | def _normalize_index_content_item(item: Dict[str, Any]) -> Dict[str, str]: | 445 | def _normalize_index_content_item(item: Dict[str, Any]) -> Dict[str, str]: |
| 331 | item_id = _get_product_id(item) | 446 | item_id = _get_product_id(item) |
| 332 | return { | 447 | return { |
| @@ -355,6 +470,7 @@ def build_index_content_fields( | @@ -355,6 +470,7 @@ def build_index_content_fields( | ||
| 355 | - `qanchors` | 470 | - `qanchors` |
| 356 | - `enriched_tags` | 471 | - `enriched_tags` |
| 357 | - `enriched_attributes` | 472 | - `enriched_attributes` |
| 473 | + - `enriched_taxonomy_attributes` | ||
| 358 | - 可选 `error` | 474 | - 可选 `error` |
| 359 | 475 | ||
| 360 | 其中: | 476 | 其中: |
| @@ -371,6 +487,7 @@ def build_index_content_fields( | @@ -371,6 +487,7 @@ def build_index_content_fields( | ||
| 371 | "qanchors": {}, | 487 | "qanchors": {}, |
| 372 | "enriched_tags": {}, | 488 | "enriched_tags": {}, |
| 373 | "enriched_attributes": [], | 489 | "enriched_attributes": [], |
| 490 | + "enriched_taxonomy_attributes": [], | ||
| 374 | } | 491 | } |
| 375 | for item in normalized_items | 492 | for item in normalized_items |
| 376 | } | 493 | } |
| @@ -398,6 +515,33 @@ def build_index_content_fields( | @@ -398,6 +515,33 @@ def build_index_content_fields( | ||
| 398 | continue | 515 | continue |
| 399 | _apply_index_content_row(results_by_id[item_id], row=row, lang=lang) | 516 | _apply_index_content_row(results_by_id[item_id], row=row, lang=lang) |
| 400 | 517 | ||
| 518 | + try: | ||
| 519 | + taxonomy_rows = analyze_products( | ||
| 520 | + products=normalized_items, | ||
| 521 | + target_lang=lang, | ||
| 522 | + batch_size=BATCH_SIZE, | ||
| 523 | + tenant_id=tenant_id, | ||
| 524 | + analysis_kind="taxonomy", | ||
| 525 | + ) | ||
| 526 | + except Exception as e: | ||
| 527 | + logger.warning( | ||
| 528 | + "build_index_content_fields taxonomy enrichment failed for lang=%s: %s", | ||
| 529 | + lang, | ||
| 530 | + e, | ||
| 531 | + ) | ||
| 532 | + for item in normalized_items: | ||
| 533 | + results_by_id[item["id"]].setdefault("error", str(e)) | ||
| 534 | + continue | ||
| 535 | + | ||
| 536 | + for row in taxonomy_rows or []: | ||
| 537 | + item_id = str(row.get("id") or "").strip() | ||
| 538 | + if not item_id or item_id not in results_by_id: | ||
| 539 | + continue | ||
| 540 | + if row.get("error"): | ||
| 541 | + results_by_id[item_id].setdefault("error", row["error"]) | ||
| 542 | + continue | ||
| 543 | + _apply_index_taxonomy_row(results_by_id[item_id], row=row, lang=lang) | ||
| 544 | + | ||
| 401 | return [results_by_id[item["id"]] for item in normalized_items] | 545 | return [results_by_id[item["id"]] for item in normalized_items] |
| 402 | 546 | ||
| 403 | 547 | ||
| @@ -463,52 +607,89 @@ def _build_prompt_input_text(product: Dict[str, Any]) -> str: | @@ -463,52 +607,89 @@ def _build_prompt_input_text(product: Dict[str, Any]) -> str: | ||
| 463 | return _truncate_by_words(candidate, PROMPT_INPUT_MAX_WORDS) | 607 | return _truncate_by_words(candidate, PROMPT_INPUT_MAX_WORDS) |
| 464 | 608 | ||
| 465 | 609 | ||
| 466 | -def _make_anchor_cache_key( | 610 | +def _make_analysis_cache_key( |
| 467 | product: Dict[str, Any], | 611 | product: Dict[str, Any], |
| 468 | target_lang: str, | 612 | target_lang: str, |
| 613 | + analysis_kind: str, | ||
| 469 | ) -> str: | 614 | ) -> str: |
| 470 | - """构造缓存 key,仅由 prompt 实际输入文本内容 + 目标语言决定。""" | 615 | + """构造缓存 key,仅由分析类型、prompt 实际输入文本内容与目标语言决定。""" |
| 471 | prompt_input = _build_prompt_input_text(product) | 616 | prompt_input = _build_prompt_input_text(product) |
| 472 | h = hashlib.md5(prompt_input.encode("utf-8")).hexdigest() | 617 | h = hashlib.md5(prompt_input.encode("utf-8")).hexdigest() |
| 473 | - return f"{ANCHOR_CACHE_PREFIX}:{target_lang}:{prompt_input[:4]}{h}" | 618 | + return f"{ANCHOR_CACHE_PREFIX}:{analysis_kind}:{target_lang}:{prompt_input[:4]}{h}" |
| 474 | 619 | ||
| 475 | 620 | ||
| 476 | -def _get_cached_anchor_result( | 621 | +def _make_anchor_cache_key( |
| 477 | product: Dict[str, Any], | 622 | product: Dict[str, Any], |
| 478 | target_lang: str, | 623 | target_lang: str, |
| 624 | +) -> str: | ||
| 625 | + return _make_analysis_cache_key(product, target_lang, analysis_kind="content") | ||
| 626 | + | ||
| 627 | + | ||
| 628 | +def _get_cached_analysis_result( | ||
| 629 | + product: Dict[str, Any], | ||
| 630 | + target_lang: str, | ||
| 631 | + analysis_kind: str, | ||
| 479 | ) -> Optional[Dict[str, Any]]: | 632 | ) -> Optional[Dict[str, Any]]: |
| 480 | if not _anchor_redis: | 633 | if not _anchor_redis: |
| 481 | return None | 634 | return None |
| 635 | + schema = _get_analysis_schema(analysis_kind) | ||
| 482 | try: | 636 | try: |
| 483 | - key = _make_anchor_cache_key(product, target_lang) | 637 | + key = _make_analysis_cache_key(product, target_lang, analysis_kind) |
| 484 | raw = _anchor_redis.get(key) | 638 | raw = _anchor_redis.get(key) |
| 485 | if not raw: | 639 | if not raw: |
| 486 | return None | 640 | return None |
| 487 | - result = _normalize_analysis_result(json.loads(raw), product=product, target_lang=target_lang) | ||
| 488 | - if not _has_meaningful_analysis_content(result): | 641 | + result = _normalize_analysis_result( |
| 642 | + json.loads(raw), | ||
| 643 | + product=product, | ||
| 644 | + target_lang=target_lang, | ||
| 645 | + schema=schema, | ||
| 646 | + ) | ||
| 647 | + if not _has_meaningful_analysis_content(result, schema): | ||
| 489 | return None | 648 | return None |
| 490 | return result | 649 | return result |
| 491 | except Exception as e: | 650 | except Exception as e: |
| 492 | - logger.warning(f"Failed to get anchor cache: {e}") | 651 | + logger.warning("Failed to get %s analysis cache: %s", analysis_kind, e) |
| 493 | return None | 652 | return None |
| 494 | 653 | ||
| 495 | 654 | ||
| 496 | -def _set_cached_anchor_result( | 655 | +def _get_cached_anchor_result( |
| 656 | + product: Dict[str, Any], | ||
| 657 | + target_lang: str, | ||
| 658 | +) -> Optional[Dict[str, Any]]: | ||
| 659 | + return _get_cached_analysis_result(product, target_lang, analysis_kind="content") | ||
| 660 | + | ||
| 661 | + | ||
| 662 | +def _set_cached_analysis_result( | ||
| 497 | product: Dict[str, Any], | 663 | product: Dict[str, Any], |
| 498 | target_lang: str, | 664 | target_lang: str, |
| 499 | result: Dict[str, Any], | 665 | result: Dict[str, Any], |
| 666 | + analysis_kind: str, | ||
| 500 | ) -> None: | 667 | ) -> None: |
| 501 | if not _anchor_redis: | 668 | if not _anchor_redis: |
| 502 | return | 669 | return |
| 670 | + schema = _get_analysis_schema(analysis_kind) | ||
| 503 | try: | 671 | try: |
| 504 | - normalized = _normalize_analysis_result(result, product=product, target_lang=target_lang) | ||
| 505 | - if not _has_meaningful_analysis_content(normalized): | 672 | + normalized = _normalize_analysis_result( |
| 673 | + result, | ||
| 674 | + product=product, | ||
| 675 | + target_lang=target_lang, | ||
| 676 | + schema=schema, | ||
| 677 | + ) | ||
| 678 | + if not _has_meaningful_analysis_content(normalized, schema): | ||
| 506 | return | 679 | return |
| 507 | - key = _make_anchor_cache_key(product, target_lang) | 680 | + key = _make_analysis_cache_key(product, target_lang, analysis_kind) |
| 508 | ttl = ANCHOR_CACHE_EXPIRE_DAYS * 24 * 3600 | 681 | ttl = ANCHOR_CACHE_EXPIRE_DAYS * 24 * 3600 |
| 509 | _anchor_redis.setex(key, ttl, json.dumps(normalized, ensure_ascii=False)) | 682 | _anchor_redis.setex(key, ttl, json.dumps(normalized, ensure_ascii=False)) |
| 510 | except Exception as e: | 683 | except Exception as e: |
| 511 | - logger.warning(f"Failed to set anchor cache: {e}") | 684 | + logger.warning("Failed to set %s analysis cache: %s", analysis_kind, e) |
| 685 | + | ||
| 686 | + | ||
| 687 | +def _set_cached_anchor_result( | ||
| 688 | + product: Dict[str, Any], | ||
| 689 | + target_lang: str, | ||
| 690 | + result: Dict[str, Any], | ||
| 691 | +) -> None: | ||
| 692 | + _set_cached_analysis_result(product, target_lang, result, analysis_kind="content") | ||
| 512 | 693 | ||
| 513 | 694 | ||
| 514 | def _build_assistant_prefix(headers: List[str]) -> str: | 695 | def _build_assistant_prefix(headers: List[str]) -> str: |
| @@ -517,8 +698,8 @@ def _build_assistant_prefix(headers: List[str]) -> str: | @@ -517,8 +698,8 @@ def _build_assistant_prefix(headers: List[str]) -> str: | ||
| 517 | return f"{header_line}\n{separator_line}\n" | 698 | return f"{header_line}\n{separator_line}\n" |
| 518 | 699 | ||
| 519 | 700 | ||
| 520 | -def _build_shared_context(products: List[Dict[str, str]]) -> str: | ||
| 521 | - shared_context = SHARED_ANALYSIS_INSTRUCTION | 701 | +def _build_shared_context(products: List[Dict[str, str]], schema: AnalysisSchema) -> str: |
| 702 | + shared_context = schema.shared_instruction | ||
| 522 | for idx, product in enumerate(products, 1): | 703 | for idx, product in enumerate(products, 1): |
| 523 | prompt_input = _build_prompt_input_text(product) | 704 | prompt_input = _build_prompt_input_text(product) |
| 524 | shared_context += f"{idx}. {prompt_input}\n" | 705 | shared_context += f"{idx}. {prompt_input}\n" |
| @@ -550,16 +731,19 @@ def reset_logged_shared_context_keys() -> None: | @@ -550,16 +731,19 @@ def reset_logged_shared_context_keys() -> None: | ||
| 550 | def create_prompt( | 731 | def create_prompt( |
| 551 | products: List[Dict[str, str]], | 732 | products: List[Dict[str, str]], |
| 552 | target_lang: str = "zh", | 733 | target_lang: str = "zh", |
| 553 | -) -> Tuple[str, str, str]: | 734 | + analysis_kind: str = "content", |
| 735 | +) -> Tuple[Optional[str], Optional[str], Optional[str]]: | ||
| 554 | """根据目标语言创建共享上下文、本地化输出要求和 Partial Mode assistant 前缀。""" | 736 | """根据目标语言创建共享上下文、本地化输出要求和 Partial Mode assistant 前缀。""" |
| 555 | - markdown_table_headers = LANGUAGE_MARKDOWN_TABLE_HEADERS.get(target_lang) | 737 | + schema = _get_analysis_schema(analysis_kind) |
| 738 | + markdown_table_headers = schema.get_headers(target_lang) | ||
| 556 | if not markdown_table_headers: | 739 | if not markdown_table_headers: |
| 557 | logger.warning( | 740 | logger.warning( |
| 558 | - "Unsupported target_lang for markdown table headers: %s", | 741 | + "Unsupported target_lang for markdown table headers: kind=%s lang=%s", |
| 742 | + analysis_kind, | ||
| 559 | target_lang, | 743 | target_lang, |
| 560 | ) | 744 | ) |
| 561 | return None, None, None | 745 | return None, None, None |
| 562 | - shared_context = _build_shared_context(products) | 746 | + shared_context = _build_shared_context(products, schema) |
| 563 | language_label = SOURCE_LANG_CODE_MAP.get(target_lang, target_lang) | 747 | language_label = SOURCE_LANG_CODE_MAP.get(target_lang, target_lang) |
| 564 | user_prompt = USER_INSTRUCTION_TEMPLATE.format(language=language_label).strip() | 748 | user_prompt = USER_INSTRUCTION_TEMPLATE.format(language=language_label).strip() |
| 565 | assistant_prefix = _build_assistant_prefix(markdown_table_headers) | 749 | assistant_prefix = _build_assistant_prefix(markdown_table_headers) |
| @@ -592,6 +776,7 @@ def call_llm( | @@ -592,6 +776,7 @@ def call_llm( | ||
| 592 | user_prompt: str, | 776 | user_prompt: str, |
| 593 | assistant_prefix: str, | 777 | assistant_prefix: str, |
| 594 | target_lang: str = "zh", | 778 | target_lang: str = "zh", |
| 779 | + analysis_kind: str = "content", | ||
| 595 | ) -> Tuple[str, str]: | 780 | ) -> Tuple[str, str]: |
| 596 | """调用大模型 API(带重试机制),使用 Partial Mode 强制 markdown 表格前缀。""" | 781 | """调用大模型 API(带重试机制),使用 Partial Mode 强制 markdown 表格前缀。""" |
| 597 | headers = { | 782 | headers = { |
| @@ -631,8 +816,9 @@ def call_llm( | @@ -631,8 +816,9 @@ def call_llm( | ||
| 631 | if _mark_shared_context_logged_once(shared_context_key): | 816 | if _mark_shared_context_logged_once(shared_context_key): |
| 632 | logger.info(f"\n{'=' * 80}") | 817 | logger.info(f"\n{'=' * 80}") |
| 633 | logger.info( | 818 | logger.info( |
| 634 | - "LLM Shared Context [model=%s, shared_key=%s, chars=%s] (logged once per process key)", | 819 | + "LLM Shared Context [model=%s, kind=%s, shared_key=%s, chars=%s] (logged once per process key)", |
| 635 | MODEL_NAME, | 820 | MODEL_NAME, |
| 821 | + analysis_kind, | ||
| 636 | shared_context_key, | 822 | shared_context_key, |
| 637 | len(shared_context), | 823 | len(shared_context), |
| 638 | ) | 824 | ) |
| @@ -641,8 +827,9 @@ def call_llm( | @@ -641,8 +827,9 @@ def call_llm( | ||
| 641 | 827 | ||
| 642 | verbose_logger.info(f"\n{'=' * 80}") | 828 | verbose_logger.info(f"\n{'=' * 80}") |
| 643 | verbose_logger.info( | 829 | verbose_logger.info( |
| 644 | - "LLM Request [model=%s, lang=%s, shared_key=%s, tail_key=%s]:", | 830 | + "LLM Request [model=%s, kind=%s, lang=%s, shared_key=%s, tail_key=%s]:", |
| 645 | MODEL_NAME, | 831 | MODEL_NAME, |
| 832 | + analysis_kind, | ||
| 646 | target_lang, | 833 | target_lang, |
| 647 | shared_context_key, | 834 | shared_context_key, |
| 648 | localized_tail_key, | 835 | localized_tail_key, |
| @@ -654,7 +841,8 @@ def call_llm( | @@ -654,7 +841,8 @@ def call_llm( | ||
| 654 | verbose_logger.info(f"\nAssistant Prefix:\n{assistant_prefix}") | 841 | verbose_logger.info(f"\nAssistant Prefix:\n{assistant_prefix}") |
| 655 | 842 | ||
| 656 | logger.info( | 843 | logger.info( |
| 657 | - "\nLLM Request Variant [lang=%s, shared_key=%s, tail_key=%s, prompt_chars=%s, prefix_chars=%s]", | 844 | + "\nLLM Request Variant [kind=%s, lang=%s, shared_key=%s, tail_key=%s, prompt_chars=%s, prefix_chars=%s]", |
| 845 | + analysis_kind, | ||
| 658 | target_lang, | 846 | target_lang, |
| 659 | shared_context_key, | 847 | shared_context_key, |
| 660 | localized_tail_key, | 848 | localized_tail_key, |
| @@ -685,8 +873,9 @@ def call_llm( | @@ -685,8 +873,9 @@ def call_llm( | ||
| 685 | usage = result.get("usage") or {} | 873 | usage = result.get("usage") or {} |
| 686 | 874 | ||
| 687 | verbose_logger.info( | 875 | verbose_logger.info( |
| 688 | - "\nLLM Response [model=%s, lang=%s, shared_key=%s, tail_key=%s]:", | 876 | + "\nLLM Response [model=%s, kind=%s, lang=%s, shared_key=%s, tail_key=%s]:", |
| 689 | MODEL_NAME, | 877 | MODEL_NAME, |
| 878 | + analysis_kind, | ||
| 690 | target_lang, | 879 | target_lang, |
| 691 | shared_context_key, | 880 | shared_context_key, |
| 692 | localized_tail_key, | 881 | localized_tail_key, |
| @@ -697,7 +886,8 @@ def call_llm( | @@ -697,7 +886,8 @@ def call_llm( | ||
| 697 | full_markdown = _merge_partial_response(assistant_prefix, generated_content) | 886 | full_markdown = _merge_partial_response(assistant_prefix, generated_content) |
| 698 | 887 | ||
| 699 | logger.info( | 888 | logger.info( |
| 700 | - "\nLLM Response Summary [lang=%s, shared_key=%s, tail_key=%s, generated_chars=%s, completion_tokens=%s, prompt_tokens=%s, total_tokens=%s]", | 889 | + "\nLLM Response Summary [kind=%s, lang=%s, shared_key=%s, tail_key=%s, generated_chars=%s, completion_tokens=%s, prompt_tokens=%s, total_tokens=%s]", |
| 890 | + analysis_kind, | ||
| 701 | target_lang, | 891 | target_lang, |
| 702 | shared_context_key, | 892 | shared_context_key, |
| 703 | localized_tail_key, | 893 | localized_tail_key, |
| @@ -742,8 +932,12 @@ def call_llm( | @@ -742,8 +932,12 @@ def call_llm( | ||
| 742 | session.close() | 932 | session.close() |
| 743 | 933 | ||
| 744 | 934 | ||
| 745 | -def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]: | 935 | +def parse_markdown_table( |
| 936 | + markdown_content: str, | ||
| 937 | + analysis_kind: str = "content", | ||
| 938 | +) -> List[Dict[str, str]]: | ||
| 746 | """解析markdown表格内容""" | 939 | """解析markdown表格内容""" |
| 940 | + schema = _get_analysis_schema(analysis_kind) | ||
| 747 | lines = markdown_content.strip().split("\n") | 941 | lines = markdown_content.strip().split("\n") |
| 748 | data = [] | 942 | data = [] |
| 749 | data_started = False | 943 | data_started = False |
| @@ -768,22 +962,15 @@ def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]: | @@ -768,22 +962,15 @@ def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]: | ||
| 768 | 962 | ||
| 769 | # 解析数据行 | 963 | # 解析数据行 |
| 770 | parts = [p.strip() for p in line.split("|")] | 964 | parts = [p.strip() for p in line.split("|")] |
| 771 | - parts = [p for p in parts if p] # 移除空字符串 | 965 | + if parts and parts[0] == "": |
| 966 | + parts = parts[1:] | ||
| 967 | + if parts and parts[-1] == "": | ||
| 968 | + parts = parts[:-1] | ||
| 772 | 969 | ||
| 773 | if len(parts) >= 2: | 970 | if len(parts) >= 2: |
| 774 | - row = { | ||
| 775 | - "seq_no": parts[0], | ||
| 776 | - "title": parts[1], # 商品标题(按目标语言) | ||
| 777 | - "category_path": parts[2] if len(parts) > 2 else "", # 品类路径 | ||
| 778 | - "tags": parts[3] if len(parts) > 3 else "", # 细分标签 | ||
| 779 | - "target_audience": parts[4] if len(parts) > 4 else "", # 适用人群 | ||
| 780 | - "usage_scene": parts[5] if len(parts) > 5 else "", # 使用场景 | ||
| 781 | - "season": parts[6] if len(parts) > 6 else "", # 适用季节 | ||
| 782 | - "key_attributes": parts[7] if len(parts) > 7 else "", # 关键属性 | ||
| 783 | - "material": parts[8] if len(parts) > 8 else "", # 材质说明 | ||
| 784 | - "features": parts[9] if len(parts) > 9 else "", # 功能特点 | ||
| 785 | - "anchor_text": parts[10] if len(parts) > 10 else "", # 锚文本 | ||
| 786 | - } | 971 | + row = {"seq_no": parts[0]} |
| 972 | + for field_index, field_name in enumerate(schema.result_fields, start=1): | ||
| 973 | + row[field_name] = parts[field_index] if len(parts) > field_index else "" | ||
| 787 | data.append(row) | 974 | data.append(row) |
| 788 | 975 | ||
| 789 | return data | 976 | return data |
| @@ -794,31 +981,45 @@ def _log_parsed_result_quality( | @@ -794,31 +981,45 @@ def _log_parsed_result_quality( | ||
| 794 | parsed_results: List[Dict[str, str]], | 981 | parsed_results: List[Dict[str, str]], |
| 795 | target_lang: str, | 982 | target_lang: str, |
| 796 | batch_num: int, | 983 | batch_num: int, |
| 984 | + analysis_kind: str, | ||
| 797 | ) -> None: | 985 | ) -> None: |
| 986 | + schema = _get_analysis_schema(analysis_kind) | ||
| 798 | expected = len(batch_data) | 987 | expected = len(batch_data) |
| 799 | actual = len(parsed_results) | 988 | actual = len(parsed_results) |
| 800 | if actual != expected: | 989 | if actual != expected: |
| 801 | logger.warning( | 990 | logger.warning( |
| 802 | - "Parsed row count mismatch for batch=%s lang=%s: expected=%s actual=%s", | 991 | + "Parsed row count mismatch for kind=%s batch=%s lang=%s: expected=%s actual=%s", |
| 992 | + analysis_kind, | ||
| 803 | batch_num, | 993 | batch_num, |
| 804 | target_lang, | 994 | target_lang, |
| 805 | expected, | 995 | expected, |
| 806 | actual, | 996 | actual, |
| 807 | ) | 997 | ) |
| 808 | 998 | ||
| 809 | - missing_anchor = sum(1 for item in parsed_results if not str(item.get("anchor_text") or "").strip()) | ||
| 810 | - missing_category = sum(1 for item in parsed_results if not str(item.get("category_path") or "").strip()) | ||
| 811 | - missing_title = sum(1 for item in parsed_results if not str(item.get("title") or "").strip()) | 999 | + if not schema.quality_fields: |
| 1000 | + logger.info( | ||
| 1001 | + "Parsed Quality Summary [kind=%s, batch=%s, lang=%s]: rows=%s/%s", | ||
| 1002 | + analysis_kind, | ||
| 1003 | + batch_num, | ||
| 1004 | + target_lang, | ||
| 1005 | + actual, | ||
| 1006 | + expected, | ||
| 1007 | + ) | ||
| 1008 | + return | ||
| 812 | 1009 | ||
| 1010 | + missing_summary = ", ".join( | ||
| 1011 | + f"missing_{field}=" | ||
| 1012 | + f"{sum(1 for item in parsed_results if not str(item.get(field) or '').strip())}" | ||
| 1013 | + for field in schema.quality_fields | ||
| 1014 | + ) | ||
| 813 | logger.info( | 1015 | logger.info( |
| 814 | - "Parsed Quality Summary [batch=%s, lang=%s]: rows=%s/%s, missing_title=%s, missing_category=%s, missing_anchor=%s", | 1016 | + "Parsed Quality Summary [kind=%s, batch=%s, lang=%s]: rows=%s/%s, %s", |
| 1017 | + analysis_kind, | ||
| 815 | batch_num, | 1018 | batch_num, |
| 816 | target_lang, | 1019 | target_lang, |
| 817 | actual, | 1020 | actual, |
| 818 | expected, | 1021 | expected, |
| 819 | - missing_title, | ||
| 820 | - missing_category, | ||
| 821 | - missing_anchor, | 1022 | + missing_summary, |
| 822 | ) | 1023 | ) |
| 823 | 1024 | ||
| 824 | 1025 | ||
| @@ -826,29 +1027,39 @@ def process_batch( | @@ -826,29 +1027,39 @@ def process_batch( | ||
| 826 | batch_data: List[Dict[str, str]], | 1027 | batch_data: List[Dict[str, str]], |
| 827 | batch_num: int, | 1028 | batch_num: int, |
| 828 | target_lang: str = "zh", | 1029 | target_lang: str = "zh", |
| 1030 | + analysis_kind: str = "content", | ||
| 829 | ) -> List[Dict[str, Any]]: | 1031 | ) -> List[Dict[str, Any]]: |
| 830 | """处理一个批次的数据""" | 1032 | """处理一个批次的数据""" |
| 1033 | + schema = _get_analysis_schema(analysis_kind) | ||
| 831 | logger.info(f"\n{'#' * 80}") | 1034 | logger.info(f"\n{'#' * 80}") |
| 832 | - logger.info(f"Processing Batch {batch_num} ({len(batch_data)} items)") | 1035 | + logger.info( |
| 1036 | + "Processing Batch %s (%s items, kind=%s)", | ||
| 1037 | + batch_num, | ||
| 1038 | + len(batch_data), | ||
| 1039 | + analysis_kind, | ||
| 1040 | + ) | ||
| 833 | 1041 | ||
| 834 | # 创建提示词 | 1042 | # 创建提示词 |
| 835 | shared_context, user_prompt, assistant_prefix = create_prompt( | 1043 | shared_context, user_prompt, assistant_prefix = create_prompt( |
| 836 | batch_data, | 1044 | batch_data, |
| 837 | target_lang=target_lang, | 1045 | target_lang=target_lang, |
| 1046 | + analysis_kind=analysis_kind, | ||
| 838 | ) | 1047 | ) |
| 839 | 1048 | ||
| 840 | # 如果提示词创建失败(例如不支持的 target_lang),本次批次整体失败,不再继续调用 LLM | 1049 | # 如果提示词创建失败(例如不支持的 target_lang),本次批次整体失败,不再继续调用 LLM |
| 841 | if shared_context is None or user_prompt is None or assistant_prefix is None: | 1050 | if shared_context is None or user_prompt is None or assistant_prefix is None: |
| 842 | logger.error( | 1051 | logger.error( |
| 843 | - "Failed to create prompt for batch %s, target_lang=%s; " | 1052 | + "Failed to create prompt for batch %s, kind=%s, target_lang=%s; " |
| 844 | "marking entire batch as failed without calling LLM", | 1053 | "marking entire batch as failed without calling LLM", |
| 845 | batch_num, | 1054 | batch_num, |
| 1055 | + analysis_kind, | ||
| 846 | target_lang, | 1056 | target_lang, |
| 847 | ) | 1057 | ) |
| 848 | return [ | 1058 | return [ |
| 849 | _make_empty_analysis_result( | 1059 | _make_empty_analysis_result( |
| 850 | item, | 1060 | item, |
| 851 | target_lang, | 1061 | target_lang, |
| 1062 | + schema, | ||
| 852 | error=f"prompt_creation_failed: unsupported target_lang={target_lang}", | 1063 | error=f"prompt_creation_failed: unsupported target_lang={target_lang}", |
| 853 | ) | 1064 | ) |
| 854 | for item in batch_data | 1065 | for item in batch_data |
| @@ -861,11 +1072,18 @@ def process_batch( | @@ -861,11 +1072,18 @@ def process_batch( | ||
| 861 | user_prompt, | 1072 | user_prompt, |
| 862 | assistant_prefix, | 1073 | assistant_prefix, |
| 863 | target_lang=target_lang, | 1074 | target_lang=target_lang, |
| 1075 | + analysis_kind=analysis_kind, | ||
| 864 | ) | 1076 | ) |
| 865 | 1077 | ||
| 866 | # 解析结果 | 1078 | # 解析结果 |
| 867 | - parsed_results = parse_markdown_table(raw_response) | ||
| 868 | - _log_parsed_result_quality(batch_data, parsed_results, target_lang, batch_num) | 1079 | + parsed_results = parse_markdown_table(raw_response, analysis_kind=analysis_kind) |
| 1080 | + _log_parsed_result_quality( | ||
| 1081 | + batch_data, | ||
| 1082 | + parsed_results, | ||
| 1083 | + target_lang, | ||
| 1084 | + batch_num, | ||
| 1085 | + analysis_kind, | ||
| 1086 | + ) | ||
| 869 | 1087 | ||
| 870 | logger.info(f"\nParsed Results ({len(parsed_results)} items):") | 1088 | logger.info(f"\nParsed Results ({len(parsed_results)} items):") |
| 871 | logger.info(json.dumps(parsed_results, ensure_ascii=False, indent=2)) | 1089 | logger.info(json.dumps(parsed_results, ensure_ascii=False, indent=2)) |
| @@ -879,10 +1097,12 @@ def process_batch( | @@ -879,10 +1097,12 @@ def process_batch( | ||
| 879 | parsed_item, | 1097 | parsed_item, |
| 880 | product=source_product, | 1098 | product=source_product, |
| 881 | target_lang=target_lang, | 1099 | target_lang=target_lang, |
| 1100 | + schema=schema, | ||
| 882 | ) | 1101 | ) |
| 883 | results_with_ids.append(result) | 1102 | results_with_ids.append(result) |
| 884 | logger.info( | 1103 | logger.info( |
| 885 | - "Mapped: seq=%s -> original_id=%s", | 1104 | + "Mapped: kind=%s seq=%s -> original_id=%s", |
| 1105 | + analysis_kind, | ||
| 886 | parsed_item.get("seq_no"), | 1106 | parsed_item.get("seq_no"), |
| 887 | source_product.get("id"), | 1107 | source_product.get("id"), |
| 888 | ) | 1108 | ) |
| @@ -890,6 +1110,7 @@ def process_batch( | @@ -890,6 +1110,7 @@ def process_batch( | ||
| 890 | # 保存批次 JSON 日志到独立文件 | 1110 | # 保存批次 JSON 日志到独立文件 |
| 891 | batch_log = { | 1111 | batch_log = { |
| 892 | "batch_num": batch_num, | 1112 | "batch_num": batch_num, |
| 1113 | + "analysis_kind": analysis_kind, | ||
| 893 | "timestamp": datetime.now().isoformat(), | 1114 | "timestamp": datetime.now().isoformat(), |
| 894 | "input_products": batch_data, | 1115 | "input_products": batch_data, |
| 895 | "raw_response": raw_response, | 1116 | "raw_response": raw_response, |
| @@ -900,7 +1121,10 @@ def process_batch( | @@ -900,7 +1121,10 @@ def process_batch( | ||
| 900 | 1121 | ||
| 901 | # 并发写 batch json 日志时,保证文件名唯一避免覆盖 | 1122 | # 并发写 batch json 日志时,保证文件名唯一避免覆盖 |
| 902 | batch_call_id = uuid.uuid4().hex[:12] | 1123 | batch_call_id = uuid.uuid4().hex[:12] |
| 903 | - batch_log_file = LOG_DIR / f"batch_{batch_num:04d}_{timestamp}_{batch_call_id}.json" | 1124 | + batch_log_file = ( |
| 1125 | + LOG_DIR | ||
| 1126 | + / f"batch_{analysis_kind}_{batch_num:04d}_{timestamp}_{batch_call_id}.json" | ||
| 1127 | + ) | ||
| 904 | with open(batch_log_file, "w", encoding="utf-8") as f: | 1128 | with open(batch_log_file, "w", encoding="utf-8") as f: |
| 905 | json.dump(batch_log, f, ensure_ascii=False, indent=2) | 1129 | json.dump(batch_log, f, ensure_ascii=False, indent=2) |
| 906 | 1130 | ||
| @@ -912,7 +1136,7 @@ def process_batch( | @@ -912,7 +1136,7 @@ def process_batch( | ||
| 912 | logger.error(f"Error processing batch {batch_num}: {str(e)}", exc_info=True) | 1136 | logger.error(f"Error processing batch {batch_num}: {str(e)}", exc_info=True) |
| 913 | # 返回空结果,保持ID映射 | 1137 | # 返回空结果,保持ID映射 |
| 914 | return [ | 1138 | return [ |
| 915 | - _make_empty_analysis_result(item, target_lang, error=str(e)) | 1139 | + _make_empty_analysis_result(item, target_lang, schema, error=str(e)) |
| 916 | for item in batch_data | 1140 | for item in batch_data |
| 917 | ] | 1141 | ] |
| 918 | 1142 | ||
| @@ -922,6 +1146,7 @@ def analyze_products( | @@ -922,6 +1146,7 @@ def analyze_products( | ||
| 922 | target_lang: str = "zh", | 1146 | target_lang: str = "zh", |
| 923 | batch_size: Optional[int] = None, | 1147 | batch_size: Optional[int] = None, |
| 924 | tenant_id: Optional[str] = None, | 1148 | tenant_id: Optional[str] = None, |
| 1149 | + analysis_kind: str = "content", | ||
| 925 | ) -> List[Dict[str, Any]]: | 1150 | ) -> List[Dict[str, Any]]: |
| 926 | """ | 1151 | """ |
| 927 | 库调用入口:根据输入+语言,返回锚文本及各维度信息。 | 1152 | 库调用入口:根据输入+语言,返回锚文本及各维度信息。 |
| @@ -937,6 +1162,7 @@ def analyze_products( | @@ -937,6 +1162,7 @@ def analyze_products( | ||
| 937 | if not products: | 1162 | if not products: |
| 938 | return [] | 1163 | return [] |
| 939 | 1164 | ||
| 1165 | + _get_analysis_schema(analysis_kind) | ||
| 940 | results_by_index: List[Optional[Dict[str, Any]]] = [None] * len(products) | 1166 | results_by_index: List[Optional[Dict[str, Any]]] = [None] * len(products) |
| 941 | uncached_items: List[Tuple[int, Dict[str, str]]] = [] | 1167 | uncached_items: List[Tuple[int, Dict[str, str]]] = [] |
| 942 | 1168 | ||
| @@ -946,11 +1172,11 @@ def analyze_products( | @@ -946,11 +1172,11 @@ def analyze_products( | ||
| 946 | uncached_items.append((idx, product)) | 1172 | uncached_items.append((idx, product)) |
| 947 | continue | 1173 | continue |
| 948 | 1174 | ||
| 949 | - cached = _get_cached_anchor_result(product, target_lang) | 1175 | + cached = _get_cached_analysis_result(product, target_lang, analysis_kind) |
| 950 | if cached: | 1176 | if cached: |
| 951 | logger.info( | 1177 | logger.info( |
| 952 | f"[analyze_products] Cache hit for title='{title[:50]}...', " | 1178 | f"[analyze_products] Cache hit for title='{title[:50]}...', " |
| 953 | - f"lang={target_lang}" | 1179 | + f"kind={analysis_kind}, lang={target_lang}" |
| 954 | ) | 1180 | ) |
| 955 | results_by_index[idx] = cached | 1181 | results_by_index[idx] = cached |
| 956 | continue | 1182 | continue |
| @@ -979,9 +1205,14 @@ def analyze_products( | @@ -979,9 +1205,14 @@ def analyze_products( | ||
| 979 | for batch_num, batch_slice, batch in batch_jobs: | 1205 | for batch_num, batch_slice, batch in batch_jobs: |
| 980 | logger.info( | 1206 | logger.info( |
| 981 | f"[analyze_products] Processing batch {batch_num}/{total_batches}, " | 1207 | f"[analyze_products] Processing batch {batch_num}/{total_batches}, " |
| 982 | - f"size={len(batch)}, target_lang={target_lang}" | 1208 | + f"size={len(batch)}, kind={analysis_kind}, target_lang={target_lang}" |
| 1209 | + ) | ||
| 1210 | + batch_results = process_batch( | ||
| 1211 | + batch, | ||
| 1212 | + batch_num=batch_num, | ||
| 1213 | + target_lang=target_lang, | ||
| 1214 | + analysis_kind=analysis_kind, | ||
| 983 | ) | 1215 | ) |
| 984 | - batch_results = process_batch(batch, batch_num=batch_num, target_lang=target_lang) | ||
| 985 | 1216 | ||
| 986 | for (original_idx, product), item in zip(batch_slice, batch_results): | 1217 | for (original_idx, product), item in zip(batch_slice, batch_results): |
| 987 | results_by_index[original_idx] = item | 1218 | results_by_index[original_idx] = item |
| @@ -992,7 +1223,7 @@ def analyze_products( | @@ -992,7 +1223,7 @@ def analyze_products( | ||
| 992 | # 不缓存错误结果,避免放大临时故障 | 1223 | # 不缓存错误结果,避免放大临时故障 |
| 993 | continue | 1224 | continue |
| 994 | try: | 1225 | try: |
| 995 | - _set_cached_anchor_result(product, target_lang, item) | 1226 | + _set_cached_analysis_result(product, target_lang, item, analysis_kind) |
| 996 | except Exception: | 1227 | except Exception: |
| 997 | # 已在内部记录 warning | 1228 | # 已在内部记录 warning |
| 998 | pass | 1229 | pass |
| @@ -1000,10 +1231,11 @@ def analyze_products( | @@ -1000,10 +1231,11 @@ def analyze_products( | ||
| 1000 | max_workers = min(CONTENT_UNDERSTANDING_MAX_WORKERS, len(batch_jobs)) | 1231 | max_workers = min(CONTENT_UNDERSTANDING_MAX_WORKERS, len(batch_jobs)) |
| 1001 | logger.info( | 1232 | logger.info( |
| 1002 | "[analyze_products] Using ThreadPoolExecutor for uncached batches: " | 1233 | "[analyze_products] Using ThreadPoolExecutor for uncached batches: " |
| 1003 | - "max_workers=%s, total_batches=%s, bs=%s, target_lang=%s", | 1234 | + "max_workers=%s, total_batches=%s, bs=%s, kind=%s, target_lang=%s", |
| 1004 | max_workers, | 1235 | max_workers, |
| 1005 | total_batches, | 1236 | total_batches, |
| 1006 | bs, | 1237 | bs, |
| 1238 | + analysis_kind, | ||
| 1007 | target_lang, | 1239 | target_lang, |
| 1008 | ) | 1240 | ) |
| 1009 | 1241 | ||
| @@ -1013,7 +1245,11 @@ def analyze_products( | @@ -1013,7 +1245,11 @@ def analyze_products( | ||
| 1013 | future_by_batch_num: Dict[int, Any] = {} | 1245 | future_by_batch_num: Dict[int, Any] = {} |
| 1014 | for batch_num, _batch_slice, batch in batch_jobs: | 1246 | for batch_num, _batch_slice, batch in batch_jobs: |
| 1015 | future_by_batch_num[batch_num] = executor.submit( | 1247 | future_by_batch_num[batch_num] = executor.submit( |
| 1016 | - process_batch, batch, batch_num=batch_num, target_lang=target_lang | 1248 | + process_batch, |
| 1249 | + batch, | ||
| 1250 | + batch_num=batch_num, | ||
| 1251 | + target_lang=target_lang, | ||
| 1252 | + analysis_kind=analysis_kind, | ||
| 1017 | ) | 1253 | ) |
| 1018 | 1254 | ||
| 1019 | # 按 batch_num 回填,确保输出稳定(results_by_index 是按原始 input index 映射的) | 1255 | # 按 batch_num 回填,确保输出稳定(results_by_index 是按原始 input index 映射的) |
| @@ -1028,7 +1264,7 @@ def analyze_products( | @@ -1028,7 +1264,7 @@ def analyze_products( | ||
| 1028 | # 不缓存错误结果,避免放大临时故障 | 1264 | # 不缓存错误结果,避免放大临时故障 |
| 1029 | continue | 1265 | continue |
| 1030 | try: | 1266 | try: |
| 1031 | - _set_cached_anchor_result(product, target_lang, item) | 1267 | + _set_cached_analysis_result(product, target_lang, item, analysis_kind) |
| 1032 | except Exception: | 1268 | except Exception: |
| 1033 | # 已在内部记录 warning | 1269 | # 已在内部记录 warning |
| 1034 | pass | 1270 | pass |
indexer/product_enrich_prompts.py
| @@ -33,6 +33,110 @@ Input product list: | @@ -33,6 +33,110 @@ Input product list: | ||
| 33 | USER_INSTRUCTION_TEMPLATE = """Please strictly return a Markdown table following the given columns in the specified language. For any column containing multiple values, separate them with commas. Do not add any other explanation. | 33 | USER_INSTRUCTION_TEMPLATE = """Please strictly return a Markdown table following the given columns in the specified language. For any column containing multiple values, separate them with commas. Do not add any other explanation. |
| 34 | Language: {language}""" | 34 | Language: {language}""" |
| 35 | 35 | ||
| 36 | +TAXONOMY_SHARED_ANALYSIS_INSTRUCTION = """Analyze each input product text and fill the columns below using an apparel attribute taxonomy. | ||
| 37 | + | ||
| 38 | +Output columns: | ||
| 39 | +1. Product Type: concise ecommerce apparel category label, not a full marketing title | ||
| 40 | +2. Target Gender: intended gender only if clearly implied | ||
| 41 | +3. Age Group: only if clearly implied, e.g. adults, kids, teens, toddlers, babies | ||
| 42 | +4. Season: season(s) or all-season suitability only if supported | ||
| 43 | +5. Fit: body closeness, e.g. slim, regular, relaxed, oversized, fitted | ||
| 44 | +6. Silhouette: overall garment shape, e.g. straight, A-line, boxy, tapered, bodycon, wide-leg | ||
| 45 | +7. Neckline: neckline type when applicable, e.g. crew neck, V-neck, hooded, collared, square neck | ||
| 46 | +8. Sleeve Length Type: sleeve length only, e.g. sleeveless, short sleeve, long sleeve, three-quarter sleeve | ||
| 47 | +9. Sleeve Style: sleeve design only, e.g. puff sleeve, raglan sleeve, batwing sleeve, bell sleeve | ||
| 48 | +10. Strap Type: strap design when applicable, e.g. spaghetti strap, wide strap, halter strap, adjustable strap | ||
| 49 | +11. Rise / Waistline: waist placement when applicable, e.g. high rise, mid rise, low rise, empire waist | ||
| 50 | +12. Leg Shape: for bottoms only, e.g. straight leg, wide leg, flare leg, tapered leg, skinny leg | ||
| 51 | +13. Skirt Shape: for skirts only, e.g. A-line, pleated, pencil, mermaid | ||
| 52 | +14. Length Type: design length only, not size, e.g. cropped, regular, longline, mini, midi, maxi, ankle length, full length | ||
| 53 | +15. Closure Type: fastening method when applicable, e.g. zipper, button, drawstring, elastic waist, hook-and-loop | ||
| 54 | +16. Design Details: construction or visual details, e.g. ruched, ruffled, pleated, cut-out, layered, distressed, split hem | ||
| 55 | +17. Fabric: fabric type only, e.g. denim, knit, chiffon, jersey, fleece, cotton twill | ||
| 56 | +18. Material Composition: fiber content or blend only if stated, e.g. cotton, polyester, spandex, linen blend, 95% cotton 5% elastane | ||
| 57 | +19. Fabric Properties: inherent fabric traits, e.g. stretch, breathable, lightweight, soft-touch, water-resistant | ||
| 58 | +20. Clothing Features: product features, e.g. lined, reversible, hooded, packable, padded, pocketed | ||
| 59 | +21. Functional Benefits: wearer benefits, e.g. moisture-wicking, thermal insulation, UV protection, easy care, supportive compression | ||
| 60 | +22. Color: specific color name when available | ||
| 61 | +23. Color Family: normalized broad retail color group, e.g. black, white, blue, green, red, pink, beige, brown, gray | ||
| 62 | +24. Print / Pattern: surface pattern when applicable, e.g. solid, striped, plaid, floral, graphic, animal print | ||
| 63 | +25. Occasion / End Use: likely use occasion only if supported, e.g. office, casual wear, streetwear, lounge, workout, outdoor | ||
| 64 | +26. Style Aesthetic: overall style only if supported, e.g. minimalist, streetwear, athleisure, smart casual, romantic, playful | ||
| 65 | + | ||
| 66 | +Rules: | ||
| 67 | +- Keep the same row order and row count as input. | ||
| 68 | +- Infer only from the provided product text. | ||
| 69 | +- Leave blank if not applicable or not reasonably supported. | ||
| 70 | +- Use concise, standardized ecommerce wording. | ||
| 71 | +- Do not combine different attribute dimensions in one field. | ||
| 72 | +- If multiple values are needed, use the delimiter required by the localization setting. | ||
| 73 | + | ||
| 74 | +Input product list: | ||
| 75 | +""" | ||
| 76 | + | ||
| 77 | +TAXONOMY_MARKDOWN_TABLE_HEADERS_EN = [ | ||
| 78 | + "No.", | ||
| 79 | + "Product Type", | ||
| 80 | + "Target Gender", | ||
| 81 | + "Age Group", | ||
| 82 | + "Season", | ||
| 83 | + "Fit", | ||
| 84 | + "Silhouette", | ||
| 85 | + "Neckline", | ||
| 86 | + "Sleeve Length Type", | ||
| 87 | + "Sleeve Style", | ||
| 88 | + "Strap Type", | ||
| 89 | + "Rise / Waistline", | ||
| 90 | + "Leg Shape", | ||
| 91 | + "Skirt Shape", | ||
| 92 | + "Length Type", | ||
| 93 | + "Closure Type", | ||
| 94 | + "Design Details", | ||
| 95 | + "Fabric", | ||
| 96 | + "Material Composition", | ||
| 97 | + "Fabric Properties", | ||
| 98 | + "Clothing Features", | ||
| 99 | + "Functional Benefits", | ||
| 100 | + "Color", | ||
| 101 | + "Color Family", | ||
| 102 | + "Print / Pattern", | ||
| 103 | + "Occasion / End Use", | ||
| 104 | + "Style Aesthetic", | ||
| 105 | +] | ||
| 106 | + | ||
| 107 | +TAXONOMY_LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { | ||
| 108 | + "en": TAXONOMY_MARKDOWN_TABLE_HEADERS_EN, | ||
| 109 | + "zh": [ | ||
| 110 | + "序号", | ||
| 111 | + "品类", | ||
| 112 | + "目标性别", | ||
| 113 | + "年龄段", | ||
| 114 | + "适用季节", | ||
| 115 | + "版型", | ||
| 116 | + "廓形", | ||
| 117 | + "领型", | ||
| 118 | + "袖长类型", | ||
| 119 | + "袖型", | ||
| 120 | + "肩带设计", | ||
| 121 | + "腰型", | ||
| 122 | + "裤型", | ||
| 123 | + "裙型", | ||
| 124 | + "长度类型", | ||
| 125 | + "闭合方式", | ||
| 126 | + "设计细节", | ||
| 127 | + "面料", | ||
| 128 | + "成分", | ||
| 129 | + "面料特性", | ||
| 130 | + "服装特征", | ||
| 131 | + "功能", | ||
| 132 | + "主颜色", | ||
| 133 | + "色系", | ||
| 134 | + "印花 / 图案", | ||
| 135 | + "适用场景", | ||
| 136 | + "风格", | ||
| 137 | + ], | ||
| 138 | +} | ||
| 139 | + | ||
| 36 | LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { | 140 | LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { |
| 37 | "en": [ | 141 | "en": [ |
| 38 | "No.", | 142 | "No.", |
tests/test_llm_enrichment_batch_fill.py
| @@ -19,10 +19,13 @@ def test_fill_llm_attributes_batch_uses_product_enrich_helper(monkeypatch): | @@ -19,10 +19,13 @@ def test_fill_llm_attributes_batch_uses_product_enrich_helper(monkeypatch): | ||
| 19 | "zh": [f"zh-anchor-{item['id']}"], | 19 | "zh": [f"zh-anchor-{item['id']}"], |
| 20 | "en": [f"en-anchor-{item['id']}"], | 20 | "en": [f"en-anchor-{item['id']}"], |
| 21 | }, | 21 | }, |
| 22 | - "tags": {"zh": ["t1", "t2"], "en": ["t1", "t2"]}, | 22 | + "enriched_tags": {"zh": ["t1", "t2"], "en": ["t1", "t2"]}, |
| 23 | "enriched_attributes": [ | 23 | "enriched_attributes": [ |
| 24 | {"name": "tags", "value": {"zh": ["t1"], "en": ["t1"]}}, | 24 | {"name": "tags", "value": {"zh": ["t1"], "en": ["t1"]}}, |
| 25 | ], | 25 | ], |
| 26 | + "enriched_taxonomy_attributes": [ | ||
| 27 | + {"name": "Product Type", "value": {"zh": ["连衣裙"], "en": ["dress"]}}, | ||
| 28 | + ], | ||
| 26 | } | 29 | } |
| 27 | for item in items | 30 | for item in items |
| 28 | ] | 31 | ] |
| @@ -54,6 +57,10 @@ def test_fill_llm_attributes_batch_uses_product_enrich_helper(monkeypatch): | @@ -54,6 +57,10 @@ def test_fill_llm_attributes_batch_uses_product_enrich_helper(monkeypatch): | ||
| 54 | 57 | ||
| 55 | assert docs[0]["qanchors"]["zh"] == ["zh-anchor-0"] | 58 | assert docs[0]["qanchors"]["zh"] == ["zh-anchor-0"] |
| 56 | assert docs[0]["qanchors"]["en"] == ["en-anchor-0"] | 59 | assert docs[0]["qanchors"]["en"] == ["en-anchor-0"] |
| 57 | - assert docs[0]["tags"]["zh"] == ["t1", "t2"] | ||
| 58 | - assert docs[0]["tags"]["en"] == ["t1", "t2"] | 60 | + assert docs[0]["enriched_tags"]["zh"] == ["t1", "t2"] |
| 61 | + assert docs[0]["enriched_tags"]["en"] == ["t1", "t2"] | ||
| 59 | assert {"name": "tags", "value": {"zh": ["t1"], "en": ["t1"]}} in docs[0]["enriched_attributes"] | 62 | assert {"name": "tags", "value": {"zh": ["t1"], "en": ["t1"]}} in docs[0]["enriched_attributes"] |
| 63 | + assert { | ||
| 64 | + "name": "Product Type", | ||
| 65 | + "value": {"zh": ["连衣裙"], "en": ["dress"]}, | ||
| 66 | + } in docs[0]["enriched_taxonomy_attributes"] |
tests/test_process_products_batching.py
| @@ -13,7 +13,13 @@ def test_analyze_products_caps_batch_size_to_20(monkeypatch): | @@ -13,7 +13,13 @@ def test_analyze_products_caps_batch_size_to_20(monkeypatch): | ||
| 13 | monkeypatch.setattr(process_products, "API_KEY", "fake-key") | 13 | monkeypatch.setattr(process_products, "API_KEY", "fake-key") |
| 14 | seen_batch_sizes: List[int] = [] | 14 | seen_batch_sizes: List[int] = [] |
| 15 | 15 | ||
| 16 | - def _fake_process_batch(batch_data: List[Dict[str, str]], batch_num: int, target_lang: str = "zh"): | 16 | + def _fake_process_batch( |
| 17 | + batch_data: List[Dict[str, str]], | ||
| 18 | + batch_num: int, | ||
| 19 | + target_lang: str = "zh", | ||
| 20 | + analysis_kind: str = "content", | ||
| 21 | + ): | ||
| 22 | + assert analysis_kind == "content" | ||
| 17 | seen_batch_sizes.append(len(batch_data)) | 23 | seen_batch_sizes.append(len(batch_data)) |
| 18 | return [ | 24 | return [ |
| 19 | { | 25 | { |
| @@ -35,7 +41,7 @@ def test_analyze_products_caps_batch_size_to_20(monkeypatch): | @@ -35,7 +41,7 @@ def test_analyze_products_caps_batch_size_to_20(monkeypatch): | ||
| 35 | ] | 41 | ] |
| 36 | 42 | ||
| 37 | monkeypatch.setattr(process_products, "process_batch", _fake_process_batch) | 43 | monkeypatch.setattr(process_products, "process_batch", _fake_process_batch) |
| 38 | - monkeypatch.setattr(process_products, "_set_cached_anchor_result", lambda *args, **kwargs: None) | 44 | + monkeypatch.setattr(process_products, "_set_cached_analysis_result", lambda *args, **kwargs: None) |
| 39 | 45 | ||
| 40 | out = process_products.analyze_products( | 46 | out = process_products.analyze_products( |
| 41 | products=_mk_products(45), | 47 | products=_mk_products(45), |
| @@ -53,7 +59,13 @@ def test_analyze_products_uses_min_batch_size_1(monkeypatch): | @@ -53,7 +59,13 @@ def test_analyze_products_uses_min_batch_size_1(monkeypatch): | ||
| 53 | monkeypatch.setattr(process_products, "API_KEY", "fake-key") | 59 | monkeypatch.setattr(process_products, "API_KEY", "fake-key") |
| 54 | seen_batch_sizes: List[int] = [] | 60 | seen_batch_sizes: List[int] = [] |
| 55 | 61 | ||
| 56 | - def _fake_process_batch(batch_data: List[Dict[str, str]], batch_num: int, target_lang: str = "zh"): | 62 | + def _fake_process_batch( |
| 63 | + batch_data: List[Dict[str, str]], | ||
| 64 | + batch_num: int, | ||
| 65 | + target_lang: str = "zh", | ||
| 66 | + analysis_kind: str = "content", | ||
| 67 | + ): | ||
| 68 | + assert analysis_kind == "content" | ||
| 57 | seen_batch_sizes.append(len(batch_data)) | 69 | seen_batch_sizes.append(len(batch_data)) |
| 58 | return [ | 70 | return [ |
| 59 | { | 71 | { |
| @@ -75,7 +87,7 @@ def test_analyze_products_uses_min_batch_size_1(monkeypatch): | @@ -75,7 +87,7 @@ def test_analyze_products_uses_min_batch_size_1(monkeypatch): | ||
| 75 | ] | 87 | ] |
| 76 | 88 | ||
| 77 | monkeypatch.setattr(process_products, "process_batch", _fake_process_batch) | 89 | monkeypatch.setattr(process_products, "process_batch", _fake_process_batch) |
| 78 | - monkeypatch.setattr(process_products, "_set_cached_anchor_result", lambda *args, **kwargs: None) | 90 | + monkeypatch.setattr(process_products, "_set_cached_analysis_result", lambda *args, **kwargs: None) |
| 79 | 91 | ||
| 80 | out = process_products.analyze_products( | 92 | out = process_products.analyze_products( |
| 81 | products=_mk_products(3), | 93 | products=_mk_products(3), |
tests/test_product_enrich_partial_mode.py
| @@ -74,6 +74,28 @@ def test_create_prompt_splits_shared_context_and_localized_tail(): | @@ -74,6 +74,28 @@ def test_create_prompt_splits_shared_context_and_localized_tail(): | ||
| 74 | assert prefix_en.startswith("| No. | Product title | Category path |") | 74 | assert prefix_en.startswith("| No. | Product title | Category path |") |
| 75 | 75 | ||
| 76 | 76 | ||
| 77 | +def test_create_prompt_supports_taxonomy_analysis_kind(): | ||
| 78 | + products = [{"id": "1", "title": "linen dress"}] | ||
| 79 | + | ||
| 80 | + shared_zh, user_zh, prefix_zh = product_enrich.create_prompt( | ||
| 81 | + products, | ||
| 82 | + target_lang="zh", | ||
| 83 | + analysis_kind="taxonomy", | ||
| 84 | + ) | ||
| 85 | + shared_fr, user_fr, prefix_fr = product_enrich.create_prompt( | ||
| 86 | + products, | ||
| 87 | + target_lang="fr", | ||
| 88 | + analysis_kind="taxonomy", | ||
| 89 | + ) | ||
| 90 | + | ||
| 91 | + assert "apparel attribute taxonomy" in shared_zh | ||
| 92 | + assert "1. linen dress" in shared_zh | ||
| 93 | + assert "Language: Chinese" in user_zh | ||
| 94 | + assert "Language: French" in user_fr | ||
| 95 | + assert prefix_zh.startswith("| 序号 | 品类 | 目标性别 |") | ||
| 96 | + assert prefix_fr.startswith("| No. | Product Type | Target Gender |") | ||
| 97 | + | ||
| 98 | + | ||
| 77 | def test_call_llm_logs_shared_context_once_and_verbose_contains_full_requests(): | 99 | def test_call_llm_logs_shared_context_once_and_verbose_contains_full_requests(): |
| 78 | payloads = [] | 100 | payloads = [] |
| 79 | response_bodies = [ | 101 | response_bodies = [ |
| @@ -228,6 +250,38 @@ def test_process_batch_reads_result_and_validates_expected_fields(): | @@ -228,6 +250,38 @@ def test_process_batch_reads_result_and_validates_expected_fields(): | ||
| 228 | assert row["anchor_text"] == "法式收腰连衣裙" | 250 | assert row["anchor_text"] == "法式收腰连衣裙" |
| 229 | 251 | ||
| 230 | 252 | ||
| 253 | +def test_process_batch_reads_taxonomy_result_with_schema_specific_fields(): | ||
| 254 | + merged_markdown = """| 序号 | 品类 | 目标性别 | 年龄段 | 适用季节 | 版型 | 廓形 | 领型 | 袖长类型 | 袖型 | 肩带设计 | 腰型 | 裤型 | 裙型 | 长度类型 | 闭合方式 | 设计细节 | 面料 | 成分 | 面料特性 | 服装特征 | 功能 | 主颜色 | 色系 | 印花 / 图案 | 适用场景 | 风格 | | ||
| 255 | +|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----| | ||
| 256 | +| 1 | 连衣裙 | 女 | 成人 | 春季,夏季 | 修身 | A字 | V领 | 无袖 | | 细肩带 | 高腰 | | A字裙 | 中长款 | 拉链 | 褶皱 | 梭织 | 聚酯纤维,氨纶 | 轻薄,透气 | 有内衬 | 易打理 | 酒红色 | 红色 | 纯色 | 约会,度假 | 浪漫 | | ||
| 257 | +""" | ||
| 258 | + | ||
| 259 | + with mock.patch.object( | ||
| 260 | + product_enrich, | ||
| 261 | + "call_llm", | ||
| 262 | + return_value=(merged_markdown, json.dumps({"choices": [{"message": {"content": "stub"}}]})), | ||
| 263 | + ): | ||
| 264 | + results = product_enrich.process_batch( | ||
| 265 | + [{"id": "sku-1", "title": "dress"}], | ||
| 266 | + batch_num=1, | ||
| 267 | + target_lang="zh", | ||
| 268 | + analysis_kind="taxonomy", | ||
| 269 | + ) | ||
| 270 | + | ||
| 271 | + assert len(results) == 1 | ||
| 272 | + row = results[0] | ||
| 273 | + assert row["id"] == "sku-1" | ||
| 274 | + assert row["lang"] == "zh" | ||
| 275 | + assert row["title_input"] == "dress" | ||
| 276 | + assert row["product_type"] == "连衣裙" | ||
| 277 | + assert row["target_gender"] == "女" | ||
| 278 | + assert row["age_group"] == "成人" | ||
| 279 | + assert row["sleeve_length_type"] == "无袖" | ||
| 280 | + assert row["material_composition"] == "聚酯纤维,氨纶" | ||
| 281 | + assert row["occasion_end_use"] == "约会,度假" | ||
| 282 | + assert row["style_aesthetic"] == "浪漫" | ||
| 283 | + | ||
| 284 | + | ||
| 231 | def test_analyze_products_uses_product_level_cache_across_batch_requests(): | 285 | def test_analyze_products_uses_product_level_cache_across_batch_requests(): |
| 232 | cache_store = {} | 286 | cache_store = {} |
| 233 | process_calls = [] | 287 | process_calls = [] |
| @@ -241,13 +295,16 @@ def test_analyze_products_uses_product_level_cache_across_batch_requests(): | @@ -241,13 +295,16 @@ def test_analyze_products_uses_product_level_cache_across_batch_requests(): | ||
| 241 | product.get("image_url", ""), | 295 | product.get("image_url", ""), |
| 242 | ) | 296 | ) |
| 243 | 297 | ||
| 244 | - def fake_get_cached_anchor_result(product, target_lang): | 298 | + def fake_get_cached_analysis_result(product, target_lang, analysis_kind="content"): |
| 299 | + assert analysis_kind == "content" | ||
| 245 | return cache_store.get(_cache_key(product, target_lang)) | 300 | return cache_store.get(_cache_key(product, target_lang)) |
| 246 | 301 | ||
| 247 | - def fake_set_cached_anchor_result(product, target_lang, result): | 302 | + def fake_set_cached_analysis_result(product, target_lang, result, analysis_kind="content"): |
| 303 | + assert analysis_kind == "content" | ||
| 248 | cache_store[_cache_key(product, target_lang)] = result | 304 | cache_store[_cache_key(product, target_lang)] = result |
| 249 | 305 | ||
| 250 | - def fake_process_batch(batch_data, batch_num, target_lang="zh"): | 306 | + def fake_process_batch(batch_data, batch_num, target_lang="zh", analysis_kind="content"): |
| 307 | + assert analysis_kind == "content" | ||
| 251 | process_calls.append( | 308 | process_calls.append( |
| 252 | { | 309 | { |
| 253 | "batch_num": batch_num, | 310 | "batch_num": batch_num, |
| @@ -281,12 +338,12 @@ def test_analyze_products_uses_product_level_cache_across_batch_requests(): | @@ -281,12 +338,12 @@ def test_analyze_products_uses_product_level_cache_across_batch_requests(): | ||
| 281 | 338 | ||
| 282 | with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object( | 339 | with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object( |
| 283 | product_enrich, | 340 | product_enrich, |
| 284 | - "_get_cached_anchor_result", | ||
| 285 | - side_effect=fake_get_cached_anchor_result, | 341 | + "_get_cached_analysis_result", |
| 342 | + side_effect=fake_get_cached_analysis_result, | ||
| 286 | ), mock.patch.object( | 343 | ), mock.patch.object( |
| 287 | product_enrich, | 344 | product_enrich, |
| 288 | - "_set_cached_anchor_result", | ||
| 289 | - side_effect=fake_set_cached_anchor_result, | 345 | + "_set_cached_analysis_result", |
| 346 | + side_effect=fake_set_cached_analysis_result, | ||
| 290 | ), mock.patch.object( | 347 | ), mock.patch.object( |
| 291 | product_enrich, | 348 | product_enrich, |
| 292 | "process_batch", | 349 | "process_batch", |
| @@ -342,11 +399,12 @@ def test_analyze_products_reuses_cached_content_with_current_product_identity(): | @@ -342,11 +399,12 @@ def test_analyze_products_reuses_cached_content_with_current_product_identity(): | ||
| 342 | 399 | ||
| 343 | with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object( | 400 | with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object( |
| 344 | product_enrich, | 401 | product_enrich, |
| 345 | - "_get_cached_anchor_result", | ||
| 346 | - wraps=lambda product, target_lang: product_enrich._normalize_analysis_result( | 402 | + "_get_cached_analysis_result", |
| 403 | + wraps=lambda product, target_lang, analysis_kind="content": product_enrich._normalize_analysis_result( | ||
| 347 | cached_result, | 404 | cached_result, |
| 348 | product=product, | 405 | product=product, |
| 349 | target_lang=target_lang, | 406 | target_lang=target_lang, |
| 407 | + schema=product_enrich._get_analysis_schema("content"), | ||
| 350 | ), | 408 | ), |
| 351 | ), mock.patch.object( | 409 | ), mock.patch.object( |
| 352 | product_enrich, | 410 | product_enrich, |
| @@ -379,7 +437,47 @@ def test_analyze_products_reuses_cached_content_with_current_product_identity(): | @@ -379,7 +437,47 @@ def test_analyze_products_reuses_cached_content_with_current_product_identity(): | ||
| 379 | 437 | ||
| 380 | 438 | ||
| 381 | def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output(): | 439 | def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output(): |
| 382 | - def fake_analyze_products(products, target_lang="zh", batch_size=None, tenant_id=None): | 440 | + def fake_analyze_products( |
| 441 | + products, | ||
| 442 | + target_lang="zh", | ||
| 443 | + batch_size=None, | ||
| 444 | + tenant_id=None, | ||
| 445 | + analysis_kind="content", | ||
| 446 | + ): | ||
| 447 | + if analysis_kind == "taxonomy": | ||
| 448 | + return [ | ||
| 449 | + { | ||
| 450 | + "id": products[0]["id"], | ||
| 451 | + "lang": target_lang, | ||
| 452 | + "title_input": products[0]["title"], | ||
| 453 | + "product_type": f"{target_lang}-dress", | ||
| 454 | + "target_gender": f"{target_lang}-women", | ||
| 455 | + "age_group": "", | ||
| 456 | + "season": f"{target_lang}-summer", | ||
| 457 | + "fit": "", | ||
| 458 | + "silhouette": "", | ||
| 459 | + "neckline": "", | ||
| 460 | + "sleeve_length_type": "", | ||
| 461 | + "sleeve_style": "", | ||
| 462 | + "strap_type": "", | ||
| 463 | + "rise_waistline": "", | ||
| 464 | + "leg_shape": "", | ||
| 465 | + "skirt_shape": "", | ||
| 466 | + "length_type": "", | ||
| 467 | + "closure_type": "", | ||
| 468 | + "design_details": "", | ||
| 469 | + "fabric": "", | ||
| 470 | + "material_composition": "", | ||
| 471 | + "fabric_properties": "", | ||
| 472 | + "clothing_features": "", | ||
| 473 | + "functional_benefits": "", | ||
| 474 | + "color": "", | ||
| 475 | + "color_family": "", | ||
| 476 | + "print_pattern": "", | ||
| 477 | + "occasion_end_use": "", | ||
| 478 | + "style_aesthetic": "", | ||
| 479 | + } | ||
| 480 | + ] | ||
| 383 | return [ | 481 | return [ |
| 384 | { | 482 | { |
| 385 | "id": products[0]["id"], | 483 | "id": products[0]["id"], |
| @@ -423,6 +521,20 @@ def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output() | @@ -423,6 +521,20 @@ def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output() | ||
| 423 | }, | 521 | }, |
| 424 | {"name": "target_audience", "value": {"zh": ["zh-audience"], "en": ["en-audience"]}}, | 522 | {"name": "target_audience", "value": {"zh": ["zh-audience"], "en": ["en-audience"]}}, |
| 425 | ], | 523 | ], |
| 524 | + "enriched_taxonomy_attributes": [ | ||
| 525 | + { | ||
| 526 | + "name": "Product Type", | ||
| 527 | + "value": {"zh": ["zh-dress"], "en": ["en-dress"]}, | ||
| 528 | + }, | ||
| 529 | + { | ||
| 530 | + "name": "Target Gender", | ||
| 531 | + "value": {"zh": ["zh-women"], "en": ["en-women"]}, | ||
| 532 | + }, | ||
| 533 | + { | ||
| 534 | + "name": "Season", | ||
| 535 | + "value": {"zh": ["zh-summer"], "en": ["en-summer"]}, | ||
| 536 | + }, | ||
| 537 | + ], | ||
| 426 | } | 538 | } |
| 427 | ] | 539 | ] |
| 428 | 540 |