Commit 90de78aa7883f27b473ea2c3de7b48da325efdad
1 parent
a3734f13
enrich接口 因为接口迭代、跟缓存不兼容,bug修复
`indexer/product_enrich.py`,不是再补一层判断。 根因有两个:缓存 key 按内容复用,但缓存值里还带着旧商品的 `id/title_input`;同时内部分析结果在历史上混用了 `tags` 和 `enriched_tags`。这样一旦命中旧缓存,`build_index_content_fields()` 会因为 `id` 对不上把结果丢掉,最后对外就变成全空。 现在的处理是: - 内部分析结果统一用 `tags` 作为 LLM/缓存层字段。 - 对外只在 `build_index_content_fields()` 封装时映射成 `enriched_tags`,`enriched_attributes` 里也统一产出 `name="enriched_tags"`。 - 读取缓存时会先做归一化:把旧缓存里的 `enriched_tags` 兼容成内部 `tags`,并把命中的缓存结果重绑到当前请求商品的 `id/title_input`。 - 写缓存时也统一写成归一化后的内部结构,并且空内容不再写入缓存。
Showing
2 changed files
with
242 additions
and
61 deletions
Show diff stats
indexer/product_enrich.py
| ... | ... | @@ -147,15 +147,40 @@ if _missing_prompt_langs: |
| 147 | 147 | # 多值字段分隔:英文逗号、中文逗号、顿号,及历史约定的 ; | / 与空白 |
| 148 | 148 | _MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[,、,;|/\n\t]+") |
| 149 | 149 | _CORE_INDEX_LANGUAGES = ("zh", "en") |
| 150 | -_ENRICHED_ATTRIBUTE_DIMENSIONS = ( | |
| 151 | - "enriched_tags", | |
| 150 | +_ANALYSIS_ATTRIBUTE_FIELD_MAP = ( | |
| 151 | + ("tags", "enriched_tags"), | |
| 152 | + ("target_audience", "target_audience"), | |
| 153 | + ("usage_scene", "usage_scene"), | |
| 154 | + ("season", "season"), | |
| 155 | + ("key_attributes", "key_attributes"), | |
| 156 | + ("material", "material"), | |
| 157 | + ("features", "features"), | |
| 158 | +) | |
| 159 | +_ANALYSIS_RESULT_FIELDS = ( | |
| 160 | + "title", | |
| 161 | + "category_path", | |
| 162 | + "tags", | |
| 163 | + "target_audience", | |
| 164 | + "usage_scene", | |
| 165 | + "season", | |
| 166 | + "key_attributes", | |
| 167 | + "material", | |
| 168 | + "features", | |
| 169 | + "anchor_text", | |
| 170 | +) | |
| 171 | +_ANALYSIS_MEANINGFUL_FIELDS = ( | |
| 172 | + "tags", | |
| 152 | 173 | "target_audience", |
| 153 | 174 | "usage_scene", |
| 154 | 175 | "season", |
| 155 | 176 | "key_attributes", |
| 156 | 177 | "material", |
| 157 | 178 | "features", |
| 179 | + "anchor_text", | |
| 158 | 180 | ) |
| 181 | +_ANALYSIS_FIELD_ALIASES = { | |
| 182 | + "tags": ("tags", "enriched_tags"), | |
| 183 | +} | |
| 159 | 184 | |
| 160 | 185 | |
| 161 | 186 | def split_multi_value_field(text: Optional[str]) -> List[str]: |
| ... | ... | @@ -195,25 +220,104 @@ def _append_enriched_attribute( |
| 195 | 220 | target.append({"name": name, "value": {lang: value}}) |
| 196 | 221 | |
| 197 | 222 | |
| 223 | +def _get_product_id(product: Dict[str, Any]) -> str: | |
| 224 | + return str(product.get("id") or product.get("spu_id") or "").strip() | |
| 225 | + | |
| 226 | + | |
| 227 | +def _get_analysis_field_aliases(field_name: str) -> Tuple[str, ...]: | |
| 228 | + return _ANALYSIS_FIELD_ALIASES.get(field_name, (field_name,)) | |
| 229 | + | |
| 230 | + | |
| 231 | +def _get_analysis_field_value(row: Dict[str, Any], field_name: str) -> Any: | |
| 232 | + for alias in _get_analysis_field_aliases(field_name): | |
| 233 | + if alias in row: | |
| 234 | + return row.get(alias) | |
| 235 | + return None | |
| 236 | + | |
| 237 | + | |
| 238 | +def _has_meaningful_value(value: Any) -> bool: | |
| 239 | + if value is None: | |
| 240 | + return False | |
| 241 | + if isinstance(value, str): | |
| 242 | + return bool(value.strip()) | |
| 243 | + if isinstance(value, dict): | |
| 244 | + return any(_has_meaningful_value(v) for v in value.values()) | |
| 245 | + if isinstance(value, list): | |
| 246 | + return any(_has_meaningful_value(v) for v in value) | |
| 247 | + return bool(value) | |
| 248 | + | |
| 249 | + | |
| 250 | +def _make_empty_analysis_result( | |
| 251 | + product: Dict[str, Any], | |
| 252 | + target_lang: str, | |
| 253 | + error: Optional[str] = None, | |
| 254 | +) -> Dict[str, Any]: | |
| 255 | + result = { | |
| 256 | + "id": _get_product_id(product), | |
| 257 | + "lang": target_lang, | |
| 258 | + "title_input": str(product.get("title") or "").strip(), | |
| 259 | + } | |
| 260 | + for field in _ANALYSIS_RESULT_FIELDS: | |
| 261 | + result[field] = "" | |
| 262 | + if error: | |
| 263 | + result["error"] = error | |
| 264 | + return result | |
| 265 | + | |
| 266 | + | |
| 267 | +def _normalize_analysis_result( | |
| 268 | + result: Dict[str, Any], | |
| 269 | + product: Dict[str, Any], | |
| 270 | + target_lang: str, | |
| 271 | +) -> Dict[str, Any]: | |
| 272 | + normalized = _make_empty_analysis_result(product, target_lang) | |
| 273 | + if not isinstance(result, dict): | |
| 274 | + return normalized | |
| 275 | + | |
| 276 | + normalized["lang"] = str(result.get("lang") or target_lang).strip() or target_lang | |
| 277 | + normalized["title"] = str(result.get("title") or "").strip() | |
| 278 | + normalized["category_path"] = str(result.get("category_path") or "").strip() | |
| 279 | + normalized["title_input"] = str( | |
| 280 | + product.get("title") or result.get("title_input") or "" | |
| 281 | + ).strip() | |
| 282 | + | |
| 283 | + for field in _ANALYSIS_RESULT_FIELDS: | |
| 284 | + if field in {"title", "category_path"}: | |
| 285 | + continue | |
| 286 | + normalized[field] = str(_get_analysis_field_value(result, field) or "").strip() | |
| 287 | + | |
| 288 | + if result.get("error"): | |
| 289 | + normalized["error"] = str(result.get("error")) | |
| 290 | + return normalized | |
| 291 | + | |
| 292 | + | |
| 293 | +def _has_meaningful_analysis_content(result: Dict[str, Any]) -> bool: | |
| 294 | + return any(_has_meaningful_value(result.get(field)) for field in _ANALYSIS_MEANINGFUL_FIELDS) | |
| 295 | + | |
| 296 | + | |
| 198 | 297 | def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang: str) -> None: |
| 199 | 298 | if not row or row.get("error"): |
| 200 | 299 | return |
| 201 | 300 | |
| 202 | - anchor_text = str(row.get("anchor_text") or "").strip() | |
| 301 | + anchor_text = str(_get_analysis_field_value(row, "anchor_text") or "").strip() | |
| 203 | 302 | if anchor_text: |
| 204 | 303 | _append_lang_phrase_map(result["qanchors"], lang=lang, raw_value=anchor_text) |
| 205 | 304 | |
| 206 | - for name in _ENRICHED_ATTRIBUTE_DIMENSIONS: | |
| 207 | - raw = row.get(name) | |
| 305 | + for source_name, output_name in _ANALYSIS_ATTRIBUTE_FIELD_MAP: | |
| 306 | + raw = _get_analysis_field_value(row, source_name) | |
| 208 | 307 | if not raw: |
| 209 | 308 | continue |
| 210 | - _append_enriched_attribute(result["enriched_attributes"], name=name, lang=lang, raw_value=raw) | |
| 211 | - if name == "enriched_tags": | |
| 309 | + _append_enriched_attribute( | |
| 310 | + result["enriched_attributes"], | |
| 311 | + name=output_name, | |
| 312 | + lang=lang, | |
| 313 | + raw_value=raw, | |
| 314 | + ) | |
| 315 | + if output_name == "enriched_tags": | |
| 212 | 316 | _append_lang_phrase_map(result["enriched_tags"], lang=lang, raw_value=raw) |
| 213 | 317 | |
| 214 | 318 | |
| 215 | 319 | def _normalize_index_content_item(item: Dict[str, Any]) -> Dict[str, str]: |
| 216 | - item_id = str(item.get("id") or item.get("spu_id") or "").strip() | |
| 320 | + item_id = _get_product_id(item) | |
| 217 | 321 | return { |
| 218 | 322 | "id": item_id, |
| 219 | 323 | "title": str(item.get("title") or "").strip(), |
| ... | ... | @@ -369,7 +473,10 @@ def _get_cached_anchor_result( |
| 369 | 473 | raw = _anchor_redis.get(key) |
| 370 | 474 | if not raw: |
| 371 | 475 | return None |
| 372 | - return json.loads(raw) | |
| 476 | + result = _normalize_analysis_result(json.loads(raw), product=product, target_lang=target_lang) | |
| 477 | + if not _has_meaningful_analysis_content(result): | |
| 478 | + return None | |
| 479 | + return result | |
| 373 | 480 | except Exception as e: |
| 374 | 481 | logger.warning(f"Failed to get anchor cache: {e}") |
| 375 | 482 | return None |
| ... | ... | @@ -383,9 +490,12 @@ def _set_cached_anchor_result( |
| 383 | 490 | if not _anchor_redis: |
| 384 | 491 | return |
| 385 | 492 | try: |
| 493 | + normalized = _normalize_analysis_result(result, product=product, target_lang=target_lang) | |
| 494 | + if not _has_meaningful_analysis_content(normalized): | |
| 495 | + return | |
| 386 | 496 | key = _make_anchor_cache_key(product, target_lang) |
| 387 | 497 | ttl = ANCHOR_CACHE_EXPIRE_DAYS * 24 * 3600 |
| 388 | - _anchor_redis.setex(key, ttl, json.dumps(result, ensure_ascii=False)) | |
| 498 | + _anchor_redis.setex(key, ttl, json.dumps(normalized, ensure_ascii=False)) | |
| 389 | 499 | except Exception as e: |
| 390 | 500 | logger.warning(f"Failed to set anchor cache: {e}") |
| 391 | 501 | |
| ... | ... | @@ -654,7 +764,7 @@ def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]: |
| 654 | 764 | "seq_no": parts[0], |
| 655 | 765 | "title": parts[1], # 商品标题(按目标语言) |
| 656 | 766 | "category_path": parts[2] if len(parts) > 2 else "", # 品类路径 |
| 657 | - "enriched_tags": parts[3] if len(parts) > 3 else "", # 细分标签 | |
| 767 | + "tags": parts[3] if len(parts) > 3 else "", # 细分标签 | |
| 658 | 768 | "target_audience": parts[4] if len(parts) > 4 else "", # 适用人群 |
| 659 | 769 | "usage_scene": parts[5] if len(parts) > 5 else "", # 使用场景 |
| 660 | 770 | "season": parts[6] if len(parts) > 6 else "", # 适用季节 |
| ... | ... | @@ -705,7 +815,7 @@ def process_batch( |
| 705 | 815 | batch_data: List[Dict[str, str]], |
| 706 | 816 | batch_num: int, |
| 707 | 817 | target_lang: str = "zh", |
| 708 | -) -> List[Dict[str, str]]: | |
| 818 | +) -> List[Dict[str, Any]]: | |
| 709 | 819 | """处理一个批次的数据""" |
| 710 | 820 | logger.info(f"\n{'#' * 80}") |
| 711 | 821 | logger.info(f"Processing Batch {batch_num} ({len(batch_data)} items)") |
| ... | ... | @@ -725,22 +835,11 @@ def process_batch( |
| 725 | 835 | target_lang, |
| 726 | 836 | ) |
| 727 | 837 | return [ |
| 728 | - { | |
| 729 | - "id": item["id"], | |
| 730 | - "lang": target_lang, | |
| 731 | - "title_input": item.get("title", ""), | |
| 732 | - "title": "", | |
| 733 | - "category_path": "", | |
| 734 | - "enriched_tags": "", | |
| 735 | - "target_audience": "", | |
| 736 | - "usage_scene": "", | |
| 737 | - "season": "", | |
| 738 | - "key_attributes": "", | |
| 739 | - "material": "", | |
| 740 | - "features": "", | |
| 741 | - "anchor_text": "", | |
| 742 | - "error": f"prompt_creation_failed: unsupported target_lang={target_lang}", | |
| 743 | - } | |
| 838 | + _make_empty_analysis_result( | |
| 839 | + item, | |
| 840 | + target_lang, | |
| 841 | + error=f"prompt_creation_failed: unsupported target_lang={target_lang}", | |
| 842 | + ) | |
| 744 | 843 | for item in batch_data |
| 745 | 844 | ] |
| 746 | 845 | |
| ... | ... | @@ -764,24 +863,18 @@ def process_batch( |
| 764 | 863 | results_with_ids = [] |
| 765 | 864 | for i, parsed_item in enumerate(parsed_results): |
| 766 | 865 | if i < len(batch_data): |
| 767 | - original_id = batch_data[i]["id"] | |
| 768 | - result = { | |
| 769 | - "id": original_id, | |
| 770 | - "lang": target_lang, | |
| 771 | - "title_input": batch_data[i]["title"], # 原始输入标题 | |
| 772 | - "title": parsed_item.get("title", ""), # 模型生成的标题 | |
| 773 | - "category_path": parsed_item.get("category_path", ""), # 品类路径 | |
| 774 | - "enriched_tags": parsed_item.get("enriched_tags", ""), # 细分标签 | |
| 775 | - "target_audience": parsed_item.get("target_audience", ""), # 适用人群 | |
| 776 | - "usage_scene": parsed_item.get("usage_scene", ""), # 使用场景 | |
| 777 | - "season": parsed_item.get("season", ""), # 适用季节 | |
| 778 | - "key_attributes": parsed_item.get("key_attributes", ""), # 关键属性 | |
| 779 | - "material": parsed_item.get("material", ""), # 材质说明 | |
| 780 | - "features": parsed_item.get("features", ""), # 功能特点 | |
| 781 | - "anchor_text": parsed_item.get("anchor_text", ""), # 锚文本 | |
| 782 | - } | |
| 866 | + source_product = batch_data[i] | |
| 867 | + result = _normalize_analysis_result( | |
| 868 | + parsed_item, | |
| 869 | + product=source_product, | |
| 870 | + target_lang=target_lang, | |
| 871 | + ) | |
| 783 | 872 | results_with_ids.append(result) |
| 784 | - logger.info(f"Mapped: seq={parsed_item['seq_no']} -> original_id={original_id}") | |
| 873 | + logger.info( | |
| 874 | + "Mapped: seq=%s -> original_id=%s", | |
| 875 | + parsed_item.get("seq_no"), | |
| 876 | + source_product.get("id"), | |
| 877 | + ) | |
| 785 | 878 | |
| 786 | 879 | # 保存批次 JSON 日志到独立文件 |
| 787 | 880 | batch_log = { |
| ... | ... | @@ -808,22 +901,7 @@ def process_batch( |
| 808 | 901 | logger.error(f"Error processing batch {batch_num}: {str(e)}", exc_info=True) |
| 809 | 902 | # 返回空结果,保持ID映射 |
| 810 | 903 | return [ |
| 811 | - { | |
| 812 | - "id": item["id"], | |
| 813 | - "lang": target_lang, | |
| 814 | - "title_input": item["title"], | |
| 815 | - "title": "", | |
| 816 | - "category_path": "", | |
| 817 | - "enriched_tags": "", | |
| 818 | - "target_audience": "", | |
| 819 | - "usage_scene": "", | |
| 820 | - "season": "", | |
| 821 | - "key_attributes": "", | |
| 822 | - "material": "", | |
| 823 | - "features": "", | |
| 824 | - "anchor_text": "", | |
| 825 | - "error": str(e), | |
| 826 | - } | |
| 904 | + _make_empty_analysis_result(item, target_lang, error=str(e)) | |
| 827 | 905 | for item in batch_data |
| 828 | 906 | ] |
| 829 | 907 | ... | ... |
tests/test_product_enrich_partial_mode.py
| ... | ... | @@ -322,6 +322,109 @@ def test_analyze_products_uses_product_level_cache_across_batch_requests(): |
| 322 | 322 | assert third[1]["anchor_text"] == "anchor:shirt" |
| 323 | 323 | |
| 324 | 324 | |
| 325 | +def test_analyze_products_reuses_cached_content_with_current_product_identity(): | |
| 326 | + cached_result = { | |
| 327 | + "id": "1165", | |
| 328 | + "lang": "zh", | |
| 329 | + "title_input": "old-title", | |
| 330 | + "title": "法式连衣裙", | |
| 331 | + "category_path": "女装>连衣裙", | |
| 332 | + "enriched_tags": "法式,收腰", | |
| 333 | + "target_audience": "年轻女性", | |
| 334 | + "usage_scene": "通勤,约会", | |
| 335 | + "season": "春季,夏季", | |
| 336 | + "key_attributes": "中长款", | |
| 337 | + "material": "聚酯纤维", | |
| 338 | + "features": "透气", | |
| 339 | + "anchor_text": "法式收腰连衣裙", | |
| 340 | + } | |
| 341 | + products = [{"id": "69960", "title": "dress"}] | |
| 342 | + | |
| 343 | + with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object( | |
| 344 | + product_enrich, | |
| 345 | + "_get_cached_anchor_result", | |
| 346 | + wraps=lambda product, target_lang: product_enrich._normalize_analysis_result( | |
| 347 | + cached_result, | |
| 348 | + product=product, | |
| 349 | + target_lang=target_lang, | |
| 350 | + ), | |
| 351 | + ), mock.patch.object( | |
| 352 | + product_enrich, | |
| 353 | + "process_batch", | |
| 354 | + side_effect=AssertionError("process_batch should not be called on cache hit"), | |
| 355 | + ): | |
| 356 | + result = product_enrich.analyze_products( | |
| 357 | + products, | |
| 358 | + target_lang="zh", | |
| 359 | + tenant_id="170", | |
| 360 | + ) | |
| 361 | + | |
| 362 | + assert result == [ | |
| 363 | + { | |
| 364 | + "id": "69960", | |
| 365 | + "lang": "zh", | |
| 366 | + "title_input": "dress", | |
| 367 | + "title": "法式连衣裙", | |
| 368 | + "category_path": "女装>连衣裙", | |
| 369 | + "tags": "法式,收腰", | |
| 370 | + "target_audience": "年轻女性", | |
| 371 | + "usage_scene": "通勤,约会", | |
| 372 | + "season": "春季,夏季", | |
| 373 | + "key_attributes": "中长款", | |
| 374 | + "material": "聚酯纤维", | |
| 375 | + "features": "透气", | |
| 376 | + "anchor_text": "法式收腰连衣裙", | |
| 377 | + } | |
| 378 | + ] | |
| 379 | + | |
| 380 | + | |
| 381 | +def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output(): | |
| 382 | + def fake_analyze_products(products, target_lang="zh", batch_size=None, tenant_id=None): | |
| 383 | + return [ | |
| 384 | + { | |
| 385 | + "id": products[0]["id"], | |
| 386 | + "lang": target_lang, | |
| 387 | + "title_input": products[0]["title"], | |
| 388 | + "title": products[0]["title"], | |
| 389 | + "category_path": "玩具>滑行玩具", | |
| 390 | + "tags": f"{target_lang}-tag1,{target_lang}-tag2", | |
| 391 | + "target_audience": f"{target_lang}-audience", | |
| 392 | + "usage_scene": "", | |
| 393 | + "season": "", | |
| 394 | + "key_attributes": "", | |
| 395 | + "material": "", | |
| 396 | + "features": "", | |
| 397 | + "anchor_text": f"{target_lang}-anchor", | |
| 398 | + } | |
| 399 | + ] | |
| 400 | + | |
| 401 | + with mock.patch.object( | |
| 402 | + product_enrich, | |
| 403 | + "analyze_products", | |
| 404 | + side_effect=fake_analyze_products, | |
| 405 | + ): | |
| 406 | + result = product_enrich.build_index_content_fields( | |
| 407 | + items=[{"spu_id": "69960", "title": "dress"}], | |
| 408 | + tenant_id="170", | |
| 409 | + ) | |
| 410 | + | |
| 411 | + assert result == [ | |
| 412 | + { | |
| 413 | + "id": "69960", | |
| 414 | + "qanchors": {"zh": ["zh-anchor"], "en": ["en-anchor"]}, | |
| 415 | + "enriched_tags": {"zh": ["zh-tag1", "zh-tag2"], "en": ["en-tag1", "en-tag2"]}, | |
| 416 | + "enriched_attributes": [ | |
| 417 | + {"name": "enriched_tags", "value": {"zh": "zh-tag1"}}, | |
| 418 | + {"name": "enriched_tags", "value": {"zh": "zh-tag2"}}, | |
| 419 | + {"name": "target_audience", "value": {"zh": "zh-audience"}}, | |
| 420 | + {"name": "enriched_tags", "value": {"en": "en-tag1"}}, | |
| 421 | + {"name": "enriched_tags", "value": {"en": "en-tag2"}}, | |
| 422 | + {"name": "target_audience", "value": {"en": "en-audience"}}, | |
| 423 | + ], | |
| 424 | + } | |
| 425 | + ] | |
| 426 | + | |
| 427 | + | |
| 325 | 428 | def test_anchor_cache_key_depends_on_product_input_not_identifiers(): |
| 326 | 429 | product_a = { |
| 327 | 430 | "id": "1", | ... | ... |