From 76e1f08890671791eb5c646c9f1e1c47c9f1e9dd Mon Sep 17 00:00:00 2001 From: tangwang Date: Wed, 18 Mar 2026 09:22:37 +0800 Subject: [PATCH] 1. 减少一列sell points。有时候大模型输出会将这最后两列混淆,因此干脆去掉一个 2. 优化缓存,缓存粒度为商品级,每次只对batch中未cache的重新计算;key使用每个商品输入的hash --- indexer/ANCHORS_AND_SEMANTIC_ATTRIBUTES.md | 1 - indexer/product_enrich.py | 55 ++++++++++++++++++++++++++++++------------------------- indexer/product_enrich_prompts.py | 40 +--------------------------------------- tests/ci/test_service_api_contracts.py | 1 - tests/test_process_products_batching.py | 2 -- tests/test_product_enrich_partial_mode.py | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 6 files changed, 119 insertions(+), 72 deletions(-) diff --git a/indexer/ANCHORS_AND_SEMANTIC_ATTRIBUTES.md b/indexer/ANCHORS_AND_SEMANTIC_ATTRIBUTES.md index 9b4f5fb..d58159b 100644 --- a/indexer/ANCHORS_AND_SEMANTIC_ATTRIBUTES.md +++ b/indexer/ANCHORS_AND_SEMANTIC_ATTRIBUTES.md @@ -135,7 +135,6 @@ SUPPORTED_LANGS = set(LANG_LABELS.keys()) "key_attributes": "<逗号分隔的关键属性>", "material": "<逗号分隔的材质说明>", "features": "<逗号分隔的功能特点>", - "selling_points": "<一句话卖点>", "anchor_text": "<逗号分隔的锚文本短语>", # 若发生错误,还会附带: # "error": "<异常信息>" diff --git a/indexer/product_enrich.py b/indexer/product_enrich.py index 1e215cb..445cfeb 100644 --- a/indexer/product_enrich.py +++ b/indexer/product_enrich.py @@ -425,8 +425,7 @@ def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]: "key_attributes": parts[7] if len(parts) > 7 else "", # 关键属性 "material": parts[8] if len(parts) > 8 else "", # 材质说明 "features": parts[9] if len(parts) > 9 else "", # 功能特点 - "selling_points": parts[10] if len(parts) > 10 else "", # 商品卖点 - "anchor_text": parts[11] if len(parts) > 11 else "", # 锚文本 + "anchor_text": parts[10] if len(parts) > 10 else "", # 锚文本 } data.append(row) @@ -503,7 +502,6 @@ def process_batch( "key_attributes": "", "material": "", "features": "", - "selling_points": "", "anchor_text": "", "error": f"prompt_creation_failed: unsupported target_lang={target_lang}", } @@ -544,7 +542,6 @@ def process_batch( "key_attributes": parsed_item.get("key_attributes", ""), # 关键属性 "material": parsed_item.get("material", ""), # 材质说明 "features": parsed_item.get("features", ""), # 功能特点 - "selling_points": parsed_item.get("selling_points", ""), # 商品卖点 "anchor_text": parsed_item.get("anchor_text", ""), # 锚文本 } results_with_ids.append(result) @@ -586,7 +583,6 @@ def process_batch( "key_attributes": "", "material": "", "features": "", - "selling_points": "", "anchor_text": "", "error": str(e), } @@ -614,39 +610,48 @@ def analyze_products( if not products: return [] - # 简单路径:索引阶段通常 batch_size=1,这里优先做单条缓存命中 - if len(products) == 1: - p = products[0] - title = str(p.get("title") or "").strip() - if title: - cached = _get_cached_anchor_result(title, target_lang, tenant_id=tenant_id) - if cached: - logger.info( - f"[analyze_products] Cache hit for title='{title[:50]}...', " - f"lang={target_lang}, tenant_id={tenant_id or 'global'}" - ) - return [cached] + results_by_index: List[Optional[Dict[str, Any]]] = [None] * len(products) + uncached_items: List[Tuple[int, Dict[str, str]]] = [] + + for idx, product in enumerate(products): + title = str(product.get("title") or "").strip() + if not title: + uncached_items.append((idx, product)) + continue + + cached = _get_cached_anchor_result(title, target_lang, tenant_id=tenant_id) + if cached: + logger.info( + f"[analyze_products] Cache hit for title='{title[:50]}...', " + f"lang={target_lang}, tenant_id={tenant_id or 'global'}" + ) + results_by_index[idx] = cached + continue + + uncached_items.append((idx, product)) + + if not uncached_items: + return [item for item in results_by_index if item is not None] # call_llm 一次处理上限固定为 BATCH_SIZE(默认 20): # - 尽可能攒批处理; # - 即便调用方传入更大的 batch_size,也会自动按上限拆批。 req_bs = BATCH_SIZE if batch_size is None else int(batch_size) bs = max(1, min(req_bs, BATCH_SIZE)) - all_results: List[Dict[str, Any]] = [] - total_batches = (len(products) + bs - 1) // bs + total_batches = (len(uncached_items) + bs - 1) // bs - for i in range(0, len(products), bs): + for i in range(0, len(uncached_items), bs): batch_num = i // bs + 1 - batch = products[i : i + bs] + batch_slice = uncached_items[i : i + bs] + batch = [item for _, item in batch_slice] logger.info( f"[analyze_products] Processing batch {batch_num}/{total_batches}, " f"size={len(batch)}, target_lang={target_lang}" ) batch_results = process_batch(batch, batch_num=batch_num, target_lang=target_lang) - all_results.extend(batch_results) - # 写入缓存 - for item in batch_results: + for (original_idx, _), item in zip(batch_slice, batch_results): + results_by_index[original_idx] = item title_input = str(item.get("title_input") or "").strip() if not title_input: continue @@ -659,4 +664,4 @@ def analyze_products( # 已在内部记录 warning pass - return all_results + return [item for item in results_by_index if item is not None] diff --git a/indexer/product_enrich_prompts.py b/indexer/product_enrich_prompts.py index 7950573..44ae3c4 100644 --- a/indexer/product_enrich_prompts.py +++ b/indexer/product_enrich_prompts.py @@ -19,8 +19,7 @@ SHARED_ANALYSIS_INSTRUCTION = """Analyze each input product title and fill these 7. Key attributes 8. Material description 9. Functional features -10. Selling point: one concise core selling phrase -11. Anchor text: a set of search-oriented words or phrases covering category, attributes, scenes, and demand +10. Anchor text: a search-focused set of keywords, selling points, and phrases covering categories, attributes, usage scenarios, and user intent Rules: - Keep the input order and row count exactly the same. @@ -46,7 +45,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "Key attributes", "Material", "Features", - "Selling point", "Anchor text" ], "zh": [ @@ -60,7 +58,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "关键属性", "材质说明", "功能特点", - "商品卖点", "锚文本" ], "zh_tw": [ @@ -74,7 +71,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "關鍵屬性", "材質說明", "功能特點", - "商品賣點", "錨文本" ], "ru": [ @@ -88,7 +84,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "Ключевые атрибуты", "Материал", "Особенности", - "Преимущество товара", "Анкорный текст" ], "ja": [ @@ -102,7 +97,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "主要属性", "素材", "機能特徴", - "商品の訴求点", "アンカーテキスト" ], "ko": [ @@ -116,7 +110,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "핵심 속성", "소재", "기능 특징", - "상품 포인트", "앵커 텍스트" ], "es": [ @@ -130,7 +123,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "Atributos clave", "Material", "Caracteristicas", - "Punto de venta", "Texto ancla" ], "fr": [ @@ -144,7 +136,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "Attributs cles", "Matiere", "Caracteristiques", - "Argument de vente", "Texte d'ancrage" ], "pt": [ @@ -158,7 +149,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "Atributos principais", "Material", "Caracteristicas", - "Ponto de venda", "Texto ancora" ], "de": [ @@ -172,7 +162,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "Wichtige Attribute", "Material", "Funktionen", - "Verkaufsargument", "Ankertext" ], "it": [ @@ -186,7 +175,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "Attributi chiave", "Materiale", "Caratteristiche", - "Punto di forza", "Testo ancora" ], "th": [ @@ -200,7 +188,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "คุณสมบัติสำคัญ", "วัสดุ", "คุณสมบัติการใช้งาน", - "จุดขายสินค้า", "แองเคอร์เท็กซ์" ], "vi": [ @@ -214,7 +201,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "Thuoc tinh chinh", "Chat lieu", "Tinh nang", - "Diem ban hang", "Van ban neo" ], "id": [ @@ -228,7 +214,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "Atribut utama", "Bahan", "Fitur", - "Nilai jual", "Teks jangkar" ], "ms": [ @@ -242,7 +227,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "Atribut utama", "Bahan", "Ciri-ciri", - "Nilai jual", "Teks sauh" ], "ar": [ @@ -256,7 +240,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "السمات الرئيسية", "المادة", "الميزات", - "نقطة البيع", "نص الربط" ], "hi": [ @@ -270,7 +253,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "मुख्य गुण", "सामग्री", "विशेषताएं", - "बिक्री बिंदु", "एंकर टेक्स्ट" ], "he": [ @@ -284,7 +266,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "מאפיינים מרכזיים", "חומר", "תכונות", - "נקודת מכירה", "טקסט עוגן" ], "my": [ @@ -298,7 +279,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "အဓိကဂုဏ်သတ္တိများ", "ပစ္စည်း", "လုပ်ဆောင်ချက်များ", - "အရောင်းထူးခြားချက်", "အန်ကာစာသား" ], "ta": [ @@ -312,7 +292,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "முக்கிய பண்புகள்", "பொருள்", "அம்சங்கள்", - "விற்பனை அம்சம்", "ஆங்கர் உரை" ], "ur": [ @@ -326,7 +305,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "کلیدی خصوصیات", "مواد", "فیچرز", - "فروختی نقطہ", "اینکر ٹیکسٹ" ], "bn": [ @@ -340,7 +318,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "মূল বৈশিষ্ট্য", "উপাদান", "ফিচার", - "বিক্রয় পয়েন্ট", "অ্যাঙ্কর টেক্সট" ], "pl": [ @@ -354,7 +331,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "Kluczowe atrybuty", "Material", "Cechy", - "Atut sprzedazowy", "Tekst kotwicy" ], "nl": [ @@ -368,7 +344,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "Belangrijke kenmerken", "Materiaal", "Functies", - "Verkooppunt", "Ankertekst" ], "ro": [ @@ -382,7 +357,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "Atribute cheie", "Material", "Caracteristici", - "Punct de vanzare", "Text ancora" ], "tr": [ @@ -396,7 +370,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "Temel ozellikler", "Malzeme", "Ozellikler", - "Satis noktasi", "Capa metni" ], "km": [ @@ -410,7 +383,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "លក្ខណៈសម្បត្តិសំខាន់", "សម្ភារៈ", "មុខងារ", - "ចំណុចលក់", "អត្ថបទអង់ក័រ" ], "lo": [ @@ -424,7 +396,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "ຄຸນລັກສະນະສຳຄັນ", "ວັດສະດຸ", "ຄຸນສົມບັດ", - "ຈຸດຂາຍ", "ຂໍ້ຄວາມອັງເຄີ" ], "yue": [ @@ -438,7 +409,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "關鍵屬性", "材質說明", "功能特點", - "商品賣點", "錨文本" ], "cs": [ @@ -452,7 +422,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "Klicove atributy", "Material", "Vlastnosti", - "Prodejni argument", "Kotvici text" ], "el": [ @@ -466,7 +435,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "Βασικά χαρακτηριστικά", "Υλικό", "Λειτουργίες", - "Σημείο πώλησης", "Κείμενο άγκυρας" ], "sv": [ @@ -480,7 +448,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "Viktiga attribut", "Material", "Funktioner", - "Saljpunkt", "Ankartext" ], "hu": [ @@ -494,7 +461,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "Fo jellemzok", "Anyag", "Funkciok", - "Ertekesitesi elony", "Horgonyszoveg" ], "da": [ @@ -508,7 +474,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "Nogleattributter", "Materiale", "Funktioner", - "Salgsargument", "Ankertekst" ], "fi": [ @@ -522,7 +487,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "Keskeiset ominaisuudet", "Materiaali", "Ominaisuudet", - "Myyntivaltti", "Ankkuriteksti" ], "uk": [ @@ -536,7 +500,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "Ключові атрибути", "Матеріал", "Особливості", - "Продаюча перевага", "Анкорний текст" ], "bg": [ @@ -550,7 +513,6 @@ LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { "Ключови атрибути", "Материал", "Характеристики", - "Търговско предимство", "Анкор текст" ] } \ No newline at end of file diff --git a/tests/ci/test_service_api_contracts.py b/tests/ci/test_service_api_contracts.py index 98be804..144fe92 100644 --- a/tests/ci/test_service_api_contracts.py +++ b/tests/ci/test_service_api_contracts.py @@ -366,7 +366,6 @@ def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch "key_attributes": "", "material": "", "features": "", - "selling_points": "", "anchor_text": f"{target_lang}-anchor-{p['id']}", } for p in products diff --git a/tests/test_process_products_batching.py b/tests/test_process_products_batching.py index a02f8da..d8d7ef4 100644 --- a/tests/test_process_products_batching.py +++ b/tests/test_process_products_batching.py @@ -29,7 +29,6 @@ def test_analyze_products_caps_batch_size_to_20(monkeypatch): "key_attributes": "", "material": "", "features": "", - "selling_points": "", "anchor_text": "", } for item in batch_data @@ -69,7 +68,6 @@ def test_analyze_products_uses_min_batch_size_1(monkeypatch): "key_attributes": "", "material": "", "features": "", - "selling_points": "", "anchor_text": "", } for item in batch_data diff --git a/tests/test_product_enrich_partial_mode.py b/tests/test_product_enrich_partial_mode.py index 55f04c6..f7dbb3b 100644 --- a/tests/test_product_enrich_partial_mode.py +++ b/tests/test_product_enrich_partial_mode.py @@ -195,9 +195,9 @@ def test_call_llm_logs_shared_context_once_and_verbose_contains_full_requests(): def test_process_batch_reads_result_and_validates_expected_fields(): - merged_markdown = """| 序号 | 商品标题 | 品类路径 | 细分标签 | 适用人群 | 使用场景 | 适用季节 | 关键属性 | 材质说明 | 功能特点 | 商品卖点 | 锚文本 | -|----|----|----|----|----|----|----|----|----|----|----|----| -| 1 | 法式连衣裙 | 女装>连衣裙 | 法式,收腰 | 年轻女性 | 通勤,约会 | 春季,夏季 | 中长款 | 聚酯纤维 | 透气 | 修身显瘦 | 法式收腰连衣裙 | + merged_markdown = """| 序号 | 商品标题 | 品类路径 | 细分标签 | 适用人群 | 使用场景 | 适用季节 | 关键属性 | 材质说明 | 功能特点 | 锚文本 | +|----|----|----|----|----|----|----|----|----|----|----| +| 1 | 法式连衣裙 | 女装>连衣裙 | 法式,收腰 | 年轻女性 | 通勤,约会 | 春季,夏季 | 中长款 | 聚酯纤维 | 透气 | 法式收腰连衣裙 | """ with mock.patch.object( @@ -225,5 +225,89 @@ def test_process_batch_reads_result_and_validates_expected_fields(): assert row["key_attributes"] == "中长款" assert row["material"] == "聚酯纤维" assert row["features"] == "透气" - assert row["selling_points"] == "修身显瘦" assert row["anchor_text"] == "法式收腰连衣裙" + + +def test_analyze_products_uses_product_level_cache_across_batch_requests(): + cache_store = {} + process_calls = [] + + def fake_get_cached_anchor_result(title, target_lang, tenant_id=None): + return cache_store.get((tenant_id, target_lang, title)) + + def fake_set_cached_anchor_result(title, target_lang, result, tenant_id=None): + cache_store[(tenant_id, target_lang, title)] = result + + def fake_process_batch(batch_data, batch_num, target_lang="zh"): + process_calls.append( + { + "batch_num": batch_num, + "target_lang": target_lang, + "titles": [item["title"] for item in batch_data], + } + ) + return [ + { + "id": item["id"], + "lang": target_lang, + "title_input": item["title"], + "title": f"normalized:{item['title']}", + "category_path": "cat", + "tags": "tags", + "target_audience": "audience", + "usage_scene": "scene", + "season": "season", + "key_attributes": "attrs", + "material": "material", + "features": "features", + "anchor_text": f"anchor:{item['title']}", + } + for item in batch_data + ] + + products = [ + {"id": "1", "title": "dress"}, + {"id": "2", "title": "shirt"}, + ] + + with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object( + product_enrich, + "_get_cached_anchor_result", + side_effect=fake_get_cached_anchor_result, + ), mock.patch.object( + product_enrich, + "_set_cached_anchor_result", + side_effect=fake_set_cached_anchor_result, + ), mock.patch.object( + product_enrich, + "process_batch", + side_effect=fake_process_batch, + ): + first = product_enrich.analyze_products( + [products[0]], + target_lang="zh", + tenant_id="170", + ) + second = product_enrich.analyze_products( + products, + target_lang="zh", + tenant_id="170", + ) + third = product_enrich.analyze_products( + products, + target_lang="zh", + tenant_id="170", + ) + + assert [row["title_input"] for row in first] == ["dress"] + assert [row["title_input"] for row in second] == ["dress", "shirt"] + assert [row["title_input"] for row in third] == ["dress", "shirt"] + + assert process_calls == [ + {"batch_num": 1, "target_lang": "zh", "titles": ["dress"]}, + {"batch_num": 1, "target_lang": "zh", "titles": ["shirt"]}, + ] + assert second[0]["anchor_text"] == "anchor:dress" + assert second[1]["anchor_text"] == "anchor:shirt" + assert third[0]["anchor_text"] == "anchor:dress" + assert third[1]["anchor_text"] == "anchor:shirt" -- libgit2 0.21.2