diff --git a/config/config_loader.py b/config/config_loader.py index e64c198..5a88b99 100644 --- a/config/config_loader.py +++ b/config/config_loader.py @@ -301,7 +301,12 @@ class ConfigLoader: # Parse tenant config tenant_config_data = config_data.get("tenant_config", {}) - + + # Parse extensible services/provider registry + services_data = config_data.get("services", {}) or {} + if not isinstance(services_data, dict): + raise ConfigurationError("services must be a dictionary if provided") + return SearchConfig( field_boosts=field_boosts, indexes=indexes, diff --git a/indexer/product_enrich.py b/indexer/product_enrich.py index b046a56..1e215cb 100644 --- a/indexer/product_enrich.py +++ b/indexer/product_enrich.py @@ -11,6 +11,7 @@ import json import logging import time import hashlib +from collections import OrderedDict from datetime import datetime from typing import List, Dict, Tuple, Any, Optional @@ -20,6 +21,12 @@ from pathlib import Path from config.env_config import REDIS_CONFIG from config.tenant_config_loader import SOURCE_LANG_CODE_MAP +from indexer.product_enrich_prompts import ( + SYSTEM_MESSAGE, + USER_INSTRUCTION_TEMPLATE, + LANGUAGE_MARKDOWN_TABLE_HEADERS, + SHARED_ANALYSIS_INSTRUCTION, +) # 配置 BATCH_SIZE = 20 @@ -32,6 +39,7 @@ API_KEY = os.environ.get("DASHSCOPE_API_KEY") MAX_RETRIES = 3 RETRY_DELAY = 5 # 秒 REQUEST_TIMEOUT = 180 # 秒 +LOGGED_SHARED_CONTEXT_CACHE_SIZE = 256 # 日志路径 OUTPUT_DIR = Path("output_logs") @@ -42,6 +50,7 @@ LOG_DIR.mkdir(parents=True, exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") log_file = LOG_DIR / f"product_enrich_{timestamp}.log" verbose_log_file = LOG_DIR / "product_enrich_verbose.log" +_logged_shared_context_keys: "OrderedDict[str, None]" = OrderedDict() # 主日志 logger:执行流程、批次信息等 logger = logging.getLogger("product_enrich") @@ -96,16 +105,11 @@ except Exception as e: logger.warning(f"Failed to initialize Redis for anchors cache: {e}") _anchor_redis = None -# 中文版本提示词(请勿删除): -# "你是一名电商平台的商品标注员,你的工作是对输入的每个商品进行理解、分析和标注," -# "并按要求格式返回 Markdown 表格。所有输出内容必须为中文。" - -SYSTEM_MESSAGES = ( - "You are a product annotator for an e-commerce platform. " - "For each input product, you must understand, analyze and label it, " - "and return a Markdown table strictly following the requested format. " - "All output must be in English." -) +_missing_prompt_langs = sorted(set(SOURCE_LANG_CODE_MAP) - set(LANGUAGE_MARKDOWN_TABLE_HEADERS)) +if _missing_prompt_langs: + raise RuntimeError( + f"Missing product_enrich prompt config for languages: {_missing_prompt_langs}" + ) def _make_anchor_cache_key( @@ -153,108 +157,109 @@ def _set_cached_anchor_result( logger.warning(f"Failed to set anchor cache: {e}") -def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> str: - """根据目标语言创建 LLM 提示词和表头说明。 +def _build_assistant_prefix(headers: List[str]) -> str: + header_line = "| " + " | ".join(headers) + " |" + separator_line = "|" + "----|" * len(headers) + return f"{header_line}\n{separator_line}\n" - 约定: - - 提示词始终使用英文; - - 当 target_lang == "en" 时,直接要求用英文分析并输出英文表头; - - 当 target_lang 为其他语言时,视作“多轮对话”的后续轮次: - * 默认上一轮已经用英文完成了分析; - * 当前轮只需要在保持结构和含义不变的前提下,将整张表格翻译为目标语言, - 包含表头与所有单元格内容。 - """ - lang_name = SOURCE_LANG_CODE_MAP.get(target_lang, target_lang) - -# 中文版本提示词(请勿删除) -# prompt = """请对输入的每条商品标题,分析并提取以下信息: - -# 1. 商品标题:将输入商品名称翻译为自然、完整的中文商品标题 -# 2. 品类路径:从大类到细分品类,用">"分隔(例如:服装>女装>裤子>工装裤) -# 3. 细分标签:商品的风格、特点、功能等(例如:碎花,收腰,法式) -# 4. 适用人群:性别/年龄段等(例如:年轻女性) -# 5. 使用场景 -# 6. 适用季节 -# 7. 关键属性 -# 8. 材质说明 -# 9. 功能特点 -# 10. 商品卖点:分析和提取一句话核心卖点,用于推荐理由 -# 11. 锚文本:生成一组能够代表该商品、并可能被用户用于搜索的词语或短语。这些词语应覆盖用户需求的各个维度,如品类、细分标签、功能特性、需求场景等等。 - -# 输入商品列表: - -# """ -# prompt_tail = """ -# 请严格按照以下markdown表格格式返回,每列内部的多值内容都用逗号分隔,不要添加任何其他说明: - -# | 序号 | 商品标题 | 品类路径 | 细分标签 | 适用人群 | 使用场景 | 适用季节 | 关键属性 | 材质说明 | 功能特点 | 商品卖点 | 锚文本 | -# |----|----|----|----|----|----|----|----|----|----|----|----| -# """ - - prompt = """Please analyze each input product title and extract the following information: - -1. Product title: a natural English product name derived from the input title -2. Category path: from broad to fine-grained category, separated by ">" (e.g. Clothing>Women>Dresses>Work Dress) -3. Fine-grained tags: style / features / attributes (e.g. floral, waist-cinching, French style) -4. Target audience: gender / age group, etc. (e.g. young women) -5. Usage scene -6. Applicable season -7. Key attributes -8. Material description -9. Functional features -10. Selling point: one concise key selling sentence for recommendation -11. Anchor text: a set of words or phrases that could be used by users as search queries for this product, covering category, fine-grained tags, functional attributes, usage scenes, etc. - -Input product list: - -""" +def _build_shared_context(products: List[Dict[str, str]]) -> str: + shared_context = SHARED_ANALYSIS_INSTRUCTION for idx, product in enumerate(products, 1): - prompt += f'{idx}. {product["title"]}\n' + shared_context += f'{idx}. {product["title"]}\n' + return shared_context - if target_lang == "en": - # 英文首轮:直接要求英文表头 + 英文内容 - prompt += """ -Please strictly return a Markdown table in the following format. For any column that can contain multiple values, separate values with commas. Do not add any other explanations: -| No. | Product title | Category path | Fine-grained tags | Target audience | Usage scene | Season | Key attributes | Material | Features | Selling point | Anchor text | -|----|----|----|----|----|----|----|----|----|----|----|----| -""" - else: - # 非英文语言:视作“下一轮对话”,只做翻译,要求表头与内容全部用目标语言 - prompt += f""" -Now we will output the same table in {lang_name}. - -IMPORTANT: -- Assume you have already generated the full table in English in a previous round. -- In this round, you must output exactly the same table structure and content, - but fully translated into {lang_name}, including ALL column headers and ALL cell values. -- Do NOT change the meaning, fields, or the number/order of rows and columns. -- Keep valid Markdown table syntax. - -Please return ONLY the Markdown table in {lang_name}, without any extra explanations. -""" +def _hash_text(text: str) -> str: + return hashlib.md5((text or "").encode("utf-8")).hexdigest()[:12] + + +def _mark_shared_context_logged_once(shared_context_key: str) -> bool: + if shared_context_key in _logged_shared_context_keys: + _logged_shared_context_keys.move_to_end(shared_context_key) + return False + + _logged_shared_context_keys[shared_context_key] = None + if len(_logged_shared_context_keys) > LOGGED_SHARED_CONTEXT_CACHE_SIZE: + _logged_shared_context_keys.popitem(last=False) + return True - return prompt +def reset_logged_shared_context_keys() -> None: + """测试辅助:清理已记录的共享 prompt key。""" + _logged_shared_context_keys.clear() -def call_llm(prompt: str, target_lang: str = "zh") -> Tuple[str, str]: - """调用大模型API(带重试机制),按目标语言选择系统提示词。""" + +def create_prompt( + products: List[Dict[str, str]], + target_lang: str = "zh", +) -> Tuple[str, str, str]: + """根据目标语言创建共享上下文、本地化输出要求和 Partial Mode assistant 前缀。""" + markdown_table_headers = LANGUAGE_MARKDOWN_TABLE_HEADERS.get(target_lang) + if not markdown_table_headers: + logger.warning( + "Unsupported target_lang for markdown table headers: %s", + target_lang, + ) + return None, None, None + shared_context = _build_shared_context(products) + language_label = SOURCE_LANG_CODE_MAP.get(target_lang, target_lang) + user_prompt = USER_INSTRUCTION_TEMPLATE.format(language=language_label).strip() + assistant_prefix = _build_assistant_prefix(markdown_table_headers) + return shared_context, user_prompt, assistant_prefix + + +def _merge_partial_response(assistant_prefix: str, generated_content: str) -> str: + """将 Partial Mode 的 assistant 前缀与补全文本拼成完整 markdown。""" + generated = (generated_content or "").lstrip() + prefix_lines = [line.strip() for line in assistant_prefix.strip().splitlines()] + generated_lines = generated.splitlines() + + if generated_lines: + first_line = generated_lines[0].strip() + if prefix_lines and first_line == prefix_lines[0]: + generated_lines = generated_lines[1:] + if generated_lines and len(prefix_lines) > 1 and generated_lines[0].strip() == prefix_lines[1]: + generated_lines = generated_lines[1:] + elif len(prefix_lines) > 1 and first_line == prefix_lines[1]: + generated_lines = generated_lines[1:] + + suffix = "\n".join(generated_lines).lstrip("\n") + if suffix: + return f"{assistant_prefix}{suffix}" + return assistant_prefix + + +def call_llm( + shared_context: str, + user_prompt: str, + assistant_prefix: str, + target_lang: str = "zh", +) -> Tuple[str, str]: + """调用大模型 API(带重试机制),使用 Partial Mode 强制 markdown 表格前缀。""" headers = { "Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json", } + shared_context_key = _hash_text(shared_context) + localized_tail_key = _hash_text(f"{target_lang}\n{user_prompt}\n{assistant_prefix}") + combined_user_prompt = f"{shared_context.rstrip()}\n\n{user_prompt.strip()}" payload = { "model": MODEL_NAME, "messages": [ { "role": "system", - "content": SYSTEM_MESSAGES, + "content": SYSTEM_MESSAGE, }, { "role": "user", - "content": prompt, + "content": combined_user_prompt, + }, + { + "role": "assistant", + "content": assistant_prefix, + "partial": True, }, ], "temperature": 0.3, @@ -266,16 +271,41 @@ def call_llm(prompt: str, target_lang: str = "zh") -> Tuple[str, str]: "payload": payload, } - # 主日志 + 详尽日志:LLM Request - logger.info(f"\n{'=' * 80}") - logger.info(f"LLM Request (Model: {MODEL_NAME}):") - logger.info(json.dumps(request_data, ensure_ascii=False, indent=2)) - logger.info(f"\nPrompt:\n{prompt}") + if _mark_shared_context_logged_once(shared_context_key): + logger.info(f"\n{'=' * 80}") + logger.info( + "LLM Shared Context [model=%s, shared_key=%s, chars=%s] (logged once per process key)", + MODEL_NAME, + shared_context_key, + len(shared_context), + ) + logger.info("\nSystem Message:\n%s", SYSTEM_MESSAGE) + logger.info("\nShared Context:\n%s", shared_context) verbose_logger.info(f"\n{'=' * 80}") - verbose_logger.info(f"LLM Request (Model: {MODEL_NAME}):") + verbose_logger.info( + "LLM Request [model=%s, lang=%s, shared_key=%s, tail_key=%s]:", + MODEL_NAME, + target_lang, + shared_context_key, + localized_tail_key, + ) verbose_logger.info(json.dumps(request_data, ensure_ascii=False, indent=2)) - verbose_logger.info(f"\nPrompt:\n{prompt}") + verbose_logger.info(f"\nCombined User Prompt:\n{combined_user_prompt}") + verbose_logger.info(f"\nShared Context:\n{shared_context}") + verbose_logger.info(f"\nLocalized Requirement:\n{user_prompt}") + verbose_logger.info(f"\nAssistant Prefix:\n{assistant_prefix}") + + logger.info( + "\nLLM Request Variant [lang=%s, shared_key=%s, tail_key=%s, prompt_chars=%s, prefix_chars=%s]", + target_lang, + shared_context_key, + localized_tail_key, + len(user_prompt), + len(assistant_prefix), + ) + logger.info("\nLocalized Requirement:\n%s", user_prompt) + logger.info("\nAssistant Prefix:\n%s", assistant_prefix) # 创建session,禁用代理 session = requests.Session() @@ -295,19 +325,37 @@ def call_llm(prompt: str, target_lang: str = "zh") -> Tuple[str, str]: response.raise_for_status() result = response.json() + usage = result.get("usage") or {} + + verbose_logger.info( + "\nLLM Response [model=%s, lang=%s, shared_key=%s, tail_key=%s]:", + MODEL_NAME, + target_lang, + shared_context_key, + localized_tail_key, + ) + verbose_logger.info(json.dumps(result, ensure_ascii=False, indent=2)) - # 主日志 + 详尽日志:LLM Response - logger.info(f"\nLLM Response:") - logger.info(json.dumps(result, ensure_ascii=False, indent=2)) + generated_content = result["choices"][0]["message"]["content"] + full_markdown = _merge_partial_response(assistant_prefix, generated_content) - verbose_logger.info(f"\nLLM Response:") - verbose_logger.info(json.dumps(result, ensure_ascii=False, indent=2)) + logger.info( + "\nLLM Response Summary [lang=%s, shared_key=%s, tail_key=%s, generated_chars=%s, completion_tokens=%s, prompt_tokens=%s, total_tokens=%s]", + target_lang, + shared_context_key, + localized_tail_key, + len(generated_content or ""), + usage.get("completion_tokens"), + usage.get("prompt_tokens"), + usage.get("total_tokens"), + ) + logger.info("\nGenerated Content:\n%s", generated_content) + logger.info("\nMerged Markdown:\n%s", full_markdown) - content = result["choices"][0]["message"]["content"] - logger.info(f"\nExtracted Content:\n{content}") - verbose_logger.info(f"\nExtracted Content:\n{content}") + verbose_logger.info(f"\nGenerated Content:\n{generated_content}") + verbose_logger.info(f"\nMerged Markdown:\n{full_markdown}") - return content, json.dumps(result, ensure_ascii=False) + return full_markdown, json.dumps(result, ensure_ascii=False) except requests.exceptions.ProxyError as e: logger.warning(f"Attempt {attempt + 1}/{MAX_RETRIES}: Proxy error - {str(e)}") @@ -385,6 +433,39 @@ def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]: return data +def _log_parsed_result_quality( + batch_data: List[Dict[str, str]], + parsed_results: List[Dict[str, str]], + target_lang: str, + batch_num: int, +) -> None: + expected = len(batch_data) + actual = len(parsed_results) + if actual != expected: + logger.warning( + "Parsed row count mismatch for batch=%s lang=%s: expected=%s actual=%s", + batch_num, + target_lang, + expected, + actual, + ) + + missing_anchor = sum(1 for item in parsed_results if not str(item.get("anchor_text") or "").strip()) + missing_category = sum(1 for item in parsed_results if not str(item.get("category_path") or "").strip()) + missing_title = sum(1 for item in parsed_results if not str(item.get("title") or "").strip()) + + logger.info( + "Parsed Quality Summary [batch=%s, lang=%s]: rows=%s/%s, missing_title=%s, missing_category=%s, missing_anchor=%s", + batch_num, + target_lang, + actual, + expected, + missing_title, + missing_category, + missing_anchor, + ) + + def process_batch( batch_data: List[Dict[str, str]], batch_num: int, @@ -395,14 +476,52 @@ def process_batch( logger.info(f"Processing Batch {batch_num} ({len(batch_data)} items)") # 创建提示词 - prompt = create_prompt(batch_data, target_lang=target_lang) + shared_context, user_prompt, assistant_prefix = create_prompt( + batch_data, + target_lang=target_lang, + ) + + # 如果提示词创建失败(例如不支持的 target_lang),本次批次整体失败,不再继续调用 LLM + if shared_context is None or user_prompt is None or assistant_prefix is None: + logger.error( + "Failed to create prompt for batch %s, target_lang=%s; " + "marking entire batch as failed without calling LLM", + batch_num, + target_lang, + ) + return [ + { + "id": item["id"], + "lang": target_lang, + "title_input": item.get("title", ""), + "title": "", + "category_path": "", + "tags": "", + "target_audience": "", + "usage_scene": "", + "season": "", + "key_attributes": "", + "material": "", + "features": "", + "selling_points": "", + "anchor_text": "", + "error": f"prompt_creation_failed: unsupported target_lang={target_lang}", + } + for item in batch_data + ] # 调用LLM try: - raw_response, full_response_json = call_llm(prompt, target_lang=target_lang) + raw_response, full_response_json = call_llm( + shared_context, + user_prompt, + assistant_prefix, + target_lang=target_lang, + ) # 解析结果 parsed_results = parse_markdown_table(raw_response) + _log_parsed_result_quality(batch_data, parsed_results, target_lang, batch_num) logger.info(f"\nParsed Results ({len(parsed_results)} items):") logger.info(json.dumps(parsed_results, ensure_ascii=False, indent=2)) @@ -541,4 +660,3 @@ def analyze_products( pass return all_results - diff --git a/indexer/product_enrich_prompts.py b/indexer/product_enrich_prompts.py new file mode 100644 index 0000000..7950573 --- /dev/null +++ b/indexer/product_enrich_prompts.py @@ -0,0 +1,556 @@ +#!/usr/bin/env python3 + +from typing import Any, Dict + +SYSTEM_MESSAGE = ( + "You are an e-commerce product annotator. " + "Continue the provided assistant Markdown table prefix. " + "Do not repeat or modify the prefix, and do not add explanations outside the table." +) + +SHARED_ANALYSIS_INSTRUCTION = """Analyze each input product title and fill these columns: + +1. Product title: a natural localized product name derived from the input title +2. Category path: broad to fine-grained category, separated by ">" +3. Fine-grained tags: style, features, functions, or notable attributes +4. Target audience: gender, age group, or suitable users +5. Usage scene +6. Applicable season +7. Key attributes +8. Material description +9. Functional features +10. Selling point: one concise core selling phrase +11. Anchor text: a set of search-oriented words or phrases covering category, attributes, scenes, and demand + +Rules: +- Keep the input order and row count exactly the same. +- Infer from the title only; if uncertain, prefer concise and broadly correct ecommerce wording. +- Keep category paths concise and use ">" as the separator. +- For columns with multiple values, the localized output requirement will define the delimiter. + +Input product list: +""" + +USER_INSTRUCTION_TEMPLATE = """Please strictly return a Markdown table following the given columns in the specified language. For any column containing multiple values, separate them with commas. Do not add any other explanation. +Language: {language}""" + +LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { + "en": [ + "No.", + "Product title", + "Category path", + "Fine-grained tags", + "Target audience", + "Usage scene", + "Season", + "Key attributes", + "Material", + "Features", + "Selling point", + "Anchor text" + ], + "zh": [ + "序号", + "商品标题", + "品类路径", + "细分标签", + "适用人群", + "使用场景", + "适用季节", + "关键属性", + "材质说明", + "功能特点", + "商品卖点", + "锚文本" + ], + "zh_tw": [ + "序號", + "商品標題", + "品類路徑", + "細分標籤", + "適用人群", + "使用場景", + "適用季節", + "關鍵屬性", + "材質說明", + "功能特點", + "商品賣點", + "錨文本" + ], + "ru": [ + "№", + "Название товара", + "Путь категории", + "Детализированные теги", + "Целевая аудитория", + "Сценарий использования", + "Сезон", + "Ключевые атрибуты", + "Материал", + "Особенности", + "Преимущество товара", + "Анкорный текст" + ], + "ja": [ + "番号", + "商品タイトル", + "カテゴリパス", + "詳細タグ", + "対象ユーザー", + "利用シーン", + "季節", + "主要属性", + "素材", + "機能特徴", + "商品の訴求点", + "アンカーテキスト" + ], + "ko": [ + "번호", + "상품 제목", + "카테고리 경로", + "세부 태그", + "대상 고객", + "사용 장면", + "계절", + "핵심 속성", + "소재", + "기능 특징", + "상품 포인트", + "앵커 텍스트" + ], + "es": [ + "N.º", + "Titulo del producto", + "Ruta de categoria", + "Etiquetas detalladas", + "Publico objetivo", + "Escenario de uso", + "Temporada", + "Atributos clave", + "Material", + "Caracteristicas", + "Punto de venta", + "Texto ancla" + ], + "fr": [ + "N°", + "Titre du produit", + "Chemin de categorie", + "Etiquettes detaillees", + "Public cible", + "Scenario d'utilisation", + "Saison", + "Attributs cles", + "Matiere", + "Caracteristiques", + "Argument de vente", + "Texte d'ancrage" + ], + "pt": [ + "Nº", + "Titulo do produto", + "Caminho da categoria", + "Tags detalhadas", + "Publico-alvo", + "Cenario de uso", + "Estacao", + "Atributos principais", + "Material", + "Caracteristicas", + "Ponto de venda", + "Texto ancora" + ], + "de": [ + "Nr.", + "Produkttitel", + "Kategoriepfad", + "Detaillierte Tags", + "Zielgruppe", + "Nutzungsszenario", + "Saison", + "Wichtige Attribute", + "Material", + "Funktionen", + "Verkaufsargument", + "Ankertext" + ], + "it": [ + "N.", + "Titolo del prodotto", + "Percorso categoria", + "Tag dettagliati", + "Pubblico target", + "Scenario d'uso", + "Stagione", + "Attributi chiave", + "Materiale", + "Caratteristiche", + "Punto di forza", + "Testo ancora" + ], + "th": [ + "ลำดับ", + "ชื่อสินค้า", + "เส้นทางหมวดหมู่", + "แท็กย่อย", + "กลุ่มเป้าหมาย", + "สถานการณ์การใช้งาน", + "ฤดูกาล", + "คุณสมบัติสำคัญ", + "วัสดุ", + "คุณสมบัติการใช้งาน", + "จุดขายสินค้า", + "แองเคอร์เท็กซ์" + ], + "vi": [ + "STT", + "Tieu de san pham", + "Duong dan danh muc", + "The chi tiet", + "Doi tuong phu hop", + "Boi canh su dung", + "Mua phu hop", + "Thuoc tinh chinh", + "Chat lieu", + "Tinh nang", + "Diem ban hang", + "Van ban neo" + ], + "id": [ + "No.", + "Judul produk", + "Jalur kategori", + "Tag terperinci", + "Target pengguna", + "Skenario penggunaan", + "Musim", + "Atribut utama", + "Bahan", + "Fitur", + "Nilai jual", + "Teks jangkar" + ], + "ms": [ + "No.", + "Tajuk produk", + "Laluan kategori", + "Tag terperinci", + "Sasaran pengguna", + "Senario penggunaan", + "Musim", + "Atribut utama", + "Bahan", + "Ciri-ciri", + "Nilai jual", + "Teks sauh" + ], + "ar": [ + "الرقم", + "عنوان المنتج", + "مسار الفئة", + "الوسوم التفصيلية", + "الفئة المستهدفة", + "سيناريو الاستخدام", + "الموسم", + "السمات الرئيسية", + "المادة", + "الميزات", + "نقطة البيع", + "نص الربط" + ], + "hi": [ + "क्रमांक", + "उत्पाद शीर्षक", + "श्रेणी पथ", + "विस्तृत टैग", + "लक्षित उपभोक्ता", + "उपयोग परिदृश्य", + "मौसम", + "मुख्य गुण", + "सामग्री", + "विशेषताएं", + "बिक्री बिंदु", + "एंकर टेक्स्ट" + ], + "he": [ + "מס׳", + "כותרת המוצר", + "נתיב קטגוריה", + "תגיות מפורטות", + "קהל יעד", + "תרחיש שימוש", + "עונה", + "מאפיינים מרכזיים", + "חומר", + "תכונות", + "נקודת מכירה", + "טקסט עוגן" + ], + "my": [ + "အမှတ်စဉ်", + "ကုန်ပစ္စည်းခေါင်းစဉ်", + "အမျိုးအစားလမ်းကြောင်း", + "အသေးစိတ်တဂ်များ", + "ပစ်မှတ်အသုံးပြုသူ", + "အသုံးပြုမှုအခြေအနေ", + "ရာသီ", + "အဓိကဂုဏ်သတ္တိများ", + "ပစ္စည်း", + "လုပ်ဆောင်ချက်များ", + "အရောင်းထူးခြားချက်", + "အန်ကာစာသား" + ], + "ta": [ + "எண்", + "தயாரிப்பு தலைப்பு", + "வகை பாதை", + "விரிவான குறிச்சொற்கள்", + "இலக்கு பயனர்கள்", + "பயன்பாட்டு நிலை", + "பருவம்", + "முக்கிய பண்புகள்", + "பொருள்", + "அம்சங்கள்", + "விற்பனை அம்சம்", + "ஆங்கர் உரை" + ], + "ur": [ + "نمبر", + "پروڈکٹ عنوان", + "زمرہ راستہ", + "تفصیلی ٹیگز", + "ہدف صارفین", + "استعمال کا منظر", + "موسم", + "کلیدی خصوصیات", + "مواد", + "فیچرز", + "فروختی نقطہ", + "اینکر ٹیکسٹ" + ], + "bn": [ + "ক্রম", + "পণ্যের শিরোনাম", + "শ্রেণি পথ", + "বিস্তারিত ট্যাগ", + "লক্ষ্য ব্যবহারকারী", + "ব্যবহারের দৃশ্য", + "মৌসুম", + "মূল বৈশিষ্ট্য", + "উপাদান", + "ফিচার", + "বিক্রয় পয়েন্ট", + "অ্যাঙ্কর টেক্সট" + ], + "pl": [ + "Nr", + "Tytul produktu", + "Sciezka kategorii", + "Szczegolowe tagi", + "Grupa docelowa", + "Scenariusz uzycia", + "Sezon", + "Kluczowe atrybuty", + "Material", + "Cechy", + "Atut sprzedazowy", + "Tekst kotwicy" + ], + "nl": [ + "Nr.", + "Producttitel", + "Categoriepad", + "Gedetailleerde tags", + "Doelgroep", + "Gebruikscontext", + "Seizoen", + "Belangrijke kenmerken", + "Materiaal", + "Functies", + "Verkooppunt", + "Ankertekst" + ], + "ro": [ + "Nr.", + "Titlul produsului", + "Calea categoriei", + "Etichete detaliate", + "Public tinta", + "Scenariu de utilizare", + "Sezon", + "Atribute cheie", + "Material", + "Caracteristici", + "Punct de vanzare", + "Text ancora" + ], + "tr": [ + "No.", + "Urun basligi", + "Kategori yolu", + "Ayrintili etiketler", + "Hedef kitle", + "Kullanim senaryosu", + "Sezon", + "Temel ozellikler", + "Malzeme", + "Ozellikler", + "Satis noktasi", + "Capa metni" + ], + "km": [ + "ល.រ", + "ចំណងជើងផលិតផល", + "ផ្លូវប្រភេទ", + "ស្លាកលម្អិត", + "ក្រុមអ្នកប្រើគោលដៅ", + "សេណារីយ៉ូប្រើប្រាស់", + "រដូវកាល", + "លក្ខណៈសម្បត្តិសំខាន់", + "សម្ភារៈ", + "មុខងារ", + "ចំណុចលក់", + "អត្ថបទអង់ក័រ" + ], + "lo": [ + "ລຳດັບ", + "ຊື່ສິນຄ້າ", + "ເສັ້ນທາງໝວດໝູ່", + "ແທັກລະອຽດ", + "ກຸ່ມເປົ້າໝາຍ", + "ສະຖານະການໃຊ້ງານ", + "ລະດູການ", + "ຄຸນລັກສະນະສຳຄັນ", + "ວັດສະດຸ", + "ຄຸນສົມບັດ", + "ຈຸດຂາຍ", + "ຂໍ້ຄວາມອັງເຄີ" + ], + "yue": [ + "序號", + "商品標題", + "品類路徑", + "細分類標籤", + "適用人群", + "使用場景", + "適用季節", + "關鍵屬性", + "材質說明", + "功能特點", + "商品賣點", + "錨文本" + ], + "cs": [ + "C.", + "Nazev produktu", + "Cesta kategorie", + "Podrobne stitky", + "Cilova skupina", + "Scenar pouziti", + "Sezona", + "Klicove atributy", + "Material", + "Vlastnosti", + "Prodejni argument", + "Kotvici text" + ], + "el": [ + "Α/Α", + "Τίτλος προϊόντος", + "Διαδρομή κατηγορίας", + "Αναλυτικές ετικέτες", + "Κοινό-στόχος", + "Σενάριο χρήσης", + "Εποχή", + "Βασικά χαρακτηριστικά", + "Υλικό", + "Λειτουργίες", + "Σημείο πώλησης", + "Κείμενο άγκυρας" + ], + "sv": [ + "Nr", + "Produkttitel", + "Kategorisokvag", + "Detaljerade taggar", + "Malgrupp", + "Anvandningsscenario", + "Sasong", + "Viktiga attribut", + "Material", + "Funktioner", + "Saljpunkt", + "Ankartext" + ], + "hu": [ + "Sorszam", + "Termekcim", + "Kategoriavonal", + "Reszletes cimkek", + "Celcsoport", + "Hasznalati helyzet", + "Evszak", + "Fo jellemzok", + "Anyag", + "Funkciok", + "Ertekesitesi elony", + "Horgonyszoveg" + ], + "da": [ + "Nr.", + "Produkttitel", + "Kategoristi", + "Detaljerede tags", + "Malgruppe", + "Brugsscenarie", + "Saeson", + "Nogleattributter", + "Materiale", + "Funktioner", + "Salgsargument", + "Ankertekst" + ], + "fi": [ + "Nro", + "Tuotteen nimi", + "Kategoriapolku", + "Yksityiskohtaiset tunnisteet", + "Kohdeyleiso", + "Kayttotilanne", + "Kausi", + "Keskeiset ominaisuudet", + "Materiaali", + "Ominaisuudet", + "Myyntivaltti", + "Ankkuriteksti" + ], + "uk": [ + "№", + "Назва товару", + "Шлях категорії", + "Детальні теги", + "Цільова аудиторія", + "Сценарій використання", + "Сезон", + "Ключові атрибути", + "Матеріал", + "Особливості", + "Продаюча перевага", + "Анкорний текст" + ], + "bg": [ + "№", + "Заглавие на продукта", + "Път на категорията", + "Подробни тагове", + "Целева аудитория", + "Сценарий на употреба", + "Сезон", + "Ключови атрибути", + "Материал", + "Характеристики", + "Търговско предимство", + "Анкор текст" + ] +} \ No newline at end of file diff --git a/tests/test_product_enrich_partial_mode.py b/tests/test_product_enrich_partial_mode.py new file mode 100644 index 0000000..55f04c6 --- /dev/null +++ b/tests/test_product_enrich_partial_mode.py @@ -0,0 +1,229 @@ +from __future__ import annotations + +import importlib.util +import io +import json +import logging +import sys +import types +from pathlib import Path +from unittest import mock + + +def _load_product_enrich_module(): + if "dotenv" not in sys.modules: + fake_dotenv = types.ModuleType("dotenv") + fake_dotenv.load_dotenv = lambda *args, **kwargs: None + sys.modules["dotenv"] = fake_dotenv + + if "redis" not in sys.modules: + fake_redis = types.ModuleType("redis") + + class _FakeRedisClient: + def __init__(self, *args, **kwargs): + pass + + def ping(self): + return True + + fake_redis.Redis = _FakeRedisClient + sys.modules["redis"] = fake_redis + + repo_root = Path(__file__).resolve().parents[1] + if str(repo_root) not in sys.path: + sys.path.insert(0, str(repo_root)) + + module_path = repo_root / "indexer" / "product_enrich.py" + spec = importlib.util.spec_from_file_location("product_enrich_under_test", module_path) + module = importlib.util.module_from_spec(spec) + assert spec and spec.loader + spec.loader.exec_module(module) + return module + + +product_enrich = _load_product_enrich_module() + + +def _attach_stream(logger_obj: logging.Logger): + stream = io.StringIO() + handler = logging.StreamHandler(stream) + handler.setFormatter(logging.Formatter("%(message)s")) + logger_obj.addHandler(handler) + return stream, handler + + +def test_create_prompt_splits_shared_context_and_localized_tail(): + products = [ + {"id": "1", "title": "dress"}, + {"id": "2", "title": "linen shirt"}, + ] + + shared_zh, user_zh, prefix_zh = product_enrich.create_prompt(products, target_lang="zh") + shared_en, user_en, prefix_en = product_enrich.create_prompt(products, target_lang="en") + + assert shared_zh == shared_en + assert "Analyze each input product title" in shared_zh + assert "1. dress" in shared_zh + assert "2. linen shirt" in shared_zh + assert "Product list" not in user_zh + assert "Product list" not in user_en + assert "specified language" in user_zh + assert "Language: Chinese" in user_zh + assert "Language: English" in user_en + assert prefix_zh.startswith("| 序号 | 商品标题 | 品类路径 |") + assert prefix_en.startswith("| No. | Product title | Category path |") + + +def test_call_llm_logs_shared_context_once_and_verbose_contains_full_requests(): + payloads = [] + response_bodies = [ + { + "choices": [ + { + "message": { + "content": ( + "| 1 | 连衣裙 | 女装>连衣裙 | 法式,收腰 | 年轻女性 | " + "通勤,约会 | 春季,夏季 | 中长款 | 聚酯纤维 | 透气 | " + "修身显瘦 | 法式收腰连衣裙 |\n" + ) + } + } + ], + "usage": {"prompt_tokens": 120, "completion_tokens": 45, "total_tokens": 165}, + }, + { + "choices": [ + { + "message": { + "content": ( + "| 1 | Dress | Women>Dress | French,Waisted | Young women | " + "Commute,Date | Spring,Summer | Midi | Polyester | Breathable | " + "Slim fit | French waisted dress |\n" + ) + } + } + ], + "usage": {"prompt_tokens": 118, "completion_tokens": 43, "total_tokens": 161}, + }, + ] + + class _FakeResponse: + def __init__(self, body): + self.body = body + + def raise_for_status(self): + return None + + def json(self): + return self.body + + class _FakeSession: + trust_env = True + + def post(self, url, headers=None, json=None, timeout=None, proxies=None): + del url, headers, timeout, proxies + payloads.append(json) + return _FakeResponse(response_bodies[len(payloads) - 1]) + + def close(self): + return None + + product_enrich.reset_logged_shared_context_keys() + main_stream, main_handler = _attach_stream(product_enrich.logger) + verbose_stream, verbose_handler = _attach_stream(product_enrich.verbose_logger) + + try: + with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object( + product_enrich.requests, + "Session", + lambda: _FakeSession(), + ): + zh_shared, zh_user, zh_prefix = product_enrich.create_prompt( + [{"id": "1", "title": "dress"}], + target_lang="zh", + ) + en_shared, en_user, en_prefix = product_enrich.create_prompt( + [{"id": "1", "title": "dress"}], + target_lang="en", + ) + + zh_markdown, zh_raw = product_enrich.call_llm( + zh_shared, + zh_user, + zh_prefix, + target_lang="zh", + ) + en_markdown, en_raw = product_enrich.call_llm( + en_shared, + en_user, + en_prefix, + target_lang="en", + ) + finally: + product_enrich.logger.removeHandler(main_handler) + product_enrich.verbose_logger.removeHandler(verbose_handler) + + assert zh_shared == en_shared + assert len(payloads) == 2 + assert len(payloads[0]["messages"]) == 3 + assert payloads[0]["messages"][1]["role"] == "user" + assert "1. dress" in payloads[0]["messages"][1]["content"] + assert "Language: Chinese" in payloads[0]["messages"][1]["content"] + assert "Language: English" in payloads[1]["messages"][1]["content"] + assert payloads[0]["messages"][-1]["partial"] is True + assert payloads[1]["messages"][-1]["partial"] is True + + main_log = main_stream.getvalue() + verbose_log = verbose_stream.getvalue() + + assert main_log.count("LLM Shared Context") == 1 + assert main_log.count("LLM Request Variant") == 2 + assert "Localized Requirement" in main_log + assert "Shared Context" in main_log + + assert verbose_log.count("LLM Request [model=") == 2 + assert verbose_log.count("LLM Response [model=") == 2 + assert '"partial": true' in verbose_log + assert "Combined User Prompt" in verbose_log + assert "French waisted dress" in verbose_log + assert "法式收腰连衣裙" in verbose_log + + assert zh_markdown.startswith(zh_prefix) + assert en_markdown.startswith(en_prefix) + assert json.loads(zh_raw)["usage"]["total_tokens"] == 165 + assert json.loads(en_raw)["usage"]["total_tokens"] == 161 + + +def test_process_batch_reads_result_and_validates_expected_fields(): + merged_markdown = """| 序号 | 商品标题 | 品类路径 | 细分标签 | 适用人群 | 使用场景 | 适用季节 | 关键属性 | 材质说明 | 功能特点 | 商品卖点 | 锚文本 | +|----|----|----|----|----|----|----|----|----|----|----|----| +| 1 | 法式连衣裙 | 女装>连衣裙 | 法式,收腰 | 年轻女性 | 通勤,约会 | 春季,夏季 | 中长款 | 聚酯纤维 | 透气 | 修身显瘦 | 法式收腰连衣裙 | +""" + + with mock.patch.object( + product_enrich, + "call_llm", + return_value=(merged_markdown, json.dumps({"choices": [{"message": {"content": "stub"}}]})), + ): + results = product_enrich.process_batch( + [{"id": "sku-1", "title": "dress"}], + batch_num=1, + target_lang="zh", + ) + + assert len(results) == 1 + row = results[0] + assert row["id"] == "sku-1" + assert row["lang"] == "zh" + assert row["title_input"] == "dress" + assert row["title"] == "法式连衣裙" + assert row["category_path"] == "女装>连衣裙" + assert row["tags"] == "法式,收腰" + assert row["target_audience"] == "年轻女性" + assert row["usage_scene"] == "通勤,约会" + assert row["season"] == "春季,夏季" + assert row["key_attributes"] == "中长款" + assert row["material"] == "聚酯纤维" + assert row["features"] == "透气" + assert row["selling_points"] == "修身显瘦" + assert row["anchor_text"] == "法式收腰连衣裙" -- libgit2 0.21.2