enrich

tangwang
1 parent e56fbdc1
Showing 4 changed files with 1019 additions and 111 deletions Show diff stats
config/config_loader.py
indexer/product_enrich.py
indexer/product_enrich_prompts.py
tests/test_product_enrich_partial_mode.py
@@ -301,7 +301,12 @@ class ConfigLoader:
  
         # Parse tenant config
         tenant_config_data = config_data.get("tenant_config", {})
-        
+
+        # Parse extensible services/provider registry
+        services_data = config_data.get("services", {}) or {}
+        if not isinstance(services_data, dict):
+            raise ConfigurationError("services must be a dictionary if provided")
+
         return SearchConfig(
             field_boosts=field_boosts,
             indexes=indexes,
@@ -11,6 +11,7 @@ import json
 import logging
 import time
 import hashlib
+from collections import OrderedDict
 from datetime import datetime
 from typing import List, Dict, Tuple, Any, Optional
  
@@ -20,6 +21,12 @@ from pathlib import Path
  
 from config.env_config import REDIS_CONFIG
 from config.tenant_config_loader import SOURCE_LANG_CODE_MAP
+from indexer.product_enrich_prompts import (
+    SYSTEM_MESSAGE,
+    USER_INSTRUCTION_TEMPLATE,
+    LANGUAGE_MARKDOWN_TABLE_HEADERS,
+    SHARED_ANALYSIS_INSTRUCTION,
+)
  
 # 配置
 BATCH_SIZE = 20
@@ -32,6 +39,7 @@ API_KEY = os.environ.get(&quot;DASHSCOPE_API_KEY&quot;)
 MAX_RETRIES = 3
 RETRY_DELAY = 5  # 秒
 REQUEST_TIMEOUT = 180  # 秒
+LOGGED_SHARED_CONTEXT_CACHE_SIZE = 256
  
 # 日志路径
 OUTPUT_DIR = Path("output_logs")
@@ -42,6 +50,7 @@ LOG_DIR.mkdir(parents=True, exist_ok=True)
 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 log_file = LOG_DIR / f"product_enrich_{timestamp}.log"
 verbose_log_file = LOG_DIR / "product_enrich_verbose.log"
+_logged_shared_context_keys: "OrderedDict[str, None]" = OrderedDict()
  
 # 主日志 logger：执行流程、批次信息等
 logger = logging.getLogger("product_enrich")
@@ -96,16 +105,11 @@ except Exception as e:
     logger.warning(f"Failed to initialize Redis for anchors cache: {e}")
     _anchor_redis = None
  
-# 中文版本提示词（请勿删除）：
-# "你是一名电商平台的商品标注员，你的工作是对输入的每个商品进行理解、分析和标注，"
-# "并按要求格式返回 Markdown 表格。所有输出内容必须为中文。"
-
-SYSTEM_MESSAGES = (
-    "You are a product annotator for an e-commerce platform. "
-    "For each input product, you must understand, analyze and label it, "
-    "and return a Markdown table strictly following the requested format. "
-    "All output must be in English."
-)
+_missing_prompt_langs = sorted(set(SOURCE_LANG_CODE_MAP) - set(LANGUAGE_MARKDOWN_TABLE_HEADERS))
+if _missing_prompt_langs:
+    raise RuntimeError(
+        f"Missing product_enrich prompt config for languages: {_missing_prompt_langs}"
+    )
  
  
 def _make_anchor_cache_key(
@@ -153,108 +157,109 @@ def _set_cached_anchor_result(
         logger.warning(f"Failed to set anchor cache: {e}")
  
  
-def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> str:
-    """根据目标语言创建 LLM 提示词和表头说明。
+def _build_assistant_prefix(headers: List[str]) -> str:
+    header_line = "| " + " | ".join(headers) + " |"
+    separator_line = "|" + "----|" * len(headers)
+    return f"{header_line}\n{separator_line}\n"
  
-    约定：
-    - 提示词始终使用英文；
-    - 当 target_lang == "en" 时，直接要求用英文分析并输出英文表头；
-    - 当 target_lang 为其他语言时，视作“多轮对话”的后续轮次：
-      * 默认上一轮已经用英文完成了分析；
-      * 当前轮只需要在保持结构和含义不变的前提下，将整张表格翻译为目标语言，
-        包含表头与所有单元格内容。
-    """
-    lang_name = SOURCE_LANG_CODE_MAP.get(target_lang, target_lang)
-
-# 中文版本提示词（请勿删除）
-# prompt = """请对输入的每条商品标题，分析并提取以下信息：
-
-# 1. 商品标题：将输入商品名称翻译为自然、完整的中文商品标题
-# 2. 品类路径：从大类到细分品类，用">"分隔（例如：服装>女装>裤子>工装裤）
-# 3. 细分标签：商品的风格、特点、功能等（例如：碎花，收腰，法式）
-# 4. 适用人群：性别/年龄段等（例如：年轻女性）
-# 5. 使用场景
-# 6. 适用季节
-# 7. 关键属性
-# 8. 材质说明 
-# 9. 功能特点
-# 10. 商品卖点：分析和提取一句话核心卖点，用于推荐理由
-# 11. 锚文本：生成一组能够代表该商品、并可能被用户用于搜索的词语或短语。这些词语应覆盖用户需求的各个维度，如品类、细分标签、功能特性、需求场景等等。
-
-# 输入商品列表：
-
-# """
-#         prompt_tail = """
-# 请严格按照以下markdown表格格式返回，每列内部的多值内容都用逗号分隔，不要添加任何其他说明：
-
-# | 序号 | 商品标题 | 品类路径 | 细分标签 | 适用人群 | 使用场景 | 适用季节 | 关键属性 | 材质说明 | 功能特点 | 商品卖点 | 锚文本 |
-# |----|----|----|----|----|----|----|----|----|----|----|----|
-# """
-
-    prompt = """Please analyze each input product title and extract the following information:
-
-1. Product title: a natural English product name derived from the input title
-2. Category path: from broad to fine-grained category, separated by ">" (e.g. Clothing>Women>Dresses>Work Dress)
-3. Fine-grained tags: style / features / attributes (e.g. floral, waist-cinching, French style)
-4. Target audience: gender / age group, etc. (e.g. young women)
-5. Usage scene
-6. Applicable season
-7. Key attributes
-8. Material description
-9. Functional features
-10. Selling point: one concise key selling sentence for recommendation
-11. Anchor text: a set of words or phrases that could be used by users as search queries for this product, covering category, fine-grained tags, functional attributes, usage scenes, etc.
-
-Input product list:
-
-"""
  
+def _build_shared_context(products: List[Dict[str, str]]) -> str:
+    shared_context = SHARED_ANALYSIS_INSTRUCTION
     for idx, product in enumerate(products, 1):
-        prompt += f'{idx}. {product["title"]}\n'
+        shared_context += f'{idx}. {product["title"]}\n'
+    return shared_context
  
-    if target_lang == "en":
-        # 英文首轮：直接要求英文表头 + 英文内容
-        prompt += """
-Please strictly return a Markdown table in the following format. For any column that can contain multiple values, separate values with commas. Do not add any other explanations:
  
-| No. | Product title | Category path | Fine-grained tags | Target audience | Usage scene | Season | Key attributes | Material | Features | Selling point | Anchor text |
-|----|----|----|----|----|----|----|----|----|----|----|----|
-"""
-    else:
-        # 非英文语言：视作“下一轮对话”，只做翻译，要求表头与内容全部用目标语言
-        prompt += f"""
-Now we will output the same table in {lang_name}.
-
-IMPORTANT:
-- Assume you have already generated the full table in English in a previous round.
-- In this round, you must output exactly the same table structure and content,
-  but fully translated into {lang_name}, including ALL column headers and ALL cell values.
-- Do NOT change the meaning, fields, or the number/order of rows and columns.
-- Keep valid Markdown table syntax.
-
-Please return ONLY the Markdown table in {lang_name}, without any extra explanations.
-"""
+def _hash_text(text: str) -> str:
+    return hashlib.md5((text or "").encode("utf-8")).hexdigest()[:12]
+
+
+def _mark_shared_context_logged_once(shared_context_key: str) -> bool:
+    if shared_context_key in _logged_shared_context_keys:
+        _logged_shared_context_keys.move_to_end(shared_context_key)
+        return False
+
+    _logged_shared_context_keys[shared_context_key] = None
+    if len(_logged_shared_context_keys) > LOGGED_SHARED_CONTEXT_CACHE_SIZE:
+        _logged_shared_context_keys.popitem(last=False)
+    return True
  
-    return prompt
  
+def reset_logged_shared_context_keys() -> None:
+    """测试辅助：清理已记录的共享 prompt key。"""
+    _logged_shared_context_keys.clear()
  
-def call_llm(prompt: str, target_lang: str = "zh") -> Tuple[str, str]:
-    """调用大模型API（带重试机制），按目标语言选择系统提示词。"""
+
+def create_prompt(
+    products: List[Dict[str, str]],
+    target_lang: str = "zh",
+) -> Tuple[str, str, str]:
+    """根据目标语言创建共享上下文、本地化输出要求和 Partial Mode assistant 前缀。"""
+    markdown_table_headers = LANGUAGE_MARKDOWN_TABLE_HEADERS.get(target_lang)
+    if not markdown_table_headers:
+        logger.warning(
+            "Unsupported target_lang for markdown table headers: %s",
+            target_lang,
+        )
+        return None, None, None
+    shared_context = _build_shared_context(products)
+    language_label = SOURCE_LANG_CODE_MAP.get(target_lang, target_lang)
+    user_prompt = USER_INSTRUCTION_TEMPLATE.format(language=language_label).strip()
+    assistant_prefix = _build_assistant_prefix(markdown_table_headers)
+    return shared_context, user_prompt, assistant_prefix
+
+
+def _merge_partial_response(assistant_prefix: str, generated_content: str) -> str:
+    """将 Partial Mode 的 assistant 前缀与补全文本拼成完整 markdown。"""
+    generated = (generated_content or "").lstrip()
+    prefix_lines = [line.strip() for line in assistant_prefix.strip().splitlines()]
+    generated_lines = generated.splitlines()
+
+    if generated_lines:
+        first_line = generated_lines[0].strip()
+        if prefix_lines and first_line == prefix_lines[0]:
+            generated_lines = generated_lines[1:]
+            if generated_lines and len(prefix_lines) > 1 and generated_lines[0].strip() == prefix_lines[1]:
+                generated_lines = generated_lines[1:]
+        elif len(prefix_lines) > 1 and first_line == prefix_lines[1]:
+            generated_lines = generated_lines[1:]
+
+    suffix = "\n".join(generated_lines).lstrip("\n")
+    if suffix:
+        return f"{assistant_prefix}{suffix}"
+    return assistant_prefix
+
+
+def call_llm(
+    shared_context: str,
+    user_prompt: str,
+    assistant_prefix: str,
+    target_lang: str = "zh",
+) -> Tuple[str, str]:
+    """调用大模型 API（带重试机制），使用 Partial Mode 强制 markdown 表格前缀。"""
     headers = {
         "Authorization": f"Bearer {API_KEY}",
         "Content-Type": "application/json",
     }
+    shared_context_key = _hash_text(shared_context)
+    localized_tail_key = _hash_text(f"{target_lang}\n{user_prompt}\n{assistant_prefix}")
+    combined_user_prompt = f"{shared_context.rstrip()}\n\n{user_prompt.strip()}"
  
     payload = {
         "model": MODEL_NAME,
         "messages": [
             {
                 "role": "system",
-                "content": SYSTEM_MESSAGES,
+                "content": SYSTEM_MESSAGE,
             },
             {
                 "role": "user",
-                "content": prompt,
+                "content": combined_user_prompt,
+            },
+            {
+                "role": "assistant",
+                "content": assistant_prefix,
+                "partial": True,
             },
         ],
         "temperature": 0.3,
@@ -266,16 +271,41 @@ def call_llm(prompt: str, target_lang: str = &quot;zh&quot;) -&gt; Tuple[str, str]:
         "payload": payload,
     }
  
-    # 主日志 + 详尽日志：LLM Request
-    logger.info(f"\n{'=' * 80}")
-    logger.info(f"LLM Request (Model: {MODEL_NAME}):")
-    logger.info(json.dumps(request_data, ensure_ascii=False, indent=2))
-    logger.info(f"\nPrompt:\n{prompt}")
+    if _mark_shared_context_logged_once(shared_context_key):
+        logger.info(f"\n{'=' * 80}")
+        logger.info(
+            "LLM Shared Context [model=%s, shared_key=%s, chars=%s] (logged once per process key)",
+            MODEL_NAME,
+            shared_context_key,
+            len(shared_context),
+        )
+        logger.info("\nSystem Message:\n%s", SYSTEM_MESSAGE)
+        logger.info("\nShared Context:\n%s", shared_context)
  
     verbose_logger.info(f"\n{'=' * 80}")
-    verbose_logger.info(f"LLM Request (Model: {MODEL_NAME}):")
+    verbose_logger.info(
+        "LLM Request [model=%s, lang=%s, shared_key=%s, tail_key=%s]:",
+        MODEL_NAME,
+        target_lang,
+        shared_context_key,
+        localized_tail_key,
+    )
     verbose_logger.info(json.dumps(request_data, ensure_ascii=False, indent=2))
-    verbose_logger.info(f"\nPrompt:\n{prompt}")
+    verbose_logger.info(f"\nCombined User Prompt:\n{combined_user_prompt}")
+    verbose_logger.info(f"\nShared Context:\n{shared_context}")
+    verbose_logger.info(f"\nLocalized Requirement:\n{user_prompt}")
+    verbose_logger.info(f"\nAssistant Prefix:\n{assistant_prefix}")
+
+    logger.info(
+        "\nLLM Request Variant [lang=%s, shared_key=%s, tail_key=%s, prompt_chars=%s, prefix_chars=%s]",
+        target_lang,
+        shared_context_key,
+        localized_tail_key,
+        len(user_prompt),
+        len(assistant_prefix),
+    )
+    logger.info("\nLocalized Requirement:\n%s", user_prompt)
+    logger.info("\nAssistant Prefix:\n%s", assistant_prefix)
  
     # 创建session，禁用代理
     session = requests.Session()
@@ -295,19 +325,37 @@ def call_llm(prompt: str, target_lang: str = &quot;zh&quot;) -&gt; Tuple[str, str]:
  
                 response.raise_for_status()
                 result = response.json()
+                usage = result.get("usage") or {}
+
+                verbose_logger.info(
+                    "\nLLM Response [model=%s, lang=%s, shared_key=%s, tail_key=%s]:",
+                    MODEL_NAME,
+                    target_lang,
+                    shared_context_key,
+                    localized_tail_key,
+                )
+                verbose_logger.info(json.dumps(result, ensure_ascii=False, indent=2))
  
-                # 主日志 + 详尽日志：LLM Response
-                logger.info(f"\nLLM Response:")
-                logger.info(json.dumps(result, ensure_ascii=False, indent=2))
+                generated_content = result["choices"][0]["message"]["content"]
+                full_markdown = _merge_partial_response(assistant_prefix, generated_content)
  
-                verbose_logger.info(f"\nLLM Response:")
-                verbose_logger.info(json.dumps(result, ensure_ascii=False, indent=2))
+                logger.info(
+                    "\nLLM Response Summary [lang=%s, shared_key=%s, tail_key=%s, generated_chars=%s, completion_tokens=%s, prompt_tokens=%s, total_tokens=%s]",
+                    target_lang,
+                    shared_context_key,
+                    localized_tail_key,
+                    len(generated_content or ""),
+                    usage.get("completion_tokens"),
+                    usage.get("prompt_tokens"),
+                    usage.get("total_tokens"),
+                )
+                logger.info("\nGenerated Content:\n%s", generated_content)
+                logger.info("\nMerged Markdown:\n%s", full_markdown)
  
-                content = result["choices"][0]["message"]["content"]
-                logger.info(f"\nExtracted Content:\n{content}")
-                verbose_logger.info(f"\nExtracted Content:\n{content}")
+                verbose_logger.info(f"\nGenerated Content:\n{generated_content}")
+                verbose_logger.info(f"\nMerged Markdown:\n{full_markdown}")
  
-                return content, json.dumps(result, ensure_ascii=False)
+                return full_markdown, json.dumps(result, ensure_ascii=False)
  
             except requests.exceptions.ProxyError as e:
                 logger.warning(f"Attempt {attempt + 1}/{MAX_RETRIES}: Proxy error - {str(e)}")
@@ -385,6 +433,39 @@ def parse_markdown_table(markdown_content: str) -&gt; List[Dict[str, str]]:
     return data
  
  
+def _log_parsed_result_quality(
+    batch_data: List[Dict[str, str]],
+    parsed_results: List[Dict[str, str]],
+    target_lang: str,
+    batch_num: int,
+) -> None:
+    expected = len(batch_data)
+    actual = len(parsed_results)
+    if actual != expected:
+        logger.warning(
+            "Parsed row count mismatch for batch=%s lang=%s: expected=%s actual=%s",
+            batch_num,
+            target_lang,
+            expected,
+            actual,
+        )
+
+    missing_anchor = sum(1 for item in parsed_results if not str(item.get("anchor_text") or "").strip())
+    missing_category = sum(1 for item in parsed_results if not str(item.get("category_path") or "").strip())
+    missing_title = sum(1 for item in parsed_results if not str(item.get("title") or "").strip())
+
+    logger.info(
+        "Parsed Quality Summary [batch=%s, lang=%s]: rows=%s/%s, missing_title=%s, missing_category=%s, missing_anchor=%s",
+        batch_num,
+        target_lang,
+        actual,
+        expected,
+        missing_title,
+        missing_category,
+        missing_anchor,
+    )
+
+
 def process_batch(
     batch_data: List[Dict[str, str]],
     batch_num: int,
@@ -395,14 +476,52 @@ def process_batch(
     logger.info(f"Processing Batch {batch_num} ({len(batch_data)} items)")
  
     # 创建提示词
-    prompt = create_prompt(batch_data, target_lang=target_lang)
+    shared_context, user_prompt, assistant_prefix = create_prompt(
+        batch_data,
+        target_lang=target_lang,
+    )
+
+    # 如果提示词创建失败（例如不支持的 target_lang），本次批次整体失败，不再继续调用 LLM
+    if shared_context is None or user_prompt is None or assistant_prefix is None:
+        logger.error(
+            "Failed to create prompt for batch %s, target_lang=%s; "
+            "marking entire batch as failed without calling LLM",
+            batch_num,
+            target_lang,
+        )
+        return [
+            {
+                "id": item["id"],
+                "lang": target_lang,
+                "title_input": item.get("title", ""),
+                "title": "",
+                "category_path": "",
+                "tags": "",
+                "target_audience": "",
+                "usage_scene": "",
+                "season": "",
+                "key_attributes": "",
+                "material": "",
+                "features": "",
+                "selling_points": "",
+                "anchor_text": "",
+                "error": f"prompt_creation_failed: unsupported target_lang={target_lang}",
+            }
+            for item in batch_data
+        ]
  
     # 调用LLM
     try:
-        raw_response, full_response_json = call_llm(prompt, target_lang=target_lang)
+        raw_response, full_response_json = call_llm(
+            shared_context,
+            user_prompt,
+            assistant_prefix,
+            target_lang=target_lang,
+        )
  
         # 解析结果
         parsed_results = parse_markdown_table(raw_response)
+        _log_parsed_result_quality(batch_data, parsed_results, target_lang, batch_num)
  
         logger.info(f"\nParsed Results ({len(parsed_results)} items):")
         logger.info(json.dumps(parsed_results, ensure_ascii=False, indent=2))
@@ -541,4 +660,3 @@ def analyze_products(
                 pass
  
     return all_results
-
@@ -0,0 +1,556 @@
+#!/usr/bin/env python3
+
+from typing import Any, Dict
+
+SYSTEM_MESSAGE = (
+    "You are an e-commerce product annotator. "
+    "Continue the provided assistant Markdown table prefix. "
+    "Do not repeat or modify the prefix, and do not add explanations outside the table."
+)
+
+SHARED_ANALYSIS_INSTRUCTION = """Analyze each input product title and fill these columns:
+
+1. Product title: a natural localized product name derived from the input title
+2. Category path: broad to fine-grained category, separated by ">"
+3. Fine-grained tags: style, features, functions, or notable attributes
+4. Target audience: gender, age group, or suitable users
+5. Usage scene
+6. Applicable season
+7. Key attributes
+8. Material description
+9. Functional features
+10. Selling point: one concise core selling phrase
+11. Anchor text: a set of search-oriented words or phrases covering category, attributes, scenes, and demand
+
+Rules:
+- Keep the input order and row count exactly the same.
+- Infer from the title only; if uncertain, prefer concise and broadly correct ecommerce wording.
+- Keep category paths concise and use ">" as the separator.
+- For columns with multiple values, the localized output requirement will define the delimiter.
+
+Input product list:
+"""
+
+USER_INSTRUCTION_TEMPLATE = """Please strictly return a Markdown table following the given columns in the specified language. For any column containing multiple values, separate them with commas. Do not add any other explanation.
+Language: {language}"""
+
+LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = {
+  "en": [
+    "No.",
+    "Product title",
+    "Category path",
+    "Fine-grained tags",
+    "Target audience",
+    "Usage scene",
+    "Season",
+    "Key attributes",
+    "Material",
+    "Features",
+    "Selling point",
+    "Anchor text"
+  ],
+  "zh": [
+    "序号",
+    "商品标题",
+    "品类路径",
+    "细分标签",
+    "适用人群",
+    "使用场景",
+    "适用季节",
+    "关键属性",
+    "材质说明",
+    "功能特点",
+    "商品卖点",
+    "锚文本"
+  ],
+  "zh_tw": [
+    "序號",
+    "商品標題",
+    "品類路徑",
+    "細分標籤",
+    "適用人群",
+    "使用場景",
+    "適用季節",
+    "關鍵屬性",
+    "材質說明",
+    "功能特點",
+    "商品賣點",
+    "錨文本"
+  ],
+  "ru": [
+    "№",
+    "Название товара",
+    "Путь категории",
+    "Детализированные теги",
+    "Целевая аудитория",
+    "Сценарий использования",
+    "Сезон",
+    "Ключевые атрибуты",
+    "Материал",
+    "Особенности",
+    "Преимущество товара",
+    "Анкорный текст"
+  ],
+  "ja": [
+    "番号",
+    "商品タイトル",
+    "カテゴリパス",
+    "詳細タグ",
+    "対象ユーザー",
+    "利用シーン",
+    "季節",
+    "主要属性",
+    "素材",
+    "機能特徴",
+    "商品の訴求点",
+    "アンカーテキスト"
+  ],
+  "ko": [
+    "번호",
+    "상품 제목",
+    "카테고리 경로",
+    "세부 태그",
+    "대상 고객",
+    "사용 장면",
+    "계절",
+    "핵심 속성",
+    "소재",
+    "기능 특징",
+    "상품 포인트",
+    "앵커 텍스트"
+  ],
+  "es": [
+    "N.º",
+    "Titulo del producto",
+    "Ruta de categoria",
+    "Etiquetas detalladas",
+    "Publico objetivo",
+    "Escenario de uso",
+    "Temporada",
+    "Atributos clave",
+    "Material",
+    "Caracteristicas",
+    "Punto de venta",
+    "Texto ancla"
+  ],
+  "fr": [
+    "N°",
+    "Titre du produit",
+    "Chemin de categorie",
+    "Etiquettes detaillees",
+    "Public cible",
+    "Scenario d'utilisation",
+    "Saison",
+    "Attributs cles",
+    "Matiere",
+    "Caracteristiques",
+    "Argument de vente",
+    "Texte d'ancrage"
+  ],
+  "pt": [
+    "Nº",
+    "Titulo do produto",
+    "Caminho da categoria",
+    "Tags detalhadas",
+    "Publico-alvo",
+    "Cenario de uso",
+    "Estacao",
+    "Atributos principais",
+    "Material",
+    "Caracteristicas",
+    "Ponto de venda",
+    "Texto ancora"
+  ],
+  "de": [
+    "Nr.",
+    "Produkttitel",
+    "Kategoriepfad",
+    "Detaillierte Tags",
+    "Zielgruppe",
+    "Nutzungsszenario",
+    "Saison",
+    "Wichtige Attribute",
+    "Material",
+    "Funktionen",
+    "Verkaufsargument",
+    "Ankertext"
+  ],
+  "it": [
+    "N.",
+    "Titolo del prodotto",
+    "Percorso categoria",
+    "Tag dettagliati",
+    "Pubblico target",
+    "Scenario d'uso",
+    "Stagione",
+    "Attributi chiave",
+    "Materiale",
+    "Caratteristiche",
+    "Punto di forza",
+    "Testo ancora"
+  ],
+  "th": [
+    "ลำดับ",
+    "ชื่อสินค้า",
+    "เส้นทางหมวดหมู่",
+    "แท็กย่อย",
+    "กลุ่มเป้าหมาย",
+    "สถานการณ์การใช้งาน",
+    "ฤดูกาล",
+    "คุณสมบัติสำคัญ",
+    "วัสดุ",
+    "คุณสมบัติการใช้งาน",
+    "จุดขายสินค้า",
+    "แองเคอร์เท็กซ์"
+  ],
+  "vi": [
+    "STT",
+    "Tieu de san pham",
+    "Duong dan danh muc",
+    "The chi tiet",
+    "Doi tuong phu hop",
+    "Boi canh su dung",
+    "Mua phu hop",
+    "Thuoc tinh chinh",
+    "Chat lieu",
+    "Tinh nang",
+    "Diem ban hang",
+    "Van ban neo"
+  ],
+  "id": [
+    "No.",
+    "Judul produk",
+    "Jalur kategori",
+    "Tag terperinci",
+    "Target pengguna",
+    "Skenario penggunaan",
+    "Musim",
+    "Atribut utama",
+    "Bahan",
+    "Fitur",
+    "Nilai jual",
+    "Teks jangkar"
+  ],
+  "ms": [
+    "No.",
+    "Tajuk produk",
+    "Laluan kategori",
+    "Tag terperinci",
+    "Sasaran pengguna",
+    "Senario penggunaan",
+    "Musim",
+    "Atribut utama",
+    "Bahan",
+    "Ciri-ciri",
+    "Nilai jual",
+    "Teks sauh"
+  ],
+  "ar": [
+    "الرقم",
+    "عنوان المنتج",
+    "مسار الفئة",
+    "الوسوم التفصيلية",
+    "الفئة المستهدفة",
+    "سيناريو الاستخدام",
+    "الموسم",
+    "السمات الرئيسية",
+    "المادة",
+    "الميزات",
+    "نقطة البيع",
+    "نص الربط"
+  ],
+  "hi": [
+    "क्रमांक",
+    "उत्पाद शीर्षक",
+    "श्रेणी पथ",
+    "विस्तृत टैग",
+    "लक्षित उपभोक्ता",
+    "उपयोग परिदृश्य",
+    "मौसम",
+    "मुख्य गुण",
+    "सामग्री",
+    "विशेषताएं",
+    "बिक्री बिंदु",
+    "एंकर टेक्स्ट"
+  ],
+  "he": [
+    "מס׳",
+    "כותרת המוצר",
+    "נתיב קטגוריה",
+    "תגיות מפורטות",
+    "קהל יעד",
+    "תרחיש שימוש",
+    "עונה",
+    "מאפיינים מרכזיים",
+    "חומר",
+    "תכונות",
+    "נקודת מכירה",
+    "טקסט עוגן"
+  ],
+  "my": [
+    "အမှတ်စဉ်",
+    "ကုန်ပစ္စည်းခေါင်းစဉ်",
+    "အမျိုးအစားလမ်းကြောင်း",
+    "အသေးစိတ်တဂ်များ",
+    "ပစ်မှတ်အသုံးပြုသူ",
+    "အသုံးပြုမှုအခြေအနေ",
+    "ရာသီ",
+    "အဓိကဂုဏ်သတ္တိများ",
+    "ပစ္စည်း",
+    "လုပ်ဆောင်ချက်များ",
+    "အရောင်းထူးခြားချက်",
+    "အန်ကာစာသား"
+  ],
+  "ta": [
+    "எண்",
+    "தயாரிப்பு தலைப்பு",
+    "வகை பாதை",
+    "விரிவான குறிச்சொற்கள்",
+    "இலக்கு பயனர்கள்",
+    "பயன்பாட்டு நிலை",
+    "பருவம்",
+    "முக்கிய பண்புகள்",
+    "பொருள்",
+    "அம்சங்கள்",
+    "விற்பனை அம்சம்",
+    "ஆங்கர் உரை"
+  ],
+  "ur": [
+    "نمبر",
+    "پروڈکٹ عنوان",
+    "زمرہ راستہ",
+    "تفصیلی ٹیگز",
+    "ہدف صارفین",
+    "استعمال کا منظر",
+    "موسم",
+    "کلیدی خصوصیات",
+    "مواد",
+    "فیچرز",
+    "فروختی نقطہ",
+    "اینکر ٹیکسٹ"
+  ],
+  "bn": [
+    "ক্রম",
+    "পণ্যের শিরোনাম",
+    "শ্রেণি পথ",
+    "বিস্তারিত ট্যাগ",
+    "লক্ষ্য ব্যবহারকারী",
+    "ব্যবহারের দৃশ্য",
+    "মৌসুম",
+    "মূল বৈশিষ্ট্য",
+    "উপাদান",
+    "ফিচার",
+    "বিক্রয় পয়েন্ট",
+    "অ্যাঙ্কর টেক্সট"
+  ],
+  "pl": [
+    "Nr",
+    "Tytul produktu",
+    "Sciezka kategorii",
+    "Szczegolowe tagi",
+    "Grupa docelowa",
+    "Scenariusz uzycia",
+    "Sezon",
+    "Kluczowe atrybuty",
+    "Material",
+    "Cechy",
+    "Atut sprzedazowy",
+    "Tekst kotwicy"
+  ],
+  "nl": [
+    "Nr.",
+    "Producttitel",
+    "Categoriepad",
+    "Gedetailleerde tags",
+    "Doelgroep",
+    "Gebruikscontext",
+    "Seizoen",
+    "Belangrijke kenmerken",
+    "Materiaal",
+    "Functies",
+    "Verkooppunt",
+    "Ankertekst"
+  ],
+  "ro": [
+    "Nr.",
+    "Titlul produsului",
+    "Calea categoriei",
+    "Etichete detaliate",
+    "Public tinta",
+    "Scenariu de utilizare",
+    "Sezon",
+    "Atribute cheie",
+    "Material",
+    "Caracteristici",
+    "Punct de vanzare",
+    "Text ancora"
+  ],
+  "tr": [
+    "No.",
+    "Urun basligi",
+    "Kategori yolu",
+    "Ayrintili etiketler",
+    "Hedef kitle",
+    "Kullanim senaryosu",
+    "Sezon",
+    "Temel ozellikler",
+    "Malzeme",
+    "Ozellikler",
+    "Satis noktasi",
+    "Capa metni"
+  ],
+  "km": [
+    "ល.រ",
+    "ចំណងជើងផលិតផល",
+    "ផ្លូវប្រភេទ",
+    "ស្លាកលម្អិត",
+    "ក្រុមអ្នកប្រើគោលដៅ",
+    "សេណារីយ៉ូប្រើប្រាស់",
+    "រដូវកាល",
+    "លក្ខណៈសម្បត្តិសំខាន់",
+    "សម្ភារៈ",
+    "មុខងារ",
+    "ចំណុចលក់",
+    "អត្ថបទអង់ក័រ"
+  ],
+  "lo": [
+    "ລຳດັບ",
+    "ຊື່ສິນຄ້າ",
+    "ເສັ້ນທາງໝວດໝູ່",
+    "ແທັກລະອຽດ",
+    "ກຸ່ມເປົ້າໝາຍ",
+    "ສະຖານະການໃຊ້ງານ",
+    "ລະດູການ",
+    "ຄຸນລັກສະນະສຳຄັນ",
+    "ວັດສະດຸ",
+    "ຄຸນສົມບັດ",
+    "ຈຸດຂາຍ",
+    "ຂໍ້ຄວາມອັງເຄີ"
+  ],
+  "yue": [
+    "序號",
+    "商品標題",
+    "品類路徑",
+    "細分類標籤",
+    "適用人群",
+    "使用場景",
+    "適用季節",
+    "關鍵屬性",
+    "材質說明",
+    "功能特點",
+    "商品賣點",
+    "錨文本"
+  ],
+  "cs": [
+    "C.",
+    "Nazev produktu",
+    "Cesta kategorie",
+    "Podrobne stitky",
+    "Cilova skupina",
+    "Scenar pouziti",
+    "Sezona",
+    "Klicove atributy",
+    "Material",
+    "Vlastnosti",
+    "Prodejni argument",
+    "Kotvici text"
+  ],
+  "el": [
+    "Α/Α",
+    "Τίτλος προϊόντος",
+    "Διαδρομή κατηγορίας",
+    "Αναλυτικές ετικέτες",
+    "Κοινό-στόχος",
+    "Σενάριο χρήσης",
+    "Εποχή",
+    "Βασικά χαρακτηριστικά",
+    "Υλικό",
+    "Λειτουργίες",
+    "Σημείο πώλησης",
+    "Κείμενο άγκυρας"
+  ],
+  "sv": [
+    "Nr",
+    "Produkttitel",
+    "Kategorisokvag",
+    "Detaljerade taggar",
+    "Malgrupp",
+    "Anvandningsscenario",
+    "Sasong",
+    "Viktiga attribut",
+    "Material",
+    "Funktioner",
+    "Saljpunkt",
+    "Ankartext"
+  ],
+  "hu": [
+    "Sorszam",
+    "Termekcim",
+    "Kategoriavonal",
+    "Reszletes cimkek",
+    "Celcsoport",
+    "Hasznalati helyzet",
+    "Evszak",
+    "Fo jellemzok",
+    "Anyag",
+    "Funkciok",
+    "Ertekesitesi elony",
+    "Horgonyszoveg"
+  ],
+  "da": [
+    "Nr.",
+    "Produkttitel",
+    "Kategoristi",
+    "Detaljerede tags",
+    "Malgruppe",
+    "Brugsscenarie",
+    "Saeson",
+    "Nogleattributter",
+    "Materiale",
+    "Funktioner",
+    "Salgsargument",
+    "Ankertekst"
+  ],
+  "fi": [
+    "Nro",
+    "Tuotteen nimi",
+    "Kategoriapolku",
+    "Yksityiskohtaiset tunnisteet",
+    "Kohdeyleiso",
+    "Kayttotilanne",
+    "Kausi",
+    "Keskeiset ominaisuudet",
+    "Materiaali",
+    "Ominaisuudet",
+    "Myyntivaltti",
+    "Ankkuriteksti"
+  ],
+  "uk": [
+    "№",
+    "Назва товару",
+    "Шлях категорії",
+    "Детальні теги",
+    "Цільова аудиторія",
+    "Сценарій використання",
+    "Сезон",
+    "Ключові атрибути",
+    "Матеріал",
+    "Особливості",
+    "Продаюча перевага",
+    "Анкорний текст"
+  ],
+  "bg": [
+    "№",
+    "Заглавие на продукта",
+    "Път на категорията",
+    "Подробни тагове",
+    "Целева аудитория",
+    "Сценарий на употреба",
+    "Сезон",
+    "Ключови атрибути",
+    "Материал",
+    "Характеристики",
+    "Търговско предимство",
+    "Анкор текст"
+  ]
+}
 \ No newline at end of file
@@ -0,0 +1,229 @@
+from __future__ import annotations
+
+import importlib.util
+import io
+import json
+import logging
+import sys
+import types
+from pathlib import Path
+from unittest import mock
+
+
+def _load_product_enrich_module():
+    if "dotenv" not in sys.modules:
+        fake_dotenv = types.ModuleType("dotenv")
+        fake_dotenv.load_dotenv = lambda *args, **kwargs: None
+        sys.modules["dotenv"] = fake_dotenv
+
+    if "redis" not in sys.modules:
+        fake_redis = types.ModuleType("redis")
+
+        class _FakeRedisClient:
+            def __init__(self, *args, **kwargs):
+                pass
+
+            def ping(self):
+                return True
+
+        fake_redis.Redis = _FakeRedisClient
+        sys.modules["redis"] = fake_redis
+
+    repo_root = Path(__file__).resolve().parents[1]
+    if str(repo_root) not in sys.path:
+        sys.path.insert(0, str(repo_root))
+
+    module_path = repo_root / "indexer" / "product_enrich.py"
+    spec = importlib.util.spec_from_file_location("product_enrich_under_test", module_path)
+    module = importlib.util.module_from_spec(spec)
+    assert spec and spec.loader
+    spec.loader.exec_module(module)
+    return module
+
+
+product_enrich = _load_product_enrich_module()
+
+
+def _attach_stream(logger_obj: logging.Logger):
+    stream = io.StringIO()
+    handler = logging.StreamHandler(stream)
+    handler.setFormatter(logging.Formatter("%(message)s"))
+    logger_obj.addHandler(handler)
+    return stream, handler
+
+
+def test_create_prompt_splits_shared_context_and_localized_tail():
+    products = [
+        {"id": "1", "title": "dress"},
+        {"id": "2", "title": "linen shirt"},
+    ]
+
+    shared_zh, user_zh, prefix_zh = product_enrich.create_prompt(products, target_lang="zh")
+    shared_en, user_en, prefix_en = product_enrich.create_prompt(products, target_lang="en")
+
+    assert shared_zh == shared_en
+    assert "Analyze each input product title" in shared_zh
+    assert "1. dress" in shared_zh
+    assert "2. linen shirt" in shared_zh
+    assert "Product list" not in user_zh
+    assert "Product list" not in user_en
+    assert "specified language" in user_zh
+    assert "Language: Chinese" in user_zh
+    assert "Language: English" in user_en
+    assert prefix_zh.startswith("| 序号 | 商品标题 | 品类路径 |")
+    assert prefix_en.startswith("| No. | Product title | Category path |")
+
+
+def test_call_llm_logs_shared_context_once_and_verbose_contains_full_requests():
+    payloads = []
+    response_bodies = [
+        {
+            "choices": [
+                {
+                    "message": {
+                        "content": (
+                            "| 1 | 连衣裙 | 女装>连衣裙 | 法式,收腰 | 年轻女性 | "
+                            "通勤,约会 | 春季,夏季 | 中长款 | 聚酯纤维 | 透气 | "
+                            "修身显瘦 | 法式收腰连衣裙 |\n"
+                        )
+                    }
+                }
+            ],
+            "usage": {"prompt_tokens": 120, "completion_tokens": 45, "total_tokens": 165},
+        },
+        {
+            "choices": [
+                {
+                    "message": {
+                        "content": (
+                            "| 1 | Dress | Women>Dress | French,Waisted | Young women | "
+                            "Commute,Date | Spring,Summer | Midi | Polyester | Breathable | "
+                            "Slim fit | French waisted dress |\n"
+                        )
+                    }
+                }
+            ],
+            "usage": {"prompt_tokens": 118, "completion_tokens": 43, "total_tokens": 161},
+        },
+    ]
+
+    class _FakeResponse:
+        def __init__(self, body):
+            self.body = body
+
+        def raise_for_status(self):
+            return None
+
+        def json(self):
+            return self.body
+
+    class _FakeSession:
+        trust_env = True
+
+        def post(self, url, headers=None, json=None, timeout=None, proxies=None):
+            del url, headers, timeout, proxies
+            payloads.append(json)
+            return _FakeResponse(response_bodies[len(payloads) - 1])
+
+        def close(self):
+            return None
+
+    product_enrich.reset_logged_shared_context_keys()
+    main_stream, main_handler = _attach_stream(product_enrich.logger)
+    verbose_stream, verbose_handler = _attach_stream(product_enrich.verbose_logger)
+
+    try:
+        with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object(
+            product_enrich.requests,
+            "Session",
+            lambda: _FakeSession(),
+        ):
+            zh_shared, zh_user, zh_prefix = product_enrich.create_prompt(
+                [{"id": "1", "title": "dress"}],
+                target_lang="zh",
+            )
+            en_shared, en_user, en_prefix = product_enrich.create_prompt(
+                [{"id": "1", "title": "dress"}],
+                target_lang="en",
+            )
+
+            zh_markdown, zh_raw = product_enrich.call_llm(
+                zh_shared,
+                zh_user,
+                zh_prefix,
+                target_lang="zh",
+            )
+            en_markdown, en_raw = product_enrich.call_llm(
+                en_shared,
+                en_user,
+                en_prefix,
+                target_lang="en",
+            )
+    finally:
+        product_enrich.logger.removeHandler(main_handler)
+        product_enrich.verbose_logger.removeHandler(verbose_handler)
+
+    assert zh_shared == en_shared
+    assert len(payloads) == 2
+    assert len(payloads[0]["messages"]) == 3
+    assert payloads[0]["messages"][1]["role"] == "user"
+    assert "1. dress" in payloads[0]["messages"][1]["content"]
+    assert "Language: Chinese" in payloads[0]["messages"][1]["content"]
+    assert "Language: English" in payloads[1]["messages"][1]["content"]
+    assert payloads[0]["messages"][-1]["partial"] is True
+    assert payloads[1]["messages"][-1]["partial"] is True
+
+    main_log = main_stream.getvalue()
+    verbose_log = verbose_stream.getvalue()
+
+    assert main_log.count("LLM Shared Context") == 1
+    assert main_log.count("LLM Request Variant") == 2
+    assert "Localized Requirement" in main_log
+    assert "Shared Context" in main_log
+
+    assert verbose_log.count("LLM Request [model=") == 2
+    assert verbose_log.count("LLM Response [model=") == 2
+    assert '"partial": true' in verbose_log
+    assert "Combined User Prompt" in verbose_log
+    assert "French waisted dress" in verbose_log
+    assert "法式收腰连衣裙" in verbose_log
+
+    assert zh_markdown.startswith(zh_prefix)
+    assert en_markdown.startswith(en_prefix)
+    assert json.loads(zh_raw)["usage"]["total_tokens"] == 165
+    assert json.loads(en_raw)["usage"]["total_tokens"] == 161
+
+
+def test_process_batch_reads_result_and_validates_expected_fields():
+    merged_markdown = """| 序号 | 商品标题 | 品类路径 | 细分标签 | 适用人群 | 使用场景 | 适用季节 | 关键属性 | 材质说明 | 功能特点 | 商品卖点 | 锚文本 |
+|----|----|----|----|----|----|----|----|----|----|----|----|
+| 1 | 法式连衣裙 | 女装>连衣裙 | 法式,收腰 | 年轻女性 | 通勤,约会 | 春季,夏季 | 中长款 | 聚酯纤维 | 透气 | 修身显瘦 | 法式收腰连衣裙 |
+"""
+
+    with mock.patch.object(
+        product_enrich,
+        "call_llm",
+        return_value=(merged_markdown, json.dumps({"choices": [{"message": {"content": "stub"}}]})),
+    ):
+        results = product_enrich.process_batch(
+            [{"id": "sku-1", "title": "dress"}],
+            batch_num=1,
+            target_lang="zh",
+        )
+
+    assert len(results) == 1
+    row = results[0]
+    assert row["id"] == "sku-1"
+    assert row["lang"] == "zh"
+    assert row["title_input"] == "dress"
+    assert row["title"] == "法式连衣裙"
+    assert row["category_path"] == "女装>连衣裙"
+    assert row["tags"] == "法式,收腰"
+    assert row["target_audience"] == "年轻女性"
+    assert row["usage_scene"] == "通勤,约会"
+    assert row["season"] == "春季,夏季"
+    assert row["key_attributes"] == "中长款"
+    assert row["material"] == "聚酯纤维"
+    assert row["features"] == "透气"
+    assert row["selling_points"] == "修身显瘦"
+    assert row["anchor_text"] == "法式收腰连衣裙"