Commit a73a751f2d90d4544a7146eea38b8b68a736c98b

Authored by tangwang
1 parent e56fbdc1

enrich

config/config_loader.py
@@ -301,7 +301,12 @@ class ConfigLoader: @@ -301,7 +301,12 @@ class ConfigLoader:
301 301
302 # Parse tenant config 302 # Parse tenant config
303 tenant_config_data = config_data.get("tenant_config", {}) 303 tenant_config_data = config_data.get("tenant_config", {})
304 - 304 +
  305 + # Parse extensible services/provider registry
  306 + services_data = config_data.get("services", {}) or {}
  307 + if not isinstance(services_data, dict):
  308 + raise ConfigurationError("services must be a dictionary if provided")
  309 +
305 return SearchConfig( 310 return SearchConfig(
306 field_boosts=field_boosts, 311 field_boosts=field_boosts,
307 indexes=indexes, 312 indexes=indexes,
indexer/product_enrich.py
@@ -11,6 +11,7 @@ import json @@ -11,6 +11,7 @@ import json
11 import logging 11 import logging
12 import time 12 import time
13 import hashlib 13 import hashlib
  14 +from collections import OrderedDict
14 from datetime import datetime 15 from datetime import datetime
15 from typing import List, Dict, Tuple, Any, Optional 16 from typing import List, Dict, Tuple, Any, Optional
16 17
@@ -20,6 +21,12 @@ from pathlib import Path @@ -20,6 +21,12 @@ from pathlib import Path
20 21
21 from config.env_config import REDIS_CONFIG 22 from config.env_config import REDIS_CONFIG
22 from config.tenant_config_loader import SOURCE_LANG_CODE_MAP 23 from config.tenant_config_loader import SOURCE_LANG_CODE_MAP
  24 +from indexer.product_enrich_prompts import (
  25 + SYSTEM_MESSAGE,
  26 + USER_INSTRUCTION_TEMPLATE,
  27 + LANGUAGE_MARKDOWN_TABLE_HEADERS,
  28 + SHARED_ANALYSIS_INSTRUCTION,
  29 +)
23 30
24 # 配置 31 # 配置
25 BATCH_SIZE = 20 32 BATCH_SIZE = 20
@@ -32,6 +39,7 @@ API_KEY = os.environ.get("DASHSCOPE_API_KEY") @@ -32,6 +39,7 @@ API_KEY = os.environ.get("DASHSCOPE_API_KEY")
32 MAX_RETRIES = 3 39 MAX_RETRIES = 3
33 RETRY_DELAY = 5 # 秒 40 RETRY_DELAY = 5 # 秒
34 REQUEST_TIMEOUT = 180 # 秒 41 REQUEST_TIMEOUT = 180 # 秒
  42 +LOGGED_SHARED_CONTEXT_CACHE_SIZE = 256
35 43
36 # 日志路径 44 # 日志路径
37 OUTPUT_DIR = Path("output_logs") 45 OUTPUT_DIR = Path("output_logs")
@@ -42,6 +50,7 @@ LOG_DIR.mkdir(parents=True, exist_ok=True) @@ -42,6 +50,7 @@ LOG_DIR.mkdir(parents=True, exist_ok=True)
42 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 50 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
43 log_file = LOG_DIR / f"product_enrich_{timestamp}.log" 51 log_file = LOG_DIR / f"product_enrich_{timestamp}.log"
44 verbose_log_file = LOG_DIR / "product_enrich_verbose.log" 52 verbose_log_file = LOG_DIR / "product_enrich_verbose.log"
  53 +_logged_shared_context_keys: "OrderedDict[str, None]" = OrderedDict()
45 54
46 # 主日志 logger:执行流程、批次信息等 55 # 主日志 logger:执行流程、批次信息等
47 logger = logging.getLogger("product_enrich") 56 logger = logging.getLogger("product_enrich")
@@ -96,16 +105,11 @@ except Exception as e: @@ -96,16 +105,11 @@ except Exception as e:
96 logger.warning(f"Failed to initialize Redis for anchors cache: {e}") 105 logger.warning(f"Failed to initialize Redis for anchors cache: {e}")
97 _anchor_redis = None 106 _anchor_redis = None
98 107
99 -# 中文版本提示词(请勿删除):  
100 -# "你是一名电商平台的商品标注员,你的工作是对输入的每个商品进行理解、分析和标注,"  
101 -# "并按要求格式返回 Markdown 表格。所有输出内容必须为中文。"  
102 -  
103 -SYSTEM_MESSAGES = (  
104 - "You are a product annotator for an e-commerce platform. "  
105 - "For each input product, you must understand, analyze and label it, "  
106 - "and return a Markdown table strictly following the requested format. "  
107 - "All output must be in English."  
108 -) 108 +_missing_prompt_langs = sorted(set(SOURCE_LANG_CODE_MAP) - set(LANGUAGE_MARKDOWN_TABLE_HEADERS))
  109 +if _missing_prompt_langs:
  110 + raise RuntimeError(
  111 + f"Missing product_enrich prompt config for languages: {_missing_prompt_langs}"
  112 + )
109 113
110 114
111 def _make_anchor_cache_key( 115 def _make_anchor_cache_key(
@@ -153,108 +157,109 @@ def _set_cached_anchor_result( @@ -153,108 +157,109 @@ def _set_cached_anchor_result(
153 logger.warning(f"Failed to set anchor cache: {e}") 157 logger.warning(f"Failed to set anchor cache: {e}")
154 158
155 159
156 -def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> str:  
157 - """根据目标语言创建 LLM 提示词和表头说明。 160 +def _build_assistant_prefix(headers: List[str]) -> str:
  161 + header_line = "| " + " | ".join(headers) + " |"
  162 + separator_line = "|" + "----|" * len(headers)
  163 + return f"{header_line}\n{separator_line}\n"
158 164
159 - 约定:  
160 - - 提示词始终使用英文;  
161 - - 当 target_lang == "en" 时,直接要求用英文分析并输出英文表头;  
162 - - 当 target_lang 为其他语言时,视作“多轮对话”的后续轮次:  
163 - * 默认上一轮已经用英文完成了分析;  
164 - * 当前轮只需要在保持结构和含义不变的前提下,将整张表格翻译为目标语言,  
165 - 包含表头与所有单元格内容。  
166 - """  
167 - lang_name = SOURCE_LANG_CODE_MAP.get(target_lang, target_lang)  
168 -  
169 -# 中文版本提示词(请勿删除)  
170 -# prompt = """请对输入的每条商品标题,分析并提取以下信息:  
171 -  
172 -# 1. 商品标题:将输入商品名称翻译为自然、完整的中文商品标题  
173 -# 2. 品类路径:从大类到细分品类,用">"分隔(例如:服装>女装>裤子>工装裤)  
174 -# 3. 细分标签:商品的风格、特点、功能等(例如:碎花,收腰,法式)  
175 -# 4. 适用人群:性别/年龄段等(例如:年轻女性)  
176 -# 5. 使用场景  
177 -# 6. 适用季节  
178 -# 7. 关键属性  
179 -# 8. 材质说明  
180 -# 9. 功能特点  
181 -# 10. 商品卖点:分析和提取一句话核心卖点,用于推荐理由  
182 -# 11. 锚文本:生成一组能够代表该商品、并可能被用户用于搜索的词语或短语。这些词语应覆盖用户需求的各个维度,如品类、细分标签、功能特性、需求场景等等。  
183 -  
184 -# 输入商品列表:  
185 -  
186 -# """  
187 -# prompt_tail = """  
188 -# 请严格按照以下markdown表格格式返回,每列内部的多值内容都用逗号分隔,不要添加任何其他说明:  
189 -  
190 -# | 序号 | 商品标题 | 品类路径 | 细分标签 | 适用人群 | 使用场景 | 适用季节 | 关键属性 | 材质说明 | 功能特点 | 商品卖点 | 锚文本 |  
191 -# |----|----|----|----|----|----|----|----|----|----|----|----|  
192 -# """  
193 -  
194 - prompt = """Please analyze each input product title and extract the following information:  
195 -  
196 -1. Product title: a natural English product name derived from the input title  
197 -2. Category path: from broad to fine-grained category, separated by ">" (e.g. Clothing>Women>Dresses>Work Dress)  
198 -3. Fine-grained tags: style / features / attributes (e.g. floral, waist-cinching, French style)  
199 -4. Target audience: gender / age group, etc. (e.g. young women)  
200 -5. Usage scene  
201 -6. Applicable season  
202 -7. Key attributes  
203 -8. Material description  
204 -9. Functional features  
205 -10. Selling point: one concise key selling sentence for recommendation  
206 -11. Anchor text: a set of words or phrases that could be used by users as search queries for this product, covering category, fine-grained tags, functional attributes, usage scenes, etc.  
207 -  
208 -Input product list:  
209 -  
210 -"""  
211 165
  166 +def _build_shared_context(products: List[Dict[str, str]]) -> str:
  167 + shared_context = SHARED_ANALYSIS_INSTRUCTION
212 for idx, product in enumerate(products, 1): 168 for idx, product in enumerate(products, 1):
213 - prompt += f'{idx}. {product["title"]}\n' 169 + shared_context += f'{idx}. {product["title"]}\n'
  170 + return shared_context
214 171
215 - if target_lang == "en":  
216 - # 英文首轮:直接要求英文表头 + 英文内容  
217 - prompt += """  
218 -Please strictly return a Markdown table in the following format. For any column that can contain multiple values, separate values with commas. Do not add any other explanations:  
219 172
220 -| No. | Product title | Category path | Fine-grained tags | Target audience | Usage scene | Season | Key attributes | Material | Features | Selling point | Anchor text |  
221 -|----|----|----|----|----|----|----|----|----|----|----|----|  
222 -"""  
223 - else:  
224 - # 非英文语言:视作“下一轮对话”,只做翻译,要求表头与内容全部用目标语言  
225 - prompt += f"""  
226 -Now we will output the same table in {lang_name}.  
227 -  
228 -IMPORTANT:  
229 -- Assume you have already generated the full table in English in a previous round.  
230 -- In this round, you must output exactly the same table structure and content,  
231 - but fully translated into {lang_name}, including ALL column headers and ALL cell values.  
232 -- Do NOT change the meaning, fields, or the number/order of rows and columns.  
233 -- Keep valid Markdown table syntax.  
234 -  
235 -Please return ONLY the Markdown table in {lang_name}, without any extra explanations.  
236 -""" 173 +def _hash_text(text: str) -> str:
  174 + return hashlib.md5((text or "").encode("utf-8")).hexdigest()[:12]
  175 +
  176 +
  177 +def _mark_shared_context_logged_once(shared_context_key: str) -> bool:
  178 + if shared_context_key in _logged_shared_context_keys:
  179 + _logged_shared_context_keys.move_to_end(shared_context_key)
  180 + return False
  181 +
  182 + _logged_shared_context_keys[shared_context_key] = None
  183 + if len(_logged_shared_context_keys) > LOGGED_SHARED_CONTEXT_CACHE_SIZE:
  184 + _logged_shared_context_keys.popitem(last=False)
  185 + return True
237 186
238 - return prompt  
239 187
  188 +def reset_logged_shared_context_keys() -> None:
  189 + """测试辅助:清理已记录的共享 prompt key。"""
  190 + _logged_shared_context_keys.clear()
240 191
241 -def call_llm(prompt: str, target_lang: str = "zh") -> Tuple[str, str]:  
242 - """调用大模型API(带重试机制),按目标语言选择系统提示词。""" 192 +
  193 +def create_prompt(
  194 + products: List[Dict[str, str]],
  195 + target_lang: str = "zh",
  196 +) -> Tuple[str, str, str]:
  197 + """根据目标语言创建共享上下文、本地化输出要求和 Partial Mode assistant 前缀。"""
  198 + markdown_table_headers = LANGUAGE_MARKDOWN_TABLE_HEADERS.get(target_lang)
  199 + if not markdown_table_headers:
  200 + logger.warning(
  201 + "Unsupported target_lang for markdown table headers: %s",
  202 + target_lang,
  203 + )
  204 + return None, None, None
  205 + shared_context = _build_shared_context(products)
  206 + language_label = SOURCE_LANG_CODE_MAP.get(target_lang, target_lang)
  207 + user_prompt = USER_INSTRUCTION_TEMPLATE.format(language=language_label).strip()
  208 + assistant_prefix = _build_assistant_prefix(markdown_table_headers)
  209 + return shared_context, user_prompt, assistant_prefix
  210 +
  211 +
  212 +def _merge_partial_response(assistant_prefix: str, generated_content: str) -> str:
  213 + """将 Partial Mode 的 assistant 前缀与补全文本拼成完整 markdown。"""
  214 + generated = (generated_content or "").lstrip()
  215 + prefix_lines = [line.strip() for line in assistant_prefix.strip().splitlines()]
  216 + generated_lines = generated.splitlines()
  217 +
  218 + if generated_lines:
  219 + first_line = generated_lines[0].strip()
  220 + if prefix_lines and first_line == prefix_lines[0]:
  221 + generated_lines = generated_lines[1:]
  222 + if generated_lines and len(prefix_lines) > 1 and generated_lines[0].strip() == prefix_lines[1]:
  223 + generated_lines = generated_lines[1:]
  224 + elif len(prefix_lines) > 1 and first_line == prefix_lines[1]:
  225 + generated_lines = generated_lines[1:]
  226 +
  227 + suffix = "\n".join(generated_lines).lstrip("\n")
  228 + if suffix:
  229 + return f"{assistant_prefix}{suffix}"
  230 + return assistant_prefix
  231 +
  232 +
  233 +def call_llm(
  234 + shared_context: str,
  235 + user_prompt: str,
  236 + assistant_prefix: str,
  237 + target_lang: str = "zh",
  238 +) -> Tuple[str, str]:
  239 + """调用大模型 API(带重试机制),使用 Partial Mode 强制 markdown 表格前缀。"""
243 headers = { 240 headers = {
244 "Authorization": f"Bearer {API_KEY}", 241 "Authorization": f"Bearer {API_KEY}",
245 "Content-Type": "application/json", 242 "Content-Type": "application/json",
246 } 243 }
  244 + shared_context_key = _hash_text(shared_context)
  245 + localized_tail_key = _hash_text(f"{target_lang}\n{user_prompt}\n{assistant_prefix}")
  246 + combined_user_prompt = f"{shared_context.rstrip()}\n\n{user_prompt.strip()}"
247 247
248 payload = { 248 payload = {
249 "model": MODEL_NAME, 249 "model": MODEL_NAME,
250 "messages": [ 250 "messages": [
251 { 251 {
252 "role": "system", 252 "role": "system",
253 - "content": SYSTEM_MESSAGES, 253 + "content": SYSTEM_MESSAGE,
254 }, 254 },
255 { 255 {
256 "role": "user", 256 "role": "user",
257 - "content": prompt, 257 + "content": combined_user_prompt,
  258 + },
  259 + {
  260 + "role": "assistant",
  261 + "content": assistant_prefix,
  262 + "partial": True,
258 }, 263 },
259 ], 264 ],
260 "temperature": 0.3, 265 "temperature": 0.3,
@@ -266,16 +271,41 @@ def call_llm(prompt: str, target_lang: str = "zh") -> Tuple[str, str]: @@ -266,16 +271,41 @@ def call_llm(prompt: str, target_lang: str = "zh") -> Tuple[str, str]:
266 "payload": payload, 271 "payload": payload,
267 } 272 }
268 273
269 - # 主日志 + 详尽日志:LLM Request  
270 - logger.info(f"\n{'=' * 80}")  
271 - logger.info(f"LLM Request (Model: {MODEL_NAME}):")  
272 - logger.info(json.dumps(request_data, ensure_ascii=False, indent=2))  
273 - logger.info(f"\nPrompt:\n{prompt}") 274 + if _mark_shared_context_logged_once(shared_context_key):
  275 + logger.info(f"\n{'=' * 80}")
  276 + logger.info(
  277 + "LLM Shared Context [model=%s, shared_key=%s, chars=%s] (logged once per process key)",
  278 + MODEL_NAME,
  279 + shared_context_key,
  280 + len(shared_context),
  281 + )
  282 + logger.info("\nSystem Message:\n%s", SYSTEM_MESSAGE)
  283 + logger.info("\nShared Context:\n%s", shared_context)
274 284
275 verbose_logger.info(f"\n{'=' * 80}") 285 verbose_logger.info(f"\n{'=' * 80}")
276 - verbose_logger.info(f"LLM Request (Model: {MODEL_NAME}):") 286 + verbose_logger.info(
  287 + "LLM Request [model=%s, lang=%s, shared_key=%s, tail_key=%s]:",
  288 + MODEL_NAME,
  289 + target_lang,
  290 + shared_context_key,
  291 + localized_tail_key,
  292 + )
277 verbose_logger.info(json.dumps(request_data, ensure_ascii=False, indent=2)) 293 verbose_logger.info(json.dumps(request_data, ensure_ascii=False, indent=2))
278 - verbose_logger.info(f"\nPrompt:\n{prompt}") 294 + verbose_logger.info(f"\nCombined User Prompt:\n{combined_user_prompt}")
  295 + verbose_logger.info(f"\nShared Context:\n{shared_context}")
  296 + verbose_logger.info(f"\nLocalized Requirement:\n{user_prompt}")
  297 + verbose_logger.info(f"\nAssistant Prefix:\n{assistant_prefix}")
  298 +
  299 + logger.info(
  300 + "\nLLM Request Variant [lang=%s, shared_key=%s, tail_key=%s, prompt_chars=%s, prefix_chars=%s]",
  301 + target_lang,
  302 + shared_context_key,
  303 + localized_tail_key,
  304 + len(user_prompt),
  305 + len(assistant_prefix),
  306 + )
  307 + logger.info("\nLocalized Requirement:\n%s", user_prompt)
  308 + logger.info("\nAssistant Prefix:\n%s", assistant_prefix)
279 309
280 # 创建session,禁用代理 310 # 创建session,禁用代理
281 session = requests.Session() 311 session = requests.Session()
@@ -295,19 +325,37 @@ def call_llm(prompt: str, target_lang: str = "zh") -> Tuple[str, str]: @@ -295,19 +325,37 @@ def call_llm(prompt: str, target_lang: str = "zh") -> Tuple[str, str]:
295 325
296 response.raise_for_status() 326 response.raise_for_status()
297 result = response.json() 327 result = response.json()
  328 + usage = result.get("usage") or {}
  329 +
  330 + verbose_logger.info(
  331 + "\nLLM Response [model=%s, lang=%s, shared_key=%s, tail_key=%s]:",
  332 + MODEL_NAME,
  333 + target_lang,
  334 + shared_context_key,
  335 + localized_tail_key,
  336 + )
  337 + verbose_logger.info(json.dumps(result, ensure_ascii=False, indent=2))
298 338
299 - # 主日志 + 详尽日志:LLM Response  
300 - logger.info(f"\nLLM Response:")  
301 - logger.info(json.dumps(result, ensure_ascii=False, indent=2)) 339 + generated_content = result["choices"][0]["message"]["content"]
  340 + full_markdown = _merge_partial_response(assistant_prefix, generated_content)
302 341
303 - verbose_logger.info(f"\nLLM Response:")  
304 - verbose_logger.info(json.dumps(result, ensure_ascii=False, indent=2)) 342 + logger.info(
  343 + "\nLLM Response Summary [lang=%s, shared_key=%s, tail_key=%s, generated_chars=%s, completion_tokens=%s, prompt_tokens=%s, total_tokens=%s]",
  344 + target_lang,
  345 + shared_context_key,
  346 + localized_tail_key,
  347 + len(generated_content or ""),
  348 + usage.get("completion_tokens"),
  349 + usage.get("prompt_tokens"),
  350 + usage.get("total_tokens"),
  351 + )
  352 + logger.info("\nGenerated Content:\n%s", generated_content)
  353 + logger.info("\nMerged Markdown:\n%s", full_markdown)
305 354
306 - content = result["choices"][0]["message"]["content"]  
307 - logger.info(f"\nExtracted Content:\n{content}")  
308 - verbose_logger.info(f"\nExtracted Content:\n{content}") 355 + verbose_logger.info(f"\nGenerated Content:\n{generated_content}")
  356 + verbose_logger.info(f"\nMerged Markdown:\n{full_markdown}")
309 357
310 - return content, json.dumps(result, ensure_ascii=False) 358 + return full_markdown, json.dumps(result, ensure_ascii=False)
311 359
312 except requests.exceptions.ProxyError as e: 360 except requests.exceptions.ProxyError as e:
313 logger.warning(f"Attempt {attempt + 1}/{MAX_RETRIES}: Proxy error - {str(e)}") 361 logger.warning(f"Attempt {attempt + 1}/{MAX_RETRIES}: Proxy error - {str(e)}")
@@ -385,6 +433,39 @@ def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]: @@ -385,6 +433,39 @@ def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]:
385 return data 433 return data
386 434
387 435
  436 +def _log_parsed_result_quality(
  437 + batch_data: List[Dict[str, str]],
  438 + parsed_results: List[Dict[str, str]],
  439 + target_lang: str,
  440 + batch_num: int,
  441 +) -> None:
  442 + expected = len(batch_data)
  443 + actual = len(parsed_results)
  444 + if actual != expected:
  445 + logger.warning(
  446 + "Parsed row count mismatch for batch=%s lang=%s: expected=%s actual=%s",
  447 + batch_num,
  448 + target_lang,
  449 + expected,
  450 + actual,
  451 + )
  452 +
  453 + missing_anchor = sum(1 for item in parsed_results if not str(item.get("anchor_text") or "").strip())
  454 + missing_category = sum(1 for item in parsed_results if not str(item.get("category_path") or "").strip())
  455 + missing_title = sum(1 for item in parsed_results if not str(item.get("title") or "").strip())
  456 +
  457 + logger.info(
  458 + "Parsed Quality Summary [batch=%s, lang=%s]: rows=%s/%s, missing_title=%s, missing_category=%s, missing_anchor=%s",
  459 + batch_num,
  460 + target_lang,
  461 + actual,
  462 + expected,
  463 + missing_title,
  464 + missing_category,
  465 + missing_anchor,
  466 + )
  467 +
  468 +
388 def process_batch( 469 def process_batch(
389 batch_data: List[Dict[str, str]], 470 batch_data: List[Dict[str, str]],
390 batch_num: int, 471 batch_num: int,
@@ -395,14 +476,52 @@ def process_batch( @@ -395,14 +476,52 @@ def process_batch(
395 logger.info(f"Processing Batch {batch_num} ({len(batch_data)} items)") 476 logger.info(f"Processing Batch {batch_num} ({len(batch_data)} items)")
396 477
397 # 创建提示词 478 # 创建提示词
398 - prompt = create_prompt(batch_data, target_lang=target_lang) 479 + shared_context, user_prompt, assistant_prefix = create_prompt(
  480 + batch_data,
  481 + target_lang=target_lang,
  482 + )
  483 +
  484 + # 如果提示词创建失败(例如不支持的 target_lang),本次批次整体失败,不再继续调用 LLM
  485 + if shared_context is None or user_prompt is None or assistant_prefix is None:
  486 + logger.error(
  487 + "Failed to create prompt for batch %s, target_lang=%s; "
  488 + "marking entire batch as failed without calling LLM",
  489 + batch_num,
  490 + target_lang,
  491 + )
  492 + return [
  493 + {
  494 + "id": item["id"],
  495 + "lang": target_lang,
  496 + "title_input": item.get("title", ""),
  497 + "title": "",
  498 + "category_path": "",
  499 + "tags": "",
  500 + "target_audience": "",
  501 + "usage_scene": "",
  502 + "season": "",
  503 + "key_attributes": "",
  504 + "material": "",
  505 + "features": "",
  506 + "selling_points": "",
  507 + "anchor_text": "",
  508 + "error": f"prompt_creation_failed: unsupported target_lang={target_lang}",
  509 + }
  510 + for item in batch_data
  511 + ]
399 512
400 # 调用LLM 513 # 调用LLM
401 try: 514 try:
402 - raw_response, full_response_json = call_llm(prompt, target_lang=target_lang) 515 + raw_response, full_response_json = call_llm(
  516 + shared_context,
  517 + user_prompt,
  518 + assistant_prefix,
  519 + target_lang=target_lang,
  520 + )
403 521
404 # 解析结果 522 # 解析结果
405 parsed_results = parse_markdown_table(raw_response) 523 parsed_results = parse_markdown_table(raw_response)
  524 + _log_parsed_result_quality(batch_data, parsed_results, target_lang, batch_num)
406 525
407 logger.info(f"\nParsed Results ({len(parsed_results)} items):") 526 logger.info(f"\nParsed Results ({len(parsed_results)} items):")
408 logger.info(json.dumps(parsed_results, ensure_ascii=False, indent=2)) 527 logger.info(json.dumps(parsed_results, ensure_ascii=False, indent=2))
@@ -541,4 +660,3 @@ def analyze_products( @@ -541,4 +660,3 @@ def analyze_products(
541 pass 660 pass
542 661
543 return all_results 662 return all_results
544 -  
indexer/product_enrich_prompts.py 0 → 100644
@@ -0,0 +1,556 @@ @@ -0,0 +1,556 @@
  1 +#!/usr/bin/env python3
  2 +
  3 +from typing import Any, Dict
  4 +
  5 +SYSTEM_MESSAGE = (
  6 + "You are an e-commerce product annotator. "
  7 + "Continue the provided assistant Markdown table prefix. "
  8 + "Do not repeat or modify the prefix, and do not add explanations outside the table."
  9 +)
  10 +
  11 +SHARED_ANALYSIS_INSTRUCTION = """Analyze each input product title and fill these columns:
  12 +
  13 +1. Product title: a natural localized product name derived from the input title
  14 +2. Category path: broad to fine-grained category, separated by ">"
  15 +3. Fine-grained tags: style, features, functions, or notable attributes
  16 +4. Target audience: gender, age group, or suitable users
  17 +5. Usage scene
  18 +6. Applicable season
  19 +7. Key attributes
  20 +8. Material description
  21 +9. Functional features
  22 +10. Selling point: one concise core selling phrase
  23 +11. Anchor text: a set of search-oriented words or phrases covering category, attributes, scenes, and demand
  24 +
  25 +Rules:
  26 +- Keep the input order and row count exactly the same.
  27 +- Infer from the title only; if uncertain, prefer concise and broadly correct ecommerce wording.
  28 +- Keep category paths concise and use ">" as the separator.
  29 +- For columns with multiple values, the localized output requirement will define the delimiter.
  30 +
  31 +Input product list:
  32 +"""
  33 +
  34 +USER_INSTRUCTION_TEMPLATE = """Please strictly return a Markdown table following the given columns in the specified language. For any column containing multiple values, separate them with commas. Do not add any other explanation.
  35 +Language: {language}"""
  36 +
  37 +LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = {
  38 + "en": [
  39 + "No.",
  40 + "Product title",
  41 + "Category path",
  42 + "Fine-grained tags",
  43 + "Target audience",
  44 + "Usage scene",
  45 + "Season",
  46 + "Key attributes",
  47 + "Material",
  48 + "Features",
  49 + "Selling point",
  50 + "Anchor text"
  51 + ],
  52 + "zh": [
  53 + "序号",
  54 + "商品标题",
  55 + "品类路径",
  56 + "细分标签",
  57 + "适用人群",
  58 + "使用场景",
  59 + "适用季节",
  60 + "关键属性",
  61 + "材质说明",
  62 + "功能特点",
  63 + "商品卖点",
  64 + "锚文本"
  65 + ],
  66 + "zh_tw": [
  67 + "序號",
  68 + "商品標題",
  69 + "品類路徑",
  70 + "細分標籤",
  71 + "適用人群",
  72 + "使用場景",
  73 + "適用季節",
  74 + "關鍵屬性",
  75 + "材質說明",
  76 + "功能特點",
  77 + "商品賣點",
  78 + "錨文本"
  79 + ],
  80 + "ru": [
  81 + "№",
  82 + "Название товара",
  83 + "Путь категории",
  84 + "Детализированные теги",
  85 + "Целевая аудитория",
  86 + "Сценарий использования",
  87 + "Сезон",
  88 + "Ключевые атрибуты",
  89 + "Материал",
  90 + "Особенности",
  91 + "Преимущество товара",
  92 + "Анкорный текст"
  93 + ],
  94 + "ja": [
  95 + "番号",
  96 + "商品タイトル",
  97 + "カテゴリパス",
  98 + "詳細タグ",
  99 + "対象ユーザー",
  100 + "利用シーン",
  101 + "季節",
  102 + "主要属性",
  103 + "素材",
  104 + "機能特徴",
  105 + "商品の訴求点",
  106 + "アンカーテキスト"
  107 + ],
  108 + "ko": [
  109 + "번호",
  110 + "상품 제목",
  111 + "카테고리 경로",
  112 + "세부 태그",
  113 + "대상 고객",
  114 + "사용 장면",
  115 + "계절",
  116 + "핵심 속성",
  117 + "소재",
  118 + "기능 특징",
  119 + "상품 포인트",
  120 + "앵커 텍스트"
  121 + ],
  122 + "es": [
  123 + "N.º",
  124 + "Titulo del producto",
  125 + "Ruta de categoria",
  126 + "Etiquetas detalladas",
  127 + "Publico objetivo",
  128 + "Escenario de uso",
  129 + "Temporada",
  130 + "Atributos clave",
  131 + "Material",
  132 + "Caracteristicas",
  133 + "Punto de venta",
  134 + "Texto ancla"
  135 + ],
  136 + "fr": [
  137 + "N°",
  138 + "Titre du produit",
  139 + "Chemin de categorie",
  140 + "Etiquettes detaillees",
  141 + "Public cible",
  142 + "Scenario d'utilisation",
  143 + "Saison",
  144 + "Attributs cles",
  145 + "Matiere",
  146 + "Caracteristiques",
  147 + "Argument de vente",
  148 + "Texte d'ancrage"
  149 + ],
  150 + "pt": [
  151 + "Nº",
  152 + "Titulo do produto",
  153 + "Caminho da categoria",
  154 + "Tags detalhadas",
  155 + "Publico-alvo",
  156 + "Cenario de uso",
  157 + "Estacao",
  158 + "Atributos principais",
  159 + "Material",
  160 + "Caracteristicas",
  161 + "Ponto de venda",
  162 + "Texto ancora"
  163 + ],
  164 + "de": [
  165 + "Nr.",
  166 + "Produkttitel",
  167 + "Kategoriepfad",
  168 + "Detaillierte Tags",
  169 + "Zielgruppe",
  170 + "Nutzungsszenario",
  171 + "Saison",
  172 + "Wichtige Attribute",
  173 + "Material",
  174 + "Funktionen",
  175 + "Verkaufsargument",
  176 + "Ankertext"
  177 + ],
  178 + "it": [
  179 + "N.",
  180 + "Titolo del prodotto",
  181 + "Percorso categoria",
  182 + "Tag dettagliati",
  183 + "Pubblico target",
  184 + "Scenario d'uso",
  185 + "Stagione",
  186 + "Attributi chiave",
  187 + "Materiale",
  188 + "Caratteristiche",
  189 + "Punto di forza",
  190 + "Testo ancora"
  191 + ],
  192 + "th": [
  193 + "ลำดับ",
  194 + "ชื่อสินค้า",
  195 + "เส้นทางหมวดหมู่",
  196 + "แท็กย่อย",
  197 + "กลุ่มเป้าหมาย",
  198 + "สถานการณ์การใช้งาน",
  199 + "ฤดูกาล",
  200 + "คุณสมบัติสำคัญ",
  201 + "วัสดุ",
  202 + "คุณสมบัติการใช้งาน",
  203 + "จุดขายสินค้า",
  204 + "แองเคอร์เท็กซ์"
  205 + ],
  206 + "vi": [
  207 + "STT",
  208 + "Tieu de san pham",
  209 + "Duong dan danh muc",
  210 + "The chi tiet",
  211 + "Doi tuong phu hop",
  212 + "Boi canh su dung",
  213 + "Mua phu hop",
  214 + "Thuoc tinh chinh",
  215 + "Chat lieu",
  216 + "Tinh nang",
  217 + "Diem ban hang",
  218 + "Van ban neo"
  219 + ],
  220 + "id": [
  221 + "No.",
  222 + "Judul produk",
  223 + "Jalur kategori",
  224 + "Tag terperinci",
  225 + "Target pengguna",
  226 + "Skenario penggunaan",
  227 + "Musim",
  228 + "Atribut utama",
  229 + "Bahan",
  230 + "Fitur",
  231 + "Nilai jual",
  232 + "Teks jangkar"
  233 + ],
  234 + "ms": [
  235 + "No.",
  236 + "Tajuk produk",
  237 + "Laluan kategori",
  238 + "Tag terperinci",
  239 + "Sasaran pengguna",
  240 + "Senario penggunaan",
  241 + "Musim",
  242 + "Atribut utama",
  243 + "Bahan",
  244 + "Ciri-ciri",
  245 + "Nilai jual",
  246 + "Teks sauh"
  247 + ],
  248 + "ar": [
  249 + "الرقم",
  250 + "عنوان المنتج",
  251 + "مسار الفئة",
  252 + "الوسوم التفصيلية",
  253 + "الفئة المستهدفة",
  254 + "سيناريو الاستخدام",
  255 + "الموسم",
  256 + "السمات الرئيسية",
  257 + "المادة",
  258 + "الميزات",
  259 + "نقطة البيع",
  260 + "نص الربط"
  261 + ],
  262 + "hi": [
  263 + "क्रमांक",
  264 + "उत्पाद शीर्षक",
  265 + "श्रेणी पथ",
  266 + "विस्तृत टैग",
  267 + "लक्षित उपभोक्ता",
  268 + "उपयोग परिदृश्य",
  269 + "मौसम",
  270 + "मुख्य गुण",
  271 + "सामग्री",
  272 + "विशेषताएं",
  273 + "बिक्री बिंदु",
  274 + "एंकर टेक्स्ट"
  275 + ],
  276 + "he": [
  277 + "מס׳",
  278 + "כותרת המוצר",
  279 + "נתיב קטגוריה",
  280 + "תגיות מפורטות",
  281 + "קהל יעד",
  282 + "תרחיש שימוש",
  283 + "עונה",
  284 + "מאפיינים מרכזיים",
  285 + "חומר",
  286 + "תכונות",
  287 + "נקודת מכירה",
  288 + "טקסט עוגן"
  289 + ],
  290 + "my": [
  291 + "အမှတ်စဉ်",
  292 + "ကုန်ပစ္စည်းခေါင်းစဉ်",
  293 + "အမျိုးအစားလမ်းကြောင်း",
  294 + "အသေးစိတ်တဂ်များ",
  295 + "ပစ်မှတ်အသုံးပြုသူ",
  296 + "အသုံးပြုမှုအခြေအနေ",
  297 + "ရာသီ",
  298 + "အဓိကဂုဏ်သတ္တိများ",
  299 + "ပစ္စည်း",
  300 + "လုပ်ဆောင်ချက်များ",
  301 + "အရောင်းထူးခြားချက်",
  302 + "အန်ကာစာသား"
  303 + ],
  304 + "ta": [
  305 + "எண்",
  306 + "தயாரிப்பு தலைப்பு",
  307 + "வகை பாதை",
  308 + "விரிவான குறிச்சொற்கள்",
  309 + "இலக்கு பயனர்கள்",
  310 + "பயன்பாட்டு நிலை",
  311 + "பருவம்",
  312 + "முக்கிய பண்புகள்",
  313 + "பொருள்",
  314 + "அம்சங்கள்",
  315 + "விற்பனை அம்சம்",
  316 + "ஆங்கர் உரை"
  317 + ],
  318 + "ur": [
  319 + "نمبر",
  320 + "پروڈکٹ عنوان",
  321 + "زمرہ راستہ",
  322 + "تفصیلی ٹیگز",
  323 + "ہدف صارفین",
  324 + "استعمال کا منظر",
  325 + "موسم",
  326 + "کلیدی خصوصیات",
  327 + "مواد",
  328 + "فیچرز",
  329 + "فروختی نقطہ",
  330 + "اینکر ٹیکسٹ"
  331 + ],
  332 + "bn": [
  333 + "ক্রম",
  334 + "পণ্যের শিরোনাম",
  335 + "শ্রেণি পথ",
  336 + "বিস্তারিত ট্যাগ",
  337 + "লক্ষ্য ব্যবহারকারী",
  338 + "ব্যবহারের দৃশ্য",
  339 + "মৌসুম",
  340 + "মূল বৈশিষ্ট্য",
  341 + "উপাদান",
  342 + "ফিচার",
  343 + "বিক্রয় পয়েন্ট",
  344 + "অ্যাঙ্কর টেক্সট"
  345 + ],
  346 + "pl": [
  347 + "Nr",
  348 + "Tytul produktu",
  349 + "Sciezka kategorii",
  350 + "Szczegolowe tagi",
  351 + "Grupa docelowa",
  352 + "Scenariusz uzycia",
  353 + "Sezon",
  354 + "Kluczowe atrybuty",
  355 + "Material",
  356 + "Cechy",
  357 + "Atut sprzedazowy",
  358 + "Tekst kotwicy"
  359 + ],
  360 + "nl": [
  361 + "Nr.",
  362 + "Producttitel",
  363 + "Categoriepad",
  364 + "Gedetailleerde tags",
  365 + "Doelgroep",
  366 + "Gebruikscontext",
  367 + "Seizoen",
  368 + "Belangrijke kenmerken",
  369 + "Materiaal",
  370 + "Functies",
  371 + "Verkooppunt",
  372 + "Ankertekst"
  373 + ],
  374 + "ro": [
  375 + "Nr.",
  376 + "Titlul produsului",
  377 + "Calea categoriei",
  378 + "Etichete detaliate",
  379 + "Public tinta",
  380 + "Scenariu de utilizare",
  381 + "Sezon",
  382 + "Atribute cheie",
  383 + "Material",
  384 + "Caracteristici",
  385 + "Punct de vanzare",
  386 + "Text ancora"
  387 + ],
  388 + "tr": [
  389 + "No.",
  390 + "Urun basligi",
  391 + "Kategori yolu",
  392 + "Ayrintili etiketler",
  393 + "Hedef kitle",
  394 + "Kullanim senaryosu",
  395 + "Sezon",
  396 + "Temel ozellikler",
  397 + "Malzeme",
  398 + "Ozellikler",
  399 + "Satis noktasi",
  400 + "Capa metni"
  401 + ],
  402 + "km": [
  403 + "ល.រ",
  404 + "ចំណងជើងផលិតផល",
  405 + "ផ្លូវប្រភេទ",
  406 + "ស្លាកលម្អិត",
  407 + "ក្រុមអ្នកប្រើគោលដៅ",
  408 + "សេណារីយ៉ូប្រើប្រាស់",
  409 + "រដូវកាល",
  410 + "លក្ខណៈសម្បត្តិសំខាន់",
  411 + "សម្ភារៈ",
  412 + "មុខងារ",
  413 + "ចំណុចលក់",
  414 + "អត្ថបទអង់ក័រ"
  415 + ],
  416 + "lo": [
  417 + "ລຳດັບ",
  418 + "ຊື່ສິນຄ້າ",
  419 + "ເສັ້ນທາງໝວດໝູ່",
  420 + "ແທັກລະອຽດ",
  421 + "ກຸ່ມເປົ້າໝາຍ",
  422 + "ສະຖານະການໃຊ້ງານ",
  423 + "ລະດູການ",
  424 + "ຄຸນລັກສະນະສຳຄັນ",
  425 + "ວັດສະດຸ",
  426 + "ຄຸນສົມບັດ",
  427 + "ຈຸດຂາຍ",
  428 + "ຂໍ້ຄວາມອັງເຄີ"
  429 + ],
  430 + "yue": [
  431 + "序號",
  432 + "商品標題",
  433 + "品類路徑",
  434 + "細分類標籤",
  435 + "適用人群",
  436 + "使用場景",
  437 + "適用季節",
  438 + "關鍵屬性",
  439 + "材質說明",
  440 + "功能特點",
  441 + "商品賣點",
  442 + "錨文本"
  443 + ],
  444 + "cs": [
  445 + "C.",
  446 + "Nazev produktu",
  447 + "Cesta kategorie",
  448 + "Podrobne stitky",
  449 + "Cilova skupina",
  450 + "Scenar pouziti",
  451 + "Sezona",
  452 + "Klicove atributy",
  453 + "Material",
  454 + "Vlastnosti",
  455 + "Prodejni argument",
  456 + "Kotvici text"
  457 + ],
  458 + "el": [
  459 + "Α/Α",
  460 + "Τίτλος προϊόντος",
  461 + "Διαδρομή κατηγορίας",
  462 + "Αναλυτικές ετικέτες",
  463 + "Κοινό-στόχος",
  464 + "Σενάριο χρήσης",
  465 + "Εποχή",
  466 + "Βασικά χαρακτηριστικά",
  467 + "Υλικό",
  468 + "Λειτουργίες",
  469 + "Σημείο πώλησης",
  470 + "Κείμενο άγκυρας"
  471 + ],
  472 + "sv": [
  473 + "Nr",
  474 + "Produkttitel",
  475 + "Kategorisokvag",
  476 + "Detaljerade taggar",
  477 + "Malgrupp",
  478 + "Anvandningsscenario",
  479 + "Sasong",
  480 + "Viktiga attribut",
  481 + "Material",
  482 + "Funktioner",
  483 + "Saljpunkt",
  484 + "Ankartext"
  485 + ],
  486 + "hu": [
  487 + "Sorszam",
  488 + "Termekcim",
  489 + "Kategoriavonal",
  490 + "Reszletes cimkek",
  491 + "Celcsoport",
  492 + "Hasznalati helyzet",
  493 + "Evszak",
  494 + "Fo jellemzok",
  495 + "Anyag",
  496 + "Funkciok",
  497 + "Ertekesitesi elony",
  498 + "Horgonyszoveg"
  499 + ],
  500 + "da": [
  501 + "Nr.",
  502 + "Produkttitel",
  503 + "Kategoristi",
  504 + "Detaljerede tags",
  505 + "Malgruppe",
  506 + "Brugsscenarie",
  507 + "Saeson",
  508 + "Nogleattributter",
  509 + "Materiale",
  510 + "Funktioner",
  511 + "Salgsargument",
  512 + "Ankertekst"
  513 + ],
  514 + "fi": [
  515 + "Nro",
  516 + "Tuotteen nimi",
  517 + "Kategoriapolku",
  518 + "Yksityiskohtaiset tunnisteet",
  519 + "Kohdeyleiso",
  520 + "Kayttotilanne",
  521 + "Kausi",
  522 + "Keskeiset ominaisuudet",
  523 + "Materiaali",
  524 + "Ominaisuudet",
  525 + "Myyntivaltti",
  526 + "Ankkuriteksti"
  527 + ],
  528 + "uk": [
  529 + "№",
  530 + "Назва товару",
  531 + "Шлях категорії",
  532 + "Детальні теги",
  533 + "Цільова аудиторія",
  534 + "Сценарій використання",
  535 + "Сезон",
  536 + "Ключові атрибути",
  537 + "Матеріал",
  538 + "Особливості",
  539 + "Продаюча перевага",
  540 + "Анкорний текст"
  541 + ],
  542 + "bg": [
  543 + "№",
  544 + "Заглавие на продукта",
  545 + "Път на категорията",
  546 + "Подробни тагове",
  547 + "Целева аудитория",
  548 + "Сценарий на употреба",
  549 + "Сезон",
  550 + "Ключови атрибути",
  551 + "Материал",
  552 + "Характеристики",
  553 + "Търговско предимство",
  554 + "Анкор текст"
  555 + ]
  556 +}
0 \ No newline at end of file 557 \ No newline at end of file
tests/test_product_enrich_partial_mode.py 0 → 100644
@@ -0,0 +1,229 @@ @@ -0,0 +1,229 @@
  1 +from __future__ import annotations
  2 +
  3 +import importlib.util
  4 +import io
  5 +import json
  6 +import logging
  7 +import sys
  8 +import types
  9 +from pathlib import Path
  10 +from unittest import mock
  11 +
  12 +
  13 +def _load_product_enrich_module():
  14 + if "dotenv" not in sys.modules:
  15 + fake_dotenv = types.ModuleType("dotenv")
  16 + fake_dotenv.load_dotenv = lambda *args, **kwargs: None
  17 + sys.modules["dotenv"] = fake_dotenv
  18 +
  19 + if "redis" not in sys.modules:
  20 + fake_redis = types.ModuleType("redis")
  21 +
  22 + class _FakeRedisClient:
  23 + def __init__(self, *args, **kwargs):
  24 + pass
  25 +
  26 + def ping(self):
  27 + return True
  28 +
  29 + fake_redis.Redis = _FakeRedisClient
  30 + sys.modules["redis"] = fake_redis
  31 +
  32 + repo_root = Path(__file__).resolve().parents[1]
  33 + if str(repo_root) not in sys.path:
  34 + sys.path.insert(0, str(repo_root))
  35 +
  36 + module_path = repo_root / "indexer" / "product_enrich.py"
  37 + spec = importlib.util.spec_from_file_location("product_enrich_under_test", module_path)
  38 + module = importlib.util.module_from_spec(spec)
  39 + assert spec and spec.loader
  40 + spec.loader.exec_module(module)
  41 + return module
  42 +
  43 +
  44 +product_enrich = _load_product_enrich_module()
  45 +
  46 +
  47 +def _attach_stream(logger_obj: logging.Logger):
  48 + stream = io.StringIO()
  49 + handler = logging.StreamHandler(stream)
  50 + handler.setFormatter(logging.Formatter("%(message)s"))
  51 + logger_obj.addHandler(handler)
  52 + return stream, handler
  53 +
  54 +
  55 +def test_create_prompt_splits_shared_context_and_localized_tail():
  56 + products = [
  57 + {"id": "1", "title": "dress"},
  58 + {"id": "2", "title": "linen shirt"},
  59 + ]
  60 +
  61 + shared_zh, user_zh, prefix_zh = product_enrich.create_prompt(products, target_lang="zh")
  62 + shared_en, user_en, prefix_en = product_enrich.create_prompt(products, target_lang="en")
  63 +
  64 + assert shared_zh == shared_en
  65 + assert "Analyze each input product title" in shared_zh
  66 + assert "1. dress" in shared_zh
  67 + assert "2. linen shirt" in shared_zh
  68 + assert "Product list" not in user_zh
  69 + assert "Product list" not in user_en
  70 + assert "specified language" in user_zh
  71 + assert "Language: Chinese" in user_zh
  72 + assert "Language: English" in user_en
  73 + assert prefix_zh.startswith("| 序号 | 商品标题 | 品类路径 |")
  74 + assert prefix_en.startswith("| No. | Product title | Category path |")
  75 +
  76 +
  77 +def test_call_llm_logs_shared_context_once_and_verbose_contains_full_requests():
  78 + payloads = []
  79 + response_bodies = [
  80 + {
  81 + "choices": [
  82 + {
  83 + "message": {
  84 + "content": (
  85 + "| 1 | 连衣裙 | 女装>连衣裙 | 法式,收腰 | 年轻女性 | "
  86 + "通勤,约会 | 春季,夏季 | 中长款 | 聚酯纤维 | 透气 | "
  87 + "修身显瘦 | 法式收腰连衣裙 |\n"
  88 + )
  89 + }
  90 + }
  91 + ],
  92 + "usage": {"prompt_tokens": 120, "completion_tokens": 45, "total_tokens": 165},
  93 + },
  94 + {
  95 + "choices": [
  96 + {
  97 + "message": {
  98 + "content": (
  99 + "| 1 | Dress | Women>Dress | French,Waisted | Young women | "
  100 + "Commute,Date | Spring,Summer | Midi | Polyester | Breathable | "
  101 + "Slim fit | French waisted dress |\n"
  102 + )
  103 + }
  104 + }
  105 + ],
  106 + "usage": {"prompt_tokens": 118, "completion_tokens": 43, "total_tokens": 161},
  107 + },
  108 + ]
  109 +
  110 + class _FakeResponse:
  111 + def __init__(self, body):
  112 + self.body = body
  113 +
  114 + def raise_for_status(self):
  115 + return None
  116 +
  117 + def json(self):
  118 + return self.body
  119 +
  120 + class _FakeSession:
  121 + trust_env = True
  122 +
  123 + def post(self, url, headers=None, json=None, timeout=None, proxies=None):
  124 + del url, headers, timeout, proxies
  125 + payloads.append(json)
  126 + return _FakeResponse(response_bodies[len(payloads) - 1])
  127 +
  128 + def close(self):
  129 + return None
  130 +
  131 + product_enrich.reset_logged_shared_context_keys()
  132 + main_stream, main_handler = _attach_stream(product_enrich.logger)
  133 + verbose_stream, verbose_handler = _attach_stream(product_enrich.verbose_logger)
  134 +
  135 + try:
  136 + with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object(
  137 + product_enrich.requests,
  138 + "Session",
  139 + lambda: _FakeSession(),
  140 + ):
  141 + zh_shared, zh_user, zh_prefix = product_enrich.create_prompt(
  142 + [{"id": "1", "title": "dress"}],
  143 + target_lang="zh",
  144 + )
  145 + en_shared, en_user, en_prefix = product_enrich.create_prompt(
  146 + [{"id": "1", "title": "dress"}],
  147 + target_lang="en",
  148 + )
  149 +
  150 + zh_markdown, zh_raw = product_enrich.call_llm(
  151 + zh_shared,
  152 + zh_user,
  153 + zh_prefix,
  154 + target_lang="zh",
  155 + )
  156 + en_markdown, en_raw = product_enrich.call_llm(
  157 + en_shared,
  158 + en_user,
  159 + en_prefix,
  160 + target_lang="en",
  161 + )
  162 + finally:
  163 + product_enrich.logger.removeHandler(main_handler)
  164 + product_enrich.verbose_logger.removeHandler(verbose_handler)
  165 +
  166 + assert zh_shared == en_shared
  167 + assert len(payloads) == 2
  168 + assert len(payloads[0]["messages"]) == 3
  169 + assert payloads[0]["messages"][1]["role"] == "user"
  170 + assert "1. dress" in payloads[0]["messages"][1]["content"]
  171 + assert "Language: Chinese" in payloads[0]["messages"][1]["content"]
  172 + assert "Language: English" in payloads[1]["messages"][1]["content"]
  173 + assert payloads[0]["messages"][-1]["partial"] is True
  174 + assert payloads[1]["messages"][-1]["partial"] is True
  175 +
  176 + main_log = main_stream.getvalue()
  177 + verbose_log = verbose_stream.getvalue()
  178 +
  179 + assert main_log.count("LLM Shared Context") == 1
  180 + assert main_log.count("LLM Request Variant") == 2
  181 + assert "Localized Requirement" in main_log
  182 + assert "Shared Context" in main_log
  183 +
  184 + assert verbose_log.count("LLM Request [model=") == 2
  185 + assert verbose_log.count("LLM Response [model=") == 2
  186 + assert '"partial": true' in verbose_log
  187 + assert "Combined User Prompt" in verbose_log
  188 + assert "French waisted dress" in verbose_log
  189 + assert "法式收腰连衣裙" in verbose_log
  190 +
  191 + assert zh_markdown.startswith(zh_prefix)
  192 + assert en_markdown.startswith(en_prefix)
  193 + assert json.loads(zh_raw)["usage"]["total_tokens"] == 165
  194 + assert json.loads(en_raw)["usage"]["total_tokens"] == 161
  195 +
  196 +
  197 +def test_process_batch_reads_result_and_validates_expected_fields():
  198 + merged_markdown = """| 序号 | 商品标题 | 品类路径 | 细分标签 | 适用人群 | 使用场景 | 适用季节 | 关键属性 | 材质说明 | 功能特点 | 商品卖点 | 锚文本 |
  199 +|----|----|----|----|----|----|----|----|----|----|----|----|
  200 +| 1 | 法式连衣裙 | 女装>连衣裙 | 法式,收腰 | 年轻女性 | 通勤,约会 | 春季,夏季 | 中长款 | 聚酯纤维 | 透气 | 修身显瘦 | 法式收腰连衣裙 |
  201 +"""
  202 +
  203 + with mock.patch.object(
  204 + product_enrich,
  205 + "call_llm",
  206 + return_value=(merged_markdown, json.dumps({"choices": [{"message": {"content": "stub"}}]})),
  207 + ):
  208 + results = product_enrich.process_batch(
  209 + [{"id": "sku-1", "title": "dress"}],
  210 + batch_num=1,
  211 + target_lang="zh",
  212 + )
  213 +
  214 + assert len(results) == 1
  215 + row = results[0]
  216 + assert row["id"] == "sku-1"
  217 + assert row["lang"] == "zh"
  218 + assert row["title_input"] == "dress"
  219 + assert row["title"] == "法式连衣裙"
  220 + assert row["category_path"] == "女装>连衣裙"
  221 + assert row["tags"] == "法式,收腰"
  222 + assert row["target_audience"] == "年轻女性"
  223 + assert row["usage_scene"] == "通勤,约会"
  224 + assert row["season"] == "春季,夏季"
  225 + assert row["key_attributes"] == "中长款"
  226 + assert row["material"] == "聚酯纤维"
  227 + assert row["features"] == "透气"
  228 + assert row["selling_points"] == "修身显瘦"
  229 + assert row["anchor_text"] == "法式收腰连衣裙"