Commit a73a751f2d90d4544a7146eea38b8b68a736c98b
1 parent
e56fbdc1
enrich
Showing
4 changed files
with
1019 additions
and
111 deletions
Show diff stats
config/config_loader.py
| ... | ... | @@ -301,7 +301,12 @@ class ConfigLoader: |
| 301 | 301 | |
| 302 | 302 | # Parse tenant config |
| 303 | 303 | tenant_config_data = config_data.get("tenant_config", {}) |
| 304 | - | |
| 304 | + | |
| 305 | + # Parse extensible services/provider registry | |
| 306 | + services_data = config_data.get("services", {}) or {} | |
| 307 | + if not isinstance(services_data, dict): | |
| 308 | + raise ConfigurationError("services must be a dictionary if provided") | |
| 309 | + | |
| 305 | 310 | return SearchConfig( |
| 306 | 311 | field_boosts=field_boosts, |
| 307 | 312 | indexes=indexes, | ... | ... |
indexer/product_enrich.py
| ... | ... | @@ -11,6 +11,7 @@ import json |
| 11 | 11 | import logging |
| 12 | 12 | import time |
| 13 | 13 | import hashlib |
| 14 | +from collections import OrderedDict | |
| 14 | 15 | from datetime import datetime |
| 15 | 16 | from typing import List, Dict, Tuple, Any, Optional |
| 16 | 17 | |
| ... | ... | @@ -20,6 +21,12 @@ from pathlib import Path |
| 20 | 21 | |
| 21 | 22 | from config.env_config import REDIS_CONFIG |
| 22 | 23 | from config.tenant_config_loader import SOURCE_LANG_CODE_MAP |
| 24 | +from indexer.product_enrich_prompts import ( | |
| 25 | + SYSTEM_MESSAGE, | |
| 26 | + USER_INSTRUCTION_TEMPLATE, | |
| 27 | + LANGUAGE_MARKDOWN_TABLE_HEADERS, | |
| 28 | + SHARED_ANALYSIS_INSTRUCTION, | |
| 29 | +) | |
| 23 | 30 | |
| 24 | 31 | # 配置 |
| 25 | 32 | BATCH_SIZE = 20 |
| ... | ... | @@ -32,6 +39,7 @@ API_KEY = os.environ.get("DASHSCOPE_API_KEY") |
| 32 | 39 | MAX_RETRIES = 3 |
| 33 | 40 | RETRY_DELAY = 5 # 秒 |
| 34 | 41 | REQUEST_TIMEOUT = 180 # 秒 |
| 42 | +LOGGED_SHARED_CONTEXT_CACHE_SIZE = 256 | |
| 35 | 43 | |
| 36 | 44 | # 日志路径 |
| 37 | 45 | OUTPUT_DIR = Path("output_logs") |
| ... | ... | @@ -42,6 +50,7 @@ LOG_DIR.mkdir(parents=True, exist_ok=True) |
| 42 | 50 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| 43 | 51 | log_file = LOG_DIR / f"product_enrich_{timestamp}.log" |
| 44 | 52 | verbose_log_file = LOG_DIR / "product_enrich_verbose.log" |
| 53 | +_logged_shared_context_keys: "OrderedDict[str, None]" = OrderedDict() | |
| 45 | 54 | |
| 46 | 55 | # 主日志 logger:执行流程、批次信息等 |
| 47 | 56 | logger = logging.getLogger("product_enrich") |
| ... | ... | @@ -96,16 +105,11 @@ except Exception as e: |
| 96 | 105 | logger.warning(f"Failed to initialize Redis for anchors cache: {e}") |
| 97 | 106 | _anchor_redis = None |
| 98 | 107 | |
| 99 | -# 中文版本提示词(请勿删除): | |
| 100 | -# "你是一名电商平台的商品标注员,你的工作是对输入的每个商品进行理解、分析和标注," | |
| 101 | -# "并按要求格式返回 Markdown 表格。所有输出内容必须为中文。" | |
| 102 | - | |
| 103 | -SYSTEM_MESSAGES = ( | |
| 104 | - "You are a product annotator for an e-commerce platform. " | |
| 105 | - "For each input product, you must understand, analyze and label it, " | |
| 106 | - "and return a Markdown table strictly following the requested format. " | |
| 107 | - "All output must be in English." | |
| 108 | -) | |
| 108 | +_missing_prompt_langs = sorted(set(SOURCE_LANG_CODE_MAP) - set(LANGUAGE_MARKDOWN_TABLE_HEADERS)) | |
| 109 | +if _missing_prompt_langs: | |
| 110 | + raise RuntimeError( | |
| 111 | + f"Missing product_enrich prompt config for languages: {_missing_prompt_langs}" | |
| 112 | + ) | |
| 109 | 113 | |
| 110 | 114 | |
| 111 | 115 | def _make_anchor_cache_key( |
| ... | ... | @@ -153,108 +157,109 @@ def _set_cached_anchor_result( |
| 153 | 157 | logger.warning(f"Failed to set anchor cache: {e}") |
| 154 | 158 | |
| 155 | 159 | |
| 156 | -def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> str: | |
| 157 | - """根据目标语言创建 LLM 提示词和表头说明。 | |
| 160 | +def _build_assistant_prefix(headers: List[str]) -> str: | |
| 161 | + header_line = "| " + " | ".join(headers) + " |" | |
| 162 | + separator_line = "|" + "----|" * len(headers) | |
| 163 | + return f"{header_line}\n{separator_line}\n" | |
| 158 | 164 | |
| 159 | - 约定: | |
| 160 | - - 提示词始终使用英文; | |
| 161 | - - 当 target_lang == "en" 时,直接要求用英文分析并输出英文表头; | |
| 162 | - - 当 target_lang 为其他语言时,视作“多轮对话”的后续轮次: | |
| 163 | - * 默认上一轮已经用英文完成了分析; | |
| 164 | - * 当前轮只需要在保持结构和含义不变的前提下,将整张表格翻译为目标语言, | |
| 165 | - 包含表头与所有单元格内容。 | |
| 166 | - """ | |
| 167 | - lang_name = SOURCE_LANG_CODE_MAP.get(target_lang, target_lang) | |
| 168 | - | |
| 169 | -# 中文版本提示词(请勿删除) | |
| 170 | -# prompt = """请对输入的每条商品标题,分析并提取以下信息: | |
| 171 | - | |
| 172 | -# 1. 商品标题:将输入商品名称翻译为自然、完整的中文商品标题 | |
| 173 | -# 2. 品类路径:从大类到细分品类,用">"分隔(例如:服装>女装>裤子>工装裤) | |
| 174 | -# 3. 细分标签:商品的风格、特点、功能等(例如:碎花,收腰,法式) | |
| 175 | -# 4. 适用人群:性别/年龄段等(例如:年轻女性) | |
| 176 | -# 5. 使用场景 | |
| 177 | -# 6. 适用季节 | |
| 178 | -# 7. 关键属性 | |
| 179 | -# 8. 材质说明 | |
| 180 | -# 9. 功能特点 | |
| 181 | -# 10. 商品卖点:分析和提取一句话核心卖点,用于推荐理由 | |
| 182 | -# 11. 锚文本:生成一组能够代表该商品、并可能被用户用于搜索的词语或短语。这些词语应覆盖用户需求的各个维度,如品类、细分标签、功能特性、需求场景等等。 | |
| 183 | - | |
| 184 | -# 输入商品列表: | |
| 185 | - | |
| 186 | -# """ | |
| 187 | -# prompt_tail = """ | |
| 188 | -# 请严格按照以下markdown表格格式返回,每列内部的多值内容都用逗号分隔,不要添加任何其他说明: | |
| 189 | - | |
| 190 | -# | 序号 | 商品标题 | 品类路径 | 细分标签 | 适用人群 | 使用场景 | 适用季节 | 关键属性 | 材质说明 | 功能特点 | 商品卖点 | 锚文本 | | |
| 191 | -# |----|----|----|----|----|----|----|----|----|----|----|----| | |
| 192 | -# """ | |
| 193 | - | |
| 194 | - prompt = """Please analyze each input product title and extract the following information: | |
| 195 | - | |
| 196 | -1. Product title: a natural English product name derived from the input title | |
| 197 | -2. Category path: from broad to fine-grained category, separated by ">" (e.g. Clothing>Women>Dresses>Work Dress) | |
| 198 | -3. Fine-grained tags: style / features / attributes (e.g. floral, waist-cinching, French style) | |
| 199 | -4. Target audience: gender / age group, etc. (e.g. young women) | |
| 200 | -5. Usage scene | |
| 201 | -6. Applicable season | |
| 202 | -7. Key attributes | |
| 203 | -8. Material description | |
| 204 | -9. Functional features | |
| 205 | -10. Selling point: one concise key selling sentence for recommendation | |
| 206 | -11. Anchor text: a set of words or phrases that could be used by users as search queries for this product, covering category, fine-grained tags, functional attributes, usage scenes, etc. | |
| 207 | - | |
| 208 | -Input product list: | |
| 209 | - | |
| 210 | -""" | |
| 211 | 165 | |
| 166 | +def _build_shared_context(products: List[Dict[str, str]]) -> str: | |
| 167 | + shared_context = SHARED_ANALYSIS_INSTRUCTION | |
| 212 | 168 | for idx, product in enumerate(products, 1): |
| 213 | - prompt += f'{idx}. {product["title"]}\n' | |
| 169 | + shared_context += f'{idx}. {product["title"]}\n' | |
| 170 | + return shared_context | |
| 214 | 171 | |
| 215 | - if target_lang == "en": | |
| 216 | - # 英文首轮:直接要求英文表头 + 英文内容 | |
| 217 | - prompt += """ | |
| 218 | -Please strictly return a Markdown table in the following format. For any column that can contain multiple values, separate values with commas. Do not add any other explanations: | |
| 219 | 172 | |
| 220 | -| No. | Product title | Category path | Fine-grained tags | Target audience | Usage scene | Season | Key attributes | Material | Features | Selling point | Anchor text | | |
| 221 | -|----|----|----|----|----|----|----|----|----|----|----|----| | |
| 222 | -""" | |
| 223 | - else: | |
| 224 | - # 非英文语言:视作“下一轮对话”,只做翻译,要求表头与内容全部用目标语言 | |
| 225 | - prompt += f""" | |
| 226 | -Now we will output the same table in {lang_name}. | |
| 227 | - | |
| 228 | -IMPORTANT: | |
| 229 | -- Assume you have already generated the full table in English in a previous round. | |
| 230 | -- In this round, you must output exactly the same table structure and content, | |
| 231 | - but fully translated into {lang_name}, including ALL column headers and ALL cell values. | |
| 232 | -- Do NOT change the meaning, fields, or the number/order of rows and columns. | |
| 233 | -- Keep valid Markdown table syntax. | |
| 234 | - | |
| 235 | -Please return ONLY the Markdown table in {lang_name}, without any extra explanations. | |
| 236 | -""" | |
| 173 | +def _hash_text(text: str) -> str: | |
| 174 | + return hashlib.md5((text or "").encode("utf-8")).hexdigest()[:12] | |
| 175 | + | |
| 176 | + | |
| 177 | +def _mark_shared_context_logged_once(shared_context_key: str) -> bool: | |
| 178 | + if shared_context_key in _logged_shared_context_keys: | |
| 179 | + _logged_shared_context_keys.move_to_end(shared_context_key) | |
| 180 | + return False | |
| 181 | + | |
| 182 | + _logged_shared_context_keys[shared_context_key] = None | |
| 183 | + if len(_logged_shared_context_keys) > LOGGED_SHARED_CONTEXT_CACHE_SIZE: | |
| 184 | + _logged_shared_context_keys.popitem(last=False) | |
| 185 | + return True | |
| 237 | 186 | |
| 238 | - return prompt | |
| 239 | 187 | |
| 188 | +def reset_logged_shared_context_keys() -> None: | |
| 189 | + """测试辅助:清理已记录的共享 prompt key。""" | |
| 190 | + _logged_shared_context_keys.clear() | |
| 240 | 191 | |
| 241 | -def call_llm(prompt: str, target_lang: str = "zh") -> Tuple[str, str]: | |
| 242 | - """调用大模型API(带重试机制),按目标语言选择系统提示词。""" | |
| 192 | + | |
| 193 | +def create_prompt( | |
| 194 | + products: List[Dict[str, str]], | |
| 195 | + target_lang: str = "zh", | |
| 196 | +) -> Tuple[str, str, str]: | |
| 197 | + """根据目标语言创建共享上下文、本地化输出要求和 Partial Mode assistant 前缀。""" | |
| 198 | + markdown_table_headers = LANGUAGE_MARKDOWN_TABLE_HEADERS.get(target_lang) | |
| 199 | + if not markdown_table_headers: | |
| 200 | + logger.warning( | |
| 201 | + "Unsupported target_lang for markdown table headers: %s", | |
| 202 | + target_lang, | |
| 203 | + ) | |
| 204 | + return None, None, None | |
| 205 | + shared_context = _build_shared_context(products) | |
| 206 | + language_label = SOURCE_LANG_CODE_MAP.get(target_lang, target_lang) | |
| 207 | + user_prompt = USER_INSTRUCTION_TEMPLATE.format(language=language_label).strip() | |
| 208 | + assistant_prefix = _build_assistant_prefix(markdown_table_headers) | |
| 209 | + return shared_context, user_prompt, assistant_prefix | |
| 210 | + | |
| 211 | + | |
| 212 | +def _merge_partial_response(assistant_prefix: str, generated_content: str) -> str: | |
| 213 | + """将 Partial Mode 的 assistant 前缀与补全文本拼成完整 markdown。""" | |
| 214 | + generated = (generated_content or "").lstrip() | |
| 215 | + prefix_lines = [line.strip() for line in assistant_prefix.strip().splitlines()] | |
| 216 | + generated_lines = generated.splitlines() | |
| 217 | + | |
| 218 | + if generated_lines: | |
| 219 | + first_line = generated_lines[0].strip() | |
| 220 | + if prefix_lines and first_line == prefix_lines[0]: | |
| 221 | + generated_lines = generated_lines[1:] | |
| 222 | + if generated_lines and len(prefix_lines) > 1 and generated_lines[0].strip() == prefix_lines[1]: | |
| 223 | + generated_lines = generated_lines[1:] | |
| 224 | + elif len(prefix_lines) > 1 and first_line == prefix_lines[1]: | |
| 225 | + generated_lines = generated_lines[1:] | |
| 226 | + | |
| 227 | + suffix = "\n".join(generated_lines).lstrip("\n") | |
| 228 | + if suffix: | |
| 229 | + return f"{assistant_prefix}{suffix}" | |
| 230 | + return assistant_prefix | |
| 231 | + | |
| 232 | + | |
| 233 | +def call_llm( | |
| 234 | + shared_context: str, | |
| 235 | + user_prompt: str, | |
| 236 | + assistant_prefix: str, | |
| 237 | + target_lang: str = "zh", | |
| 238 | +) -> Tuple[str, str]: | |
| 239 | + """调用大模型 API(带重试机制),使用 Partial Mode 强制 markdown 表格前缀。""" | |
| 243 | 240 | headers = { |
| 244 | 241 | "Authorization": f"Bearer {API_KEY}", |
| 245 | 242 | "Content-Type": "application/json", |
| 246 | 243 | } |
| 244 | + shared_context_key = _hash_text(shared_context) | |
| 245 | + localized_tail_key = _hash_text(f"{target_lang}\n{user_prompt}\n{assistant_prefix}") | |
| 246 | + combined_user_prompt = f"{shared_context.rstrip()}\n\n{user_prompt.strip()}" | |
| 247 | 247 | |
| 248 | 248 | payload = { |
| 249 | 249 | "model": MODEL_NAME, |
| 250 | 250 | "messages": [ |
| 251 | 251 | { |
| 252 | 252 | "role": "system", |
| 253 | - "content": SYSTEM_MESSAGES, | |
| 253 | + "content": SYSTEM_MESSAGE, | |
| 254 | 254 | }, |
| 255 | 255 | { |
| 256 | 256 | "role": "user", |
| 257 | - "content": prompt, | |
| 257 | + "content": combined_user_prompt, | |
| 258 | + }, | |
| 259 | + { | |
| 260 | + "role": "assistant", | |
| 261 | + "content": assistant_prefix, | |
| 262 | + "partial": True, | |
| 258 | 263 | }, |
| 259 | 264 | ], |
| 260 | 265 | "temperature": 0.3, |
| ... | ... | @@ -266,16 +271,41 @@ def call_llm(prompt: str, target_lang: str = "zh") -> Tuple[str, str]: |
| 266 | 271 | "payload": payload, |
| 267 | 272 | } |
| 268 | 273 | |
| 269 | - # 主日志 + 详尽日志:LLM Request | |
| 270 | - logger.info(f"\n{'=' * 80}") | |
| 271 | - logger.info(f"LLM Request (Model: {MODEL_NAME}):") | |
| 272 | - logger.info(json.dumps(request_data, ensure_ascii=False, indent=2)) | |
| 273 | - logger.info(f"\nPrompt:\n{prompt}") | |
| 274 | + if _mark_shared_context_logged_once(shared_context_key): | |
| 275 | + logger.info(f"\n{'=' * 80}") | |
| 276 | + logger.info( | |
| 277 | + "LLM Shared Context [model=%s, shared_key=%s, chars=%s] (logged once per process key)", | |
| 278 | + MODEL_NAME, | |
| 279 | + shared_context_key, | |
| 280 | + len(shared_context), | |
| 281 | + ) | |
| 282 | + logger.info("\nSystem Message:\n%s", SYSTEM_MESSAGE) | |
| 283 | + logger.info("\nShared Context:\n%s", shared_context) | |
| 274 | 284 | |
| 275 | 285 | verbose_logger.info(f"\n{'=' * 80}") |
| 276 | - verbose_logger.info(f"LLM Request (Model: {MODEL_NAME}):") | |
| 286 | + verbose_logger.info( | |
| 287 | + "LLM Request [model=%s, lang=%s, shared_key=%s, tail_key=%s]:", | |
| 288 | + MODEL_NAME, | |
| 289 | + target_lang, | |
| 290 | + shared_context_key, | |
| 291 | + localized_tail_key, | |
| 292 | + ) | |
| 277 | 293 | verbose_logger.info(json.dumps(request_data, ensure_ascii=False, indent=2)) |
| 278 | - verbose_logger.info(f"\nPrompt:\n{prompt}") | |
| 294 | + verbose_logger.info(f"\nCombined User Prompt:\n{combined_user_prompt}") | |
| 295 | + verbose_logger.info(f"\nShared Context:\n{shared_context}") | |
| 296 | + verbose_logger.info(f"\nLocalized Requirement:\n{user_prompt}") | |
| 297 | + verbose_logger.info(f"\nAssistant Prefix:\n{assistant_prefix}") | |
| 298 | + | |
| 299 | + logger.info( | |
| 300 | + "\nLLM Request Variant [lang=%s, shared_key=%s, tail_key=%s, prompt_chars=%s, prefix_chars=%s]", | |
| 301 | + target_lang, | |
| 302 | + shared_context_key, | |
| 303 | + localized_tail_key, | |
| 304 | + len(user_prompt), | |
| 305 | + len(assistant_prefix), | |
| 306 | + ) | |
| 307 | + logger.info("\nLocalized Requirement:\n%s", user_prompt) | |
| 308 | + logger.info("\nAssistant Prefix:\n%s", assistant_prefix) | |
| 279 | 309 | |
| 280 | 310 | # 创建session,禁用代理 |
| 281 | 311 | session = requests.Session() |
| ... | ... | @@ -295,19 +325,37 @@ def call_llm(prompt: str, target_lang: str = "zh") -> Tuple[str, str]: |
| 295 | 325 | |
| 296 | 326 | response.raise_for_status() |
| 297 | 327 | result = response.json() |
| 328 | + usage = result.get("usage") or {} | |
| 329 | + | |
| 330 | + verbose_logger.info( | |
| 331 | + "\nLLM Response [model=%s, lang=%s, shared_key=%s, tail_key=%s]:", | |
| 332 | + MODEL_NAME, | |
| 333 | + target_lang, | |
| 334 | + shared_context_key, | |
| 335 | + localized_tail_key, | |
| 336 | + ) | |
| 337 | + verbose_logger.info(json.dumps(result, ensure_ascii=False, indent=2)) | |
| 298 | 338 | |
| 299 | - # 主日志 + 详尽日志:LLM Response | |
| 300 | - logger.info(f"\nLLM Response:") | |
| 301 | - logger.info(json.dumps(result, ensure_ascii=False, indent=2)) | |
| 339 | + generated_content = result["choices"][0]["message"]["content"] | |
| 340 | + full_markdown = _merge_partial_response(assistant_prefix, generated_content) | |
| 302 | 341 | |
| 303 | - verbose_logger.info(f"\nLLM Response:") | |
| 304 | - verbose_logger.info(json.dumps(result, ensure_ascii=False, indent=2)) | |
| 342 | + logger.info( | |
| 343 | + "\nLLM Response Summary [lang=%s, shared_key=%s, tail_key=%s, generated_chars=%s, completion_tokens=%s, prompt_tokens=%s, total_tokens=%s]", | |
| 344 | + target_lang, | |
| 345 | + shared_context_key, | |
| 346 | + localized_tail_key, | |
| 347 | + len(generated_content or ""), | |
| 348 | + usage.get("completion_tokens"), | |
| 349 | + usage.get("prompt_tokens"), | |
| 350 | + usage.get("total_tokens"), | |
| 351 | + ) | |
| 352 | + logger.info("\nGenerated Content:\n%s", generated_content) | |
| 353 | + logger.info("\nMerged Markdown:\n%s", full_markdown) | |
| 305 | 354 | |
| 306 | - content = result["choices"][0]["message"]["content"] | |
| 307 | - logger.info(f"\nExtracted Content:\n{content}") | |
| 308 | - verbose_logger.info(f"\nExtracted Content:\n{content}") | |
| 355 | + verbose_logger.info(f"\nGenerated Content:\n{generated_content}") | |
| 356 | + verbose_logger.info(f"\nMerged Markdown:\n{full_markdown}") | |
| 309 | 357 | |
| 310 | - return content, json.dumps(result, ensure_ascii=False) | |
| 358 | + return full_markdown, json.dumps(result, ensure_ascii=False) | |
| 311 | 359 | |
| 312 | 360 | except requests.exceptions.ProxyError as e: |
| 313 | 361 | logger.warning(f"Attempt {attempt + 1}/{MAX_RETRIES}: Proxy error - {str(e)}") |
| ... | ... | @@ -385,6 +433,39 @@ def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]: |
| 385 | 433 | return data |
| 386 | 434 | |
| 387 | 435 | |
| 436 | +def _log_parsed_result_quality( | |
| 437 | + batch_data: List[Dict[str, str]], | |
| 438 | + parsed_results: List[Dict[str, str]], | |
| 439 | + target_lang: str, | |
| 440 | + batch_num: int, | |
| 441 | +) -> None: | |
| 442 | + expected = len(batch_data) | |
| 443 | + actual = len(parsed_results) | |
| 444 | + if actual != expected: | |
| 445 | + logger.warning( | |
| 446 | + "Parsed row count mismatch for batch=%s lang=%s: expected=%s actual=%s", | |
| 447 | + batch_num, | |
| 448 | + target_lang, | |
| 449 | + expected, | |
| 450 | + actual, | |
| 451 | + ) | |
| 452 | + | |
| 453 | + missing_anchor = sum(1 for item in parsed_results if not str(item.get("anchor_text") or "").strip()) | |
| 454 | + missing_category = sum(1 for item in parsed_results if not str(item.get("category_path") or "").strip()) | |
| 455 | + missing_title = sum(1 for item in parsed_results if not str(item.get("title") or "").strip()) | |
| 456 | + | |
| 457 | + logger.info( | |
| 458 | + "Parsed Quality Summary [batch=%s, lang=%s]: rows=%s/%s, missing_title=%s, missing_category=%s, missing_anchor=%s", | |
| 459 | + batch_num, | |
| 460 | + target_lang, | |
| 461 | + actual, | |
| 462 | + expected, | |
| 463 | + missing_title, | |
| 464 | + missing_category, | |
| 465 | + missing_anchor, | |
| 466 | + ) | |
| 467 | + | |
| 468 | + | |
| 388 | 469 | def process_batch( |
| 389 | 470 | batch_data: List[Dict[str, str]], |
| 390 | 471 | batch_num: int, |
| ... | ... | @@ -395,14 +476,52 @@ def process_batch( |
| 395 | 476 | logger.info(f"Processing Batch {batch_num} ({len(batch_data)} items)") |
| 396 | 477 | |
| 397 | 478 | # 创建提示词 |
| 398 | - prompt = create_prompt(batch_data, target_lang=target_lang) | |
| 479 | + shared_context, user_prompt, assistant_prefix = create_prompt( | |
| 480 | + batch_data, | |
| 481 | + target_lang=target_lang, | |
| 482 | + ) | |
| 483 | + | |
| 484 | + # 如果提示词创建失败(例如不支持的 target_lang),本次批次整体失败,不再继续调用 LLM | |
| 485 | + if shared_context is None or user_prompt is None or assistant_prefix is None: | |
| 486 | + logger.error( | |
| 487 | + "Failed to create prompt for batch %s, target_lang=%s; " | |
| 488 | + "marking entire batch as failed without calling LLM", | |
| 489 | + batch_num, | |
| 490 | + target_lang, | |
| 491 | + ) | |
| 492 | + return [ | |
| 493 | + { | |
| 494 | + "id": item["id"], | |
| 495 | + "lang": target_lang, | |
| 496 | + "title_input": item.get("title", ""), | |
| 497 | + "title": "", | |
| 498 | + "category_path": "", | |
| 499 | + "tags": "", | |
| 500 | + "target_audience": "", | |
| 501 | + "usage_scene": "", | |
| 502 | + "season": "", | |
| 503 | + "key_attributes": "", | |
| 504 | + "material": "", | |
| 505 | + "features": "", | |
| 506 | + "selling_points": "", | |
| 507 | + "anchor_text": "", | |
| 508 | + "error": f"prompt_creation_failed: unsupported target_lang={target_lang}", | |
| 509 | + } | |
| 510 | + for item in batch_data | |
| 511 | + ] | |
| 399 | 512 | |
| 400 | 513 | # 调用LLM |
| 401 | 514 | try: |
| 402 | - raw_response, full_response_json = call_llm(prompt, target_lang=target_lang) | |
| 515 | + raw_response, full_response_json = call_llm( | |
| 516 | + shared_context, | |
| 517 | + user_prompt, | |
| 518 | + assistant_prefix, | |
| 519 | + target_lang=target_lang, | |
| 520 | + ) | |
| 403 | 521 | |
| 404 | 522 | # 解析结果 |
| 405 | 523 | parsed_results = parse_markdown_table(raw_response) |
| 524 | + _log_parsed_result_quality(batch_data, parsed_results, target_lang, batch_num) | |
| 406 | 525 | |
| 407 | 526 | logger.info(f"\nParsed Results ({len(parsed_results)} items):") |
| 408 | 527 | logger.info(json.dumps(parsed_results, ensure_ascii=False, indent=2)) |
| ... | ... | @@ -541,4 +660,3 @@ def analyze_products( |
| 541 | 660 | pass |
| 542 | 661 | |
| 543 | 662 | return all_results |
| 544 | - | ... | ... |
| ... | ... | @@ -0,0 +1,556 @@ |
| 1 | +#!/usr/bin/env python3 | |
| 2 | + | |
| 3 | +from typing import Any, Dict | |
| 4 | + | |
| 5 | +SYSTEM_MESSAGE = ( | |
| 6 | + "You are an e-commerce product annotator. " | |
| 7 | + "Continue the provided assistant Markdown table prefix. " | |
| 8 | + "Do not repeat or modify the prefix, and do not add explanations outside the table." | |
| 9 | +) | |
| 10 | + | |
| 11 | +SHARED_ANALYSIS_INSTRUCTION = """Analyze each input product title and fill these columns: | |
| 12 | + | |
| 13 | +1. Product title: a natural localized product name derived from the input title | |
| 14 | +2. Category path: broad to fine-grained category, separated by ">" | |
| 15 | +3. Fine-grained tags: style, features, functions, or notable attributes | |
| 16 | +4. Target audience: gender, age group, or suitable users | |
| 17 | +5. Usage scene | |
| 18 | +6. Applicable season | |
| 19 | +7. Key attributes | |
| 20 | +8. Material description | |
| 21 | +9. Functional features | |
| 22 | +10. Selling point: one concise core selling phrase | |
| 23 | +11. Anchor text: a set of search-oriented words or phrases covering category, attributes, scenes, and demand | |
| 24 | + | |
| 25 | +Rules: | |
| 26 | +- Keep the input order and row count exactly the same. | |
| 27 | +- Infer from the title only; if uncertain, prefer concise and broadly correct ecommerce wording. | |
| 28 | +- Keep category paths concise and use ">" as the separator. | |
| 29 | +- For columns with multiple values, the localized output requirement will define the delimiter. | |
| 30 | + | |
| 31 | +Input product list: | |
| 32 | +""" | |
| 33 | + | |
| 34 | +USER_INSTRUCTION_TEMPLATE = """Please strictly return a Markdown table following the given columns in the specified language. For any column containing multiple values, separate them with commas. Do not add any other explanation. | |
| 35 | +Language: {language}""" | |
| 36 | + | |
| 37 | +LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { | |
| 38 | + "en": [ | |
| 39 | + "No.", | |
| 40 | + "Product title", | |
| 41 | + "Category path", | |
| 42 | + "Fine-grained tags", | |
| 43 | + "Target audience", | |
| 44 | + "Usage scene", | |
| 45 | + "Season", | |
| 46 | + "Key attributes", | |
| 47 | + "Material", | |
| 48 | + "Features", | |
| 49 | + "Selling point", | |
| 50 | + "Anchor text" | |
| 51 | + ], | |
| 52 | + "zh": [ | |
| 53 | + "序号", | |
| 54 | + "商品标题", | |
| 55 | + "品类路径", | |
| 56 | + "细分标签", | |
| 57 | + "适用人群", | |
| 58 | + "使用场景", | |
| 59 | + "适用季节", | |
| 60 | + "关键属性", | |
| 61 | + "材质说明", | |
| 62 | + "功能特点", | |
| 63 | + "商品卖点", | |
| 64 | + "锚文本" | |
| 65 | + ], | |
| 66 | + "zh_tw": [ | |
| 67 | + "序號", | |
| 68 | + "商品標題", | |
| 69 | + "品類路徑", | |
| 70 | + "細分標籤", | |
| 71 | + "適用人群", | |
| 72 | + "使用場景", | |
| 73 | + "適用季節", | |
| 74 | + "關鍵屬性", | |
| 75 | + "材質說明", | |
| 76 | + "功能特點", | |
| 77 | + "商品賣點", | |
| 78 | + "錨文本" | |
| 79 | + ], | |
| 80 | + "ru": [ | |
| 81 | + "№", | |
| 82 | + "Название товара", | |
| 83 | + "Путь категории", | |
| 84 | + "Детализированные теги", | |
| 85 | + "Целевая аудитория", | |
| 86 | + "Сценарий использования", | |
| 87 | + "Сезон", | |
| 88 | + "Ключевые атрибуты", | |
| 89 | + "Материал", | |
| 90 | + "Особенности", | |
| 91 | + "Преимущество товара", | |
| 92 | + "Анкорный текст" | |
| 93 | + ], | |
| 94 | + "ja": [ | |
| 95 | + "番号", | |
| 96 | + "商品タイトル", | |
| 97 | + "カテゴリパス", | |
| 98 | + "詳細タグ", | |
| 99 | + "対象ユーザー", | |
| 100 | + "利用シーン", | |
| 101 | + "季節", | |
| 102 | + "主要属性", | |
| 103 | + "素材", | |
| 104 | + "機能特徴", | |
| 105 | + "商品の訴求点", | |
| 106 | + "アンカーテキスト" | |
| 107 | + ], | |
| 108 | + "ko": [ | |
| 109 | + "번호", | |
| 110 | + "상품 제목", | |
| 111 | + "카테고리 경로", | |
| 112 | + "세부 태그", | |
| 113 | + "대상 고객", | |
| 114 | + "사용 장면", | |
| 115 | + "계절", | |
| 116 | + "핵심 속성", | |
| 117 | + "소재", | |
| 118 | + "기능 특징", | |
| 119 | + "상품 포인트", | |
| 120 | + "앵커 텍스트" | |
| 121 | + ], | |
| 122 | + "es": [ | |
| 123 | + "N.º", | |
| 124 | + "Titulo del producto", | |
| 125 | + "Ruta de categoria", | |
| 126 | + "Etiquetas detalladas", | |
| 127 | + "Publico objetivo", | |
| 128 | + "Escenario de uso", | |
| 129 | + "Temporada", | |
| 130 | + "Atributos clave", | |
| 131 | + "Material", | |
| 132 | + "Caracteristicas", | |
| 133 | + "Punto de venta", | |
| 134 | + "Texto ancla" | |
| 135 | + ], | |
| 136 | + "fr": [ | |
| 137 | + "N°", | |
| 138 | + "Titre du produit", | |
| 139 | + "Chemin de categorie", | |
| 140 | + "Etiquettes detaillees", | |
| 141 | + "Public cible", | |
| 142 | + "Scenario d'utilisation", | |
| 143 | + "Saison", | |
| 144 | + "Attributs cles", | |
| 145 | + "Matiere", | |
| 146 | + "Caracteristiques", | |
| 147 | + "Argument de vente", | |
| 148 | + "Texte d'ancrage" | |
| 149 | + ], | |
| 150 | + "pt": [ | |
| 151 | + "Nº", | |
| 152 | + "Titulo do produto", | |
| 153 | + "Caminho da categoria", | |
| 154 | + "Tags detalhadas", | |
| 155 | + "Publico-alvo", | |
| 156 | + "Cenario de uso", | |
| 157 | + "Estacao", | |
| 158 | + "Atributos principais", | |
| 159 | + "Material", | |
| 160 | + "Caracteristicas", | |
| 161 | + "Ponto de venda", | |
| 162 | + "Texto ancora" | |
| 163 | + ], | |
| 164 | + "de": [ | |
| 165 | + "Nr.", | |
| 166 | + "Produkttitel", | |
| 167 | + "Kategoriepfad", | |
| 168 | + "Detaillierte Tags", | |
| 169 | + "Zielgruppe", | |
| 170 | + "Nutzungsszenario", | |
| 171 | + "Saison", | |
| 172 | + "Wichtige Attribute", | |
| 173 | + "Material", | |
| 174 | + "Funktionen", | |
| 175 | + "Verkaufsargument", | |
| 176 | + "Ankertext" | |
| 177 | + ], | |
| 178 | + "it": [ | |
| 179 | + "N.", | |
| 180 | + "Titolo del prodotto", | |
| 181 | + "Percorso categoria", | |
| 182 | + "Tag dettagliati", | |
| 183 | + "Pubblico target", | |
| 184 | + "Scenario d'uso", | |
| 185 | + "Stagione", | |
| 186 | + "Attributi chiave", | |
| 187 | + "Materiale", | |
| 188 | + "Caratteristiche", | |
| 189 | + "Punto di forza", | |
| 190 | + "Testo ancora" | |
| 191 | + ], | |
| 192 | + "th": [ | |
| 193 | + "ลำดับ", | |
| 194 | + "ชื่อสินค้า", | |
| 195 | + "เส้นทางหมวดหมู่", | |
| 196 | + "แท็กย่อย", | |
| 197 | + "กลุ่มเป้าหมาย", | |
| 198 | + "สถานการณ์การใช้งาน", | |
| 199 | + "ฤดูกาล", | |
| 200 | + "คุณสมบัติสำคัญ", | |
| 201 | + "วัสดุ", | |
| 202 | + "คุณสมบัติการใช้งาน", | |
| 203 | + "จุดขายสินค้า", | |
| 204 | + "แองเคอร์เท็กซ์" | |
| 205 | + ], | |
| 206 | + "vi": [ | |
| 207 | + "STT", | |
| 208 | + "Tieu de san pham", | |
| 209 | + "Duong dan danh muc", | |
| 210 | + "The chi tiet", | |
| 211 | + "Doi tuong phu hop", | |
| 212 | + "Boi canh su dung", | |
| 213 | + "Mua phu hop", | |
| 214 | + "Thuoc tinh chinh", | |
| 215 | + "Chat lieu", | |
| 216 | + "Tinh nang", | |
| 217 | + "Diem ban hang", | |
| 218 | + "Van ban neo" | |
| 219 | + ], | |
| 220 | + "id": [ | |
| 221 | + "No.", | |
| 222 | + "Judul produk", | |
| 223 | + "Jalur kategori", | |
| 224 | + "Tag terperinci", | |
| 225 | + "Target pengguna", | |
| 226 | + "Skenario penggunaan", | |
| 227 | + "Musim", | |
| 228 | + "Atribut utama", | |
| 229 | + "Bahan", | |
| 230 | + "Fitur", | |
| 231 | + "Nilai jual", | |
| 232 | + "Teks jangkar" | |
| 233 | + ], | |
| 234 | + "ms": [ | |
| 235 | + "No.", | |
| 236 | + "Tajuk produk", | |
| 237 | + "Laluan kategori", | |
| 238 | + "Tag terperinci", | |
| 239 | + "Sasaran pengguna", | |
| 240 | + "Senario penggunaan", | |
| 241 | + "Musim", | |
| 242 | + "Atribut utama", | |
| 243 | + "Bahan", | |
| 244 | + "Ciri-ciri", | |
| 245 | + "Nilai jual", | |
| 246 | + "Teks sauh" | |
| 247 | + ], | |
| 248 | + "ar": [ | |
| 249 | + "الرقم", | |
| 250 | + "عنوان المنتج", | |
| 251 | + "مسار الفئة", | |
| 252 | + "الوسوم التفصيلية", | |
| 253 | + "الفئة المستهدفة", | |
| 254 | + "سيناريو الاستخدام", | |
| 255 | + "الموسم", | |
| 256 | + "السمات الرئيسية", | |
| 257 | + "المادة", | |
| 258 | + "الميزات", | |
| 259 | + "نقطة البيع", | |
| 260 | + "نص الربط" | |
| 261 | + ], | |
| 262 | + "hi": [ | |
| 263 | + "क्रमांक", | |
| 264 | + "उत्पाद शीर्षक", | |
| 265 | + "श्रेणी पथ", | |
| 266 | + "विस्तृत टैग", | |
| 267 | + "लक्षित उपभोक्ता", | |
| 268 | + "उपयोग परिदृश्य", | |
| 269 | + "मौसम", | |
| 270 | + "मुख्य गुण", | |
| 271 | + "सामग्री", | |
| 272 | + "विशेषताएं", | |
| 273 | + "बिक्री बिंदु", | |
| 274 | + "एंकर टेक्स्ट" | |
| 275 | + ], | |
| 276 | + "he": [ | |
| 277 | + "מס׳", | |
| 278 | + "כותרת המוצר", | |
| 279 | + "נתיב קטגוריה", | |
| 280 | + "תגיות מפורטות", | |
| 281 | + "קהל יעד", | |
| 282 | + "תרחיש שימוש", | |
| 283 | + "עונה", | |
| 284 | + "מאפיינים מרכזיים", | |
| 285 | + "חומר", | |
| 286 | + "תכונות", | |
| 287 | + "נקודת מכירה", | |
| 288 | + "טקסט עוגן" | |
| 289 | + ], | |
| 290 | + "my": [ | |
| 291 | + "အမှတ်စဉ်", | |
| 292 | + "ကုန်ပစ္စည်းခေါင်းစဉ်", | |
| 293 | + "အမျိုးအစားလမ်းကြောင်း", | |
| 294 | + "အသေးစိတ်တဂ်များ", | |
| 295 | + "ပစ်မှတ်အသုံးပြုသူ", | |
| 296 | + "အသုံးပြုမှုအခြေအနေ", | |
| 297 | + "ရာသီ", | |
| 298 | + "အဓိကဂုဏ်သတ္တိများ", | |
| 299 | + "ပစ္စည်း", | |
| 300 | + "လုပ်ဆောင်ချက်များ", | |
| 301 | + "အရောင်းထူးခြားချက်", | |
| 302 | + "အန်ကာစာသား" | |
| 303 | + ], | |
| 304 | + "ta": [ | |
| 305 | + "எண்", | |
| 306 | + "தயாரிப்பு தலைப்பு", | |
| 307 | + "வகை பாதை", | |
| 308 | + "விரிவான குறிச்சொற்கள்", | |
| 309 | + "இலக்கு பயனர்கள்", | |
| 310 | + "பயன்பாட்டு நிலை", | |
| 311 | + "பருவம்", | |
| 312 | + "முக்கிய பண்புகள்", | |
| 313 | + "பொருள்", | |
| 314 | + "அம்சங்கள்", | |
| 315 | + "விற்பனை அம்சம்", | |
| 316 | + "ஆங்கர் உரை" | |
| 317 | + ], | |
| 318 | + "ur": [ | |
| 319 | + "نمبر", | |
| 320 | + "پروڈکٹ عنوان", | |
| 321 | + "زمرہ راستہ", | |
| 322 | + "تفصیلی ٹیگز", | |
| 323 | + "ہدف صارفین", | |
| 324 | + "استعمال کا منظر", | |
| 325 | + "موسم", | |
| 326 | + "کلیدی خصوصیات", | |
| 327 | + "مواد", | |
| 328 | + "فیچرز", | |
| 329 | + "فروختی نقطہ", | |
| 330 | + "اینکر ٹیکسٹ" | |
| 331 | + ], | |
| 332 | + "bn": [ | |
| 333 | + "ক্রম", | |
| 334 | + "পণ্যের শিরোনাম", | |
| 335 | + "শ্রেণি পথ", | |
| 336 | + "বিস্তারিত ট্যাগ", | |
| 337 | + "লক্ষ্য ব্যবহারকারী", | |
| 338 | + "ব্যবহারের দৃশ্য", | |
| 339 | + "মৌসুম", | |
| 340 | + "মূল বৈশিষ্ট্য", | |
| 341 | + "উপাদান", | |
| 342 | + "ফিচার", | |
| 343 | + "বিক্রয় পয়েন্ট", | |
| 344 | + "অ্যাঙ্কর টেক্সট" | |
| 345 | + ], | |
| 346 | + "pl": [ | |
| 347 | + "Nr", | |
| 348 | + "Tytul produktu", | |
| 349 | + "Sciezka kategorii", | |
| 350 | + "Szczegolowe tagi", | |
| 351 | + "Grupa docelowa", | |
| 352 | + "Scenariusz uzycia", | |
| 353 | + "Sezon", | |
| 354 | + "Kluczowe atrybuty", | |
| 355 | + "Material", | |
| 356 | + "Cechy", | |
| 357 | + "Atut sprzedazowy", | |
| 358 | + "Tekst kotwicy" | |
| 359 | + ], | |
| 360 | + "nl": [ | |
| 361 | + "Nr.", | |
| 362 | + "Producttitel", | |
| 363 | + "Categoriepad", | |
| 364 | + "Gedetailleerde tags", | |
| 365 | + "Doelgroep", | |
| 366 | + "Gebruikscontext", | |
| 367 | + "Seizoen", | |
| 368 | + "Belangrijke kenmerken", | |
| 369 | + "Materiaal", | |
| 370 | + "Functies", | |
| 371 | + "Verkooppunt", | |
| 372 | + "Ankertekst" | |
| 373 | + ], | |
| 374 | + "ro": [ | |
| 375 | + "Nr.", | |
| 376 | + "Titlul produsului", | |
| 377 | + "Calea categoriei", | |
| 378 | + "Etichete detaliate", | |
| 379 | + "Public tinta", | |
| 380 | + "Scenariu de utilizare", | |
| 381 | + "Sezon", | |
| 382 | + "Atribute cheie", | |
| 383 | + "Material", | |
| 384 | + "Caracteristici", | |
| 385 | + "Punct de vanzare", | |
| 386 | + "Text ancora" | |
| 387 | + ], | |
| 388 | + "tr": [ | |
| 389 | + "No.", | |
| 390 | + "Urun basligi", | |
| 391 | + "Kategori yolu", | |
| 392 | + "Ayrintili etiketler", | |
| 393 | + "Hedef kitle", | |
| 394 | + "Kullanim senaryosu", | |
| 395 | + "Sezon", | |
| 396 | + "Temel ozellikler", | |
| 397 | + "Malzeme", | |
| 398 | + "Ozellikler", | |
| 399 | + "Satis noktasi", | |
| 400 | + "Capa metni" | |
| 401 | + ], | |
| 402 | + "km": [ | |
| 403 | + "ល.រ", | |
| 404 | + "ចំណងជើងផលិតផល", | |
| 405 | + "ផ្លូវប្រភេទ", | |
| 406 | + "ស្លាកលម្អិត", | |
| 407 | + "ក្រុមអ្នកប្រើគោលដៅ", | |
| 408 | + "សេណារីយ៉ូប្រើប្រាស់", | |
| 409 | + "រដូវកាល", | |
| 410 | + "លក្ខណៈសម្បត្តិសំខាន់", | |
| 411 | + "សម្ភារៈ", | |
| 412 | + "មុខងារ", | |
| 413 | + "ចំណុចលក់", | |
| 414 | + "អត្ថបទអង់ក័រ" | |
| 415 | + ], | |
| 416 | + "lo": [ | |
| 417 | + "ລຳດັບ", | |
| 418 | + "ຊື່ສິນຄ້າ", | |
| 419 | + "ເສັ້ນທາງໝວດໝູ່", | |
| 420 | + "ແທັກລະອຽດ", | |
| 421 | + "ກຸ່ມເປົ້າໝາຍ", | |
| 422 | + "ສະຖານະການໃຊ້ງານ", | |
| 423 | + "ລະດູການ", | |
| 424 | + "ຄຸນລັກສະນະສຳຄັນ", | |
| 425 | + "ວັດສະດຸ", | |
| 426 | + "ຄຸນສົມບັດ", | |
| 427 | + "ຈຸດຂາຍ", | |
| 428 | + "ຂໍ້ຄວາມອັງເຄີ" | |
| 429 | + ], | |
| 430 | + "yue": [ | |
| 431 | + "序號", | |
| 432 | + "商品標題", | |
| 433 | + "品類路徑", | |
| 434 | + "細分類標籤", | |
| 435 | + "適用人群", | |
| 436 | + "使用場景", | |
| 437 | + "適用季節", | |
| 438 | + "關鍵屬性", | |
| 439 | + "材質說明", | |
| 440 | + "功能特點", | |
| 441 | + "商品賣點", | |
| 442 | + "錨文本" | |
| 443 | + ], | |
| 444 | + "cs": [ | |
| 445 | + "C.", | |
| 446 | + "Nazev produktu", | |
| 447 | + "Cesta kategorie", | |
| 448 | + "Podrobne stitky", | |
| 449 | + "Cilova skupina", | |
| 450 | + "Scenar pouziti", | |
| 451 | + "Sezona", | |
| 452 | + "Klicove atributy", | |
| 453 | + "Material", | |
| 454 | + "Vlastnosti", | |
| 455 | + "Prodejni argument", | |
| 456 | + "Kotvici text" | |
| 457 | + ], | |
| 458 | + "el": [ | |
| 459 | + "Α/Α", | |
| 460 | + "Τίτλος προϊόντος", | |
| 461 | + "Διαδρομή κατηγορίας", | |
| 462 | + "Αναλυτικές ετικέτες", | |
| 463 | + "Κοινό-στόχος", | |
| 464 | + "Σενάριο χρήσης", | |
| 465 | + "Εποχή", | |
| 466 | + "Βασικά χαρακτηριστικά", | |
| 467 | + "Υλικό", | |
| 468 | + "Λειτουργίες", | |
| 469 | + "Σημείο πώλησης", | |
| 470 | + "Κείμενο άγκυρας" | |
| 471 | + ], | |
| 472 | + "sv": [ | |
| 473 | + "Nr", | |
| 474 | + "Produkttitel", | |
| 475 | + "Kategorisokvag", | |
| 476 | + "Detaljerade taggar", | |
| 477 | + "Malgrupp", | |
| 478 | + "Anvandningsscenario", | |
| 479 | + "Sasong", | |
| 480 | + "Viktiga attribut", | |
| 481 | + "Material", | |
| 482 | + "Funktioner", | |
| 483 | + "Saljpunkt", | |
| 484 | + "Ankartext" | |
| 485 | + ], | |
| 486 | + "hu": [ | |
| 487 | + "Sorszam", | |
| 488 | + "Termekcim", | |
| 489 | + "Kategoriavonal", | |
| 490 | + "Reszletes cimkek", | |
| 491 | + "Celcsoport", | |
| 492 | + "Hasznalati helyzet", | |
| 493 | + "Evszak", | |
| 494 | + "Fo jellemzok", | |
| 495 | + "Anyag", | |
| 496 | + "Funkciok", | |
| 497 | + "Ertekesitesi elony", | |
| 498 | + "Horgonyszoveg" | |
| 499 | + ], | |
| 500 | + "da": [ | |
| 501 | + "Nr.", | |
| 502 | + "Produkttitel", | |
| 503 | + "Kategoristi", | |
| 504 | + "Detaljerede tags", | |
| 505 | + "Malgruppe", | |
| 506 | + "Brugsscenarie", | |
| 507 | + "Saeson", | |
| 508 | + "Nogleattributter", | |
| 509 | + "Materiale", | |
| 510 | + "Funktioner", | |
| 511 | + "Salgsargument", | |
| 512 | + "Ankertekst" | |
| 513 | + ], | |
| 514 | + "fi": [ | |
| 515 | + "Nro", | |
| 516 | + "Tuotteen nimi", | |
| 517 | + "Kategoriapolku", | |
| 518 | + "Yksityiskohtaiset tunnisteet", | |
| 519 | + "Kohdeyleiso", | |
| 520 | + "Kayttotilanne", | |
| 521 | + "Kausi", | |
| 522 | + "Keskeiset ominaisuudet", | |
| 523 | + "Materiaali", | |
| 524 | + "Ominaisuudet", | |
| 525 | + "Myyntivaltti", | |
| 526 | + "Ankkuriteksti" | |
| 527 | + ], | |
| 528 | + "uk": [ | |
| 529 | + "№", | |
| 530 | + "Назва товару", | |
| 531 | + "Шлях категорії", | |
| 532 | + "Детальні теги", | |
| 533 | + "Цільова аудиторія", | |
| 534 | + "Сценарій використання", | |
| 535 | + "Сезон", | |
| 536 | + "Ключові атрибути", | |
| 537 | + "Матеріал", | |
| 538 | + "Особливості", | |
| 539 | + "Продаюча перевага", | |
| 540 | + "Анкорний текст" | |
| 541 | + ], | |
| 542 | + "bg": [ | |
| 543 | + "№", | |
| 544 | + "Заглавие на продукта", | |
| 545 | + "Път на категорията", | |
| 546 | + "Подробни тагове", | |
| 547 | + "Целева аудитория", | |
| 548 | + "Сценарий на употреба", | |
| 549 | + "Сезон", | |
| 550 | + "Ключови атрибути", | |
| 551 | + "Материал", | |
| 552 | + "Характеристики", | |
| 553 | + "Търговско предимство", | |
| 554 | + "Анкор текст" | |
| 555 | + ] | |
| 556 | +} | |
| 0 | 557 | \ No newline at end of file | ... | ... |
| ... | ... | @@ -0,0 +1,229 @@ |
| 1 | +from __future__ import annotations | |
| 2 | + | |
| 3 | +import importlib.util | |
| 4 | +import io | |
| 5 | +import json | |
| 6 | +import logging | |
| 7 | +import sys | |
| 8 | +import types | |
| 9 | +from pathlib import Path | |
| 10 | +from unittest import mock | |
| 11 | + | |
| 12 | + | |
| 13 | +def _load_product_enrich_module(): | |
| 14 | + if "dotenv" not in sys.modules: | |
| 15 | + fake_dotenv = types.ModuleType("dotenv") | |
| 16 | + fake_dotenv.load_dotenv = lambda *args, **kwargs: None | |
| 17 | + sys.modules["dotenv"] = fake_dotenv | |
| 18 | + | |
| 19 | + if "redis" not in sys.modules: | |
| 20 | + fake_redis = types.ModuleType("redis") | |
| 21 | + | |
| 22 | + class _FakeRedisClient: | |
| 23 | + def __init__(self, *args, **kwargs): | |
| 24 | + pass | |
| 25 | + | |
| 26 | + def ping(self): | |
| 27 | + return True | |
| 28 | + | |
| 29 | + fake_redis.Redis = _FakeRedisClient | |
| 30 | + sys.modules["redis"] = fake_redis | |
| 31 | + | |
| 32 | + repo_root = Path(__file__).resolve().parents[1] | |
| 33 | + if str(repo_root) not in sys.path: | |
| 34 | + sys.path.insert(0, str(repo_root)) | |
| 35 | + | |
| 36 | + module_path = repo_root / "indexer" / "product_enrich.py" | |
| 37 | + spec = importlib.util.spec_from_file_location("product_enrich_under_test", module_path) | |
| 38 | + module = importlib.util.module_from_spec(spec) | |
| 39 | + assert spec and spec.loader | |
| 40 | + spec.loader.exec_module(module) | |
| 41 | + return module | |
| 42 | + | |
| 43 | + | |
| 44 | +product_enrich = _load_product_enrich_module() | |
| 45 | + | |
| 46 | + | |
| 47 | +def _attach_stream(logger_obj: logging.Logger): | |
| 48 | + stream = io.StringIO() | |
| 49 | + handler = logging.StreamHandler(stream) | |
| 50 | + handler.setFormatter(logging.Formatter("%(message)s")) | |
| 51 | + logger_obj.addHandler(handler) | |
| 52 | + return stream, handler | |
| 53 | + | |
| 54 | + | |
| 55 | +def test_create_prompt_splits_shared_context_and_localized_tail(): | |
| 56 | + products = [ | |
| 57 | + {"id": "1", "title": "dress"}, | |
| 58 | + {"id": "2", "title": "linen shirt"}, | |
| 59 | + ] | |
| 60 | + | |
| 61 | + shared_zh, user_zh, prefix_zh = product_enrich.create_prompt(products, target_lang="zh") | |
| 62 | + shared_en, user_en, prefix_en = product_enrich.create_prompt(products, target_lang="en") | |
| 63 | + | |
| 64 | + assert shared_zh == shared_en | |
| 65 | + assert "Analyze each input product title" in shared_zh | |
| 66 | + assert "1. dress" in shared_zh | |
| 67 | + assert "2. linen shirt" in shared_zh | |
| 68 | + assert "Product list" not in user_zh | |
| 69 | + assert "Product list" not in user_en | |
| 70 | + assert "specified language" in user_zh | |
| 71 | + assert "Language: Chinese" in user_zh | |
| 72 | + assert "Language: English" in user_en | |
| 73 | + assert prefix_zh.startswith("| 序号 | 商品标题 | 品类路径 |") | |
| 74 | + assert prefix_en.startswith("| No. | Product title | Category path |") | |
| 75 | + | |
| 76 | + | |
| 77 | +def test_call_llm_logs_shared_context_once_and_verbose_contains_full_requests(): | |
| 78 | + payloads = [] | |
| 79 | + response_bodies = [ | |
| 80 | + { | |
| 81 | + "choices": [ | |
| 82 | + { | |
| 83 | + "message": { | |
| 84 | + "content": ( | |
| 85 | + "| 1 | 连衣裙 | 女装>连衣裙 | 法式,收腰 | 年轻女性 | " | |
| 86 | + "通勤,约会 | 春季,夏季 | 中长款 | 聚酯纤维 | 透气 | " | |
| 87 | + "修身显瘦 | 法式收腰连衣裙 |\n" | |
| 88 | + ) | |
| 89 | + } | |
| 90 | + } | |
| 91 | + ], | |
| 92 | + "usage": {"prompt_tokens": 120, "completion_tokens": 45, "total_tokens": 165}, | |
| 93 | + }, | |
| 94 | + { | |
| 95 | + "choices": [ | |
| 96 | + { | |
| 97 | + "message": { | |
| 98 | + "content": ( | |
| 99 | + "| 1 | Dress | Women>Dress | French,Waisted | Young women | " | |
| 100 | + "Commute,Date | Spring,Summer | Midi | Polyester | Breathable | " | |
| 101 | + "Slim fit | French waisted dress |\n" | |
| 102 | + ) | |
| 103 | + } | |
| 104 | + } | |
| 105 | + ], | |
| 106 | + "usage": {"prompt_tokens": 118, "completion_tokens": 43, "total_tokens": 161}, | |
| 107 | + }, | |
| 108 | + ] | |
| 109 | + | |
| 110 | + class _FakeResponse: | |
| 111 | + def __init__(self, body): | |
| 112 | + self.body = body | |
| 113 | + | |
| 114 | + def raise_for_status(self): | |
| 115 | + return None | |
| 116 | + | |
| 117 | + def json(self): | |
| 118 | + return self.body | |
| 119 | + | |
| 120 | + class _FakeSession: | |
| 121 | + trust_env = True | |
| 122 | + | |
| 123 | + def post(self, url, headers=None, json=None, timeout=None, proxies=None): | |
| 124 | + del url, headers, timeout, proxies | |
| 125 | + payloads.append(json) | |
| 126 | + return _FakeResponse(response_bodies[len(payloads) - 1]) | |
| 127 | + | |
| 128 | + def close(self): | |
| 129 | + return None | |
| 130 | + | |
| 131 | + product_enrich.reset_logged_shared_context_keys() | |
| 132 | + main_stream, main_handler = _attach_stream(product_enrich.logger) | |
| 133 | + verbose_stream, verbose_handler = _attach_stream(product_enrich.verbose_logger) | |
| 134 | + | |
| 135 | + try: | |
| 136 | + with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object( | |
| 137 | + product_enrich.requests, | |
| 138 | + "Session", | |
| 139 | + lambda: _FakeSession(), | |
| 140 | + ): | |
| 141 | + zh_shared, zh_user, zh_prefix = product_enrich.create_prompt( | |
| 142 | + [{"id": "1", "title": "dress"}], | |
| 143 | + target_lang="zh", | |
| 144 | + ) | |
| 145 | + en_shared, en_user, en_prefix = product_enrich.create_prompt( | |
| 146 | + [{"id": "1", "title": "dress"}], | |
| 147 | + target_lang="en", | |
| 148 | + ) | |
| 149 | + | |
| 150 | + zh_markdown, zh_raw = product_enrich.call_llm( | |
| 151 | + zh_shared, | |
| 152 | + zh_user, | |
| 153 | + zh_prefix, | |
| 154 | + target_lang="zh", | |
| 155 | + ) | |
| 156 | + en_markdown, en_raw = product_enrich.call_llm( | |
| 157 | + en_shared, | |
| 158 | + en_user, | |
| 159 | + en_prefix, | |
| 160 | + target_lang="en", | |
| 161 | + ) | |
| 162 | + finally: | |
| 163 | + product_enrich.logger.removeHandler(main_handler) | |
| 164 | + product_enrich.verbose_logger.removeHandler(verbose_handler) | |
| 165 | + | |
| 166 | + assert zh_shared == en_shared | |
| 167 | + assert len(payloads) == 2 | |
| 168 | + assert len(payloads[0]["messages"]) == 3 | |
| 169 | + assert payloads[0]["messages"][1]["role"] == "user" | |
| 170 | + assert "1. dress" in payloads[0]["messages"][1]["content"] | |
| 171 | + assert "Language: Chinese" in payloads[0]["messages"][1]["content"] | |
| 172 | + assert "Language: English" in payloads[1]["messages"][1]["content"] | |
| 173 | + assert payloads[0]["messages"][-1]["partial"] is True | |
| 174 | + assert payloads[1]["messages"][-1]["partial"] is True | |
| 175 | + | |
| 176 | + main_log = main_stream.getvalue() | |
| 177 | + verbose_log = verbose_stream.getvalue() | |
| 178 | + | |
| 179 | + assert main_log.count("LLM Shared Context") == 1 | |
| 180 | + assert main_log.count("LLM Request Variant") == 2 | |
| 181 | + assert "Localized Requirement" in main_log | |
| 182 | + assert "Shared Context" in main_log | |
| 183 | + | |
| 184 | + assert verbose_log.count("LLM Request [model=") == 2 | |
| 185 | + assert verbose_log.count("LLM Response [model=") == 2 | |
| 186 | + assert '"partial": true' in verbose_log | |
| 187 | + assert "Combined User Prompt" in verbose_log | |
| 188 | + assert "French waisted dress" in verbose_log | |
| 189 | + assert "法式收腰连衣裙" in verbose_log | |
| 190 | + | |
| 191 | + assert zh_markdown.startswith(zh_prefix) | |
| 192 | + assert en_markdown.startswith(en_prefix) | |
| 193 | + assert json.loads(zh_raw)["usage"]["total_tokens"] == 165 | |
| 194 | + assert json.loads(en_raw)["usage"]["total_tokens"] == 161 | |
| 195 | + | |
| 196 | + | |
| 197 | +def test_process_batch_reads_result_and_validates_expected_fields(): | |
| 198 | + merged_markdown = """| 序号 | 商品标题 | 品类路径 | 细分标签 | 适用人群 | 使用场景 | 适用季节 | 关键属性 | 材质说明 | 功能特点 | 商品卖点 | 锚文本 | | |
| 199 | +|----|----|----|----|----|----|----|----|----|----|----|----| | |
| 200 | +| 1 | 法式连衣裙 | 女装>连衣裙 | 法式,收腰 | 年轻女性 | 通勤,约会 | 春季,夏季 | 中长款 | 聚酯纤维 | 透气 | 修身显瘦 | 法式收腰连衣裙 | | |
| 201 | +""" | |
| 202 | + | |
| 203 | + with mock.patch.object( | |
| 204 | + product_enrich, | |
| 205 | + "call_llm", | |
| 206 | + return_value=(merged_markdown, json.dumps({"choices": [{"message": {"content": "stub"}}]})), | |
| 207 | + ): | |
| 208 | + results = product_enrich.process_batch( | |
| 209 | + [{"id": "sku-1", "title": "dress"}], | |
| 210 | + batch_num=1, | |
| 211 | + target_lang="zh", | |
| 212 | + ) | |
| 213 | + | |
| 214 | + assert len(results) == 1 | |
| 215 | + row = results[0] | |
| 216 | + assert row["id"] == "sku-1" | |
| 217 | + assert row["lang"] == "zh" | |
| 218 | + assert row["title_input"] == "dress" | |
| 219 | + assert row["title"] == "法式连衣裙" | |
| 220 | + assert row["category_path"] == "女装>连衣裙" | |
| 221 | + assert row["tags"] == "法式,收腰" | |
| 222 | + assert row["target_audience"] == "年轻女性" | |
| 223 | + assert row["usage_scene"] == "通勤,约会" | |
| 224 | + assert row["season"] == "春季,夏季" | |
| 225 | + assert row["key_attributes"] == "中长款" | |
| 226 | + assert row["material"] == "聚酯纤维" | |
| 227 | + assert row["features"] == "透气" | |
| 228 | + assert row["selling_points"] == "修身显瘦" | |
| 229 | + assert row["anchor_text"] == "法式收腰连衣裙" | ... | ... |