Commit a73a751f2d90d4544a7146eea38b8b68a736c98b
1 parent
e56fbdc1
enrich
Showing
4 changed files
with
1019 additions
and
111 deletions
Show diff stats
config/config_loader.py
| @@ -301,7 +301,12 @@ class ConfigLoader: | @@ -301,7 +301,12 @@ class ConfigLoader: | ||
| 301 | 301 | ||
| 302 | # Parse tenant config | 302 | # Parse tenant config |
| 303 | tenant_config_data = config_data.get("tenant_config", {}) | 303 | tenant_config_data = config_data.get("tenant_config", {}) |
| 304 | - | 304 | + |
| 305 | + # Parse extensible services/provider registry | ||
| 306 | + services_data = config_data.get("services", {}) or {} | ||
| 307 | + if not isinstance(services_data, dict): | ||
| 308 | + raise ConfigurationError("services must be a dictionary if provided") | ||
| 309 | + | ||
| 305 | return SearchConfig( | 310 | return SearchConfig( |
| 306 | field_boosts=field_boosts, | 311 | field_boosts=field_boosts, |
| 307 | indexes=indexes, | 312 | indexes=indexes, |
indexer/product_enrich.py
| @@ -11,6 +11,7 @@ import json | @@ -11,6 +11,7 @@ import json | ||
| 11 | import logging | 11 | import logging |
| 12 | import time | 12 | import time |
| 13 | import hashlib | 13 | import hashlib |
| 14 | +from collections import OrderedDict | ||
| 14 | from datetime import datetime | 15 | from datetime import datetime |
| 15 | from typing import List, Dict, Tuple, Any, Optional | 16 | from typing import List, Dict, Tuple, Any, Optional |
| 16 | 17 | ||
| @@ -20,6 +21,12 @@ from pathlib import Path | @@ -20,6 +21,12 @@ from pathlib import Path | ||
| 20 | 21 | ||
| 21 | from config.env_config import REDIS_CONFIG | 22 | from config.env_config import REDIS_CONFIG |
| 22 | from config.tenant_config_loader import SOURCE_LANG_CODE_MAP | 23 | from config.tenant_config_loader import SOURCE_LANG_CODE_MAP |
| 24 | +from indexer.product_enrich_prompts import ( | ||
| 25 | + SYSTEM_MESSAGE, | ||
| 26 | + USER_INSTRUCTION_TEMPLATE, | ||
| 27 | + LANGUAGE_MARKDOWN_TABLE_HEADERS, | ||
| 28 | + SHARED_ANALYSIS_INSTRUCTION, | ||
| 29 | +) | ||
| 23 | 30 | ||
| 24 | # 配置 | 31 | # 配置 |
| 25 | BATCH_SIZE = 20 | 32 | BATCH_SIZE = 20 |
| @@ -32,6 +39,7 @@ API_KEY = os.environ.get("DASHSCOPE_API_KEY") | @@ -32,6 +39,7 @@ API_KEY = os.environ.get("DASHSCOPE_API_KEY") | ||
| 32 | MAX_RETRIES = 3 | 39 | MAX_RETRIES = 3 |
| 33 | RETRY_DELAY = 5 # 秒 | 40 | RETRY_DELAY = 5 # 秒 |
| 34 | REQUEST_TIMEOUT = 180 # 秒 | 41 | REQUEST_TIMEOUT = 180 # 秒 |
| 42 | +LOGGED_SHARED_CONTEXT_CACHE_SIZE = 256 | ||
| 35 | 43 | ||
| 36 | # 日志路径 | 44 | # 日志路径 |
| 37 | OUTPUT_DIR = Path("output_logs") | 45 | OUTPUT_DIR = Path("output_logs") |
| @@ -42,6 +50,7 @@ LOG_DIR.mkdir(parents=True, exist_ok=True) | @@ -42,6 +50,7 @@ LOG_DIR.mkdir(parents=True, exist_ok=True) | ||
| 42 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | 50 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| 43 | log_file = LOG_DIR / f"product_enrich_{timestamp}.log" | 51 | log_file = LOG_DIR / f"product_enrich_{timestamp}.log" |
| 44 | verbose_log_file = LOG_DIR / "product_enrich_verbose.log" | 52 | verbose_log_file = LOG_DIR / "product_enrich_verbose.log" |
| 53 | +_logged_shared_context_keys: "OrderedDict[str, None]" = OrderedDict() | ||
| 45 | 54 | ||
| 46 | # 主日志 logger:执行流程、批次信息等 | 55 | # 主日志 logger:执行流程、批次信息等 |
| 47 | logger = logging.getLogger("product_enrich") | 56 | logger = logging.getLogger("product_enrich") |
| @@ -96,16 +105,11 @@ except Exception as e: | @@ -96,16 +105,11 @@ except Exception as e: | ||
| 96 | logger.warning(f"Failed to initialize Redis for anchors cache: {e}") | 105 | logger.warning(f"Failed to initialize Redis for anchors cache: {e}") |
| 97 | _anchor_redis = None | 106 | _anchor_redis = None |
| 98 | 107 | ||
| 99 | -# 中文版本提示词(请勿删除): | ||
| 100 | -# "你是一名电商平台的商品标注员,你的工作是对输入的每个商品进行理解、分析和标注," | ||
| 101 | -# "并按要求格式返回 Markdown 表格。所有输出内容必须为中文。" | ||
| 102 | - | ||
| 103 | -SYSTEM_MESSAGES = ( | ||
| 104 | - "You are a product annotator for an e-commerce platform. " | ||
| 105 | - "For each input product, you must understand, analyze and label it, " | ||
| 106 | - "and return a Markdown table strictly following the requested format. " | ||
| 107 | - "All output must be in English." | ||
| 108 | -) | 108 | +_missing_prompt_langs = sorted(set(SOURCE_LANG_CODE_MAP) - set(LANGUAGE_MARKDOWN_TABLE_HEADERS)) |
| 109 | +if _missing_prompt_langs: | ||
| 110 | + raise RuntimeError( | ||
| 111 | + f"Missing product_enrich prompt config for languages: {_missing_prompt_langs}" | ||
| 112 | + ) | ||
| 109 | 113 | ||
| 110 | 114 | ||
| 111 | def _make_anchor_cache_key( | 115 | def _make_anchor_cache_key( |
| @@ -153,108 +157,109 @@ def _set_cached_anchor_result( | @@ -153,108 +157,109 @@ def _set_cached_anchor_result( | ||
| 153 | logger.warning(f"Failed to set anchor cache: {e}") | 157 | logger.warning(f"Failed to set anchor cache: {e}") |
| 154 | 158 | ||
| 155 | 159 | ||
| 156 | -def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> str: | ||
| 157 | - """根据目标语言创建 LLM 提示词和表头说明。 | 160 | +def _build_assistant_prefix(headers: List[str]) -> str: |
| 161 | + header_line = "| " + " | ".join(headers) + " |" | ||
| 162 | + separator_line = "|" + "----|" * len(headers) | ||
| 163 | + return f"{header_line}\n{separator_line}\n" | ||
| 158 | 164 | ||
| 159 | - 约定: | ||
| 160 | - - 提示词始终使用英文; | ||
| 161 | - - 当 target_lang == "en" 时,直接要求用英文分析并输出英文表头; | ||
| 162 | - - 当 target_lang 为其他语言时,视作“多轮对话”的后续轮次: | ||
| 163 | - * 默认上一轮已经用英文完成了分析; | ||
| 164 | - * 当前轮只需要在保持结构和含义不变的前提下,将整张表格翻译为目标语言, | ||
| 165 | - 包含表头与所有单元格内容。 | ||
| 166 | - """ | ||
| 167 | - lang_name = SOURCE_LANG_CODE_MAP.get(target_lang, target_lang) | ||
| 168 | - | ||
| 169 | -# 中文版本提示词(请勿删除) | ||
| 170 | -# prompt = """请对输入的每条商品标题,分析并提取以下信息: | ||
| 171 | - | ||
| 172 | -# 1. 商品标题:将输入商品名称翻译为自然、完整的中文商品标题 | ||
| 173 | -# 2. 品类路径:从大类到细分品类,用">"分隔(例如:服装>女装>裤子>工装裤) | ||
| 174 | -# 3. 细分标签:商品的风格、特点、功能等(例如:碎花,收腰,法式) | ||
| 175 | -# 4. 适用人群:性别/年龄段等(例如:年轻女性) | ||
| 176 | -# 5. 使用场景 | ||
| 177 | -# 6. 适用季节 | ||
| 178 | -# 7. 关键属性 | ||
| 179 | -# 8. 材质说明 | ||
| 180 | -# 9. 功能特点 | ||
| 181 | -# 10. 商品卖点:分析和提取一句话核心卖点,用于推荐理由 | ||
| 182 | -# 11. 锚文本:生成一组能够代表该商品、并可能被用户用于搜索的词语或短语。这些词语应覆盖用户需求的各个维度,如品类、细分标签、功能特性、需求场景等等。 | ||
| 183 | - | ||
| 184 | -# 输入商品列表: | ||
| 185 | - | ||
| 186 | -# """ | ||
| 187 | -# prompt_tail = """ | ||
| 188 | -# 请严格按照以下markdown表格格式返回,每列内部的多值内容都用逗号分隔,不要添加任何其他说明: | ||
| 189 | - | ||
| 190 | -# | 序号 | 商品标题 | 品类路径 | 细分标签 | 适用人群 | 使用场景 | 适用季节 | 关键属性 | 材质说明 | 功能特点 | 商品卖点 | 锚文本 | | ||
| 191 | -# |----|----|----|----|----|----|----|----|----|----|----|----| | ||
| 192 | -# """ | ||
| 193 | - | ||
| 194 | - prompt = """Please analyze each input product title and extract the following information: | ||
| 195 | - | ||
| 196 | -1. Product title: a natural English product name derived from the input title | ||
| 197 | -2. Category path: from broad to fine-grained category, separated by ">" (e.g. Clothing>Women>Dresses>Work Dress) | ||
| 198 | -3. Fine-grained tags: style / features / attributes (e.g. floral, waist-cinching, French style) | ||
| 199 | -4. Target audience: gender / age group, etc. (e.g. young women) | ||
| 200 | -5. Usage scene | ||
| 201 | -6. Applicable season | ||
| 202 | -7. Key attributes | ||
| 203 | -8. Material description | ||
| 204 | -9. Functional features | ||
| 205 | -10. Selling point: one concise key selling sentence for recommendation | ||
| 206 | -11. Anchor text: a set of words or phrases that could be used by users as search queries for this product, covering category, fine-grained tags, functional attributes, usage scenes, etc. | ||
| 207 | - | ||
| 208 | -Input product list: | ||
| 209 | - | ||
| 210 | -""" | ||
| 211 | 165 | ||
| 166 | +def _build_shared_context(products: List[Dict[str, str]]) -> str: | ||
| 167 | + shared_context = SHARED_ANALYSIS_INSTRUCTION | ||
| 212 | for idx, product in enumerate(products, 1): | 168 | for idx, product in enumerate(products, 1): |
| 213 | - prompt += f'{idx}. {product["title"]}\n' | 169 | + shared_context += f'{idx}. {product["title"]}\n' |
| 170 | + return shared_context | ||
| 214 | 171 | ||
| 215 | - if target_lang == "en": | ||
| 216 | - # 英文首轮:直接要求英文表头 + 英文内容 | ||
| 217 | - prompt += """ | ||
| 218 | -Please strictly return a Markdown table in the following format. For any column that can contain multiple values, separate values with commas. Do not add any other explanations: | ||
| 219 | 172 | ||
| 220 | -| No. | Product title | Category path | Fine-grained tags | Target audience | Usage scene | Season | Key attributes | Material | Features | Selling point | Anchor text | | ||
| 221 | -|----|----|----|----|----|----|----|----|----|----|----|----| | ||
| 222 | -""" | ||
| 223 | - else: | ||
| 224 | - # 非英文语言:视作“下一轮对话”,只做翻译,要求表头与内容全部用目标语言 | ||
| 225 | - prompt += f""" | ||
| 226 | -Now we will output the same table in {lang_name}. | ||
| 227 | - | ||
| 228 | -IMPORTANT: | ||
| 229 | -- Assume you have already generated the full table in English in a previous round. | ||
| 230 | -- In this round, you must output exactly the same table structure and content, | ||
| 231 | - but fully translated into {lang_name}, including ALL column headers and ALL cell values. | ||
| 232 | -- Do NOT change the meaning, fields, or the number/order of rows and columns. | ||
| 233 | -- Keep valid Markdown table syntax. | ||
| 234 | - | ||
| 235 | -Please return ONLY the Markdown table in {lang_name}, without any extra explanations. | ||
| 236 | -""" | 173 | +def _hash_text(text: str) -> str: |
| 174 | + return hashlib.md5((text or "").encode("utf-8")).hexdigest()[:12] | ||
| 175 | + | ||
| 176 | + | ||
| 177 | +def _mark_shared_context_logged_once(shared_context_key: str) -> bool: | ||
| 178 | + if shared_context_key in _logged_shared_context_keys: | ||
| 179 | + _logged_shared_context_keys.move_to_end(shared_context_key) | ||
| 180 | + return False | ||
| 181 | + | ||
| 182 | + _logged_shared_context_keys[shared_context_key] = None | ||
| 183 | + if len(_logged_shared_context_keys) > LOGGED_SHARED_CONTEXT_CACHE_SIZE: | ||
| 184 | + _logged_shared_context_keys.popitem(last=False) | ||
| 185 | + return True | ||
| 237 | 186 | ||
| 238 | - return prompt | ||
| 239 | 187 | ||
| 188 | +def reset_logged_shared_context_keys() -> None: | ||
| 189 | + """测试辅助:清理已记录的共享 prompt key。""" | ||
| 190 | + _logged_shared_context_keys.clear() | ||
| 240 | 191 | ||
| 241 | -def call_llm(prompt: str, target_lang: str = "zh") -> Tuple[str, str]: | ||
| 242 | - """调用大模型API(带重试机制),按目标语言选择系统提示词。""" | 192 | + |
| 193 | +def create_prompt( | ||
| 194 | + products: List[Dict[str, str]], | ||
| 195 | + target_lang: str = "zh", | ||
| 196 | +) -> Tuple[str, str, str]: | ||
| 197 | + """根据目标语言创建共享上下文、本地化输出要求和 Partial Mode assistant 前缀。""" | ||
| 198 | + markdown_table_headers = LANGUAGE_MARKDOWN_TABLE_HEADERS.get(target_lang) | ||
| 199 | + if not markdown_table_headers: | ||
| 200 | + logger.warning( | ||
| 201 | + "Unsupported target_lang for markdown table headers: %s", | ||
| 202 | + target_lang, | ||
| 203 | + ) | ||
| 204 | + return None, None, None | ||
| 205 | + shared_context = _build_shared_context(products) | ||
| 206 | + language_label = SOURCE_LANG_CODE_MAP.get(target_lang, target_lang) | ||
| 207 | + user_prompt = USER_INSTRUCTION_TEMPLATE.format(language=language_label).strip() | ||
| 208 | + assistant_prefix = _build_assistant_prefix(markdown_table_headers) | ||
| 209 | + return shared_context, user_prompt, assistant_prefix | ||
| 210 | + | ||
| 211 | + | ||
| 212 | +def _merge_partial_response(assistant_prefix: str, generated_content: str) -> str: | ||
| 213 | + """将 Partial Mode 的 assistant 前缀与补全文本拼成完整 markdown。""" | ||
| 214 | + generated = (generated_content or "").lstrip() | ||
| 215 | + prefix_lines = [line.strip() for line in assistant_prefix.strip().splitlines()] | ||
| 216 | + generated_lines = generated.splitlines() | ||
| 217 | + | ||
| 218 | + if generated_lines: | ||
| 219 | + first_line = generated_lines[0].strip() | ||
| 220 | + if prefix_lines and first_line == prefix_lines[0]: | ||
| 221 | + generated_lines = generated_lines[1:] | ||
| 222 | + if generated_lines and len(prefix_lines) > 1 and generated_lines[0].strip() == prefix_lines[1]: | ||
| 223 | + generated_lines = generated_lines[1:] | ||
| 224 | + elif len(prefix_lines) > 1 and first_line == prefix_lines[1]: | ||
| 225 | + generated_lines = generated_lines[1:] | ||
| 226 | + | ||
| 227 | + suffix = "\n".join(generated_lines).lstrip("\n") | ||
| 228 | + if suffix: | ||
| 229 | + return f"{assistant_prefix}{suffix}" | ||
| 230 | + return assistant_prefix | ||
| 231 | + | ||
| 232 | + | ||
| 233 | +def call_llm( | ||
| 234 | + shared_context: str, | ||
| 235 | + user_prompt: str, | ||
| 236 | + assistant_prefix: str, | ||
| 237 | + target_lang: str = "zh", | ||
| 238 | +) -> Tuple[str, str]: | ||
| 239 | + """调用大模型 API(带重试机制),使用 Partial Mode 强制 markdown 表格前缀。""" | ||
| 243 | headers = { | 240 | headers = { |
| 244 | "Authorization": f"Bearer {API_KEY}", | 241 | "Authorization": f"Bearer {API_KEY}", |
| 245 | "Content-Type": "application/json", | 242 | "Content-Type": "application/json", |
| 246 | } | 243 | } |
| 244 | + shared_context_key = _hash_text(shared_context) | ||
| 245 | + localized_tail_key = _hash_text(f"{target_lang}\n{user_prompt}\n{assistant_prefix}") | ||
| 246 | + combined_user_prompt = f"{shared_context.rstrip()}\n\n{user_prompt.strip()}" | ||
| 247 | 247 | ||
| 248 | payload = { | 248 | payload = { |
| 249 | "model": MODEL_NAME, | 249 | "model": MODEL_NAME, |
| 250 | "messages": [ | 250 | "messages": [ |
| 251 | { | 251 | { |
| 252 | "role": "system", | 252 | "role": "system", |
| 253 | - "content": SYSTEM_MESSAGES, | 253 | + "content": SYSTEM_MESSAGE, |
| 254 | }, | 254 | }, |
| 255 | { | 255 | { |
| 256 | "role": "user", | 256 | "role": "user", |
| 257 | - "content": prompt, | 257 | + "content": combined_user_prompt, |
| 258 | + }, | ||
| 259 | + { | ||
| 260 | + "role": "assistant", | ||
| 261 | + "content": assistant_prefix, | ||
| 262 | + "partial": True, | ||
| 258 | }, | 263 | }, |
| 259 | ], | 264 | ], |
| 260 | "temperature": 0.3, | 265 | "temperature": 0.3, |
| @@ -266,16 +271,41 @@ def call_llm(prompt: str, target_lang: str = "zh") -> Tuple[str, str]: | @@ -266,16 +271,41 @@ def call_llm(prompt: str, target_lang: str = "zh") -> Tuple[str, str]: | ||
| 266 | "payload": payload, | 271 | "payload": payload, |
| 267 | } | 272 | } |
| 268 | 273 | ||
| 269 | - # 主日志 + 详尽日志:LLM Request | ||
| 270 | - logger.info(f"\n{'=' * 80}") | ||
| 271 | - logger.info(f"LLM Request (Model: {MODEL_NAME}):") | ||
| 272 | - logger.info(json.dumps(request_data, ensure_ascii=False, indent=2)) | ||
| 273 | - logger.info(f"\nPrompt:\n{prompt}") | 274 | + if _mark_shared_context_logged_once(shared_context_key): |
| 275 | + logger.info(f"\n{'=' * 80}") | ||
| 276 | + logger.info( | ||
| 277 | + "LLM Shared Context [model=%s, shared_key=%s, chars=%s] (logged once per process key)", | ||
| 278 | + MODEL_NAME, | ||
| 279 | + shared_context_key, | ||
| 280 | + len(shared_context), | ||
| 281 | + ) | ||
| 282 | + logger.info("\nSystem Message:\n%s", SYSTEM_MESSAGE) | ||
| 283 | + logger.info("\nShared Context:\n%s", shared_context) | ||
| 274 | 284 | ||
| 275 | verbose_logger.info(f"\n{'=' * 80}") | 285 | verbose_logger.info(f"\n{'=' * 80}") |
| 276 | - verbose_logger.info(f"LLM Request (Model: {MODEL_NAME}):") | 286 | + verbose_logger.info( |
| 287 | + "LLM Request [model=%s, lang=%s, shared_key=%s, tail_key=%s]:", | ||
| 288 | + MODEL_NAME, | ||
| 289 | + target_lang, | ||
| 290 | + shared_context_key, | ||
| 291 | + localized_tail_key, | ||
| 292 | + ) | ||
| 277 | verbose_logger.info(json.dumps(request_data, ensure_ascii=False, indent=2)) | 293 | verbose_logger.info(json.dumps(request_data, ensure_ascii=False, indent=2)) |
| 278 | - verbose_logger.info(f"\nPrompt:\n{prompt}") | 294 | + verbose_logger.info(f"\nCombined User Prompt:\n{combined_user_prompt}") |
| 295 | + verbose_logger.info(f"\nShared Context:\n{shared_context}") | ||
| 296 | + verbose_logger.info(f"\nLocalized Requirement:\n{user_prompt}") | ||
| 297 | + verbose_logger.info(f"\nAssistant Prefix:\n{assistant_prefix}") | ||
| 298 | + | ||
| 299 | + logger.info( | ||
| 300 | + "\nLLM Request Variant [lang=%s, shared_key=%s, tail_key=%s, prompt_chars=%s, prefix_chars=%s]", | ||
| 301 | + target_lang, | ||
| 302 | + shared_context_key, | ||
| 303 | + localized_tail_key, | ||
| 304 | + len(user_prompt), | ||
| 305 | + len(assistant_prefix), | ||
| 306 | + ) | ||
| 307 | + logger.info("\nLocalized Requirement:\n%s", user_prompt) | ||
| 308 | + logger.info("\nAssistant Prefix:\n%s", assistant_prefix) | ||
| 279 | 309 | ||
| 280 | # 创建session,禁用代理 | 310 | # 创建session,禁用代理 |
| 281 | session = requests.Session() | 311 | session = requests.Session() |
| @@ -295,19 +325,37 @@ def call_llm(prompt: str, target_lang: str = "zh") -> Tuple[str, str]: | @@ -295,19 +325,37 @@ def call_llm(prompt: str, target_lang: str = "zh") -> Tuple[str, str]: | ||
| 295 | 325 | ||
| 296 | response.raise_for_status() | 326 | response.raise_for_status() |
| 297 | result = response.json() | 327 | result = response.json() |
| 328 | + usage = result.get("usage") or {} | ||
| 329 | + | ||
| 330 | + verbose_logger.info( | ||
| 331 | + "\nLLM Response [model=%s, lang=%s, shared_key=%s, tail_key=%s]:", | ||
| 332 | + MODEL_NAME, | ||
| 333 | + target_lang, | ||
| 334 | + shared_context_key, | ||
| 335 | + localized_tail_key, | ||
| 336 | + ) | ||
| 337 | + verbose_logger.info(json.dumps(result, ensure_ascii=False, indent=2)) | ||
| 298 | 338 | ||
| 299 | - # 主日志 + 详尽日志:LLM Response | ||
| 300 | - logger.info(f"\nLLM Response:") | ||
| 301 | - logger.info(json.dumps(result, ensure_ascii=False, indent=2)) | 339 | + generated_content = result["choices"][0]["message"]["content"] |
| 340 | + full_markdown = _merge_partial_response(assistant_prefix, generated_content) | ||
| 302 | 341 | ||
| 303 | - verbose_logger.info(f"\nLLM Response:") | ||
| 304 | - verbose_logger.info(json.dumps(result, ensure_ascii=False, indent=2)) | 342 | + logger.info( |
| 343 | + "\nLLM Response Summary [lang=%s, shared_key=%s, tail_key=%s, generated_chars=%s, completion_tokens=%s, prompt_tokens=%s, total_tokens=%s]", | ||
| 344 | + target_lang, | ||
| 345 | + shared_context_key, | ||
| 346 | + localized_tail_key, | ||
| 347 | + len(generated_content or ""), | ||
| 348 | + usage.get("completion_tokens"), | ||
| 349 | + usage.get("prompt_tokens"), | ||
| 350 | + usage.get("total_tokens"), | ||
| 351 | + ) | ||
| 352 | + logger.info("\nGenerated Content:\n%s", generated_content) | ||
| 353 | + logger.info("\nMerged Markdown:\n%s", full_markdown) | ||
| 305 | 354 | ||
| 306 | - content = result["choices"][0]["message"]["content"] | ||
| 307 | - logger.info(f"\nExtracted Content:\n{content}") | ||
| 308 | - verbose_logger.info(f"\nExtracted Content:\n{content}") | 355 | + verbose_logger.info(f"\nGenerated Content:\n{generated_content}") |
| 356 | + verbose_logger.info(f"\nMerged Markdown:\n{full_markdown}") | ||
| 309 | 357 | ||
| 310 | - return content, json.dumps(result, ensure_ascii=False) | 358 | + return full_markdown, json.dumps(result, ensure_ascii=False) |
| 311 | 359 | ||
| 312 | except requests.exceptions.ProxyError as e: | 360 | except requests.exceptions.ProxyError as e: |
| 313 | logger.warning(f"Attempt {attempt + 1}/{MAX_RETRIES}: Proxy error - {str(e)}") | 361 | logger.warning(f"Attempt {attempt + 1}/{MAX_RETRIES}: Proxy error - {str(e)}") |
| @@ -385,6 +433,39 @@ def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]: | @@ -385,6 +433,39 @@ def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]: | ||
| 385 | return data | 433 | return data |
| 386 | 434 | ||
| 387 | 435 | ||
| 436 | +def _log_parsed_result_quality( | ||
| 437 | + batch_data: List[Dict[str, str]], | ||
| 438 | + parsed_results: List[Dict[str, str]], | ||
| 439 | + target_lang: str, | ||
| 440 | + batch_num: int, | ||
| 441 | +) -> None: | ||
| 442 | + expected = len(batch_data) | ||
| 443 | + actual = len(parsed_results) | ||
| 444 | + if actual != expected: | ||
| 445 | + logger.warning( | ||
| 446 | + "Parsed row count mismatch for batch=%s lang=%s: expected=%s actual=%s", | ||
| 447 | + batch_num, | ||
| 448 | + target_lang, | ||
| 449 | + expected, | ||
| 450 | + actual, | ||
| 451 | + ) | ||
| 452 | + | ||
| 453 | + missing_anchor = sum(1 for item in parsed_results if not str(item.get("anchor_text") or "").strip()) | ||
| 454 | + missing_category = sum(1 for item in parsed_results if not str(item.get("category_path") or "").strip()) | ||
| 455 | + missing_title = sum(1 for item in parsed_results if not str(item.get("title") or "").strip()) | ||
| 456 | + | ||
| 457 | + logger.info( | ||
| 458 | + "Parsed Quality Summary [batch=%s, lang=%s]: rows=%s/%s, missing_title=%s, missing_category=%s, missing_anchor=%s", | ||
| 459 | + batch_num, | ||
| 460 | + target_lang, | ||
| 461 | + actual, | ||
| 462 | + expected, | ||
| 463 | + missing_title, | ||
| 464 | + missing_category, | ||
| 465 | + missing_anchor, | ||
| 466 | + ) | ||
| 467 | + | ||
| 468 | + | ||
| 388 | def process_batch( | 469 | def process_batch( |
| 389 | batch_data: List[Dict[str, str]], | 470 | batch_data: List[Dict[str, str]], |
| 390 | batch_num: int, | 471 | batch_num: int, |
| @@ -395,14 +476,52 @@ def process_batch( | @@ -395,14 +476,52 @@ def process_batch( | ||
| 395 | logger.info(f"Processing Batch {batch_num} ({len(batch_data)} items)") | 476 | logger.info(f"Processing Batch {batch_num} ({len(batch_data)} items)") |
| 396 | 477 | ||
| 397 | # 创建提示词 | 478 | # 创建提示词 |
| 398 | - prompt = create_prompt(batch_data, target_lang=target_lang) | 479 | + shared_context, user_prompt, assistant_prefix = create_prompt( |
| 480 | + batch_data, | ||
| 481 | + target_lang=target_lang, | ||
| 482 | + ) | ||
| 483 | + | ||
| 484 | + # 如果提示词创建失败(例如不支持的 target_lang),本次批次整体失败,不再继续调用 LLM | ||
| 485 | + if shared_context is None or user_prompt is None or assistant_prefix is None: | ||
| 486 | + logger.error( | ||
| 487 | + "Failed to create prompt for batch %s, target_lang=%s; " | ||
| 488 | + "marking entire batch as failed without calling LLM", | ||
| 489 | + batch_num, | ||
| 490 | + target_lang, | ||
| 491 | + ) | ||
| 492 | + return [ | ||
| 493 | + { | ||
| 494 | + "id": item["id"], | ||
| 495 | + "lang": target_lang, | ||
| 496 | + "title_input": item.get("title", ""), | ||
| 497 | + "title": "", | ||
| 498 | + "category_path": "", | ||
| 499 | + "tags": "", | ||
| 500 | + "target_audience": "", | ||
| 501 | + "usage_scene": "", | ||
| 502 | + "season": "", | ||
| 503 | + "key_attributes": "", | ||
| 504 | + "material": "", | ||
| 505 | + "features": "", | ||
| 506 | + "selling_points": "", | ||
| 507 | + "anchor_text": "", | ||
| 508 | + "error": f"prompt_creation_failed: unsupported target_lang={target_lang}", | ||
| 509 | + } | ||
| 510 | + for item in batch_data | ||
| 511 | + ] | ||
| 399 | 512 | ||
| 400 | # 调用LLM | 513 | # 调用LLM |
| 401 | try: | 514 | try: |
| 402 | - raw_response, full_response_json = call_llm(prompt, target_lang=target_lang) | 515 | + raw_response, full_response_json = call_llm( |
| 516 | + shared_context, | ||
| 517 | + user_prompt, | ||
| 518 | + assistant_prefix, | ||
| 519 | + target_lang=target_lang, | ||
| 520 | + ) | ||
| 403 | 521 | ||
| 404 | # 解析结果 | 522 | # 解析结果 |
| 405 | parsed_results = parse_markdown_table(raw_response) | 523 | parsed_results = parse_markdown_table(raw_response) |
| 524 | + _log_parsed_result_quality(batch_data, parsed_results, target_lang, batch_num) | ||
| 406 | 525 | ||
| 407 | logger.info(f"\nParsed Results ({len(parsed_results)} items):") | 526 | logger.info(f"\nParsed Results ({len(parsed_results)} items):") |
| 408 | logger.info(json.dumps(parsed_results, ensure_ascii=False, indent=2)) | 527 | logger.info(json.dumps(parsed_results, ensure_ascii=False, indent=2)) |
| @@ -541,4 +660,3 @@ def analyze_products( | @@ -541,4 +660,3 @@ def analyze_products( | ||
| 541 | pass | 660 | pass |
| 542 | 661 | ||
| 543 | return all_results | 662 | return all_results |
| 544 | - |
| @@ -0,0 +1,556 @@ | @@ -0,0 +1,556 @@ | ||
| 1 | +#!/usr/bin/env python3 | ||
| 2 | + | ||
| 3 | +from typing import Any, Dict | ||
| 4 | + | ||
| 5 | +SYSTEM_MESSAGE = ( | ||
| 6 | + "You are an e-commerce product annotator. " | ||
| 7 | + "Continue the provided assistant Markdown table prefix. " | ||
| 8 | + "Do not repeat or modify the prefix, and do not add explanations outside the table." | ||
| 9 | +) | ||
| 10 | + | ||
| 11 | +SHARED_ANALYSIS_INSTRUCTION = """Analyze each input product title and fill these columns: | ||
| 12 | + | ||
| 13 | +1. Product title: a natural localized product name derived from the input title | ||
| 14 | +2. Category path: broad to fine-grained category, separated by ">" | ||
| 15 | +3. Fine-grained tags: style, features, functions, or notable attributes | ||
| 16 | +4. Target audience: gender, age group, or suitable users | ||
| 17 | +5. Usage scene | ||
| 18 | +6. Applicable season | ||
| 19 | +7. Key attributes | ||
| 20 | +8. Material description | ||
| 21 | +9. Functional features | ||
| 22 | +10. Selling point: one concise core selling phrase | ||
| 23 | +11. Anchor text: a set of search-oriented words or phrases covering category, attributes, scenes, and demand | ||
| 24 | + | ||
| 25 | +Rules: | ||
| 26 | +- Keep the input order and row count exactly the same. | ||
| 27 | +- Infer from the title only; if uncertain, prefer concise and broadly correct ecommerce wording. | ||
| 28 | +- Keep category paths concise and use ">" as the separator. | ||
| 29 | +- For columns with multiple values, the localized output requirement will define the delimiter. | ||
| 30 | + | ||
| 31 | +Input product list: | ||
| 32 | +""" | ||
| 33 | + | ||
| 34 | +USER_INSTRUCTION_TEMPLATE = """Please strictly return a Markdown table following the given columns in the specified language. For any column containing multiple values, separate them with commas. Do not add any other explanation. | ||
| 35 | +Language: {language}""" | ||
| 36 | + | ||
| 37 | +LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { | ||
| 38 | + "en": [ | ||
| 39 | + "No.", | ||
| 40 | + "Product title", | ||
| 41 | + "Category path", | ||
| 42 | + "Fine-grained tags", | ||
| 43 | + "Target audience", | ||
| 44 | + "Usage scene", | ||
| 45 | + "Season", | ||
| 46 | + "Key attributes", | ||
| 47 | + "Material", | ||
| 48 | + "Features", | ||
| 49 | + "Selling point", | ||
| 50 | + "Anchor text" | ||
| 51 | + ], | ||
| 52 | + "zh": [ | ||
| 53 | + "序号", | ||
| 54 | + "商品标题", | ||
| 55 | + "品类路径", | ||
| 56 | + "细分标签", | ||
| 57 | + "适用人群", | ||
| 58 | + "使用场景", | ||
| 59 | + "适用季节", | ||
| 60 | + "关键属性", | ||
| 61 | + "材质说明", | ||
| 62 | + "功能特点", | ||
| 63 | + "商品卖点", | ||
| 64 | + "锚文本" | ||
| 65 | + ], | ||
| 66 | + "zh_tw": [ | ||
| 67 | + "序號", | ||
| 68 | + "商品標題", | ||
| 69 | + "品類路徑", | ||
| 70 | + "細分標籤", | ||
| 71 | + "適用人群", | ||
| 72 | + "使用場景", | ||
| 73 | + "適用季節", | ||
| 74 | + "關鍵屬性", | ||
| 75 | + "材質說明", | ||
| 76 | + "功能特點", | ||
| 77 | + "商品賣點", | ||
| 78 | + "錨文本" | ||
| 79 | + ], | ||
| 80 | + "ru": [ | ||
| 81 | + "№", | ||
| 82 | + "Название товара", | ||
| 83 | + "Путь категории", | ||
| 84 | + "Детализированные теги", | ||
| 85 | + "Целевая аудитория", | ||
| 86 | + "Сценарий использования", | ||
| 87 | + "Сезон", | ||
| 88 | + "Ключевые атрибуты", | ||
| 89 | + "Материал", | ||
| 90 | + "Особенности", | ||
| 91 | + "Преимущество товара", | ||
| 92 | + "Анкорный текст" | ||
| 93 | + ], | ||
| 94 | + "ja": [ | ||
| 95 | + "番号", | ||
| 96 | + "商品タイトル", | ||
| 97 | + "カテゴリパス", | ||
| 98 | + "詳細タグ", | ||
| 99 | + "対象ユーザー", | ||
| 100 | + "利用シーン", | ||
| 101 | + "季節", | ||
| 102 | + "主要属性", | ||
| 103 | + "素材", | ||
| 104 | + "機能特徴", | ||
| 105 | + "商品の訴求点", | ||
| 106 | + "アンカーテキスト" | ||
| 107 | + ], | ||
| 108 | + "ko": [ | ||
| 109 | + "번호", | ||
| 110 | + "상품 제목", | ||
| 111 | + "카테고리 경로", | ||
| 112 | + "세부 태그", | ||
| 113 | + "대상 고객", | ||
| 114 | + "사용 장면", | ||
| 115 | + "계절", | ||
| 116 | + "핵심 속성", | ||
| 117 | + "소재", | ||
| 118 | + "기능 특징", | ||
| 119 | + "상품 포인트", | ||
| 120 | + "앵커 텍스트" | ||
| 121 | + ], | ||
| 122 | + "es": [ | ||
| 123 | + "N.º", | ||
| 124 | + "Titulo del producto", | ||
| 125 | + "Ruta de categoria", | ||
| 126 | + "Etiquetas detalladas", | ||
| 127 | + "Publico objetivo", | ||
| 128 | + "Escenario de uso", | ||
| 129 | + "Temporada", | ||
| 130 | + "Atributos clave", | ||
| 131 | + "Material", | ||
| 132 | + "Caracteristicas", | ||
| 133 | + "Punto de venta", | ||
| 134 | + "Texto ancla" | ||
| 135 | + ], | ||
| 136 | + "fr": [ | ||
| 137 | + "N°", | ||
| 138 | + "Titre du produit", | ||
| 139 | + "Chemin de categorie", | ||
| 140 | + "Etiquettes detaillees", | ||
| 141 | + "Public cible", | ||
| 142 | + "Scenario d'utilisation", | ||
| 143 | + "Saison", | ||
| 144 | + "Attributs cles", | ||
| 145 | + "Matiere", | ||
| 146 | + "Caracteristiques", | ||
| 147 | + "Argument de vente", | ||
| 148 | + "Texte d'ancrage" | ||
| 149 | + ], | ||
| 150 | + "pt": [ | ||
| 151 | + "Nº", | ||
| 152 | + "Titulo do produto", | ||
| 153 | + "Caminho da categoria", | ||
| 154 | + "Tags detalhadas", | ||
| 155 | + "Publico-alvo", | ||
| 156 | + "Cenario de uso", | ||
| 157 | + "Estacao", | ||
| 158 | + "Atributos principais", | ||
| 159 | + "Material", | ||
| 160 | + "Caracteristicas", | ||
| 161 | + "Ponto de venda", | ||
| 162 | + "Texto ancora" | ||
| 163 | + ], | ||
| 164 | + "de": [ | ||
| 165 | + "Nr.", | ||
| 166 | + "Produkttitel", | ||
| 167 | + "Kategoriepfad", | ||
| 168 | + "Detaillierte Tags", | ||
| 169 | + "Zielgruppe", | ||
| 170 | + "Nutzungsszenario", | ||
| 171 | + "Saison", | ||
| 172 | + "Wichtige Attribute", | ||
| 173 | + "Material", | ||
| 174 | + "Funktionen", | ||
| 175 | + "Verkaufsargument", | ||
| 176 | + "Ankertext" | ||
| 177 | + ], | ||
| 178 | + "it": [ | ||
| 179 | + "N.", | ||
| 180 | + "Titolo del prodotto", | ||
| 181 | + "Percorso categoria", | ||
| 182 | + "Tag dettagliati", | ||
| 183 | + "Pubblico target", | ||
| 184 | + "Scenario d'uso", | ||
| 185 | + "Stagione", | ||
| 186 | + "Attributi chiave", | ||
| 187 | + "Materiale", | ||
| 188 | + "Caratteristiche", | ||
| 189 | + "Punto di forza", | ||
| 190 | + "Testo ancora" | ||
| 191 | + ], | ||
| 192 | + "th": [ | ||
| 193 | + "ลำดับ", | ||
| 194 | + "ชื่อสินค้า", | ||
| 195 | + "เส้นทางหมวดหมู่", | ||
| 196 | + "แท็กย่อย", | ||
| 197 | + "กลุ่มเป้าหมาย", | ||
| 198 | + "สถานการณ์การใช้งาน", | ||
| 199 | + "ฤดูกาล", | ||
| 200 | + "คุณสมบัติสำคัญ", | ||
| 201 | + "วัสดุ", | ||
| 202 | + "คุณสมบัติการใช้งาน", | ||
| 203 | + "จุดขายสินค้า", | ||
| 204 | + "แองเคอร์เท็กซ์" | ||
| 205 | + ], | ||
| 206 | + "vi": [ | ||
| 207 | + "STT", | ||
| 208 | + "Tieu de san pham", | ||
| 209 | + "Duong dan danh muc", | ||
| 210 | + "The chi tiet", | ||
| 211 | + "Doi tuong phu hop", | ||
| 212 | + "Boi canh su dung", | ||
| 213 | + "Mua phu hop", | ||
| 214 | + "Thuoc tinh chinh", | ||
| 215 | + "Chat lieu", | ||
| 216 | + "Tinh nang", | ||
| 217 | + "Diem ban hang", | ||
| 218 | + "Van ban neo" | ||
| 219 | + ], | ||
| 220 | + "id": [ | ||
| 221 | + "No.", | ||
| 222 | + "Judul produk", | ||
| 223 | + "Jalur kategori", | ||
| 224 | + "Tag terperinci", | ||
| 225 | + "Target pengguna", | ||
| 226 | + "Skenario penggunaan", | ||
| 227 | + "Musim", | ||
| 228 | + "Atribut utama", | ||
| 229 | + "Bahan", | ||
| 230 | + "Fitur", | ||
| 231 | + "Nilai jual", | ||
| 232 | + "Teks jangkar" | ||
| 233 | + ], | ||
| 234 | + "ms": [ | ||
| 235 | + "No.", | ||
| 236 | + "Tajuk produk", | ||
| 237 | + "Laluan kategori", | ||
| 238 | + "Tag terperinci", | ||
| 239 | + "Sasaran pengguna", | ||
| 240 | + "Senario penggunaan", | ||
| 241 | + "Musim", | ||
| 242 | + "Atribut utama", | ||
| 243 | + "Bahan", | ||
| 244 | + "Ciri-ciri", | ||
| 245 | + "Nilai jual", | ||
| 246 | + "Teks sauh" | ||
| 247 | + ], | ||
| 248 | + "ar": [ | ||
| 249 | + "الرقم", | ||
| 250 | + "عنوان المنتج", | ||
| 251 | + "مسار الفئة", | ||
| 252 | + "الوسوم التفصيلية", | ||
| 253 | + "الفئة المستهدفة", | ||
| 254 | + "سيناريو الاستخدام", | ||
| 255 | + "الموسم", | ||
| 256 | + "السمات الرئيسية", | ||
| 257 | + "المادة", | ||
| 258 | + "الميزات", | ||
| 259 | + "نقطة البيع", | ||
| 260 | + "نص الربط" | ||
| 261 | + ], | ||
| 262 | + "hi": [ | ||
| 263 | + "क्रमांक", | ||
| 264 | + "उत्पाद शीर्षक", | ||
| 265 | + "श्रेणी पथ", | ||
| 266 | + "विस्तृत टैग", | ||
| 267 | + "लक्षित उपभोक्ता", | ||
| 268 | + "उपयोग परिदृश्य", | ||
| 269 | + "मौसम", | ||
| 270 | + "मुख्य गुण", | ||
| 271 | + "सामग्री", | ||
| 272 | + "विशेषताएं", | ||
| 273 | + "बिक्री बिंदु", | ||
| 274 | + "एंकर टेक्स्ट" | ||
| 275 | + ], | ||
| 276 | + "he": [ | ||
| 277 | + "מס׳", | ||
| 278 | + "כותרת המוצר", | ||
| 279 | + "נתיב קטגוריה", | ||
| 280 | + "תגיות מפורטות", | ||
| 281 | + "קהל יעד", | ||
| 282 | + "תרחיש שימוש", | ||
| 283 | + "עונה", | ||
| 284 | + "מאפיינים מרכזיים", | ||
| 285 | + "חומר", | ||
| 286 | + "תכונות", | ||
| 287 | + "נקודת מכירה", | ||
| 288 | + "טקסט עוגן" | ||
| 289 | + ], | ||
| 290 | + "my": [ | ||
| 291 | + "အမှတ်စဉ်", | ||
| 292 | + "ကုန်ပစ္စည်းခေါင်းစဉ်", | ||
| 293 | + "အမျိုးအစားလမ်းကြောင်း", | ||
| 294 | + "အသေးစိတ်တဂ်များ", | ||
| 295 | + "ပစ်မှတ်အသုံးပြုသူ", | ||
| 296 | + "အသုံးပြုမှုအခြေအနေ", | ||
| 297 | + "ရာသီ", | ||
| 298 | + "အဓိကဂုဏ်သတ္တိများ", | ||
| 299 | + "ပစ္စည်း", | ||
| 300 | + "လုပ်ဆောင်ချက်များ", | ||
| 301 | + "အရောင်းထူးခြားချက်", | ||
| 302 | + "အန်ကာစာသား" | ||
| 303 | + ], | ||
| 304 | + "ta": [ | ||
| 305 | + "எண்", | ||
| 306 | + "தயாரிப்பு தலைப்பு", | ||
| 307 | + "வகை பாதை", | ||
| 308 | + "விரிவான குறிச்சொற்கள்", | ||
| 309 | + "இலக்கு பயனர்கள்", | ||
| 310 | + "பயன்பாட்டு நிலை", | ||
| 311 | + "பருவம்", | ||
| 312 | + "முக்கிய பண்புகள்", | ||
| 313 | + "பொருள்", | ||
| 314 | + "அம்சங்கள்", | ||
| 315 | + "விற்பனை அம்சம்", | ||
| 316 | + "ஆங்கர் உரை" | ||
| 317 | + ], | ||
| 318 | + "ur": [ | ||
| 319 | + "نمبر", | ||
| 320 | + "پروڈکٹ عنوان", | ||
| 321 | + "زمرہ راستہ", | ||
| 322 | + "تفصیلی ٹیگز", | ||
| 323 | + "ہدف صارفین", | ||
| 324 | + "استعمال کا منظر", | ||
| 325 | + "موسم", | ||
| 326 | + "کلیدی خصوصیات", | ||
| 327 | + "مواد", | ||
| 328 | + "فیچرز", | ||
| 329 | + "فروختی نقطہ", | ||
| 330 | + "اینکر ٹیکسٹ" | ||
| 331 | + ], | ||
| 332 | + "bn": [ | ||
| 333 | + "ক্রম", | ||
| 334 | + "পণ্যের শিরোনাম", | ||
| 335 | + "শ্রেণি পথ", | ||
| 336 | + "বিস্তারিত ট্যাগ", | ||
| 337 | + "লক্ষ্য ব্যবহারকারী", | ||
| 338 | + "ব্যবহারের দৃশ্য", | ||
| 339 | + "মৌসুম", | ||
| 340 | + "মূল বৈশিষ্ট্য", | ||
| 341 | + "উপাদান", | ||
| 342 | + "ফিচার", | ||
| 343 | + "বিক্রয় পয়েন্ট", | ||
| 344 | + "অ্যাঙ্কর টেক্সট" | ||
| 345 | + ], | ||
| 346 | + "pl": [ | ||
| 347 | + "Nr", | ||
| 348 | + "Tytul produktu", | ||
| 349 | + "Sciezka kategorii", | ||
| 350 | + "Szczegolowe tagi", | ||
| 351 | + "Grupa docelowa", | ||
| 352 | + "Scenariusz uzycia", | ||
| 353 | + "Sezon", | ||
| 354 | + "Kluczowe atrybuty", | ||
| 355 | + "Material", | ||
| 356 | + "Cechy", | ||
| 357 | + "Atut sprzedazowy", | ||
| 358 | + "Tekst kotwicy" | ||
| 359 | + ], | ||
| 360 | + "nl": [ | ||
| 361 | + "Nr.", | ||
| 362 | + "Producttitel", | ||
| 363 | + "Categoriepad", | ||
| 364 | + "Gedetailleerde tags", | ||
| 365 | + "Doelgroep", | ||
| 366 | + "Gebruikscontext", | ||
| 367 | + "Seizoen", | ||
| 368 | + "Belangrijke kenmerken", | ||
| 369 | + "Materiaal", | ||
| 370 | + "Functies", | ||
| 371 | + "Verkooppunt", | ||
| 372 | + "Ankertekst" | ||
| 373 | + ], | ||
| 374 | + "ro": [ | ||
| 375 | + "Nr.", | ||
| 376 | + "Titlul produsului", | ||
| 377 | + "Calea categoriei", | ||
| 378 | + "Etichete detaliate", | ||
| 379 | + "Public tinta", | ||
| 380 | + "Scenariu de utilizare", | ||
| 381 | + "Sezon", | ||
| 382 | + "Atribute cheie", | ||
| 383 | + "Material", | ||
| 384 | + "Caracteristici", | ||
| 385 | + "Punct de vanzare", | ||
| 386 | + "Text ancora" | ||
| 387 | + ], | ||
| 388 | + "tr": [ | ||
| 389 | + "No.", | ||
| 390 | + "Urun basligi", | ||
| 391 | + "Kategori yolu", | ||
| 392 | + "Ayrintili etiketler", | ||
| 393 | + "Hedef kitle", | ||
| 394 | + "Kullanim senaryosu", | ||
| 395 | + "Sezon", | ||
| 396 | + "Temel ozellikler", | ||
| 397 | + "Malzeme", | ||
| 398 | + "Ozellikler", | ||
| 399 | + "Satis noktasi", | ||
| 400 | + "Capa metni" | ||
| 401 | + ], | ||
| 402 | + "km": [ | ||
| 403 | + "ល.រ", | ||
| 404 | + "ចំណងជើងផលិតផល", | ||
| 405 | + "ផ្លូវប្រភេទ", | ||
| 406 | + "ស្លាកលម្អិត", | ||
| 407 | + "ក្រុមអ្នកប្រើគោលដៅ", | ||
| 408 | + "សេណារីយ៉ូប្រើប្រាស់", | ||
| 409 | + "រដូវកាល", | ||
| 410 | + "លក្ខណៈសម្បត្តិសំខាន់", | ||
| 411 | + "សម្ភារៈ", | ||
| 412 | + "មុខងារ", | ||
| 413 | + "ចំណុចលក់", | ||
| 414 | + "អត្ថបទអង់ក័រ" | ||
| 415 | + ], | ||
| 416 | + "lo": [ | ||
| 417 | + "ລຳດັບ", | ||
| 418 | + "ຊື່ສິນຄ້າ", | ||
| 419 | + "ເສັ້ນທາງໝວດໝູ່", | ||
| 420 | + "ແທັກລະອຽດ", | ||
| 421 | + "ກຸ່ມເປົ້າໝາຍ", | ||
| 422 | + "ສະຖານະການໃຊ້ງານ", | ||
| 423 | + "ລະດູການ", | ||
| 424 | + "ຄຸນລັກສະນະສຳຄັນ", | ||
| 425 | + "ວັດສະດຸ", | ||
| 426 | + "ຄຸນສົມບັດ", | ||
| 427 | + "ຈຸດຂາຍ", | ||
| 428 | + "ຂໍ້ຄວາມອັງເຄີ" | ||
| 429 | + ], | ||
| 430 | + "yue": [ | ||
| 431 | + "序號", | ||
| 432 | + "商品標題", | ||
| 433 | + "品類路徑", | ||
| 434 | + "細分類標籤", | ||
| 435 | + "適用人群", | ||
| 436 | + "使用場景", | ||
| 437 | + "適用季節", | ||
| 438 | + "關鍵屬性", | ||
| 439 | + "材質說明", | ||
| 440 | + "功能特點", | ||
| 441 | + "商品賣點", | ||
| 442 | + "錨文本" | ||
| 443 | + ], | ||
| 444 | + "cs": [ | ||
| 445 | + "C.", | ||
| 446 | + "Nazev produktu", | ||
| 447 | + "Cesta kategorie", | ||
| 448 | + "Podrobne stitky", | ||
| 449 | + "Cilova skupina", | ||
| 450 | + "Scenar pouziti", | ||
| 451 | + "Sezona", | ||
| 452 | + "Klicove atributy", | ||
| 453 | + "Material", | ||
| 454 | + "Vlastnosti", | ||
| 455 | + "Prodejni argument", | ||
| 456 | + "Kotvici text" | ||
| 457 | + ], | ||
| 458 | + "el": [ | ||
| 459 | + "Α/Α", | ||
| 460 | + "Τίτλος προϊόντος", | ||
| 461 | + "Διαδρομή κατηγορίας", | ||
| 462 | + "Αναλυτικές ετικέτες", | ||
| 463 | + "Κοινό-στόχος", | ||
| 464 | + "Σενάριο χρήσης", | ||
| 465 | + "Εποχή", | ||
| 466 | + "Βασικά χαρακτηριστικά", | ||
| 467 | + "Υλικό", | ||
| 468 | + "Λειτουργίες", | ||
| 469 | + "Σημείο πώλησης", | ||
| 470 | + "Κείμενο άγκυρας" | ||
| 471 | + ], | ||
| 472 | + "sv": [ | ||
| 473 | + "Nr", | ||
| 474 | + "Produkttitel", | ||
| 475 | + "Kategorisokvag", | ||
| 476 | + "Detaljerade taggar", | ||
| 477 | + "Malgrupp", | ||
| 478 | + "Anvandningsscenario", | ||
| 479 | + "Sasong", | ||
| 480 | + "Viktiga attribut", | ||
| 481 | + "Material", | ||
| 482 | + "Funktioner", | ||
| 483 | + "Saljpunkt", | ||
| 484 | + "Ankartext" | ||
| 485 | + ], | ||
| 486 | + "hu": [ | ||
| 487 | + "Sorszam", | ||
| 488 | + "Termekcim", | ||
| 489 | + "Kategoriavonal", | ||
| 490 | + "Reszletes cimkek", | ||
| 491 | + "Celcsoport", | ||
| 492 | + "Hasznalati helyzet", | ||
| 493 | + "Evszak", | ||
| 494 | + "Fo jellemzok", | ||
| 495 | + "Anyag", | ||
| 496 | + "Funkciok", | ||
| 497 | + "Ertekesitesi elony", | ||
| 498 | + "Horgonyszoveg" | ||
| 499 | + ], | ||
| 500 | + "da": [ | ||
| 501 | + "Nr.", | ||
| 502 | + "Produkttitel", | ||
| 503 | + "Kategoristi", | ||
| 504 | + "Detaljerede tags", | ||
| 505 | + "Malgruppe", | ||
| 506 | + "Brugsscenarie", | ||
| 507 | + "Saeson", | ||
| 508 | + "Nogleattributter", | ||
| 509 | + "Materiale", | ||
| 510 | + "Funktioner", | ||
| 511 | + "Salgsargument", | ||
| 512 | + "Ankertekst" | ||
| 513 | + ], | ||
| 514 | + "fi": [ | ||
| 515 | + "Nro", | ||
| 516 | + "Tuotteen nimi", | ||
| 517 | + "Kategoriapolku", | ||
| 518 | + "Yksityiskohtaiset tunnisteet", | ||
| 519 | + "Kohdeyleiso", | ||
| 520 | + "Kayttotilanne", | ||
| 521 | + "Kausi", | ||
| 522 | + "Keskeiset ominaisuudet", | ||
| 523 | + "Materiaali", | ||
| 524 | + "Ominaisuudet", | ||
| 525 | + "Myyntivaltti", | ||
| 526 | + "Ankkuriteksti" | ||
| 527 | + ], | ||
| 528 | + "uk": [ | ||
| 529 | + "№", | ||
| 530 | + "Назва товару", | ||
| 531 | + "Шлях категорії", | ||
| 532 | + "Детальні теги", | ||
| 533 | + "Цільова аудиторія", | ||
| 534 | + "Сценарій використання", | ||
| 535 | + "Сезон", | ||
| 536 | + "Ключові атрибути", | ||
| 537 | + "Матеріал", | ||
| 538 | + "Особливості", | ||
| 539 | + "Продаюча перевага", | ||
| 540 | + "Анкорний текст" | ||
| 541 | + ], | ||
| 542 | + "bg": [ | ||
| 543 | + "№", | ||
| 544 | + "Заглавие на продукта", | ||
| 545 | + "Път на категорията", | ||
| 546 | + "Подробни тагове", | ||
| 547 | + "Целева аудитория", | ||
| 548 | + "Сценарий на употреба", | ||
| 549 | + "Сезон", | ||
| 550 | + "Ключови атрибути", | ||
| 551 | + "Материал", | ||
| 552 | + "Характеристики", | ||
| 553 | + "Търговско предимство", | ||
| 554 | + "Анкор текст" | ||
| 555 | + ] | ||
| 556 | +} | ||
| 0 | \ No newline at end of file | 557 | \ No newline at end of file |
| @@ -0,0 +1,229 @@ | @@ -0,0 +1,229 @@ | ||
| 1 | +from __future__ import annotations | ||
| 2 | + | ||
| 3 | +import importlib.util | ||
| 4 | +import io | ||
| 5 | +import json | ||
| 6 | +import logging | ||
| 7 | +import sys | ||
| 8 | +import types | ||
| 9 | +from pathlib import Path | ||
| 10 | +from unittest import mock | ||
| 11 | + | ||
| 12 | + | ||
| 13 | +def _load_product_enrich_module(): | ||
| 14 | + if "dotenv" not in sys.modules: | ||
| 15 | + fake_dotenv = types.ModuleType("dotenv") | ||
| 16 | + fake_dotenv.load_dotenv = lambda *args, **kwargs: None | ||
| 17 | + sys.modules["dotenv"] = fake_dotenv | ||
| 18 | + | ||
| 19 | + if "redis" not in sys.modules: | ||
| 20 | + fake_redis = types.ModuleType("redis") | ||
| 21 | + | ||
| 22 | + class _FakeRedisClient: | ||
| 23 | + def __init__(self, *args, **kwargs): | ||
| 24 | + pass | ||
| 25 | + | ||
| 26 | + def ping(self): | ||
| 27 | + return True | ||
| 28 | + | ||
| 29 | + fake_redis.Redis = _FakeRedisClient | ||
| 30 | + sys.modules["redis"] = fake_redis | ||
| 31 | + | ||
| 32 | + repo_root = Path(__file__).resolve().parents[1] | ||
| 33 | + if str(repo_root) not in sys.path: | ||
| 34 | + sys.path.insert(0, str(repo_root)) | ||
| 35 | + | ||
| 36 | + module_path = repo_root / "indexer" / "product_enrich.py" | ||
| 37 | + spec = importlib.util.spec_from_file_location("product_enrich_under_test", module_path) | ||
| 38 | + module = importlib.util.module_from_spec(spec) | ||
| 39 | + assert spec and spec.loader | ||
| 40 | + spec.loader.exec_module(module) | ||
| 41 | + return module | ||
| 42 | + | ||
| 43 | + | ||
| 44 | +product_enrich = _load_product_enrich_module() | ||
| 45 | + | ||
| 46 | + | ||
| 47 | +def _attach_stream(logger_obj: logging.Logger): | ||
| 48 | + stream = io.StringIO() | ||
| 49 | + handler = logging.StreamHandler(stream) | ||
| 50 | + handler.setFormatter(logging.Formatter("%(message)s")) | ||
| 51 | + logger_obj.addHandler(handler) | ||
| 52 | + return stream, handler | ||
| 53 | + | ||
| 54 | + | ||
| 55 | +def test_create_prompt_splits_shared_context_and_localized_tail(): | ||
| 56 | + products = [ | ||
| 57 | + {"id": "1", "title": "dress"}, | ||
| 58 | + {"id": "2", "title": "linen shirt"}, | ||
| 59 | + ] | ||
| 60 | + | ||
| 61 | + shared_zh, user_zh, prefix_zh = product_enrich.create_prompt(products, target_lang="zh") | ||
| 62 | + shared_en, user_en, prefix_en = product_enrich.create_prompt(products, target_lang="en") | ||
| 63 | + | ||
| 64 | + assert shared_zh == shared_en | ||
| 65 | + assert "Analyze each input product title" in shared_zh | ||
| 66 | + assert "1. dress" in shared_zh | ||
| 67 | + assert "2. linen shirt" in shared_zh | ||
| 68 | + assert "Product list" not in user_zh | ||
| 69 | + assert "Product list" not in user_en | ||
| 70 | + assert "specified language" in user_zh | ||
| 71 | + assert "Language: Chinese" in user_zh | ||
| 72 | + assert "Language: English" in user_en | ||
| 73 | + assert prefix_zh.startswith("| 序号 | 商品标题 | 品类路径 |") | ||
| 74 | + assert prefix_en.startswith("| No. | Product title | Category path |") | ||
| 75 | + | ||
| 76 | + | ||
| 77 | +def test_call_llm_logs_shared_context_once_and_verbose_contains_full_requests(): | ||
| 78 | + payloads = [] | ||
| 79 | + response_bodies = [ | ||
| 80 | + { | ||
| 81 | + "choices": [ | ||
| 82 | + { | ||
| 83 | + "message": { | ||
| 84 | + "content": ( | ||
| 85 | + "| 1 | 连衣裙 | 女装>连衣裙 | 法式,收腰 | 年轻女性 | " | ||
| 86 | + "通勤,约会 | 春季,夏季 | 中长款 | 聚酯纤维 | 透气 | " | ||
| 87 | + "修身显瘦 | 法式收腰连衣裙 |\n" | ||
| 88 | + ) | ||
| 89 | + } | ||
| 90 | + } | ||
| 91 | + ], | ||
| 92 | + "usage": {"prompt_tokens": 120, "completion_tokens": 45, "total_tokens": 165}, | ||
| 93 | + }, | ||
| 94 | + { | ||
| 95 | + "choices": [ | ||
| 96 | + { | ||
| 97 | + "message": { | ||
| 98 | + "content": ( | ||
| 99 | + "| 1 | Dress | Women>Dress | French,Waisted | Young women | " | ||
| 100 | + "Commute,Date | Spring,Summer | Midi | Polyester | Breathable | " | ||
| 101 | + "Slim fit | French waisted dress |\n" | ||
| 102 | + ) | ||
| 103 | + } | ||
| 104 | + } | ||
| 105 | + ], | ||
| 106 | + "usage": {"prompt_tokens": 118, "completion_tokens": 43, "total_tokens": 161}, | ||
| 107 | + }, | ||
| 108 | + ] | ||
| 109 | + | ||
| 110 | + class _FakeResponse: | ||
| 111 | + def __init__(self, body): | ||
| 112 | + self.body = body | ||
| 113 | + | ||
| 114 | + def raise_for_status(self): | ||
| 115 | + return None | ||
| 116 | + | ||
| 117 | + def json(self): | ||
| 118 | + return self.body | ||
| 119 | + | ||
| 120 | + class _FakeSession: | ||
| 121 | + trust_env = True | ||
| 122 | + | ||
| 123 | + def post(self, url, headers=None, json=None, timeout=None, proxies=None): | ||
| 124 | + del url, headers, timeout, proxies | ||
| 125 | + payloads.append(json) | ||
| 126 | + return _FakeResponse(response_bodies[len(payloads) - 1]) | ||
| 127 | + | ||
| 128 | + def close(self): | ||
| 129 | + return None | ||
| 130 | + | ||
| 131 | + product_enrich.reset_logged_shared_context_keys() | ||
| 132 | + main_stream, main_handler = _attach_stream(product_enrich.logger) | ||
| 133 | + verbose_stream, verbose_handler = _attach_stream(product_enrich.verbose_logger) | ||
| 134 | + | ||
| 135 | + try: | ||
| 136 | + with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object( | ||
| 137 | + product_enrich.requests, | ||
| 138 | + "Session", | ||
| 139 | + lambda: _FakeSession(), | ||
| 140 | + ): | ||
| 141 | + zh_shared, zh_user, zh_prefix = product_enrich.create_prompt( | ||
| 142 | + [{"id": "1", "title": "dress"}], | ||
| 143 | + target_lang="zh", | ||
| 144 | + ) | ||
| 145 | + en_shared, en_user, en_prefix = product_enrich.create_prompt( | ||
| 146 | + [{"id": "1", "title": "dress"}], | ||
| 147 | + target_lang="en", | ||
| 148 | + ) | ||
| 149 | + | ||
| 150 | + zh_markdown, zh_raw = product_enrich.call_llm( | ||
| 151 | + zh_shared, | ||
| 152 | + zh_user, | ||
| 153 | + zh_prefix, | ||
| 154 | + target_lang="zh", | ||
| 155 | + ) | ||
| 156 | + en_markdown, en_raw = product_enrich.call_llm( | ||
| 157 | + en_shared, | ||
| 158 | + en_user, | ||
| 159 | + en_prefix, | ||
| 160 | + target_lang="en", | ||
| 161 | + ) | ||
| 162 | + finally: | ||
| 163 | + product_enrich.logger.removeHandler(main_handler) | ||
| 164 | + product_enrich.verbose_logger.removeHandler(verbose_handler) | ||
| 165 | + | ||
| 166 | + assert zh_shared == en_shared | ||
| 167 | + assert len(payloads) == 2 | ||
| 168 | + assert len(payloads[0]["messages"]) == 3 | ||
| 169 | + assert payloads[0]["messages"][1]["role"] == "user" | ||
| 170 | + assert "1. dress" in payloads[0]["messages"][1]["content"] | ||
| 171 | + assert "Language: Chinese" in payloads[0]["messages"][1]["content"] | ||
| 172 | + assert "Language: English" in payloads[1]["messages"][1]["content"] | ||
| 173 | + assert payloads[0]["messages"][-1]["partial"] is True | ||
| 174 | + assert payloads[1]["messages"][-1]["partial"] is True | ||
| 175 | + | ||
| 176 | + main_log = main_stream.getvalue() | ||
| 177 | + verbose_log = verbose_stream.getvalue() | ||
| 178 | + | ||
| 179 | + assert main_log.count("LLM Shared Context") == 1 | ||
| 180 | + assert main_log.count("LLM Request Variant") == 2 | ||
| 181 | + assert "Localized Requirement" in main_log | ||
| 182 | + assert "Shared Context" in main_log | ||
| 183 | + | ||
| 184 | + assert verbose_log.count("LLM Request [model=") == 2 | ||
| 185 | + assert verbose_log.count("LLM Response [model=") == 2 | ||
| 186 | + assert '"partial": true' in verbose_log | ||
| 187 | + assert "Combined User Prompt" in verbose_log | ||
| 188 | + assert "French waisted dress" in verbose_log | ||
| 189 | + assert "法式收腰连衣裙" in verbose_log | ||
| 190 | + | ||
| 191 | + assert zh_markdown.startswith(zh_prefix) | ||
| 192 | + assert en_markdown.startswith(en_prefix) | ||
| 193 | + assert json.loads(zh_raw)["usage"]["total_tokens"] == 165 | ||
| 194 | + assert json.loads(en_raw)["usage"]["total_tokens"] == 161 | ||
| 195 | + | ||
| 196 | + | ||
| 197 | +def test_process_batch_reads_result_and_validates_expected_fields(): | ||
| 198 | + merged_markdown = """| 序号 | 商品标题 | 品类路径 | 细分标签 | 适用人群 | 使用场景 | 适用季节 | 关键属性 | 材质说明 | 功能特点 | 商品卖点 | 锚文本 | | ||
| 199 | +|----|----|----|----|----|----|----|----|----|----|----|----| | ||
| 200 | +| 1 | 法式连衣裙 | 女装>连衣裙 | 法式,收腰 | 年轻女性 | 通勤,约会 | 春季,夏季 | 中长款 | 聚酯纤维 | 透气 | 修身显瘦 | 法式收腰连衣裙 | | ||
| 201 | +""" | ||
| 202 | + | ||
| 203 | + with mock.patch.object( | ||
| 204 | + product_enrich, | ||
| 205 | + "call_llm", | ||
| 206 | + return_value=(merged_markdown, json.dumps({"choices": [{"message": {"content": "stub"}}]})), | ||
| 207 | + ): | ||
| 208 | + results = product_enrich.process_batch( | ||
| 209 | + [{"id": "sku-1", "title": "dress"}], | ||
| 210 | + batch_num=1, | ||
| 211 | + target_lang="zh", | ||
| 212 | + ) | ||
| 213 | + | ||
| 214 | + assert len(results) == 1 | ||
| 215 | + row = results[0] | ||
| 216 | + assert row["id"] == "sku-1" | ||
| 217 | + assert row["lang"] == "zh" | ||
| 218 | + assert row["title_input"] == "dress" | ||
| 219 | + assert row["title"] == "法式连衣裙" | ||
| 220 | + assert row["category_path"] == "女装>连衣裙" | ||
| 221 | + assert row["tags"] == "法式,收腰" | ||
| 222 | + assert row["target_audience"] == "年轻女性" | ||
| 223 | + assert row["usage_scene"] == "通勤,约会" | ||
| 224 | + assert row["season"] == "春季,夏季" | ||
| 225 | + assert row["key_attributes"] == "中长款" | ||
| 226 | + assert row["material"] == "聚酯纤维" | ||
| 227 | + assert row["features"] == "透气" | ||
| 228 | + assert row["selling_points"] == "修身显瘦" | ||
| 229 | + assert row["anchor_text"] == "法式收腰连衣裙" |