Commit 6f7840cfeb5d4f57eb81674aff77a1a0d60094ab

Authored by tangwang
1 parent 137455af

refactor: rename product annotator to enrich and expand multilingual prompts

- Rename indexer/product_annotator.py to indexer/product_enrich.py and remove CSV-based CLI entrypoint, keeping only in-memory analyze_products API
- Introduce dedicated product_enrich logging with separate verbose log file for full LLM requests/responses
- Change indexer and /indexer/enrich-content API wiring to use indexer.product_enrich instead of indexer.product_annotator, updating tests and docs accordingly
- Switch translate_prompts to share SUPPORTED_INDEX_LANGUAGES from tenant_config_loader and reuse that mapping for language code → display name
- Remove hard SUPPORTED_LANGS constraint from LLM content-enrichment flow, driving languages directly from tenant/indexer configuration
- Redesign LLM prompt generation to support multi-round, multi-language tables: first round in English, subsequent rounds translate the entire table (headers + cells) into target languages using English instructions
api/routes/indexer.py
@@ -443,23 +443,12 @@ async def build_docs_from_db(request: BuildDocsFromDbRequest): @@ -443,23 +443,12 @@ async def build_docs_from_db(request: BuildDocsFromDbRequest):
443 443
444 def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: List[str]) -> List[Dict[str, Any]]: 444 def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: List[str]) -> List[Dict[str, Any]]:
445 """ 445 """
446 - 同步执行内容理解:调用 product_annotator.analyze_products,按语言批量跑 LLM, 446 + 同步执行内容理解:调用 product_enrich.analyze_products,按语言批量跑 LLM,
447 再聚合成每 SPU 的 qanchors、semantic_attributes、tags。供 run_in_executor 调用。 447 再聚合成每 SPU 的 qanchors、semantic_attributes、tags。供 run_in_executor 调用。
448 """ 448 """
449 - from indexer.product_annotator import analyze_products, SUPPORTED_LANGS  
450 -  
451 - llm_langs = [lang for lang in languages if lang in SUPPORTED_LANGS]  
452 - if not llm_langs:  
453 - return [  
454 - {  
455 - "spu_id": it["spu_id"],  
456 - "qanchors": {},  
457 - "semantic_attributes": [],  
458 - "tags": [],  
459 - "error": "no supported languages (supported: %s)" % sorted(SUPPORTED_LANGS),  
460 - }  
461 - for it in items  
462 - ] 449 + from indexer.product_enrich import analyze_products
  450 +
  451 + llm_langs = list(dict.fromkeys(languages)) or ["en"]
463 452
464 products = [{"id": it["spu_id"], "title": (it.get("title") or "").strip()} for it in items] 453 products = [{"id": it["spu_id"], "title": (it.get("title") or "").strip()} for it in items]
465 dim_keys = [ 454 dim_keys = [
@@ -544,7 +533,7 @@ async def enrich_content(request: EnrichContentRequest): @@ -544,7 +533,7 @@ async def enrich_content(request: EnrichContentRequest):
544 - 与 /indexer/build-docs 解耦,避免 build-docs 因 LLM 耗时过长而阻塞;调用方可 533 - 与 /indexer/build-docs 解耦,避免 build-docs 因 LLM 耗时过长而阻塞;调用方可
545 先拿不含 qanchors/tags 的 doc,再异步或离线补齐本接口结果后更新 ES。 534 先拿不含 qanchors/tags 的 doc,再异步或离线补齐本接口结果后更新 ES。
546 535
547 - 实现逻辑与 indexer.product_annotator.analyze_products 一致,支持多语言与 Redis 缓存。 536 + 实现逻辑与 indexer.product_enrich.analyze_products 一致,支持多语言与 Redis 缓存。
548 """ 537 """
549 try: 538 try:
550 if not request.items: 539 if not request.items:
api/translator_app.py
@@ -88,7 +88,7 @@ import sys @@ -88,7 +88,7 @@ import sys
88 import logging 88 import logging
89 import argparse 89 import argparse
90 import uvicorn 90 import uvicorn
91 -from typing import Optional, Dict 91 +from typing import Dict, List, Optional, Sequence, Union
92 from fastapi import FastAPI, HTTPException 92 from fastapi import FastAPI, HTTPException
93 from fastapi.responses import JSONResponse 93 from fastapi.responses import JSONResponse
94 from fastapi.middleware.cors import CORSMiddleware 94 from fastapi.middleware.cors import CORSMiddleware
@@ -162,7 +162,7 @@ def get_translator(model: str = "qwen") -> object: @@ -162,7 +162,7 @@ def get_translator(model: str = "qwen") -> object:
162 # Request/Response models 162 # Request/Response models
163 class TranslationRequest(BaseModel): 163 class TranslationRequest(BaseModel):
164 """Translation request model.""" 164 """Translation request model."""
165 - text: str = Field(..., description="Text to translate") 165 + text: Union[str, List[str]] = Field(..., description="Text to translate (string or list of strings)")
166 target_lang: str = Field(..., description="Target language code (zh, en, ru, etc.)") 166 target_lang: str = Field(..., description="Target language code (zh, en, ru, etc.)")
167 source_lang: Optional[str] = Field(None, description="Source language code (optional, auto-detect if not provided)") 167 source_lang: Optional[str] = Field(None, description="Source language code (optional, auto-detect if not provided)")
168 model: Optional[str] = Field(None, description="Translation model: qwen-mt | deepl | llm") 168 model: Optional[str] = Field(None, description="Translation model: qwen-mt | deepl | llm")
@@ -183,10 +183,13 @@ class TranslationRequest(BaseModel): @@ -183,10 +183,13 @@ class TranslationRequest(BaseModel):
183 183
184 class TranslationResponse(BaseModel): 184 class TranslationResponse(BaseModel):
185 """Translation response model.""" 185 """Translation response model."""
186 - text: str = Field(..., description="Original text") 186 + text: Union[str, List[str]] = Field(..., description="Original text (string or list)")
187 target_lang: str = Field(..., description="Target language code") 187 target_lang: str = Field(..., description="Target language code")
188 source_lang: Optional[str] = Field(None, description="Source language code (detected or provided)") 188 source_lang: Optional[str] = Field(None, description="Source language code (detected or provided)")
189 - translated_text: str = Field(..., description="Translated text") 189 + translated_text: Union[str, List[Optional[str]]] = Field(
  190 + ...,
  191 + description="Translated text (string or list; list elements may be null on failure)",
  192 + )
190 status: str = Field(..., description="Translation status") 193 status: str = Field(..., description="Translation status")
191 model: str = Field(..., description="Translation model used") 194 model: str = Field(..., description="Translation model used")
192 195
@@ -260,11 +263,19 @@ async def translate(request: TranslationRequest): @@ -260,11 +263,19 @@ async def translate(request: TranslationRequest):
260 263
261 Supports both Qwen (default) and DeepL models via the 'model' parameter. 264 Supports both Qwen (default) and DeepL models via the 'model' parameter.
262 """ 265 """
263 - if not request.text or not request.text.strip():  
264 - raise HTTPException(  
265 - status_code=400,  
266 - detail="Text cannot be empty"  
267 - ) 266 + # 允许 text 为字符串或字符串列表
  267 + if isinstance(request.text, list):
  268 + if not request.text:
  269 + raise HTTPException(
  270 + status_code=400,
  271 + detail="Text list cannot be empty"
  272 + )
  273 + else:
  274 + if not request.text or not request.text.strip():
  275 + raise HTTPException(
  276 + status_code=400,
  277 + detail="Text cannot be empty"
  278 + )
268 279
269 if not request.target_lang: 280 if not request.target_lang:
270 raise HTTPException( 281 raise HTTPException(
@@ -283,24 +294,96 @@ async def translate(request: TranslationRequest): @@ -283,24 +294,96 @@ async def translate(request: TranslationRequest):
283 try: 294 try:
284 # Get translator instance for the specified model 295 # Get translator instance for the specified model
285 translator = get_translator(model=model) 296 translator = get_translator(model=model)
286 -  
287 - # Translate using the fixed prompt 297 + raw_text = request.text
  298 +
  299 + # 如果是列表,并且底层 provider 声明支持 batch,则直接传 list
  300 + if isinstance(raw_text, list) and getattr(translator, "supports_batch", False):
  301 + try:
  302 + translated_list = translator.translate(
  303 + text=raw_text,
  304 + target_lang=request.target_lang,
  305 + source_lang=request.source_lang,
  306 + context=request.context,
  307 + prompt=request.prompt,
  308 + )
  309 + except Exception as exc:
  310 + logger.error("Batch translation failed: %s", exc, exc_info=True)
  311 + # 回退到逐条拆分逻辑
  312 + translated_list = None
  313 +
  314 + if translated_list is not None:
  315 + # 规范化为 List[Optional[str]],并保证长度对应
  316 + if not isinstance(translated_list, list):
  317 + raise HTTPException(
  318 + status_code=500,
  319 + detail="Batch translation provider returned non-list result",
  320 + )
  321 + normalized: List[Optional[str]] = []
  322 + for idx, item in enumerate(raw_text):
  323 + if idx < len(translated_list):
  324 + val = translated_list[idx]
  325 + else:
  326 + val = None
  327 + # 失败语义:失败位置为 None
  328 + normalized.append(val)
  329 +
  330 + return TranslationResponse(
  331 + text=raw_text,
  332 + target_lang=request.target_lang,
  333 + source_lang=request.source_lang,
  334 + translated_text=normalized,
  335 + status="success",
  336 + model=str(getattr(translator, "model", model)),
  337 + )
  338 +
  339 + # 否则:统一走逐条拆分逻辑(包括不支持 batch 的 provider)
  340 + if isinstance(raw_text, list):
  341 + results: List[Optional[str]] = []
  342 + for item in raw_text:
  343 + if item is None or not str(item).strip():
  344 + # 空元素不视为失败,直接返回原值
  345 + results.append(item) # type: ignore[arg-type]
  346 + continue
  347 + try:
  348 + out = translator.translate(
  349 + text=str(item),
  350 + target_lang=request.target_lang,
  351 + source_lang=request.source_lang,
  352 + context=request.context,
  353 + prompt=request.prompt,
  354 + )
  355 + except Exception as exc:
  356 + logger.warning("Per-item translation failed: %s", exc, exc_info=True)
  357 + out = None
  358 + # 失败语义:该元素为 None
  359 + results.append(out)
  360 +
  361 + return TranslationResponse(
  362 + text=raw_text,
  363 + target_lang=request.target_lang,
  364 + source_lang=request.source_lang,
  365 + translated_text=results,
  366 + status="success",
  367 + model=str(getattr(translator, "model", model)),
  368 + )
  369 +
  370 + # 单文本模式:保持原有严格失败语义
288 translated_text = translator.translate( 371 translated_text = translator.translate(
289 - text=request.text, 372 + text=raw_text,
290 target_lang=request.target_lang, 373 target_lang=request.target_lang,
291 source_lang=request.source_lang, 374 source_lang=request.source_lang,
292 context=request.context, 375 context=request.context,
293 prompt=request.prompt, 376 prompt=request.prompt,
294 ) 377 )
295 - 378 +
296 if translated_text is None: 379 if translated_text is None:
297 raise HTTPException( 380 raise HTTPException(
298 status_code=500, 381 status_code=500,
299 detail="Translation failed" 382 detail="Translation failed"
300 ) 383 )
301 - 384 +
302 return TranslationResponse( 385 return TranslationResponse(
303 - text=request.text, 386 + text=raw_text,
304 target_lang=request.target_lang, 387 target_lang=request.target_lang,
305 source_lang=request.source_lang, 388 source_lang=request.source_lang,
306 translated_text=translated_text, 389 translated_text=translated_text,
config/config.yaml
@@ -224,7 +224,7 @@ spu_config: @@ -224,7 +224,7 @@ spu_config:
224 224
225 # 租户配置(Tenant Configuration) 225 # 租户配置(Tenant Configuration)
226 # 每个租户可配置主语言 primary_language 与索引语言 index_languages(主市场语言,商家可勾选) 226 # 每个租户可配置主语言 primary_language 与索引语言 index_languages(主市场语言,商家可勾选)
227 -# 默认 index_languages: [en, zh],可配置为任意 SUPPORTED_INDEX_LANGUAGES 的子集 227 +# 默认 index_languages: [en, zh],可配置为任意 SOURCE_LANG_CODE_MAP.keys() 的子集
228 tenant_config: 228 tenant_config:
229 default: 229 default:
230 primary_language: "en" 230 primary_language: "en"
config/tenant_config_loader.py
@@ -11,7 +11,8 @@ from typing import Dict, Any, Optional, List @@ -11,7 +11,8 @@ from typing import Dict, Any, Optional, List
11 logger = logging.getLogger(__name__) 11 logger = logging.getLogger(__name__)
12 12
13 # 支持的索引语言:code -> display name(供商家勾选主市场语言等场景使用) 13 # 支持的索引语言:code -> display name(供商家勾选主市场语言等场景使用)
14 -SUPPORTED_INDEX_LANGUAGES: Dict[str, str] = { 14 +# 语言代码与展示名的双向映射(供翻译/LLM 提示等统一使用)
  15 +SOURCE_LANG_CODE_MAP: Dict[str, str] = {
15 "en": "English", 16 "en": "English",
16 "zh": "Chinese", 17 "zh": "Chinese",
17 "zh_tw": "Traditional Chinese", 18 "zh_tw": "Traditional Chinese",
@@ -51,6 +52,9 @@ SUPPORTED_INDEX_LANGUAGES: Dict[str, str] = { @@ -51,6 +52,9 @@ SUPPORTED_INDEX_LANGUAGES: Dict[str, str] = {
51 "bg": "Bulgarian", 52 "bg": "Bulgarian",
52 } 53 }
53 54
  55 +TARGET_LANG_CODE_MAP: Dict[str, str] = {v: k for k, v in SOURCE_LANG_CODE_MAP.items()}
  56 +
  57 +
54 def normalize_index_languages(value: Any, primary_language: str = "en") -> List[str]: 58 def normalize_index_languages(value: Any, primary_language: str = "en") -> List[str]:
55 """ 59 """
56 将 index_languages 配置规范化为合法语言代码列表。 60 将 index_languages 配置规范化为合法语言代码列表。
@@ -67,7 +71,7 @@ def normalize_index_languages(value: Any, primary_language: str = &quot;en&quot;) -&gt; List[ @@ -67,7 +71,7 @@ def normalize_index_languages(value: Any, primary_language: str = &quot;en&quot;) -&gt; List[
67 code = (item or "").strip().lower() 71 code = (item or "").strip().lower()
68 if not code or code in seen: 72 if not code or code in seen:
69 continue 73 continue
70 - if code in SUPPORTED_INDEX_LANGUAGES: 74 + if code in SOURCE_LANG_CODE_MAP:
71 valid.append(code) 75 valid.append(code)
72 seen.add(code) 76 seen.add(code)
73 return valid 77 return valid
@@ -91,11 +95,11 @@ def resolve_index_languages( @@ -91,11 +95,11 @@ def resolve_index_languages(
91 to_en = bool(tenant_config.get("translate_to_en")) 95 to_en = bool(tenant_config.get("translate_to_en"))
92 to_zh = bool(tenant_config.get("translate_to_zh")) 96 to_zh = bool(tenant_config.get("translate_to_zh"))
93 langs: List[str] = [] 97 langs: List[str] = []
94 - if primary and primary in SUPPORTED_INDEX_LANGUAGES: 98 + if primary and primary in SOURCE_LANG_CODE_MAP:
95 langs.append(primary) 99 langs.append(primary)
96 for code in ("en", "zh"): 100 for code in ("en", "zh"):
97 if code not in langs and ((code == "en" and to_en) or (code == "zh" and to_zh)): 101 if code not in langs and ((code == "en" and to_en) or (code == "zh" and to_zh)):
98 - if code in SUPPORTED_INDEX_LANGUAGES: 102 + if code in SOURCE_LANG_CODE_MAP:
99 langs.append(code) 103 langs.append(code)
100 return langs if langs else list(default_index_languages) 104 return langs if langs else list(default_index_languages)
101 105
config/translate_prompts.py
1 -SOURCE_LANG_CODE_MAP = {  
2 - "en": "English",  
3 - "zh": "Chinese",  
4 - "zh_tw": "Traditional Chinese",  
5 - "ru": "Russian",  
6 - "ja": "Japanese",  
7 - "ko": "Korean",  
8 - "es": "Spanish",  
9 - "fr": "French",  
10 - "pt": "Portuguese",  
11 - "de": "German",  
12 - "it": "Italian",  
13 - "th": "Thai",  
14 - "vi": "Vietnamese",  
15 - "id": "Indonesian",  
16 - "ms": "Malay",  
17 - "ar": "Arabic",  
18 - "hi": "Hindi",  
19 - "he": "Hebrew",  
20 - "my": "Burmese",  
21 - "ta": "Tamil",  
22 - "ur": "Urdu",  
23 - "bn": "Bengali",  
24 - "pl": "Polish",  
25 - "nl": "Dutch",  
26 - "ro": "Romanian",  
27 - "tr": "Turkish",  
28 - "km": "Khmer",  
29 - "lo": "Lao",  
30 - "yue": "Cantonese",  
31 - "cs": "Czech",  
32 - "el": "Greek",  
33 - "sv": "Swedish",  
34 - "hu": "Hungarian",  
35 - "da": "Danish",  
36 - "fi": "Finnish",  
37 - "uk": "Ukrainian",  
38 - "bg": "Bulgarian",  
39 -}  
40 -  
41 -TARGET_LANG_CODE_MAP = {v: k for k, v in SOURCE_LANG_CODE_MAP.items()} 1 +from config.tenant_config_loader import SOURCE_LANG_CODE_MAP, TARGET_LANG_CODE_MAP
42 2
43 TRANSLATION_PROMPTS = { 3 TRANSLATION_PROMPTS = {
44 "general": { 4 "general": {
docs/DEVELOPER_GUIDE.md
@@ -92,7 +92,7 @@ MySQL (店匠 SPU/SKU) @@ -92,7 +92,7 @@ MySQL (店匠 SPU/SKU)
92 | indexer | 6004 | 索引 API(reindex/build-docs 等) | ✓ | 92 | indexer | 6004 | 索引 API(reindex/build-docs 等) | ✓ |
93 | frontend | 6003 | 调试 UI | ✓ | 93 | frontend | 6003 | 调试 UI | ✓ |
94 | embedding | 6005 | 向量服务(文本/图片) | 可选 | 94 | embedding | 6005 | 向量服务(文本/图片) | 可选 |
95 -| translator | 6006 | 翻译服务 | 可选 | 95 +| translator | 6006 | 翻译服务(`POST /translate` 支持单条或批量 list;批量失败用 `null` 占位) | 可选 |
96 | reranker | 6007 | 重排服务 | 可选 | 96 | reranker | 6007 | 重排服务 | 可选 |
97 97
98 - 启动:`./run.sh` 仅启动 backend / indexer / frontend;需全功能时通过环境变量或脚本另行启动 embedding / translator / reranker。 98 - 启动:`./run.sh` 仅启动 backend / indexer / frontend;需全功能时通过环境变量或脚本另行启动 embedding / translator / reranker。
@@ -170,6 +170,11 @@ docs/ # 文档(含本指南) @@ -170,6 +170,11 @@ docs/ # 文档(含本指南)
170 - **原则**:业务代码只依赖 Provider 接口,不依赖具体 URL 或后端类型;新增调用方式(如新 Provider 类型)在对应 `providers/<capability>.py` 中实现并在工厂中注册。 170 - **原则**:业务代码只依赖 Provider 接口,不依赖具体 URL 或后端类型;新增调用方式(如新 Provider 类型)在对应 `providers/<capability>.py` 中实现并在工厂中注册。
171 - **详见**:本指南 §7.2;[QUICKSTART.md](./QUICKSTART.md) §3。 171 - **详见**:本指南 §7.2;[QUICKSTART.md](./QUICKSTART.md) §3。
172 172
  173 +补充约定(翻译 provider):
  174 +
  175 +- `translate(text=...)` 支持 `str` 与 `List[str]` 两种输入;当输入为列表时,输出必须与输入 **等长且顺序对应**,失败位置为 `None`(HTTP JSON 表现为 `null`)。
  176 +- provider 可暴露 `supports_batch: bool`(property)用于标识其是否支持直接批量调用;上层在处理 `text` 为列表时可优先走 batch,否则逐条拆分调用。
  177 +
173 ### 4.9 suggestion 178 ### 4.9 suggestion
174 179
175 - **职责**:建议索引的构建与检索:从 ES 商品索引与 MySQL 日志等构建 suggestion 索引;搜索 API 的 `/search/suggestions` 使用本模块。 180 - **职责**:建议索引的构建与检索:从 ES 商品索引与 MySQL 日志等构建 suggestion 索引;搜索 API 的 `/search/suggestions` 使用本模块。
docs/工作总结-微服务性能优化与架构.md
@@ -101,9 +101,9 @@ instruction: &quot;Given a shopping query, rank product titles by relevance&quot; @@ -101,9 +101,9 @@ instruction: &quot;Given a shopping query, rank product titles by relevance&quot;
101 101
102 **具体内容**: 102 **具体内容**:
103 - **接口**:`POST /indexer/enrich-content`(Indexer 服务端口 **6004**)。请求体为 `items` 数组,每项含 `spu_id`、`title`(必填)及可选多语言标题等;单次请求最多 **50 条**,建议批量调用。响应 `results` 与 `items` 一一对应,每项含 `spu_id`、`qanchors`(按语言键,如 `qanchors.zh`、`qanchors.en`,逗号分隔短语)、`semantic_attributes`、`tags`。 103 - **接口**:`POST /indexer/enrich-content`(Indexer 服务端口 **6004**)。请求体为 `items` 数组,每项含 `spu_id`、`title`(必填)及可选多语言标题等;单次请求最多 **50 条**,建议批量调用。响应 `results` 与 `items` 一一对应,每项含 `spu_id`、`qanchors`(按语言键,如 `qanchors.zh`、`qanchors.en`,逗号分隔短语)、`semantic_attributes`、`tags`。
104 -- **索引侧**:微服务组合方式下,调用方先拿不含 qanchors/tags 的 doc,再调用本接口补齐后写入 ES 的 `qanchors.{lang}` 等字段;索引 transformer(`indexer/document_transformer.py`、`indexer/product_annotator.py`)内也可在构建 doc 时调用内容理解逻辑,写入 `qanchors.{lang}`。 104 +-- **索引侧**:微服务组合方式下,调用方先拿不含 qanchors/tags 的 doc,再调用本接口补齐后写入 ES 的 `qanchors.{lang}` 等字段;索引 transformer(`indexer/document_transformer.py`、`indexer/product_enrich.py`)内也可在构建 doc 时调用内容理解逻辑,写入 `qanchors.{lang}`。
105 - **Suggest 侧**:`suggestion/builder.py` 从 ES 商品索引读取 `_source: ["id", "spu_id", "title", "qanchors"]`,对 `qanchors.{lang}` 用 `_split_qanchors` 拆成词条,以 `source="qanchor"` 加入候选,排序时 `qanchor` 权重大于纯 title(`add_product("qanchor", ...)`);suggest 配置中 `sources: ["query_log", "qanchor"]` 表示候选来源包含 qanchor。 105 - **Suggest 侧**:`suggestion/builder.py` 从 ES 商品索引读取 `_source: ["id", "spu_id", "title", "qanchors"]`,对 `qanchors.{lang}` 用 `_split_qanchors` 拆成词条,以 `source="qanchor"` 加入候选,排序时 `qanchor` 权重大于纯 title(`add_product("qanchor", ...)`);suggest 配置中 `sources: ["query_log", "qanchor"]` 表示候选来源包含 qanchor。
106 -- **实现与依赖**:内容理解内部使用大模型(需 `DASHSCOPE_API_KEY`),支持多语言与 Redis 缓存(如 `product_anchors`);逻辑与 `indexer/product_annotator` 一致。 106 +- **实现与依赖**:内容理解内部使用大模型(需 `DASHSCOPE_API_KEY`),支持多语言与 Redis 缓存(如 `product_anchors`);逻辑与 `indexer/product_enrich` 一致。
107 107
108 **状态**:内容理解字段已接入索引与 suggest 链路;依赖内容理解(qanchors/tags)的**全量数据尚未全部完成一轮**,后续需持续跑满并校验效果。 108 **状态**:内容理解字段已接入索引与 suggest 链路;依赖内容理解(qanchors/tags)的**全量数据尚未全部完成一轮**,后续需持续跑满并校验效果。
109 109
docs/搜索API对接指南.md
@@ -1484,7 +1484,7 @@ curl -X POST &quot;http://127.0.0.1:6004/indexer/build-docs-from-db&quot; \ @@ -1484,7 +1484,7 @@ curl -X POST &quot;http://127.0.0.1:6004/indexer/build-docs-from-db&quot; \
1484 ### 5.8 内容理解字段生成接口 1484 ### 5.8 内容理解字段生成接口
1485 1485
1486 - **端点**: `POST /indexer/enrich-content` 1486 - **端点**: `POST /indexer/enrich-content`
1487 -- **描述**: 根据商品内容信息批量生成 **qanchors**(锚文本)、**semantic_attributes**(语义属性)、**tags**(细分标签),供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段(必填/可选见下表)。内部逻辑与 `indexer.product_annotator` 一致,支持多语言与 Redis 缓存;单次请求在线程池中执行,避免阻塞其他接口。 1487 +- **描述**: 根据商品内容信息批量生成 **qanchors**(锚文本)、**semantic_attributes**(语义属性)、**tags**(细分标签),供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段(必填/可选见下表)。内部逻辑与 `indexer.product_enrich` 一致,支持多语言与 Redis 缓存;单次请求在线程池中执行,避免阻塞其他接口。
1488 1488
1489 #### 请求参数 1489 #### 请求参数
1490 1490
@@ -1862,6 +1862,10 @@ curl &quot;http://localhost:6007/health&quot; @@ -1862,6 +1862,10 @@ curl &quot;http://localhost:6007/health&quot;
1862 } 1862 }
1863 ``` 1863 ```
1864 1864
  1865 +> **失败语义(批量)**:当 `text` 为列表时,如果其中某条翻译失败,对应位置返回 `null`(即 `translated_text[i] = null`),并保持数组长度与顺序不变;接口整体仍返回 `status="success"`,用于避免“部分失败”导致整批请求失败。
  1866 +
  1867 +> **实现提示(可忽略)**:服务端会尽可能使用底层翻译 provider 的批量能力(若支持),否则自动拆分逐条翻译;无论采用哪种方式,上述批量契约保持一致。
  1868 +
1865 **完整 curl 示例**: 1869 **完整 curl 示例**:
1866 1870
1867 中文 → 英文: 1871 中文 → 英文:
docs/翻译模块说明.md
@@ -24,3 +24,83 @@ TRANSLATION_MODEL=qwen # 或 deepl @@ -24,3 +24,83 @@ TRANSLATION_MODEL=qwen # 或 deepl
24 ## Provider 配置 24 ## Provider 配置
25 25
26 Provider 与 URL 在 `config/config.yaml` 的 `services.translation`。详见 [QUICKSTART.md](./QUICKSTART.md) §3 与 [DEVELOPER_GUIDE.md](./DEVELOPER_GUIDE.md) §7.2。 26 Provider 与 URL 在 `config/config.yaml` 的 `services.translation`。详见 [QUICKSTART.md](./QUICKSTART.md) §3 与 [DEVELOPER_GUIDE.md](./DEVELOPER_GUIDE.md) §7.2。
  27 +
  28 +## HTTP 接口契约(translator service,端口 6006)
  29 +
  30 +服务默认监听 `http://localhost:6006`,提供:
  31 +
  32 +- `POST /translate`: 文本翻译(支持 `qwen/qwen-mt`、`deepl`、`llm`)
  33 +- `GET /health`: 健康检查
  34 +
  35 +### `POST /translate`
  36 +
  37 +**请求体**:
  38 +
  39 +```json
  40 +{
  41 + "text": "商品名称",
  42 + "target_lang": "en",
  43 + "source_lang": "zh",
  44 + "model": "qwen",
  45 + "context": "sku_name",
  46 + "prompt": null
  47 +}
  48 +```
  49 +
  50 +- `text` 支持两种形式:
  51 + - 单条:`string`
  52 + - 批量:`string[]`(等长返回,顺序对应)
  53 +
  54 +**响应体**(单条):
  55 +
  56 +```json
  57 +{
  58 + "text": "商品名称",
  59 + "target_lang": "en",
  60 + "source_lang": "zh",
  61 + "translated_text": "Product name",
  62 + "status": "success",
  63 + "model": "qwen"
  64 +}
  65 +```
  66 +
  67 +**响应体**(批量):
  68 +
  69 +```json
  70 +{
  71 + "text": ["商品名称1", "商品名称2"],
  72 + "target_lang": "en",
  73 + "source_lang": "zh",
  74 + "translated_text": ["Product name 1", null],
  75 + "status": "success",
  76 + "model": "qwen"
  77 +}
  78 +```
  79 +
  80 +批量模式下,**单条失败用 `null` 占位**(即 `translated_text[i] = null`),保证长度与顺序一一对应,避免部分失败导致整批报错。
  81 +
  82 +---
  83 +
  84 +## 开发者接口约定(Provider / 代码调用)
  85 +
  86 +除 HTTP 微服务外,代码侧(如 query/indexer)通常通过 `providers.translation.create_translation_provider()` 获取翻译 provider 实例并调用 `translate()`。
  87 +
  88 +### 输入输出形状(Shape)
  89 +
  90 +- `translate(text=...)` 支持:
  91 + - **单条**:`text: str` → 返回 `Optional[str]`
  92 + - **批量**:`text: List[str]` → 返回 `List[Optional[str]]`
  93 +- **批量语义**:返回列表必须与输入 **等长且顺序对应**;某条翻译失败时,对应位置为 `None`(HTTP JSON 中表现为 `null`)。
  94 +
  95 +### 批量能力标识(supports_batch)
  96 +
  97 +不同 provider 对批量的实现方式可能不同(例如:真正一次请求传多条,或内部循环逐条翻译并保持 shape)。
  98 +
  99 +为便于上层(如 `api/translator_app.py`)做最优调用,provider 可暴露:
  100 +
  101 +- `supports_batch: bool`(property)
  102 +
  103 +上层在收到 `text` 为列表时:
  104 +
  105 +- **若 `supports_batch=True`**:可以直接将列表传给 `translate(text=[...])`
  106 +- **若 `supports_batch=False`**:上层会逐条拆分调用(仍保证输出列表一一对应、失败为 `null`)
indexer/document_transformer.py
@@ -14,7 +14,7 @@ import logging @@ -14,7 +14,7 @@ import logging
14 import re 14 import re
15 from typing import Dict, Any, Optional, List 15 from typing import Dict, Any, Optional, List
16 from config import ConfigLoader 16 from config import ConfigLoader
17 -from indexer.product_annotator import analyze_products, SUPPORTED_LANGS 17 +from indexer.product_enrich import analyze_products
18 18
19 logger = logging.getLogger(__name__) 19 logger = logging.getLogger(__name__)
20 20
@@ -225,9 +225,8 @@ class SPUDocumentTransformer: @@ -225,9 +225,8 @@ class SPUDocumentTransformer:
225 index_langs = self.tenant_config.get("index_languages") or ["en", "zh"] 225 index_langs = self.tenant_config.get("index_languages") or ["en", "zh"]
226 except Exception: 226 except Exception:
227 index_langs = ["en", "zh"] 227 index_langs = ["en", "zh"]
228 - llm_langs = [lang for lang in index_langs if lang in SUPPORTED_LANGS]  
229 - if not llm_langs:  
230 - return 228 + # 不再限制为固定 SUPPORTED_LANGS,直接按照租户配置的 index_languages 调用
  229 + llm_langs = list(dict.fromkeys(index_langs)) # 去重并保持顺序
231 230
232 # 只对有 title 的 SPU 参与 LLM;其余跳过 231 # 只对有 title 的 SPU 参与 LLM;其余跳过
233 id_to_idx: Dict[str, int] = {} 232 id_to_idx: Dict[str, int] = {}
@@ -651,7 +650,7 @@ class SPUDocumentTransformer: @@ -651,7 +650,7 @@ class SPUDocumentTransformer:
651 650
652 def _fill_llm_attributes(self, doc: Dict[str, Any], spu_row: pd.Series) -> None: 651 def _fill_llm_attributes(self, doc: Dict[str, Any], spu_row: pd.Series) -> None:
653 """ 652 """
654 - 调用 indexer.product_annotator.analyze_products,为当前 SPU 填充: 653 + 调用 indexer.product_enrich.analyze_products,为当前 SPU 填充:
655 - qanchors.{lang} 654 - qanchors.{lang}
656 - semantic_attributes (lang/name/value) 655 - semantic_attributes (lang/name/value)
657 """ 656 """
@@ -660,10 +659,8 @@ class SPUDocumentTransformer: @@ -660,10 +659,8 @@ class SPUDocumentTransformer:
660 except Exception: 659 except Exception:
661 index_langs = ["en", "zh"] 660 index_langs = ["en", "zh"]
662 661
663 - # 只在支持的语言集合内调用  
664 - llm_langs = [lang for lang in index_langs if lang in SUPPORTED_LANGS]  
665 - if not llm_langs:  
666 - return 662 + # 不再限制为固定 SUPPORTED_LANGS,直接按照租户配置的 index_languages 调用
  663 + llm_langs = list(dict.fromkeys(index_langs)) # 去重并保持顺序
667 664
668 spu_id = str(spu_row.get("id") or "").strip() 665 spu_id = str(spu_row.get("id") or "").strip()
669 title = str(spu_row.get("title") or "").strip() 666 title = str(spu_row.get("title") or "").strip()
indexer/product_annotator.py deleted
@@ -1,685 +0,0 @@ @@ -1,685 +0,0 @@
1 -#!/usr/bin/env python3  
2 -"""  
3 -商品品类分析脚本  
4 -批量读取商品标题,调用大模型进行品类分析,并保存结果  
5 -"""  
6 -  
7 -import csv  
8 -import os  
9 -import json  
10 -import logging  
11 -import time  
12 -import hashlib  
13 -from datetime import datetime  
14 -from typing import List, Dict, Tuple, Any, Optional  
15 -  
16 -import redis  
17 -import requests  
18 -from pathlib import Path  
19 -from requests.adapters import HTTPAdapter  
20 -from urllib3.util.retry import Retry  
21 -  
22 -from config.env_config import REDIS_CONFIG  
23 -  
24 -# 配置  
25 -BATCH_SIZE = 20  
26 -# 华北2(北京):https://dashscope.aliyuncs.com/compatible-mode/v1  
27 -# 新加坡:https://dashscope-intl.aliyuncs.com/compatible-mode/v1  
28 -# 美国(弗吉尼亚):https://dashscope-us.aliyuncs.com/compatible-mode/v1  
29 -API_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1"  
30 -MODEL_NAME = "qwen-flash"  
31 -API_KEY = os.environ.get("DASHSCOPE_API_KEY")  
32 -MAX_RETRIES = 3  
33 -RETRY_DELAY = 5 # 秒  
34 -REQUEST_TIMEOUT = 180 # 秒  
35 -  
36 -# 文件路径  
37 -INPUT_FILE = "saas_170_products.csv"  
38 -OUTPUT_DIR = Path("output_logs")  
39 -OUTPUT_FILE = OUTPUT_DIR / "products_analyzed.csv"  
40 -LOG_DIR = OUTPUT_DIR / "logs"  
41 -  
42 -# 设置独立日志(不影响全局 indexer.log)  
43 -LOG_DIR.mkdir(parents=True, exist_ok=True)  
44 -timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")  
45 -log_file = LOG_DIR / f"process_{timestamp}.log"  
46 -  
47 -logger = logging.getLogger("product_annotator")  
48 -logger.setLevel(logging.INFO)  
49 -  
50 -if not logger.handlers:  
51 - formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")  
52 -  
53 - file_handler = logging.FileHandler(log_file, encoding="utf-8")  
54 - file_handler.setFormatter(formatter)  
55 -  
56 - stream_handler = logging.StreamHandler()  
57 - stream_handler.setFormatter(formatter)  
58 -  
59 - logger.addHandler(file_handler)  
60 - logger.addHandler(stream_handler)  
61 -  
62 - # 避免日志向根 logger 传播,防止写入 logs/indexer.log 等其他文件  
63 - logger.propagate = False  
64 -  
65 -  
66 -# Redis 缓存(用于 anchors / 语义属性)  
67 -ANCHOR_CACHE_PREFIX = REDIS_CONFIG.get("anchor_cache_prefix", "product_anchors")  
68 -ANCHOR_CACHE_EXPIRE_DAYS = int(REDIS_CONFIG.get("anchor_cache_expire_days", 30))  
69 -_anchor_redis: Optional[redis.Redis] = None  
70 -  
71 -try:  
72 - _anchor_redis = redis.Redis(  
73 - host=REDIS_CONFIG.get("host", "localhost"),  
74 - port=REDIS_CONFIG.get("port", 6479),  
75 - password=REDIS_CONFIG.get("password"),  
76 - decode_responses=True,  
77 - socket_timeout=REDIS_CONFIG.get("socket_timeout", 1),  
78 - socket_connect_timeout=REDIS_CONFIG.get("socket_connect_timeout", 1),  
79 - retry_on_timeout=REDIS_CONFIG.get("retry_on_timeout", False),  
80 - health_check_interval=10,  
81 - )  
82 - _anchor_redis.ping()  
83 - logger.info("Redis cache initialized for product anchors and semantic attributes")  
84 -except Exception as e:  
85 - logger.warning(f"Failed to initialize Redis for anchors cache: {e}")  
86 - _anchor_redis = None  
87 -  
88 -  
89 -LANG_LABELS: Dict[str, str] = {  
90 - "zh": "中文",  
91 - "en": "英文",  
92 - "de": "德文",  
93 - "ru": "俄文",  
94 - "fr": "法文",  
95 -}  
96 -  
97 -SUPPORTED_LANGS = set(LANG_LABELS.keys())  
98 -  
99 -SYSTEM_MESSAGES: Dict[str, str] = {  
100 - "zh": (  
101 - "你是一名电商平台的商品标注员,你的工作是对输入的每个商品进行理解、分析和标注,"  
102 - "并按要求格式返回 Markdown 表格。所有输出内容必须为中文。"  
103 - ),  
104 - "en": (  
105 - "You are a product annotator for an e-commerce platform. "  
106 - "For each input product, you must understand, analyze and label it, "  
107 - "and return a Markdown table strictly following the requested format. "  
108 - "All output must be in English."  
109 - ),  
110 - "de": (  
111 - "Du bist ein Produktannotator für eine E‑Commerce‑Plattform. "  
112 - "Du sollst jedes Eingabeprodukt verstehen, analysieren und beschriften "  
113 - "und eine Markdown-Tabelle im geforderten Format zurückgeben. "  
114 - "Alle Ausgaben müssen auf Deutsch sein."  
115 - ),  
116 - "ru": (  
117 - "Вы — разметчик товаров для платформы электронной коммерции. "  
118 - "Ваша задача — понимать, анализировать и размечать каждый товар "  
119 - "и возвращать таблицу Markdown в требуемом формате. "  
120 - "Весь вывод должен быть на русском языке."  
121 - ),  
122 - "fr": (  
123 - "Vous êtes annotateur de produits pour une plateforme e‑commerce. "  
124 - "Pour chaque produit en entrée, vous devez le comprendre, l’analyser et l’annoter, "  
125 - "puis renvoyer un tableau Markdown au format demandé. "  
126 - "Toute la sortie doit être en français."  
127 - ),  
128 -}  
129 -  
130 -  
131 -def _make_anchor_cache_key(  
132 - title: str,  
133 - target_lang: str,  
134 - tenant_id: Optional[str] = None,  
135 -) -> str:  
136 - """构造 anchors/语义属性的缓存 key。"""  
137 - base = (tenant_id or "global").strip()  
138 - h = hashlib.md5(title.encode("utf-8")).hexdigest()  
139 - return f"{ANCHOR_CACHE_PREFIX}:{base}:{target_lang}:{h}"  
140 -  
141 -  
142 -def _get_cached_anchor_result(  
143 - title: str,  
144 - target_lang: str,  
145 - tenant_id: Optional[str] = None,  
146 -) -> Optional[Dict[str, Any]]:  
147 - if not _anchor_redis:  
148 - return None  
149 - try:  
150 - key = _make_anchor_cache_key(title, target_lang, tenant_id)  
151 - raw = _anchor_redis.get(key)  
152 - if not raw:  
153 - return None  
154 - return json.loads(raw)  
155 - except Exception as e:  
156 - logger.warning(f"Failed to get anchor cache: {e}")  
157 - return None  
158 -  
159 -  
160 -def _set_cached_anchor_result(  
161 - title: str,  
162 - target_lang: str,  
163 - result: Dict[str, Any],  
164 - tenant_id: Optional[str] = None,  
165 -) -> None:  
166 - if not _anchor_redis:  
167 - return  
168 - try:  
169 - key = _make_anchor_cache_key(title, target_lang, tenant_id)  
170 - ttl = ANCHOR_CACHE_EXPIRE_DAYS * 24 * 3600  
171 - _anchor_redis.setex(key, ttl, json.dumps(result, ensure_ascii=False))  
172 - except Exception as e:  
173 - logger.warning(f"Failed to set anchor cache: {e}")  
174 -  
175 -  
176 -def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> str:  
177 - """根据目标语言创建 LLM 提示词和表头说明。"""  
178 - if target_lang == "en":  
179 - prompt = """Please analyze each input product title and extract the following information:  
180 -  
181 -1. Product title: a natural English product name derived from the input title  
182 -2. Category path: from broad to fine-grained category, separated by ">" (e.g. Clothing>Women>Dresses>Work Dress)  
183 -3. Fine-grained tags: style / features / attributes (e.g. floral, waist-cinching, French style)  
184 -4. Target audience: gender / age group, etc. (e.g. young women)  
185 -5. Usage scene  
186 -6. Applicable season  
187 -7. Key attributes  
188 -8. Material description  
189 -9. Functional features  
190 -10. Selling point: one concise key selling sentence for recommendation  
191 -11. Anchor text: a set of words or phrases that could be used by users as search queries for this product, covering category, fine-grained tags, functional attributes, usage scenes, etc.  
192 -  
193 -Input product list:  
194 -  
195 -"""  
196 - prompt_tail = """  
197 -Please strictly return a Markdown table in the following format. For any column that can contain multiple values, separate values with commas. Do not add any other explanations:  
198 -  
199 -| No. | Product title | Category path | Fine-grained tags | Target audience | Usage scene | Season | Key attributes | Material | Features | Selling point | Anchor text |  
200 -|----|----|----|----|----|----|----|----|----|----|----|----|  
201 -"""  
202 - elif target_lang == "de":  
203 - prompt = """Bitte analysiere jeden eingegebenen Produkttitel und extrahiere die folgenden Informationen:  
204 -  
205 -1. Produkttitel: ein natürlicher deutscher Produkttitel basierend auf dem Eingangstitel  
206 -2. Kategoriepfad: von Oberkategorie bis Feinkategorie, getrennt durch ">" (z. B. Kleidung>Damen>Kleider>Businesskleid)  
207 -3. Feinkörnige Tags: Stil / Merkmale / Eigenschaften (z. B. Blumenmuster, tailliert, französischer Stil)  
208 -4. Zielgruppe: Geschlecht / Altersgruppe usw. (z. B. junge Frauen)  
209 -5. Einsatzszenario  
210 -6. Geeignete Saison  
211 -7. Wichtige Attribute  
212 -8. Materialbeschreibung  
213 -9. Funktionale Merkmale  
214 -10. Verkaufsargument: ein prägnanter, einzeiliger Haupt-Selling-Point für Empfehlungen  
215 -11. Ankertexte: eine Menge von Wörtern oder Phrasen, die Nutzer als Suchanfragen für dieses Produkt verwenden könnten und die Kategorie, feine Tags, Funktion und Nutzungsszenarien abdecken.  
216 -  
217 -Eingabeliste der Produkte:  
218 -  
219 -"""  
220 - prompt_tail = """  
221 -Gib bitte strikt eine Markdown-Tabelle im folgenden Format zurück. Mehrere Werte in einer Spalte werden durch Kommas getrennt. Füge keine weiteren Erklärungen hinzu:  
222 -  
223 -| Nr. | Produkttitel | Kategoriepfad | Feintags | Zielgruppe | Einsatzszenario | Saison | Wichtige Attribute | Material | Merkmale | Verkaufsargument | Ankertexte |  
224 -|----|----|----|----|----|----|----|----|----|----|----|----|  
225 -"""  
226 - elif target_lang == "ru":  
227 - prompt = """Пожалуйста, проанализируйте каждый входной заголовок товара и извлеките следующую информацию:  
228 -  
229 -1. Заголовок товара: естественное русскоязычное название товара на основе исходного заголовка  
230 -2. Путь категории: от широкой до узкой категории, разделённый символом ">" (например: Одежда>Женская одежда>Платья>Деловое платье)  
231 -3. Детализированные теги: стиль / особенности / характеристики (например: цветочный принт, приталенный, французский стиль)  
232 -4. Целевая аудитория: пол / возрастная группа и т. п. (например: молодые женщины)  
233 -5. Сценарий использования  
234 -6. Подходящий сезон  
235 -7. Ключевые характеристики  
236 -8. Описание материала  
237 -9. Функциональные особенности  
238 -10. Торговое преимущество: одно краткое ключевое предложение для рекомендаций  
239 -11. Якорные запросы: набор слов или фраз, которые пользователи могут использовать в качестве поисковых запросов для этого товара, покрывающих категорию, детализированные теги, функциональные характеристики, сценарии использования и т. д.  
240 -  
241 -Список входных товаров:  
242 -  
243 -"""  
244 - prompt_tail = """  
245 -Пожалуйста, строго верните Markdown‑таблицу в следующем формате. Для колонок с несколькими значениями разделяйте значения запятыми. Не добавляйте никаких дополнительных пояснений:  
246 -  
247 -| № | Заголовок товара | Путь категории | Детализированные теги | Целевая аудитория | Сценарий использования | Сезон | Ключевые характеристики | Материал | Особенности | Торговое преимущество | Якорные запросы |  
248 -|----|----|----|----|----|----|----|----|----|----|----|----|  
249 -"""  
250 - elif target_lang == "fr":  
251 - prompt = """Veuillez analyser chaque titre de produit en entrée et extraire les informations suivantes :  
252 -  
253 -1. Titre du produit : un titre de produit naturel en français basé sur le titre d’origine  
254 -2. Chemin de catégorie : de la catégorie la plus large à la plus fine, séparées par ">" (par ex. Vêtements>Femme>Robes>Robe de travail)  
255 -3. Tags détaillés : style / caractéristiques / attributs (par ex. fleuri, cintré, style français)  
256 -4. Public cible : sexe / tranche d’âge, etc. (par ex. jeunes femmes)  
257 -5. Scénario d’utilisation  
258 -6. Saison adaptée  
259 -7. Attributs clés  
260 -8. Description du matériau  
261 -9. Caractéristiques fonctionnelles  
262 -10. Argument de vente : une phrase concise résumant le principal atout pour la recommandation  
263 -11. Texte d’ancrage : un ensemble de mots ou d’expressions que les utilisateurs pourraient saisir comme requêtes de recherche pour ce produit, couvrant la catégorie, les tags détaillés, les fonctions, les scénarios d’usage, etc.  
264 -  
265 -Liste des produits en entrée :  
266 -  
267 -"""  
268 - prompt_tail = """  
269 -Veuillez strictement renvoyer un tableau Markdown au format suivant. Pour toute colonne pouvant contenir plusieurs valeurs, séparez‑les par des virgules. N’ajoutez aucune autre explication :  
270 -  
271 -| N° | Titre du produit | Chemin de catégorie | Tags détaillés | Public cible | Scénario d’utilisation | Saison | Attributs clés | Matériau | Caractéristiques | Argument de vente | Texte d’ancrage |  
272 -|----|----|----|----|----|----|----|----|----|----|----|----|  
273 -"""  
274 - else:  
275 - # 默认中文版本  
276 - prompt = """请对输入的每条商品标题,分析并提取以下信息:  
277 -  
278 -1. 商品标题:将输入商品名称翻译为自然、完整的中文商品标题  
279 -2. 品类路径:从大类到细分品类,用">"分隔(例如:服装>女装>裤子>工装裤)  
280 -3. 细分标签:商品的风格、特点、功能等(例如:碎花,收腰,法式)  
281 -4. 适用人群:性别/年龄段等(例如:年轻女性)  
282 -5. 使用场景  
283 -6. 适用季节  
284 -7. 关键属性  
285 -8. 材质说明  
286 -9. 功能特点  
287 -10. 商品卖点:分析和提取一句话核心卖点,用于推荐理由  
288 -11. 锚文本:生成一组能够代表该商品、并可能被用户用于搜索的词语或短语。这些词语应覆盖用户需求的各个维度,如品类、细分标签、功能特性、需求场景等等。  
289 -  
290 -输入商品列表:  
291 -  
292 -"""  
293 - prompt_tail = """  
294 -请严格按照以下markdown表格格式返回,每列内部的多值内容都用逗号分隔,不要添加任何其他说明:  
295 -  
296 -| 序号 | 商品标题 | 品类路径 | 细分标签 | 适用人群 | 使用场景 | 适用季节 | 关键属性 | 材质说明 | 功能特点 | 商品卖点 | 锚文本 |  
297 -|----|----|----|----|----|----|----|----|----|----|----|----|  
298 -"""  
299 -  
300 - for idx, product in enumerate(products, 1):  
301 - prompt += f'{idx}. {product["title"]}\n'  
302 - prompt += prompt_tail  
303 -  
304 - return prompt  
305 -  
306 -  
307 -def call_llm(prompt: str, target_lang: str = "zh") -> Tuple[str, str]:  
308 - """调用大模型API(带重试机制),按目标语言选择系统提示词。"""  
309 - headers = {  
310 - "Authorization": f"Bearer {API_KEY}",  
311 - "Content-Type": "application/json"  
312 - }  
313 -  
314 - payload = {  
315 - "model": MODEL_NAME,  
316 - "messages": [  
317 - {  
318 - "role": "system",  
319 - "content": SYSTEM_MESSAGES.get(target_lang, SYSTEM_MESSAGES["zh"])  
320 - },  
321 - {  
322 - "role": "user",  
323 - "content": prompt  
324 - }  
325 - ],  
326 - "temperature": 0.3,  
327 - "top_p": 0.8  
328 - }  
329 -  
330 - request_data = {  
331 - "headers": {k: v for k, v in headers.items() if k != "Authorization"},  
332 - "payload": payload  
333 - }  
334 -  
335 - logger.info(f"\n{'='*80}")  
336 - logger.info(f"LLM Request (Model: {MODEL_NAME}):")  
337 - logger.info(json.dumps(request_data, ensure_ascii=False, indent=2))  
338 - logger.info(f"\nPrompt:\n{prompt}")  
339 -  
340 - # 创建session,禁用代理  
341 - session = requests.Session()  
342 - session.trust_env = False # 忽略系统代理设置  
343 -  
344 - try:  
345 - # 重试机制  
346 - for attempt in range(MAX_RETRIES):  
347 - try:  
348 - response = session.post(  
349 - f"{API_BASE_URL}/chat/completions",  
350 - headers=headers,  
351 - json=payload,  
352 - timeout=REQUEST_TIMEOUT,  
353 - proxies={"http": None, "https": None} # 明确禁用代理  
354 - )  
355 -  
356 - response.raise_for_status()  
357 - result = response.json()  
358 -  
359 - logger.info(f"\nLLM Response:")  
360 - logger.info(json.dumps(result, ensure_ascii=False, indent=2))  
361 -  
362 - content = result["choices"][0]["message"]["content"]  
363 - logger.info(f"\nExtracted Content:\n{content}")  
364 -  
365 - return content, json.dumps(result, ensure_ascii=False)  
366 -  
367 - except requests.exceptions.ProxyError as e:  
368 - logger.warning(f"Attempt {attempt + 1}/{MAX_RETRIES}: Proxy error - {str(e)}")  
369 - if attempt < MAX_RETRIES - 1:  
370 - logger.info(f"Retrying in {RETRY_DELAY} seconds...")  
371 - time.sleep(RETRY_DELAY)  
372 - else:  
373 - raise  
374 -  
375 - except requests.exceptions.RequestException as e:  
376 - logger.warning(f"Attempt {attempt + 1}/{MAX_RETRIES}: Request error - {str(e)}")  
377 - if attempt < MAX_RETRIES - 1:  
378 - logger.info(f"Retrying in {RETRY_DELAY} seconds...")  
379 - time.sleep(RETRY_DELAY)  
380 - else:  
381 - raise  
382 -  
383 - except Exception as e:  
384 - logger.error(f"Unexpected error on attempt {attempt + 1}/{MAX_RETRIES}: {str(e)}")  
385 - if attempt < MAX_RETRIES - 1:  
386 - logger.info(f"Retrying in {RETRY_DELAY} seconds...")  
387 - time.sleep(RETRY_DELAY)  
388 - else:  
389 - raise  
390 -  
391 - finally:  
392 - session.close()  
393 -  
394 -  
395 -def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]:  
396 - """解析markdown表格内容"""  
397 - lines = markdown_content.strip().split('\n')  
398 - data = []  
399 - data_started = False  
400 -  
401 - for line in lines:  
402 - line = line.strip()  
403 - if not line:  
404 - continue  
405 -  
406 - # 表格行处理  
407 - if line.startswith('|'):  
408 - # 分隔行(---- 或 :---: 等;允许空格,如 "| ---- | ---- |")  
409 - sep_chars = line.replace('|', '').strip().replace(' ', '')  
410 - if sep_chars and set(sep_chars) <= {'-', ':'}:  
411 - data_started = True  
412 - continue  
413 -  
414 - # 首个表头行:无论语言如何,统一跳过  
415 - if not data_started:  
416 - # 等待下一行数据行  
417 - continue  
418 -  
419 - # 解析数据行  
420 - parts = [p.strip() for p in line.split('|')]  
421 - parts = [p for p in parts if p] # 移除空字符串  
422 -  
423 - if len(parts) >= 2:  
424 - row = {  
425 - "seq_no": parts[0],  
426 - "title": parts[1], # 商品标题(按目标语言)  
427 - "category_path": parts[2] if len(parts) > 2 else "", # 品类路径  
428 - "tags": parts[3] if len(parts) > 3 else "", # 细分标签  
429 - "target_audience": parts[4] if len(parts) > 4 else "", # 适用人群  
430 - "usage_scene": parts[5] if len(parts) > 5 else "", # 使用场景  
431 - "season": parts[6] if len(parts) > 6 else "", # 适用季节  
432 - "key_attributes": parts[7] if len(parts) > 7 else "", # 关键属性  
433 - "material": parts[8] if len(parts) > 8 else "", # 材质说明  
434 - "features": parts[9] if len(parts) > 9 else "", # 功能特点  
435 - "selling_points": parts[10] if len(parts) > 10 else "", # 商品卖点  
436 - "anchor_text": parts[11] if len(parts) > 11 else "" # 锚文本  
437 - }  
438 - data.append(row)  
439 -  
440 - return data  
441 -  
442 -  
443 -def process_batch(  
444 - batch_data: List[Dict[str, str]],  
445 - batch_num: int,  
446 - target_lang: str = "zh"  
447 -) -> List[Dict[str, str]]:  
448 - """处理一个批次的数据"""  
449 - logger.info(f"\n{'#'*80}")  
450 - logger.info(f"Processing Batch {batch_num} ({len(batch_data)} items)")  
451 -  
452 - # 创建提示词  
453 - prompt = create_prompt(batch_data, target_lang=target_lang)  
454 -  
455 - # 调用LLM  
456 - try:  
457 - raw_response, full_response_json = call_llm(prompt, target_lang=target_lang)  
458 -  
459 - # 解析结果  
460 - parsed_results = parse_markdown_table(raw_response)  
461 -  
462 - logger.info(f"\nParsed Results ({len(parsed_results)} items):")  
463 - logger.info(json.dumps(parsed_results, ensure_ascii=False, indent=2))  
464 -  
465 - # 映射回原始ID  
466 - results_with_ids = []  
467 - for i, parsed_item in enumerate(parsed_results):  
468 - if i < len(batch_data):  
469 - original_id = batch_data[i]["id"]  
470 - result = {  
471 - "id": original_id,  
472 - "lang": target_lang,  
473 - "title_input": batch_data[i]["title"], # 原始输入标题  
474 - "title": parsed_item.get("title", ""), # 模型生成的标题  
475 - "category_path": parsed_item.get("category_path", ""), # 品类路径  
476 - "tags": parsed_item.get("tags", ""), # 细分标签  
477 - "target_audience": parsed_item.get("target_audience", ""), # 适用人群  
478 - "usage_scene": parsed_item.get("usage_scene", ""), # 使用场景  
479 - "season": parsed_item.get("season", ""), # 适用季节  
480 - "key_attributes": parsed_item.get("key_attributes", ""), # 关键属性  
481 - "material": parsed_item.get("material", ""), # 材质说明  
482 - "features": parsed_item.get("features", ""), # 功能特点  
483 - "selling_points": parsed_item.get("selling_points", ""), # 商品卖点  
484 - "anchor_text": parsed_item.get("anchor_text", "") # 锚文本  
485 - }  
486 - results_with_ids.append(result)  
487 - logger.info(f"Mapped: seq={parsed_item['seq_no']} -> original_id={original_id}")  
488 -  
489 - # 保存日志  
490 - batch_log = {  
491 - "batch_num": batch_num,  
492 - "timestamp": datetime.now().isoformat(),  
493 - "input_products": batch_data,  
494 - "raw_response": raw_response,  
495 - "full_response_json": full_response_json,  
496 - "parsed_results": parsed_results,  
497 - "final_results": results_with_ids  
498 - }  
499 -  
500 - batch_log_file = LOG_DIR / f"batch_{batch_num:04d}_{timestamp}.json"  
501 - with open(batch_log_file, 'w', encoding='utf-8') as f:  
502 - json.dump(batch_log, f, ensure_ascii=False, indent=2)  
503 -  
504 - logger.info(f"Batch log saved to: {batch_log_file}")  
505 -  
506 - return results_with_ids  
507 -  
508 - except Exception as e:  
509 - logger.error(f"Error processing batch {batch_num}: {str(e)}", exc_info=True)  
510 - # 返回空结果,保持ID映射  
511 - return [{  
512 - "id": item["id"],  
513 - "lang": target_lang,  
514 - "title_input": item["title"],  
515 - "title": "",  
516 - "category_path": "",  
517 - "tags": "",  
518 - "target_audience": "",  
519 - "usage_scene": "",  
520 - "season": "",  
521 - "key_attributes": "",  
522 - "material": "",  
523 - "features": "",  
524 - "selling_points": "",  
525 - "anchor_text": "",  
526 - "error": str(e),  
527 - } for item in batch_data]  
528 -  
529 -  
530 -def read_products(input_file: str) -> List[Dict[str, str]]:  
531 - """读取CSV文件"""  
532 - products = []  
533 - with open(input_file, 'r', encoding='utf-8') as f:  
534 - reader = csv.DictReader(f)  
535 - for row in reader:  
536 - products.append({  
537 - "id": row["id"],  
538 - "title": row["title"]  
539 - })  
540 - return products  
541 -  
542 -  
543 -def write_results(results: List[Dict[str, str]], output_file: Path):  
544 - """写入结果到CSV文件"""  
545 - output_file.parent.mkdir(parents=True, exist_ok=True)  
546 -  
547 - fieldnames = [  
548 - "id",  
549 - "lang",  
550 - "title_input",  
551 - "title",  
552 - "category_path",  
553 - "tags",  
554 - "target_audience",  
555 - "usage_scene",  
556 - "season",  
557 - "key_attributes",  
558 - "material",  
559 - "features",  
560 - "selling_points",  
561 - "anchor_text",  
562 - ]  
563 -  
564 - with open(output_file, 'w', encoding='utf-8', newline='') as f:  
565 - writer = csv.DictWriter(f, fieldnames=fieldnames)  
566 - writer.writeheader()  
567 - writer.writerows(results)  
568 -  
569 - logger.info(f"\nResults written to: {output_file}")  
570 -  
571 -  
572 -def main():  
573 - """主函数"""  
574 - if not API_KEY:  
575 - logger.error("Error: DASHSCOPE_API_KEY environment variable not set!")  
576 - return  
577 -  
578 - logger.info(f"Starting product analysis process")  
579 - logger.info(f"Input file: {INPUT_FILE}")  
580 - logger.info(f"Output file: {OUTPUT_FILE}")  
581 - logger.info(f"Batch size: {BATCH_SIZE}")  
582 - logger.info(f"Model: {MODEL_NAME}")  
583 -  
584 - # 读取产品数据  
585 - logger.info(f"\nReading products from {INPUT_FILE}...")  
586 - products = read_products(INPUT_FILE)  
587 - logger.info(f"Total products to process: {len(products)}")  
588 -  
589 - # 分批处理  
590 - all_results = []  
591 - total_batches = (len(products) + BATCH_SIZE - 1) // BATCH_SIZE  
592 -  
593 - for i in range(0, len(products), BATCH_SIZE):  
594 - batch_num = i // BATCH_SIZE + 1  
595 - batch = products[i:i + BATCH_SIZE]  
596 -  
597 - logger.info(f"\nProgress: Batch {batch_num}/{total_batches}")  
598 -  
599 - results = process_batch(batch, batch_num, target_lang="zh")  
600 - all_results.extend(results)  
601 -  
602 - # 每处理完一个批次就写入一次(断点续传)  
603 - write_results(all_results, OUTPUT_FILE)  
604 - logger.info(f"Progress saved: {len(all_results)}/{len(products)} items completed")  
605 -  
606 - logger.info(f"\n{'='*80}")  
607 - logger.info(f"Processing completed!")  
608 - logger.info(f"Total processed: {len(all_results)} items")  
609 - logger.info(f"Output file: {OUTPUT_FILE}")  
610 - logger.info(f"Log file: {log_file}")  
611 -  
612 -  
613 -if __name__ == "__main__":  
614 - main()  
615 -  
616 -  
617 -def analyze_products(  
618 - products: List[Dict[str, str]],  
619 - target_lang: str = "zh",  
620 - batch_size: Optional[int] = None,  
621 - tenant_id: Optional[str] = None,  
622 -) -> List[Dict[str, Any]]:  
623 - """  
624 - 库调用入口:根据输入+语言,返回锚文本及各维度信息。  
625 -  
626 - Args:  
627 - products: [{"id": "...", "title": "..."}]  
628 - target_lang: 输出语言,需在 SUPPORTED_LANGS 内  
629 - batch_size: 批大小,默认使用全局 BATCH_SIZE  
630 - """  
631 - if not API_KEY:  
632 - raise RuntimeError("DASHSCOPE_API_KEY is not set, cannot call LLM")  
633 -  
634 - if target_lang not in SUPPORTED_LANGS:  
635 - raise ValueError(f"Unsupported target_lang={target_lang}, supported={sorted(SUPPORTED_LANGS)}")  
636 -  
637 - if not products:  
638 - return []  
639 -  
640 - # 简单路径:索引阶段通常 batch_size=1,这里优先做单条缓存命中  
641 - if len(products) == 1:  
642 - p = products[0]  
643 - title = str(p.get("title") or "").strip()  
644 - if title:  
645 - cached = _get_cached_anchor_result(title, target_lang, tenant_id=tenant_id)  
646 - if cached:  
647 - logger.info(  
648 - f"[analyze_products] Cache hit for title='{title[:50]}...', "  
649 - f"lang={target_lang}, tenant_id={tenant_id or 'global'}"  
650 - )  
651 - return [cached]  
652 -  
653 - # call_llm 一次处理上限固定为 BATCH_SIZE(默认 20):  
654 - # - 尽可能攒批处理;  
655 - # - 即便调用方传入更大的 batch_size,也会自动按上限拆批。  
656 - req_bs = BATCH_SIZE if batch_size is None else int(batch_size)  
657 - bs = max(1, min(req_bs, BATCH_SIZE))  
658 - all_results: List[Dict[str, Any]] = []  
659 - total_batches = (len(products) + bs - 1) // bs  
660 -  
661 - for i in range(0, len(products), bs):  
662 - batch_num = i // bs + 1  
663 - batch = products[i:i + bs]  
664 - logger.info(  
665 - f"[analyze_products] Processing batch {batch_num}/{total_batches}, "  
666 - f"size={len(batch)}, target_lang={target_lang}"  
667 - )  
668 - batch_results = process_batch(batch, batch_num=batch_num, target_lang=target_lang)  
669 - all_results.extend(batch_results)  
670 -  
671 - # 写入缓存  
672 - for item in batch_results:  
673 - title_input = str(item.get("title_input") or "").strip()  
674 - if not title_input:  
675 - continue  
676 - if item.get("error"):  
677 - # 不缓存错误结果,避免放大临时故障  
678 - continue  
679 - try:  
680 - _set_cached_anchor_result(title_input, target_lang, item, tenant_id=tenant_id)  
681 - except Exception:  
682 - # 已在内部记录 warning  
683 - pass  
684 -  
685 - return all_results  
indexer/product_enrich.py 0 → 100644
@@ -0,0 +1,516 @@ @@ -0,0 +1,516 @@
  1 +#!/usr/bin/env python3
  2 +"""
  3 +商品内容理解与属性补充模块(product_enrich)
  4 +
  5 +提供基于 LLM 的商品锚文本 / 语义属性 / 标签等分析能力,
  6 +供 indexer 与 API 在内存中调用(不再负责 CSV 读写)。
  7 +"""
  8 +
  9 +import os
  10 +import json
  11 +import logging
  12 +import time
  13 +import hashlib
  14 +from datetime import datetime
  15 +from typing import List, Dict, Tuple, Any, Optional
  16 +
  17 +import redis
  18 +import requests
  19 +from pathlib import Path
  20 +
  21 +from config.env_config import REDIS_CONFIG
  22 +from config.tenant_config_loader import SOURCE_LANG_CODE_MAP
  23 +
  24 +# 配置
  25 +BATCH_SIZE = 20
  26 +# 华北2(北京):https://dashscope.aliyuncs.com/compatible-mode/v1
  27 +# 新加坡:https://dashscope-intl.aliyuncs.com/compatible-mode/v1
  28 +# 美国(弗吉尼亚):https://dashscope-us.aliyuncs.com/compatible-mode/v1
  29 +API_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1"
  30 +MODEL_NAME = "qwen-flash"
  31 +API_KEY = os.environ.get("DASHSCOPE_API_KEY")
  32 +MAX_RETRIES = 3
  33 +RETRY_DELAY = 5 # 秒
  34 +REQUEST_TIMEOUT = 180 # 秒
  35 +
  36 +# 日志路径
  37 +OUTPUT_DIR = Path("output_logs")
  38 +LOG_DIR = OUTPUT_DIR / "logs"
  39 +
  40 +# 设置独立日志(不影响全局 indexer.log)
  41 +LOG_DIR.mkdir(parents=True, exist_ok=True)
  42 +timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
  43 +log_file = LOG_DIR / f"product_enrich_{timestamp}.log"
  44 +verbose_log_file = LOG_DIR / "product_enrich_verbose.log"
  45 +
  46 +# 主日志 logger:执行流程、批次信息等
  47 +logger = logging.getLogger("product_enrich")
  48 +logger.setLevel(logging.INFO)
  49 +
  50 +if not logger.handlers:
  51 + formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
  52 +
  53 + file_handler = logging.FileHandler(log_file, encoding="utf-8")
  54 + file_handler.setFormatter(formatter)
  55 +
  56 + stream_handler = logging.StreamHandler()
  57 + stream_handler.setFormatter(formatter)
  58 +
  59 + logger.addHandler(file_handler)
  60 + logger.addHandler(stream_handler)
  61 +
  62 + # 避免日志向根 logger 传播,防止写入 logs/indexer.log 等其他文件
  63 + logger.propagate = False
  64 +
  65 +# 详尽日志 logger:专门记录 LLM 请求与响应
  66 +verbose_logger = logging.getLogger("product_enrich_verbose")
  67 +verbose_logger.setLevel(logging.INFO)
  68 +
  69 +if not verbose_logger.handlers:
  70 + verbose_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
  71 + verbose_file_handler = logging.FileHandler(verbose_log_file, encoding="utf-8")
  72 + verbose_file_handler.setFormatter(verbose_formatter)
  73 + verbose_logger.addHandler(verbose_file_handler)
  74 + verbose_logger.propagate = False
  75 +
  76 +
  77 +# Redis 缓存(用于 anchors / 语义属性)
  78 +ANCHOR_CACHE_PREFIX = REDIS_CONFIG.get("anchor_cache_prefix", "product_anchors")
  79 +ANCHOR_CACHE_EXPIRE_DAYS = int(REDIS_CONFIG.get("anchor_cache_expire_days", 30))
  80 +_anchor_redis: Optional[redis.Redis] = None
  81 +
  82 +try:
  83 + _anchor_redis = redis.Redis(
  84 + host=REDIS_CONFIG.get("host", "localhost"),
  85 + port=REDIS_CONFIG.get("port", 6479),
  86 + password=REDIS_CONFIG.get("password"),
  87 + decode_responses=True,
  88 + socket_timeout=REDIS_CONFIG.get("socket_timeout", 1),
  89 + socket_connect_timeout=REDIS_CONFIG.get("socket_connect_timeout", 1),
  90 + retry_on_timeout=REDIS_CONFIG.get("retry_on_timeout", False),
  91 + health_check_interval=10,
  92 + )
  93 + _anchor_redis.ping()
  94 + logger.info("Redis cache initialized for product anchors and semantic attributes")
  95 +except Exception as e:
  96 + logger.warning(f"Failed to initialize Redis for anchors cache: {e}")
  97 + _anchor_redis = None
  98 +
  99 +
  100 +SYSTEM_MESSAGES = (
  101 + "You are a product annotator for an e-commerce platform. "
  102 + "For each input product, you must understand, analyze and label it, "
  103 + "and return a Markdown table strictly following the requested format. "
  104 + "All output must be in English."
  105 +)
  106 +
  107 +
  108 +def _make_anchor_cache_key(
  109 + title: str,
  110 + target_lang: str,
  111 + tenant_id: Optional[str] = None,
  112 +) -> str:
  113 + """构造 anchors/语义属性的缓存 key。"""
  114 + base = (tenant_id or "global").strip()
  115 + h = hashlib.md5(title.encode("utf-8")).hexdigest()
  116 + return f"{ANCHOR_CACHE_PREFIX}:{base}:{target_lang}:{h}"
  117 +
  118 +
  119 +def _get_cached_anchor_result(
  120 + title: str,
  121 + target_lang: str,
  122 + tenant_id: Optional[str] = None,
  123 +) -> Optional[Dict[str, Any]]:
  124 + if not _anchor_redis:
  125 + return None
  126 + try:
  127 + key = _make_anchor_cache_key(title, target_lang, tenant_id)
  128 + raw = _anchor_redis.get(key)
  129 + if not raw:
  130 + return None
  131 + return json.loads(raw)
  132 + except Exception as e:
  133 + logger.warning(f"Failed to get anchor cache: {e}")
  134 + return None
  135 +
  136 +
  137 +def _set_cached_anchor_result(
  138 + title: str,
  139 + target_lang: str,
  140 + result: Dict[str, Any],
  141 + tenant_id: Optional[str] = None,
  142 +) -> None:
  143 + if not _anchor_redis:
  144 + return
  145 + try:
  146 + key = _make_anchor_cache_key(title, target_lang, tenant_id)
  147 + ttl = ANCHOR_CACHE_EXPIRE_DAYS * 24 * 3600
  148 + _anchor_redis.setex(key, ttl, json.dumps(result, ensure_ascii=False))
  149 + except Exception as e:
  150 + logger.warning(f"Failed to set anchor cache: {e}")
  151 +
  152 +
  153 +def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> str:
  154 + """根据目标语言创建 LLM 提示词和表头说明。
  155 +
  156 + 约定:
  157 + - 提示词始终使用英文;
  158 + - 当 target_lang == "en" 时,直接要求用英文分析并输出英文表头;
  159 + - 当 target_lang 为其他语言时,视作“多轮对话”的后续轮次:
  160 + * 默认上一轮已经用英文完成了分析;
  161 + * 当前轮只需要在保持结构和含义不变的前提下,将整张表格翻译为目标语言,
  162 + 包含表头与所有单元格内容。
  163 + """
  164 + lang_name = SOURCE_LANG_CODE_MAP.get(target_lang, target_lang)
  165 +
  166 + prompt = """Please analyze each input product title and extract the following information:
  167 +
  168 +1. Product title: a natural English product name derived from the input title
  169 +2. Category path: from broad to fine-grained category, separated by ">" (e.g. Clothing>Women>Dresses>Work Dress)
  170 +3. Fine-grained tags: style / features / attributes (e.g. floral, waist-cinching, French style)
  171 +4. Target audience: gender / age group, etc. (e.g. young women)
  172 +5. Usage scene
  173 +6. Applicable season
  174 +7. Key attributes
  175 +8. Material description
  176 +9. Functional features
  177 +10. Selling point: one concise key selling sentence for recommendation
  178 +11. Anchor text: a set of words or phrases that could be used by users as search queries for this product, covering category, fine-grained tags, functional attributes, usage scenes, etc.
  179 +
  180 +Input product list:
  181 +
  182 +"""
  183 +
  184 + for idx, product in enumerate(products, 1):
  185 + prompt += f'{idx}. {product["title"]}\n'
  186 +
  187 + if target_lang == "en":
  188 + # 英文首轮:直接要求英文表头 + 英文内容
  189 + prompt += """
  190 +Please strictly return a Markdown table in the following format. For any column that can contain multiple values, separate values with commas. Do not add any other explanations:
  191 +
  192 +| No. | Product title | Category path | Fine-grained tags | Target audience | Usage scene | Season | Key attributes | Material | Features | Selling point | Anchor text |
  193 +|----|----|----|----|----|----|----|----|----|----|----|----|
  194 +"""
  195 + else:
  196 + # 非英文语言:视作“下一轮对话”,只做翻译,要求表头与内容全部用目标语言
  197 + prompt += f"""
  198 +Now we will output the same table in {lang_name}.
  199 +
  200 +IMPORTANT:
  201 +- Assume you have already generated the full table in English in a previous round.
  202 +- In this round, you must output exactly the same table structure and content,
  203 + but fully translated into {lang_name}, including ALL column headers and ALL cell values.
  204 +- Do NOT change the meaning, fields, or the number/order of rows and columns.
  205 +- Keep valid Markdown table syntax.
  206 +
  207 +Please return ONLY the Markdown table in {lang_name}, without any extra explanations.
  208 +"""
  209 +
  210 + return prompt
  211 +
  212 +
  213 +def call_llm(prompt: str, target_lang: str = "zh") -> Tuple[str, str]:
  214 + """调用大模型API(带重试机制),按目标语言选择系统提示词。"""
  215 + headers = {
  216 + "Authorization": f"Bearer {API_KEY}",
  217 + "Content-Type": "application/json",
  218 + }
  219 +
  220 + payload = {
  221 + "model": MODEL_NAME,
  222 + "messages": [
  223 + {
  224 + "role": "system",
  225 + "content": SYSTEM_MESSAGES,
  226 + },
  227 + {
  228 + "role": "user",
  229 + "content": prompt,
  230 + },
  231 + ],
  232 + "temperature": 0.3,
  233 + "top_p": 0.8,
  234 + }
  235 +
  236 + request_data = {
  237 + "headers": {k: v for k, v in headers.items() if k != "Authorization"},
  238 + "payload": payload,
  239 + }
  240 +
  241 + # 主日志 + 详尽日志:LLM Request
  242 + logger.info(f"\n{'=' * 80}")
  243 + logger.info(f"LLM Request (Model: {MODEL_NAME}):")
  244 + logger.info(json.dumps(request_data, ensure_ascii=False, indent=2))
  245 + logger.info(f"\nPrompt:\n{prompt}")
  246 +
  247 + verbose_logger.info(f"\n{'=' * 80}")
  248 + verbose_logger.info(f"LLM Request (Model: {MODEL_NAME}):")
  249 + verbose_logger.info(json.dumps(request_data, ensure_ascii=False, indent=2))
  250 + verbose_logger.info(f"\nPrompt:\n{prompt}")
  251 +
  252 + # 创建session,禁用代理
  253 + session = requests.Session()
  254 + session.trust_env = False # 忽略系统代理设置
  255 +
  256 + try:
  257 + # 重试机制
  258 + for attempt in range(MAX_RETRIES):
  259 + try:
  260 + response = session.post(
  261 + f"{API_BASE_URL}/chat/completions",
  262 + headers=headers,
  263 + json=payload,
  264 + timeout=REQUEST_TIMEOUT,
  265 + proxies={"http": None, "https": None}, # 明确禁用代理
  266 + )
  267 +
  268 + response.raise_for_status()
  269 + result = response.json()
  270 +
  271 + # 主日志 + 详尽日志:LLM Response
  272 + logger.info(f"\nLLM Response:")
  273 + logger.info(json.dumps(result, ensure_ascii=False, indent=2))
  274 +
  275 + verbose_logger.info(f"\nLLM Response:")
  276 + verbose_logger.info(json.dumps(result, ensure_ascii=False, indent=2))
  277 +
  278 + content = result["choices"][0]["message"]["content"]
  279 + logger.info(f"\nExtracted Content:\n{content}")
  280 + verbose_logger.info(f"\nExtracted Content:\n{content}")
  281 +
  282 + return content, json.dumps(result, ensure_ascii=False)
  283 +
  284 + except requests.exceptions.ProxyError as e:
  285 + logger.warning(f"Attempt {attempt + 1}/{MAX_RETRIES}: Proxy error - {str(e)}")
  286 + if attempt < MAX_RETRIES - 1:
  287 + logger.info(f"Retrying in {RETRY_DELAY} seconds...")
  288 + time.sleep(RETRY_DELAY)
  289 + else:
  290 + raise
  291 +
  292 + except requests.exceptions.RequestException as e:
  293 + logger.warning(f"Attempt {attempt + 1}/{MAX_RETRIES}: Request error - {str(e)}")
  294 + if attempt < MAX_RETRIES - 1:
  295 + logger.info(f"Retrying in {RETRY_DELAY} seconds...")
  296 + time.sleep(RETRY_DELAY)
  297 + else:
  298 + raise
  299 +
  300 + except Exception as e:
  301 + logger.error(f"Unexpected error on attempt {attempt + 1}/{MAX_RETRIES}: {str(e)}")
  302 + if attempt < MAX_RETRIES - 1:
  303 + logger.info(f"Retrying in {RETRY_DELAY} seconds...")
  304 + time.sleep(RETRY_DELAY)
  305 + else:
  306 + raise
  307 +
  308 + finally:
  309 + session.close()
  310 +
  311 +
  312 +def parse_markdown_table(markdown_content: str) -> List[Dict[str, str]]:
  313 + """解析markdown表格内容"""
  314 + lines = markdown_content.strip().split("\n")
  315 + data = []
  316 + data_started = False
  317 +
  318 + for line in lines:
  319 + line = line.strip()
  320 + if not line:
  321 + continue
  322 +
  323 + # 表格行处理
  324 + if line.startswith("|"):
  325 + # 分隔行(---- 或 :---: 等;允许空格,如 "| ---- | ---- |")
  326 + sep_chars = line.replace("|", "").strip().replace(" ", "")
  327 + if sep_chars and set(sep_chars) <= {"-", ":"}:
  328 + data_started = True
  329 + continue
  330 +
  331 + # 首个表头行:无论语言如何,统一跳过
  332 + if not data_started:
  333 + # 等待下一行数据行
  334 + continue
  335 +
  336 + # 解析数据行
  337 + parts = [p.strip() for p in line.split("|")]
  338 + parts = [p for p in parts if p] # 移除空字符串
  339 +
  340 + if len(parts) >= 2:
  341 + row = {
  342 + "seq_no": parts[0],
  343 + "title": parts[1], # 商品标题(按目标语言)
  344 + "category_path": parts[2] if len(parts) > 2 else "", # 品类路径
  345 + "tags": parts[3] if len(parts) > 3 else "", # 细分标签
  346 + "target_audience": parts[4] if len(parts) > 4 else "", # 适用人群
  347 + "usage_scene": parts[5] if len(parts) > 5 else "", # 使用场景
  348 + "season": parts[6] if len(parts) > 6 else "", # 适用季节
  349 + "key_attributes": parts[7] if len(parts) > 7 else "", # 关键属性
  350 + "material": parts[8] if len(parts) > 8 else "", # 材质说明
  351 + "features": parts[9] if len(parts) > 9 else "", # 功能特点
  352 + "selling_points": parts[10] if len(parts) > 10 else "", # 商品卖点
  353 + "anchor_text": parts[11] if len(parts) > 11 else "", # 锚文本
  354 + }
  355 + data.append(row)
  356 +
  357 + return data
  358 +
  359 +
  360 +def process_batch(
  361 + batch_data: List[Dict[str, str]],
  362 + batch_num: int,
  363 + target_lang: str = "zh",
  364 +) -> List[Dict[str, str]]:
  365 + """处理一个批次的数据"""
  366 + logger.info(f"\n{'#' * 80}")
  367 + logger.info(f"Processing Batch {batch_num} ({len(batch_data)} items)")
  368 +
  369 + # 创建提示词
  370 + prompt = create_prompt(batch_data, target_lang=target_lang)
  371 +
  372 + # 调用LLM
  373 + try:
  374 + raw_response, full_response_json = call_llm(prompt, target_lang=target_lang)
  375 +
  376 + # 解析结果
  377 + parsed_results = parse_markdown_table(raw_response)
  378 +
  379 + logger.info(f"\nParsed Results ({len(parsed_results)} items):")
  380 + logger.info(json.dumps(parsed_results, ensure_ascii=False, indent=2))
  381 +
  382 + # 映射回原始ID
  383 + results_with_ids = []
  384 + for i, parsed_item in enumerate(parsed_results):
  385 + if i < len(batch_data):
  386 + original_id = batch_data[i]["id"]
  387 + result = {
  388 + "id": original_id,
  389 + "lang": target_lang,
  390 + "title_input": batch_data[i]["title"], # 原始输入标题
  391 + "title": parsed_item.get("title", ""), # 模型生成的标题
  392 + "category_path": parsed_item.get("category_path", ""), # 品类路径
  393 + "tags": parsed_item.get("tags", ""), # 细分标签
  394 + "target_audience": parsed_item.get("target_audience", ""), # 适用人群
  395 + "usage_scene": parsed_item.get("usage_scene", ""), # 使用场景
  396 + "season": parsed_item.get("season", ""), # 适用季节
  397 + "key_attributes": parsed_item.get("key_attributes", ""), # 关键属性
  398 + "material": parsed_item.get("material", ""), # 材质说明
  399 + "features": parsed_item.get("features", ""), # 功能特点
  400 + "selling_points": parsed_item.get("selling_points", ""), # 商品卖点
  401 + "anchor_text": parsed_item.get("anchor_text", ""), # 锚文本
  402 + }
  403 + results_with_ids.append(result)
  404 + logger.info(f"Mapped: seq={parsed_item['seq_no']} -> original_id={original_id}")
  405 +
  406 + # 保存批次 JSON 日志到独立文件
  407 + batch_log = {
  408 + "batch_num": batch_num,
  409 + "timestamp": datetime.now().isoformat(),
  410 + "input_products": batch_data,
  411 + "raw_response": raw_response,
  412 + "full_response_json": full_response_json,
  413 + "parsed_results": parsed_results,
  414 + "final_results": results_with_ids,
  415 + }
  416 +
  417 + batch_log_file = LOG_DIR / f"batch_{batch_num:04d}_{timestamp}.json"
  418 + with open(batch_log_file, "w", encoding="utf-8") as f:
  419 + json.dump(batch_log, f, ensure_ascii=False, indent=2)
  420 +
  421 + logger.info(f"Batch log saved to: {batch_log_file}")
  422 +
  423 + return results_with_ids
  424 +
  425 + except Exception as e:
  426 + logger.error(f"Error processing batch {batch_num}: {str(e)}", exc_info=True)
  427 + # 返回空结果,保持ID映射
  428 + return [
  429 + {
  430 + "id": item["id"],
  431 + "lang": target_lang,
  432 + "title_input": item["title"],
  433 + "title": "",
  434 + "category_path": "",
  435 + "tags": "",
  436 + "target_audience": "",
  437 + "usage_scene": "",
  438 + "season": "",
  439 + "key_attributes": "",
  440 + "material": "",
  441 + "features": "",
  442 + "selling_points": "",
  443 + "anchor_text": "",
  444 + "error": str(e),
  445 + }
  446 + for item in batch_data
  447 + ]
  448 +
  449 +
  450 +def analyze_products(
  451 + products: List[Dict[str, str]],
  452 + target_lang: str = "zh",
  453 + batch_size: Optional[int] = None,
  454 + tenant_id: Optional[str] = None,
  455 +) -> List[Dict[str, Any]]:
  456 + """
  457 + 库调用入口:根据输入+语言,返回锚文本及各维度信息。
  458 +
  459 + Args:
  460 + products: [{"id": "...", "title": "..."}]
  461 + target_lang: 输出语言
  462 + batch_size: 批大小,默认使用全局 BATCH_SIZE
  463 + """
  464 + if not API_KEY:
  465 + raise RuntimeError("DASHSCOPE_API_KEY is not set, cannot call LLM")
  466 +
  467 + if not products:
  468 + return []
  469 +
  470 + # 简单路径:索引阶段通常 batch_size=1,这里优先做单条缓存命中
  471 + if len(products) == 1:
  472 + p = products[0]
  473 + title = str(p.get("title") or "").strip()
  474 + if title:
  475 + cached = _get_cached_anchor_result(title, target_lang, tenant_id=tenant_id)
  476 + if cached:
  477 + logger.info(
  478 + f"[analyze_products] Cache hit for title='{title[:50]}...', "
  479 + f"lang={target_lang}, tenant_id={tenant_id or 'global'}"
  480 + )
  481 + return [cached]
  482 +
  483 + # call_llm 一次处理上限固定为 BATCH_SIZE(默认 20):
  484 + # - 尽可能攒批处理;
  485 + # - 即便调用方传入更大的 batch_size,也会自动按上限拆批。
  486 + req_bs = BATCH_SIZE if batch_size is None else int(batch_size)
  487 + bs = max(1, min(req_bs, BATCH_SIZE))
  488 + all_results: List[Dict[str, Any]] = []
  489 + total_batches = (len(products) + bs - 1) // bs
  490 +
  491 + for i in range(0, len(products), bs):
  492 + batch_num = i // bs + 1
  493 + batch = products[i : i + bs]
  494 + logger.info(
  495 + f"[analyze_products] Processing batch {batch_num}/{total_batches}, "
  496 + f"size={len(batch)}, target_lang={target_lang}"
  497 + )
  498 + batch_results = process_batch(batch, batch_num=batch_num, target_lang=target_lang)
  499 + all_results.extend(batch_results)
  500 +
  501 + # 写入缓存
  502 + for item in batch_results:
  503 + title_input = str(item.get("title_input") or "").strip()
  504 + if not title_input:
  505 + continue
  506 + if item.get("error"):
  507 + # 不缓存错误结果,避免放大临时故障
  508 + continue
  509 + try:
  510 + _set_cached_anchor_result(title_input, target_lang, item, tenant_id=tenant_id)
  511 + except Exception:
  512 + # 已在内部记录 warning
  513 + pass
  514 +
  515 + return all_results
  516 +
providers/translation.py
@@ -2,7 +2,7 @@ @@ -2,7 +2,7 @@
2 from __future__ import annotations 2 from __future__ import annotations
3 3
4 import logging 4 import logging
5 -from typing import Any, Dict, Optional 5 +from typing import Any, Dict, List, Optional, Sequence, Union
6 import requests 6 import requests
7 7
8 from config.services_config import get_translation_config, get_translation_base_url 8 from config.services_config import get_translation_config, get_translation_base_url
@@ -23,16 +23,49 @@ class HttpTranslationProvider: @@ -23,16 +23,49 @@ class HttpTranslationProvider:
23 self.model = model or "qwen" 23 self.model = model or "qwen"
24 self.timeout_sec = float(timeout_sec or 10.0) 24 self.timeout_sec = float(timeout_sec or 10.0)
25 25
  26 + @property
  27 + def supports_batch(self) -> bool:
  28 + """
  29 + Whether this provider supports list input natively.
  30 +
  31 + 当前实现中,我们已经在 `_translate_once` 内处理了 list,
  32 + 所以可以直接视为支持 batch。
  33 + """
  34 + return True
  35 +
26 def _translate_once( 36 def _translate_once(
27 self, 37 self,
28 - text: str, 38 + text: Union[str, Sequence[str]],
29 target_lang: str, 39 target_lang: str,
30 source_lang: Optional[str] = None, 40 source_lang: Optional[str] = None,
31 context: Optional[str] = None, 41 context: Optional[str] = None,
32 prompt: Optional[str] = None, 42 prompt: Optional[str] = None,
33 - ) -> Optional[str]: 43 + ) -> Union[Optional[str], List[Optional[str]]]:
  44 + # 允许 text 为单个字符串或字符串列表
  45 + if isinstance(text, (list, tuple)):
  46 + # 上游约定:列表输入时,输出列表一一对应;失败位置为 None
  47 + results: List[Optional[str]] = []
  48 + for item in text:
  49 + if item is None or not str(item).strip():
  50 + # 空字符串/None 不视为失败,原样返回以保持语义
  51 + results.append(item) # type: ignore[arg-type]
  52 + continue
  53 + try:
  54 + single = self._translate_once(
  55 + text=str(item),
  56 + target_lang=target_lang,
  57 + source_lang=source_lang,
  58 + context=context,
  59 + prompt=prompt,
  60 + )
  61 + results.append(single) # type: ignore[arg-type]
  62 + except Exception:
  63 + # 理论上不会进入,因为内部已捕获;兜底保持长度一致
  64 + results.append(None)
  65 + return results
  66 +
34 if not text or not str(text).strip(): 67 if not text or not str(text).strip():
35 - return text 68 + return text # type: ignore[return-value]
36 try: 69 try:
37 url = f"{self.base_url}/translate" 70 url = f"{self.base_url}/translate"
38 payload = { 71 payload = {
@@ -62,12 +95,12 @@ class HttpTranslationProvider: @@ -62,12 +95,12 @@ class HttpTranslationProvider:
62 95
63 def translate( 96 def translate(
64 self, 97 self,
65 - text: str, 98 + text: Union[str, Sequence[str]],
66 target_lang: str, 99 target_lang: str,
67 source_lang: Optional[str] = None, 100 source_lang: Optional[str] = None,
68 context: Optional[str] = None, 101 context: Optional[str] = None,
69 prompt: Optional[str] = None, 102 prompt: Optional[str] = None,
70 - ) -> Optional[str]: 103 + ) -> Union[Optional[str], List[Optional[str]]]:
71 return self._translate_once( 104 return self._translate_once(
72 text=text, 105 text=text,
73 target_lang=target_lang, 106 target_lang=target_lang,
query/deepl_provider.py
@@ -10,7 +10,7 @@ from __future__ import annotations @@ -10,7 +10,7 @@ from __future__ import annotations
10 import logging 10 import logging
11 import os 11 import os
12 import re 12 import re
13 -from typing import Dict, Optional, Tuple 13 +from typing import Dict, List, Optional, Sequence, Tuple, Union
14 14
15 import requests 15 import requests
16 from config.services_config import get_translation_config 16 from config.services_config import get_translation_config
@@ -88,6 +88,14 @@ class DeepLProvider: @@ -88,6 +88,14 @@ class DeepLProvider:
88 if not self.api_key: 88 if not self.api_key:
89 logger.warning("DEEPL_AUTH_KEY not set; DeepL translation is unavailable") 89 logger.warning("DEEPL_AUTH_KEY not set; DeepL translation is unavailable")
90 90
  91 + @property
  92 + def supports_batch(self) -> bool:
  93 + """
  94 + DeepL HTTP API 本身支持一次传多条 text,这里先返回 False,
  95 + 由上层逐条拆分,后续如果要真正批量,可调整实现。
  96 + """
  97 + return False
  98 +
91 def _resolve_request_context( 99 def _resolve_request_context(
92 self, 100 self,
93 target_lang: str, 101 target_lang: str,
@@ -108,12 +116,28 @@ class DeepLProvider: @@ -108,12 +116,28 @@ class DeepLProvider:
108 116
109 def translate( 117 def translate(
110 self, 118 self,
111 - text: str, 119 + text: Union[str, Sequence[str]],
112 target_lang: str, 120 target_lang: str,
113 source_lang: Optional[str] = None, 121 source_lang: Optional[str] = None,
114 context: Optional[str] = None, 122 context: Optional[str] = None,
115 prompt: Optional[str] = None, 123 prompt: Optional[str] = None,
116 - ) -> Optional[str]: 124 + ) -> Union[Optional[str], List[Optional[str]]]:
  125 + if isinstance(text, (list, tuple)):
  126 + results: List[Optional[str]] = []
  127 + for item in text:
  128 + if item is None or not str(item).strip():
  129 + results.append(item) # type: ignore[arg-type]
  130 + continue
  131 + out = self.translate(
  132 + text=str(item),
  133 + target_lang=target_lang,
  134 + source_lang=source_lang,
  135 + context=context,
  136 + prompt=prompt,
  137 + )
  138 + results.append(out)
  139 + return results
  140 +
117 if not self.api_key: 141 if not self.api_key:
118 return None 142 return None
119 143
query/llm_translate.py
@@ -11,13 +11,14 @@ from __future__ import annotations @@ -11,13 +11,14 @@ from __future__ import annotations
11 import logging 11 import logging
12 import os 12 import os
13 import time 13 import time
14 -from typing import Optional 14 +from typing import List, Optional, Sequence, Union
15 15
16 from openai import OpenAI 16 from openai import OpenAI
17 17
18 from config.env_config import DASHSCOPE_API_KEY 18 from config.env_config import DASHSCOPE_API_KEY
19 from config.services_config import get_translation_config 19 from config.services_config import get_translation_config
20 -from config.translate_prompts import TRANSLATION_PROMPTS, SOURCE_LANG_CODE_MAP 20 +from config.translate_prompts import TRANSLATION_PROMPTS
  21 +from config.tenant_config_loader import SOURCE_LANG_CODE_MAP, TARGET_LANG_CODE_MAP
21 22
22 23
23 logger = logging.getLogger(__name__) 24 logger = logging.getLogger(__name__)
@@ -96,6 +97,12 @@ class LLMTranslatorProvider: @@ -96,6 +97,12 @@ class LLMTranslatorProvider:
96 ) 97 )
97 self.client = self._create_client() 98 self.client = self._create_client()
98 99
  100 + @property
  101 + def supports_batch(self) -> bool:
  102 + """Whether this provider efficiently supports list input."""
  103 + # 我们在 translate 中已经原生支持 list,所以这里返回 True
  104 + return True
  105 +
99 def _create_client(self) -> Optional[OpenAI]: 106 def _create_client(self) -> Optional[OpenAI]:
100 api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY") 107 api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY")
101 if not api_key: 108 if not api_key:
@@ -107,7 +114,7 @@ class LLMTranslatorProvider: @@ -107,7 +114,7 @@ class LLMTranslatorProvider:
107 logger.error("Failed to initialize llm translation client: %s", exc, exc_info=True) 114 logger.error("Failed to initialize llm translation client: %s", exc, exc_info=True)
108 return None 115 return None
109 116
110 - def translate( 117 + def _translate_single(
111 self, 118 self,
112 text: str, 119 text: str,
113 target_lang: str, 120 target_lang: str,
@@ -148,7 +155,14 @@ class LLMTranslatorProvider: @@ -148,7 +155,14 @@ class LLMTranslatorProvider:
148 if not content: 155 if not content:
149 logger.warning("[llm] Empty result | src=%s tgt=%s latency=%.1fms", src, tgt, latency_ms) 156 logger.warning("[llm] Empty result | src=%s tgt=%s latency=%.1fms", src, tgt, latency_ms)
150 return None 157 return None
151 - logger.info("[llm] Success | src=%s tgt=%s src_text=%s response=%s latency=%.1fms", src, tgt, text, content, latency_ms) 158 + logger.info(
  159 + "[llm] Success | src=%s tgt=%s src_text=%s response=%s latency=%.1fms",
  160 + src,
  161 + tgt,
  162 + text,
  163 + content,
  164 + latency_ms,
  165 + )
152 return content 166 return content
153 except Exception as exc: 167 except Exception as exc:
154 latency_ms = (time.time() - start) * 1000 168 latency_ms = (time.time() - start) * 1000
@@ -162,16 +176,56 @@ class LLMTranslatorProvider: @@ -162,16 +176,56 @@ class LLMTranslatorProvider:
162 ) 176 )
163 return None 177 return None
164 178
  179 + def translate(
  180 + self,
  181 + text: Union[str, Sequence[str]],
  182 + target_lang: str,
  183 + source_lang: Optional[str] = None,
  184 + context: Optional[str] = None,
  185 + prompt: Optional[str] = None,
  186 + ) -> Union[Optional[str], List[Optional[str]]]:
  187 + """
  188 + Translate a single string or a list of strings.
  189 +
  190 + - If input is a list, returns a list of the same length.
  191 + - Per-item failures are returned as None.
  192 + """
  193 + if isinstance(text, (list, tuple)):
  194 + results: List[Optional[str]] = []
  195 + for item in text:
  196 + # 保证一一对应,即使某个元素为空也占位
  197 + if item is None:
  198 + results.append(None)
  199 + continue
  200 + results.append(
  201 + self._translate_single(
  202 + text=str(item),
  203 + target_lang=target_lang,
  204 + source_lang=source_lang,
  205 + context=context,
  206 + prompt=prompt,
  207 + )
  208 + )
  209 + return results
  210 +
  211 + return self._translate_single(
  212 + text=str(text),
  213 + target_lang=target_lang,
  214 + source_lang=source_lang,
  215 + context=context,
  216 + prompt=prompt,
  217 + )
  218 +
165 219
166 def llm_translate( 220 def llm_translate(
167 - text: str, 221 + text: Union[str, Sequence[str]],
168 target_lang: str, 222 target_lang: str,
169 *, 223 *,
170 source_lang: Optional[str] = None, 224 source_lang: Optional[str] = None,
171 source_lang_label: Optional[str] = None, 225 source_lang_label: Optional[str] = None,
172 target_lang_label: Optional[str] = None, 226 target_lang_label: Optional[str] = None,
173 timeout_sec: Optional[float] = None, 227 timeout_sec: Optional[float] = None,
174 -) -> Optional[str]: 228 +) -> Union[Optional[str], List[Optional[str]]]:
175 provider = LLMTranslatorProvider(timeout_sec=timeout_sec or 30.0) 229 provider = LLMTranslatorProvider(timeout_sec=timeout_sec or 30.0)
176 return provider.translate( 230 return provider.translate(
177 text=text, 231 text=text,
query/qwen_mt_translate.py
@@ -7,14 +7,14 @@ import logging @@ -7,14 +7,14 @@ import logging
7 import os 7 import os
8 import re 8 import re
9 import time 9 import time
10 -from typing import Dict, List, Optional 10 +from typing import Dict, List, Optional, Sequence, Union
11 11
12 import redis 12 import redis
13 from openai import OpenAI 13 from openai import OpenAI
14 14
15 from config.env_config import DASHSCOPE_API_KEY, REDIS_CONFIG 15 from config.env_config import DASHSCOPE_API_KEY, REDIS_CONFIG
16 from config.services_config import get_translation_cache_config 16 from config.services_config import get_translation_cache_config
17 -from config.translate_prompts import SOURCE_LANG_CODE_MAP 17 +from config.tenant_config_loader import SOURCE_LANG_CODE_MAP, TARGET_LANG_CODE_MAP
18 18
19 logger = logging.getLogger(__name__) 19 logger = logging.getLogger(__name__)
20 20
@@ -62,6 +62,16 @@ class Translator: @@ -62,6 +62,16 @@ class Translator:
62 if self.use_cache and bool(cache_cfg.get("enabled", True)): 62 if self.use_cache and bool(cache_cfg.get("enabled", True)):
63 self.redis_client = self._init_redis_client() 63 self.redis_client = self._init_redis_client()
64 64
  65 + @property
  66 + def supports_batch(self) -> bool:
  67 + """
  68 + 标记该 provider 已支持列表输入。
  69 +
  70 + 当前实现为循环单条调用(带缓存),不是真正的并行批量请求,
  71 + 但对上层来说可以直接传 list,返回 list。
  72 + """
  73 + return True
  74 +
65 @staticmethod 75 @staticmethod
66 def _normalize_model(model: str) -> str: 76 def _normalize_model(model: str) -> str:
67 m = (model or "qwen").strip().lower() 77 m = (model or "qwen").strip().lower()
@@ -117,14 +127,31 @@ class Translator: @@ -117,14 +127,31 @@ class Translator:
117 127
118 def translate( 128 def translate(
119 self, 129 self,
120 - text: str, 130 + text: Union[str, Sequence[str]],
121 target_lang: str, 131 target_lang: str,
122 source_lang: Optional[str] = None, 132 source_lang: Optional[str] = None,
123 context: Optional[str] = None, 133 context: Optional[str] = None,
124 prompt: Optional[str] = None, 134 prompt: Optional[str] = None,
125 - ) -> Optional[str]:  
126 - if not text or not text.strip():  
127 - return text 135 + ) -> Union[Optional[str], List[Optional[str]]]:
  136 + if isinstance(text, (list, tuple)):
  137 + results: List[Optional[str]] = []
  138 + for item in text:
  139 + if item is None or not str(item).strip():
  140 + results.append(item) # type: ignore[arg-type]
  141 + continue
  142 + # 对于 batch,这里沿用单条的缓存与规则,逐条调用
  143 + out = self.translate(
  144 + text=str(item),
  145 + target_lang=target_lang,
  146 + source_lang=source_lang,
  147 + context=context,
  148 + prompt=prompt,
  149 + )
  150 + results.append(out)
  151 + return results
  152 +
  153 + if not text or not str(text).strip():
  154 + return text # type: ignore[return-value]
128 155
129 tgt = (target_lang or "").strip().lower() 156 tgt = (target_lang or "").strip().lower()
130 src = (source_lang or "").strip().lower() or None 157 src = (source_lang or "").strip().lower() or None
tests/ci/test_service_api_contracts.py
@@ -342,7 +342,7 @@ def test_indexer_build_docs_from_db_contract(indexer_client: TestClient): @@ -342,7 +342,7 @@ def test_indexer_build_docs_from_db_contract(indexer_client: TestClient):
342 342
343 343
344 def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch): 344 def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch):
345 - import indexer.product_annotator as process_products 345 + import indexer.product_enrich as process_products
346 346
347 def _fake_analyze_products( 347 def _fake_analyze_products(
348 products: List[Dict[str, str]], 348 products: List[Dict[str, str]],
tests/test_process_products_batching.py
@@ -2,7 +2,7 @@ from __future__ import annotations @@ -2,7 +2,7 @@ from __future__ import annotations
2 2
3 from typing import Any, Dict, List 3 from typing import Any, Dict, List
4 4
5 -import indexer.product_annotator as process_products 5 +import indexer.product_enrich as process_products
6 6
7 7
8 def _mk_products(n: int) -> List[Dict[str, str]]: 8 def _mk_products(n: int) -> List[Dict[str, str]]: