Commit d4cadc13bd22491045c3580a54d0aaa1d4f625e6

Authored by tangwang
1 parent a0a173ae

翻译重构

api/routes/search.py
@@ -472,7 +472,6 @@ async def get_es_raw_document(spu_id: str, http_request: Request): @@ -472,7 +472,6 @@ async def get_es_raw_document(spu_id: str, http_request: Request):
472 index_name = get_tenant_index_name(tenant_id) 472 index_name = get_tenant_index_name(tenant_id)
473 473
474 body = { 474 body = {
475 - "size": 5,  
476 "query": { 475 "query": {
477 "bool": { 476 "bool": {
478 "filter": [ 477 "filter": [
api/translator_app.py
@@ -98,7 +98,9 @@ from pydantic import BaseModel, Field @@ -98,7 +98,9 @@ from pydantic import BaseModel, Field
98 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 98 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
99 99
100 from query.qwen_mt_translate import Translator 100 from query.qwen_mt_translate import Translator
101 -from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG 101 +from query.llm_translate import LLMTranslatorProvider
  102 +from query.deepl_provider import DeepLProvider
  103 +from config.services_config import get_translation_config
102 104
103 # Configure logging 105 # Configure logging
104 logging.basicConfig( 106 logging.basicConfig(
@@ -107,23 +109,52 @@ logging.basicConfig( @@ -107,23 +109,52 @@ logging.basicConfig(
107 ) 109 )
108 logger = logging.getLogger(__name__) 110 logger = logging.getLogger(__name__)
109 111
110 -# Fixed translation prompt  
111 -TRANSLATION_PROMPT = "Translate the original text into an English product SKU name. Requirements: Ensure accurate and complete transmission of the original information, with concise, clear, authentic, and professional language."  
112 -  
113 # Global translator instances cache (keyed by model) 112 # Global translator instances cache (keyed by model)
114 -_translators: Dict[str, Translator] = {} 113 +_translators: Dict[str, object] = {}
  114 +
115 115
  116 +def _resolve_default_model() -> str:
  117 + """
  118 + Resolve translator model from services.translation config first.
116 119
117 -def get_translator(model: str = "qwen") -> Translator: 120 + Priority:
  121 + 1) TRANSLATION_MODEL env (explicit runtime override)
  122 + 2) services.translation.provider + providers.<provider>.model
  123 + 3) qwen-mt
  124 + """
  125 + env_model = (os.getenv("TRANSLATION_MODEL") or "").strip()
  126 + if env_model:
  127 + return env_model
  128 + try:
  129 + cfg = get_translation_config()
  130 + provider = (cfg.provider or "").strip().lower()
  131 + provider_cfg = cfg.get_provider_cfg() if hasattr(cfg, "get_provider_cfg") else {}
  132 + model = (provider_cfg.get("model") or "").strip().lower() if isinstance(provider_cfg, dict) else ""
  133 + if provider == "llm":
  134 + return "llm"
  135 + if provider in {"qwen-mt", "direct", "http"}:
  136 + return model or "qwen-mt"
  137 + if provider == "deepl":
  138 + return "deepl"
  139 + except Exception:
  140 + pass
  141 + return "qwen-mt"
  142 +
  143 +
  144 +def get_translator(model: str = "qwen") -> object:
118 """Get or create translator instance for the specified model.""" 145 """Get or create translator instance for the specified model."""
119 global _translators 146 global _translators
120 if model not in _translators: 147 if model not in _translators:
121 logger.info(f"Initializing translator with model: {model}...") 148 logger.info(f"Initializing translator with model: {model}...")
122 - _translators[model] = Translator(  
123 - model=model,  
124 - use_cache=True,  
125 - timeout=10  
126 - ) 149 + normalized = (model or "qwen").strip().lower()
  150 + if normalized in {"qwen", "qwen-mt", "qwen-mt-flash", "qwen-mt-flush"}:
  151 + _translators[model] = Translator(model=normalized, use_cache=True, timeout=10)
  152 + elif normalized == "deepl":
  153 + _translators[model] = DeepLProvider(api_key=None, timeout=10.0)
  154 + elif normalized == "llm":
  155 + _translators[model] = LLMTranslatorProvider()
  156 + else:
  157 + raise ValueError(f"Unsupported model: {model}")
127 logger.info(f"Translator initialized with model: {model}") 158 logger.info(f"Translator initialized with model: {model}")
128 return _translators[model] 159 return _translators[model]
129 160
@@ -134,7 +165,9 @@ class TranslationRequest(BaseModel): @@ -134,7 +165,9 @@ class TranslationRequest(BaseModel):
134 text: str = Field(..., description="Text to translate") 165 text: str = Field(..., description="Text to translate")
135 target_lang: str = Field(..., description="Target language code (zh, en, ru, etc.)") 166 target_lang: str = Field(..., description="Target language code (zh, en, ru, etc.)")
136 source_lang: Optional[str] = Field(None, description="Source language code (optional, auto-detect if not provided)") 167 source_lang: Optional[str] = Field(None, description="Source language code (optional, auto-detect if not provided)")
137 - model: Optional[str] = Field("qwen", description="Translation model: 'qwen' (default) or 'deepl'") 168 + model: Optional[str] = Field(None, description="Translation model: qwen-mt | deepl | llm")
  169 + context: Optional[str] = Field(None, description="Optional translation scene or context")
  170 + prompt: Optional[str] = Field(None, description="Optional prompt override")
138 171
139 class Config: 172 class Config:
140 json_schema_extra = { 173 json_schema_extra = {
@@ -142,7 +175,8 @@ class TranslationRequest(BaseModel): @@ -142,7 +175,8 @@ class TranslationRequest(BaseModel):
142 "text": "商品名称", 175 "text": "商品名称",
143 "target_lang": "en", 176 "target_lang": "en",
144 "source_lang": "zh", 177 "source_lang": "zh",
145 - "model": "qwen" 178 + "model": "llm",
  179 + "context": "sku_name"
146 } 180 }
147 } 181 }
148 182
@@ -180,8 +214,7 @@ app.add_middleware( @@ -180,8 +214,7 @@ app.add_middleware(
180 async def startup_event(): 214 async def startup_event():
181 """Initialize translator on startup.""" 215 """Initialize translator on startup."""
182 logger.info("Starting Translation Service API on port 6006") 216 logger.info("Starting Translation Service API on port 6006")
183 - # Get default model from environment variable or use 'qwen'  
184 - default_model = os.getenv("TRANSLATION_MODEL", "qwen") 217 + default_model = _resolve_default_model()
185 try: 218 try:
186 get_translator(model=default_model) 219 get_translator(model=default_model)
187 logger.info(f"Translation service ready with default model: {default_model}") 220 logger.info(f"Translation service ready with default model: {default_model}")
@@ -194,15 +227,17 @@ async def startup_event(): @@ -194,15 +227,17 @@ async def startup_event():
194 async def health_check(): 227 async def health_check():
195 """Health check endpoint.""" 228 """Health check endpoint."""
196 try: 229 try:
197 - default_model = os.getenv("TRANSLATION_MODEL", "qwen")  
198 - translator = get_translator(model=default_model) 230 + # 仅做轻量级本地检查,避免在健康检查中触发潜在的阻塞初始化或外部依赖
  231 + default_model = _resolve_default_model()
  232 + # 如果启动事件成功,默认模型通常会已经初始化到缓存中
  233 + translator = _translators.get(default_model) or next(iter(_translators.values()), None)
199 return { 234 return {
200 "status": "healthy", 235 "status": "healthy",
201 "service": "translation", 236 "service": "translation",
202 "default_model": default_model, 237 "default_model": default_model,
203 "available_models": list(_translators.keys()), 238 "available_models": list(_translators.keys()),
204 "translator_initialized": translator is not None, 239 "translator_initialized": translator is not None,
205 - "cache_enabled": translator.use_cache if translator else False 240 + "cache_enabled": bool(getattr(translator, "use_cache", False))
206 } 241 }
207 except Exception as e: 242 except Exception as e:
208 logger.error(f"Health check failed: {e}") 243 logger.error(f"Health check failed: {e}")
@@ -238,11 +273,11 @@ async def translate(request: TranslationRequest): @@ -238,11 +273,11 @@ async def translate(request: TranslationRequest):
238 ) 273 )
239 274
240 # Validate model parameter 275 # Validate model parameter
241 - model = request.model.lower() if request.model else "qwen"  
242 - if model not in ['qwen', 'deepl']: 276 + model = request.model.lower() if request.model else _resolve_default_model().lower()
  277 + if model not in ["qwen", "qwen-mt", "deepl", "llm"]:
243 raise HTTPException( 278 raise HTTPException(
244 status_code=400, 279 status_code=400,
245 - detail=f"Invalid model: {model}. Supported models: 'qwen', 'deepl'" 280 + detail="Invalid model. Supported models: 'qwen-mt', 'deepl', 'llm'"
246 ) 281 )
247 282
248 try: 283 try:
@@ -254,7 +289,8 @@ async def translate(request: TranslationRequest): @@ -254,7 +289,8 @@ async def translate(request: TranslationRequest):
254 text=request.text, 289 text=request.text,
255 target_lang=request.target_lang, 290 target_lang=request.target_lang,
256 source_lang=request.source_lang, 291 source_lang=request.source_lang,
257 - prompt=TRANSLATION_PROMPT 292 + context=request.context,
  293 + prompt=request.prompt,
258 ) 294 )
259 295
260 if translated_text is None: 296 if translated_text is None:
@@ -269,7 +305,7 @@ async def translate(request: TranslationRequest): @@ -269,7 +305,7 @@ async def translate(request: TranslationRequest):
269 source_lang=request.source_lang, 305 source_lang=request.source_lang,
270 translated_text=translated_text, 306 translated_text=translated_text,
271 status="success", 307 status="success",
272 - model=translator.model 308 + model=str(getattr(translator, "model", model))
273 ) 309 )
274 310
275 except HTTPException: 311 except HTTPException:
config/__init__.py
@@ -28,6 +28,7 @@ from .services_config import ( @@ -28,6 +28,7 @@ from .services_config import (
28 get_translation_base_url, 28 get_translation_base_url,
29 get_embedding_base_url, 29 get_embedding_base_url,
30 get_rerank_service_url, 30 get_rerank_service_url,
  31 + get_translation_cache_config,
31 ServiceConfig, 32 ServiceConfig,
32 ) 33 )
33 34
@@ -53,5 +54,6 @@ __all__ = [ @@ -53,5 +54,6 @@ __all__ = [
53 'get_translation_base_url', 54 'get_translation_base_url',
54 'get_embedding_base_url', 55 'get_embedding_base_url',
55 'get_rerank_service_url', 56 'get_rerank_service_url',
  57 + 'get_translation_cache_config',
56 'ServiceConfig', 58 'ServiceConfig',
57 ] 59 ]
config/config.yaml
@@ -81,18 +81,6 @@ query_config: @@ -81,18 +81,6 @@ query_config:
81 translation_service: "deepl" 81 translation_service: "deepl"
82 translation_api_key: null # 通过环境变量设置 82 translation_api_key: null # 通过环境变量设置
83 83
84 - # 翻译提示词配置(用于提高翻译质量,作为DeepL API的context参数)  
85 - translation_prompts:  
86 - # 商品标题翻译提示词  
87 - product_title_zh: "请将原文翻译成中文商品SKU名称,要求:确保精确、完整地传达原文信息的基础上,语言简洁清晰、地道、专业。"  
88 - product_title_en: "Translate the original text into an English product SKU name. Requirements: Ensure accurate and complete transmission of the original information, with concise, clear, authentic, and professional language."  
89 - # query翻译提示词  
90 - query_zh: "电商领域"  
91 - query_en: "e-commerce domain"  
92 - # 默认翻译用词  
93 - default_zh: "电商领域"  
94 - default_en: "e-commerce domain"  
95 -  
96 # 返回字段配置(_source includes) 84 # 返回字段配置(_source includes)
97 # null表示返回所有字段,[]表示不返回任何字段,列表表示只返回指定字段 85 # null表示返回所有字段,[]表示不返回任何字段,列表表示只返回指定字段
98 source_fields: null 86 source_fields: null
@@ -119,16 +107,24 @@ rerank: @@ -119,16 +107,24 @@ rerank:
119 # 可扩展服务/provider 注册表(单一配置源) 107 # 可扩展服务/provider 注册表(单一配置源)
120 services: 108 services:
121 translation: 109 translation:
122 - provider: "llm" # direct | http | google(reserved) 110 + provider: "llm" # qwen-mt | deepl | http | llm
123 base_url: "http://127.0.0.1:6006" 111 base_url: "http://127.0.0.1:6006"
124 - model: "qwen" 112 + model: "qwen-flash"
125 timeout_sec: 10.0 113 timeout_sec: 10.0
  114 + cache:
  115 + enabled: true
  116 + key_prefix: "trans:v2"
  117 + ttl_seconds: 62208000
  118 + sliding_expiration: true
  119 + key_include_context: true
  120 + key_include_prompt: true
  121 + key_include_source_lang: true
126 providers: 122 providers:
127 - direct:  
128 - model: "qwen" 123 + qwen-mt:
  124 + model: "qwen-mt-flush"
129 http: 125 http:
130 base_url: "http://127.0.0.1:6006" 126 base_url: "http://127.0.0.1:6006"
131 - model: "qwen" 127 + model: "qwen-mt-flush"
132 timeout_sec: 10.0 128 timeout_sec: 10.0
133 llm: 129 llm:
134 model: "qwen-flash" 130 model: "qwen-flash"
@@ -136,6 +132,11 @@ services: @@ -136,6 +132,11 @@ services:
136 # base_url 留空则使用 DASHSCOPE_BASE_URL 或默认地域 132 # base_url 留空则使用 DASHSCOPE_BASE_URL 或默认地域
137 base_url: "" 133 base_url: ""
138 timeout_sec: 30.0 134 timeout_sec: 30.0
  135 + deepl:
  136 + model: "deepl"
  137 + timeout_sec: 10.0
  138 + # 可选:用于术语表翻译(由 query_config.translation_glossary_id 衔接)
  139 + glossary_id: ""
139 google: 140 google:
140 enabled: false 141 enabled: false
141 project_id: "" 142 project_id: ""
config/config_loader.py
@@ -42,7 +42,6 @@ class QueryConfig: @@ -42,7 +42,6 @@ class QueryConfig:
42 translation_api_key: Optional[str] = None 42 translation_api_key: Optional[str] = None
43 translation_glossary_id: Optional[str] = None 43 translation_glossary_id: Optional[str] = None
44 translation_context: str = "e-commerce product search" 44 translation_context: str = "e-commerce product search"
45 - translation_prompts: Dict[str, str] = field(default_factory=dict)  
46 45
47 # Embedding field names 46 # Embedding field names
48 text_embedding_field: Optional[str] = "title_embedding" 47 text_embedding_field: Optional[str] = "title_embedding"
@@ -250,7 +249,6 @@ class ConfigLoader: @@ -250,7 +249,6 @@ class ConfigLoader:
250 translation_service=query_config_data.get("translation_service") or "deepl", 249 translation_service=query_config_data.get("translation_service") or "deepl",
251 translation_glossary_id=query_config_data.get("translation_glossary_id"), 250 translation_glossary_id=query_config_data.get("translation_glossary_id"),
252 translation_context=query_config_data.get("translation_context") or "e-commerce product search", 251 translation_context=query_config_data.get("translation_context") or "e-commerce product search",
253 - translation_prompts=query_config_data.get("translation_prompts", {}),  
254 text_embedding_field=query_config_data.get("text_embedding_field"), 252 text_embedding_field=query_config_data.get("text_embedding_field"),
255 image_embedding_field=query_config_data.get("image_embedding_field"), 253 image_embedding_field=query_config_data.get("image_embedding_field"),
256 source_fields=query_config_data.get("source_fields"), 254 source_fields=query_config_data.get("source_fields"),
config/services_config.py
@@ -72,12 +72,12 @@ def _resolve_translation() -&gt; ServiceConfig: @@ -72,12 +72,12 @@ def _resolve_translation() -&gt; ServiceConfig:
72 config_provider=cfg.get("provider"), 72 config_provider=cfg.get("provider"),
73 capability="translation", 73 capability="translation",
74 ) 74 )
75 - if provider not in ("direct", "local", "inprocess", "http", "service"): 75 + if provider not in ("qwen-mt", "deepl", "direct", "local", "inprocess", "http", "service", "llm"):
76 raise ValueError(f"Unsupported translation provider: {provider}") 76 raise ValueError(f"Unsupported translation provider: {provider}")
77 77
78 # Env override for http base_url 78 # Env override for http base_url
79 env_url = os.getenv("TRANSLATION_SERVICE_URL") 79 env_url = os.getenv("TRANSLATION_SERVICE_URL")
80 - if env_url and provider == "http": 80 + if env_url and provider in ("http", "service"):
81 providers = dict(providers) 81 providers = dict(providers)
82 providers["http"] = dict(providers.get("http", {})) 82 providers["http"] = dict(providers.get("http", {}))
83 providers["http"]["base_url"] = env_url.rstrip("/") 83 providers["http"]["base_url"] = env_url.rstrip("/")
@@ -206,6 +206,27 @@ def get_translation_base_url() -&gt; str: @@ -206,6 +206,27 @@ def get_translation_base_url() -&gt; str:
206 return str(base).rstrip("/") 206 return str(base).rstrip("/")
207 207
208 208
  209 +def get_translation_cache_config() -> Dict[str, Any]:
  210 + """
  211 + Resolve translation cache policy from services.translation.cache.
  212 +
  213 + All translation cache key/TTL behavior should be configured in config.yaml,
  214 + not hardcoded in code.
  215 + """
  216 + raw = _load_services_raw()
  217 + cfg = raw.get("translation", {}) if isinstance(raw.get("translation"), dict) else {}
  218 + cache_cfg = cfg.get("cache", {}) if isinstance(cfg.get("cache"), dict) else {}
  219 + return {
  220 + "enabled": bool(cache_cfg.get("enabled", True)),
  221 + "key_prefix": str(cache_cfg.get("key_prefix", "trans:v2")),
  222 + "ttl_seconds": int(cache_cfg.get("ttl_seconds", 360 * 24 * 3600)),
  223 + "sliding_expiration": bool(cache_cfg.get("sliding_expiration", True)),
  224 + "key_include_context": bool(cache_cfg.get("key_include_context", True)),
  225 + "key_include_prompt": bool(cache_cfg.get("key_include_prompt", True)),
  226 + "key_include_source_lang": bool(cache_cfg.get("key_include_source_lang", True)),
  227 + }
  228 +
  229 +
209 def get_embedding_base_url() -> str: 230 def get_embedding_base_url() -> str:
210 """Resolve embedding HTTP base URL.""" 231 """Resolve embedding HTTP base URL."""
211 base = ( 232 base = (
config/translate_prompts.py 0 → 100644
@@ -0,0 +1,82 @@ @@ -0,0 +1,82 @@
  1 +SOURCE_LANG_CODE_MAP = {
  2 + "en": "English",
  3 + "zh": "Chinese",
  4 + "zh_tw": "Traditional Chinese",
  5 + "ru": "Russian",
  6 + "ja": "Japanese",
  7 + "ko": "Korean",
  8 + "es": "Spanish",
  9 + "fr": "French",
  10 + "pt": "Portuguese",
  11 + "de": "German",
  12 + "it": "Italian",
  13 + "th": "Thai",
  14 + "vi": "Vietnamese",
  15 + "id": "Indonesian",
  16 + "ms": "Malay",
  17 + "ar": "Arabic",
  18 + "hi": "Hindi",
  19 + "he": "Hebrew",
  20 + "my": "Burmese",
  21 + "ta": "Tamil",
  22 + "ur": "Urdu",
  23 + "bn": "Bengali",
  24 + "pl": "Polish",
  25 + "nl": "Dutch",
  26 + "ro": "Romanian",
  27 + "tr": "Turkish",
  28 + "km": "Khmer",
  29 + "lo": "Lao",
  30 + "yue": "Cantonese",
  31 + "cs": "Czech",
  32 + "el": "Greek",
  33 + "sv": "Swedish",
  34 + "hu": "Hungarian",
  35 + "da": "Danish",
  36 + "fi": "Finnish",
  37 + "uk": "Ukrainian",
  38 + "bg": "Bulgarian",
  39 +}
  40 +
  41 +TARGET_LANG_CODE_MAP = {v: k for k, v in SOURCE_LANG_CODE_MAP.items()}
  42 +
  43 +TRANSLATION_PROMPTS = {
  44 + "general": {
  45 + "zh": "你是一名专业的 {source_lang}({src_lang_code})到 {target_lang}({tgt_lang_code})翻译专家,请准确传达原文含义并符合{target_lang}语言习惯,只输出翻译结果:{text}",
  46 + "en": "You are a professional {source_lang} ({src_lang_code}) to {target_lang} ({tgt_lang_code}) translator. Accurately convey the meaning following {target_lang} grammar and usage, output only the translation: {text}",
  47 + "ru": "Вы профессиональный переводчик с {source_lang} ({src_lang_code}) на {target_lang} ({tgt_lang_code}). Точно передайте смысл текста, соблюдая нормы {target_lang}, выводите только перевод: {text}",
  48 + "ar": "أنت مترجم محترف من {source_lang} ({src_lang_code}) إلى {target_lang} ({tgt_lang_code}). انقل المعنى بدقة وفق قواعد {target_lang} وأخرج الترجمة فقط: {text}",
  49 + "ja": "あなたは {source_lang}({src_lang_code})から {target_lang}({tgt_lang_code})へのプロ翻訳者です。意味を正確に伝え、{target_lang}の表現に従い、翻訳のみ出力してください:{text}",
  50 + "es": "Eres un traductor profesional de {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Transmite con precisión el significado y devuelve solo la traducción: {text}",
  51 + "de": "Du bist ein professioneller Übersetzer von {source_lang} ({src_lang_code}) nach {target_lang} ({tgt_lang_code}). Gib die Bedeutung korrekt wieder und gib nur die Übersetzung aus: {text}",
  52 + "fr": "Vous êtes un traducteur professionnel de {source_lang} ({src_lang_code}) vers {target_lang} ({tgt_lang_code}). Transmettez fidèlement le sens et produisez uniquement la traduction : {text}",
  53 + "it": "Sei un traduttore professionista da {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Trasmetti accuratamente il significato e restituisci solo la traduzione: {text}",
  54 + "pt": "Você é um tradutor profissional de {source_lang} ({src_lang_code}) para {target_lang} ({tgt_lang_code}). Transmita o significado com precisão e produza apenas a tradução: {text}"
  55 + },
  56 +
  57 + "sku_name": {
  58 + "zh": "你是一名专业的 {source_lang}({src_lang_code})到 {target_lang}({tgt_lang_code})电商翻译专家,请将原文翻译为{target_lang}商品SKU名称,要求准确完整、简洁专业,只输出结果:{text}",
  59 + "en": "You are a professional {source_lang} ({src_lang_code}) to {target_lang} ({tgt_lang_code}) ecommerce translator. Translate into a concise and accurate {target_lang} product SKU name, output only the result: {text}",
  60 + "ru": "Вы переводчик e-commerce с {source_lang} ({src_lang_code}) на {target_lang} ({tgt_lang_code}). Переведите в краткое и точное название SKU товара на {target_lang}, выводите только результат: {text}",
  61 + "ar": "أنت مترجم تجارة إلكترونية من {source_lang} ({src_lang_code}) إلى {target_lang} ({tgt_lang_code}). ترجم إلى اسم SKU للمنتج بلغة {target_lang} بدقة واختصار، وأخرج النتيجة فقط: {text}",
  62 + "ja": "{source_lang}({src_lang_code})から {target_lang}({tgt_lang_code})へのEC翻訳者として、簡潔で正確な{target_lang}の商品SKU名に翻訳し、結果のみ出力してください:{text}",
  63 + "es": "Eres un traductor ecommerce de {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Traduce a un nombre SKU de producto en {target_lang}, preciso y conciso, devuelve solo el resultado: {text}",
  64 + "de": "Du bist ein E-Commerce-Übersetzer von {source_lang} ({src_lang_code}) nach {target_lang} ({tgt_lang_code}). Übersetze in einen präzisen und kurzen {target_lang} Produkt-SKU-Namen, nur Ergebnis ausgeben: {text}",
  65 + "fr": "Vous êtes un traducteur e-commerce de {source_lang} ({src_lang_code}) vers {target_lang} ({tgt_lang_code}). Traduisez en un nom SKU produit {target_lang} précis et concis, sortie uniquement : {text}",
  66 + "it": "Sei un traduttore ecommerce da {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Traduce in un nome SKU prodotto {target_lang} conciso e accurato, restituisci solo il risultato: {text}",
  67 + "pt": "Você é um tradutor de e-commerce de {source_lang} ({src_lang_code}) para {target_lang} ({tgt_lang_code}). Traduza para um nome SKU de produto {target_lang} conciso e preciso, produza apenas o resultado: {text}"
  68 + },
  69 +
  70 + "ecommerce_search_query": {
  71 + "zh": "你是一名专业的 {source_lang}({src_lang_code})到 {target_lang}({tgt_lang_code})翻译助手,请将电商搜索词准确翻译为{target_lang}并符合搜索习惯,只输出结果:{text}",
  72 + "en": "You are a professional {source_lang} ({src_lang_code}) to {target_lang} ({tgt_lang_code}) translator. Translate the ecommerce search query accurately following {target_lang} search habits, output only the result: {text}",
  73 + "ru": "Вы переводчик с {source_lang} ({src_lang_code}) на {target_lang} ({tgt_lang_code}). Переведите поисковый запрос e-commerce с учётом привычек поиска, выводите только результат: {text}",
  74 + "ar": "أنت مترجم من {source_lang} ({src_lang_code}) إلى {target_lang} ({tgt_lang_code}). ترجم عبارة البحث للتجارة الإلكترونية بما يناسب عادات البحث وأخرج النتيجة فقط: {text}",
  75 + "ja": "{source_lang}({src_lang_code})から {target_lang}({tgt_lang_code})への翻訳者として、EC検索キーワードを{target_lang}の検索習慣に合わせて翻訳し、結果のみ出力してください:{text}",
  76 + "es": "Eres un traductor de {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Traduce la consulta de búsqueda ecommerce según los hábitos de búsqueda y devuelve solo el resultado: {text}",
  77 + "de": "Du bist ein Übersetzer von {source_lang} ({src_lang_code}) nach {target_lang} ({tgt_lang_code}). Übersetze die E-Commerce-Suchanfrage entsprechend den Suchgewohnheiten, nur Ergebnis ausgeben: {text}",
  78 + "fr": "Vous êtes un traducteur de {source_lang} ({src_lang_code}) vers {target_lang} ({tgt_lang_code}). Traduisez la requête de recherche e-commerce selon les habitudes de recherche, sortie uniquement : {text}",
  79 + "it": "Sei un traduttore da {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Traduce la query di ricerca ecommerce secondo le abitudini di ricerca e restituisci solo il risultato: {text}",
  80 + "pt": "Você é um tradutor de {source_lang} ({src_lang_code}) para {target_lang} ({tgt_lang_code}). Traduza a consulta de busca de ecommerce conforme os hábitos de busca e produza apenas o resultado: {text}"
  81 + }
  82 +}
docs/搜索API对接指南.md
@@ -1814,7 +1814,8 @@ curl &quot;http://localhost:6007/health&quot; @@ -1814,7 +1814,8 @@ curl &quot;http://localhost:6007/health&quot;
1814 "text": "商品名称", 1814 "text": "商品名称",
1815 "target_lang": "en", 1815 "target_lang": "en",
1816 "source_lang": "zh", 1816 "source_lang": "zh",
1817 - "model": "qwen" 1817 + "model": "qwen",
  1818 + "context": "sku_name"
1818 } 1819 }
1819 ``` 1820 ```
1820 1821
@@ -1823,7 +1824,8 @@ curl &quot;http://localhost:6007/health&quot; @@ -1823,7 +1824,8 @@ curl &quot;http://localhost:6007/health&quot;
1823 | `text` | string | Y | 待翻译文本 | 1824 | `text` | string | Y | 待翻译文本 |
1824 | `target_lang` | string | Y | 目标语言:`zh`、`en`、`ru` 等 | 1825 | `target_lang` | string | Y | 目标语言:`zh`、`en`、`ru` 等 |
1825 | `source_lang` | string | N | 源语言,不传则自动检测 | 1826 | `source_lang` | string | N | 源语言,不传则自动检测 |
1826 -| `model` | string | N | `qwen`(默认)或 `deepl` | 1827 +| `model` | string | N | `qwen`(默认)、`deepl` 或 `llm` |
  1828 +| `context` | string | N | 翻译场景参数:商品标题翻译使用 `sku_name`,搜索请求中的 query 翻译使用 `ecommerce_search_query`,其它通用场景可不传或使用 `general` |
1827 1829
1828 **响应**: 1830 **响应**:
1829 ```json 1831 ```json
indexer/document_transformer.py
@@ -36,7 +36,6 @@ class SPUDocumentTransformer: @@ -36,7 +36,6 @@ class SPUDocumentTransformer:
36 searchable_option_dimensions: List[str], 36 searchable_option_dimensions: List[str],
37 tenant_config: Optional[Dict[str, Any]] = None, 37 tenant_config: Optional[Dict[str, Any]] = None,
38 translator: Optional[Any] = None, 38 translator: Optional[Any] = None,
39 - translation_prompts: Optional[Dict[str, str]] = None,  
40 encoder: Optional[Any] = None, 39 encoder: Optional[Any] = None,
41 enable_title_embedding: bool = True, 40 enable_title_embedding: bool = True,
42 image_encoder: Optional[Any] = None, 41 image_encoder: Optional[Any] = None,
@@ -50,7 +49,6 @@ class SPUDocumentTransformer: @@ -50,7 +49,6 @@ class SPUDocumentTransformer:
50 searchable_option_dimensions: 可搜索的option维度列表 49 searchable_option_dimensions: 可搜索的option维度列表
51 tenant_config: 租户配置(包含主语言和翻译配置) 50 tenant_config: 租户配置(包含主语言和翻译配置)
52 translator: 翻译器实例(可选,如果提供则启用翻译功能) 51 translator: 翻译器实例(可选,如果提供则启用翻译功能)
53 - translation_prompts: 翻译提示词配置(可选)  
54 encoder: 文本编码器实例(可选,用于生成title_embedding) 52 encoder: 文本编码器实例(可选,用于生成title_embedding)
55 enable_title_embedding: 是否启用标题向量化(默认True) 53 enable_title_embedding: 是否启用标题向量化(默认True)
56 image_encoder: 图片编码器实例(可选,需实现 encode_image_urls(urls) -> List[Optional[np.ndarray]]) 54 image_encoder: 图片编码器实例(可选,需实现 encode_image_urls(urls) -> List[Optional[np.ndarray]])
@@ -60,12 +58,33 @@ class SPUDocumentTransformer: @@ -60,12 +58,33 @@ class SPUDocumentTransformer:
60 self.searchable_option_dimensions = searchable_option_dimensions 58 self.searchable_option_dimensions = searchable_option_dimensions
61 self.tenant_config = tenant_config or {} 59 self.tenant_config = tenant_config or {}
62 self.translator = translator 60 self.translator = translator
63 - self.translation_prompts = translation_prompts or {}  
64 self.encoder = encoder 61 self.encoder = encoder
65 self.enable_title_embedding = enable_title_embedding 62 self.enable_title_embedding = enable_title_embedding
66 self.image_encoder = image_encoder 63 self.image_encoder = image_encoder
67 self.enable_image_embedding = bool(enable_image_embedding and image_encoder is not None) 64 self.enable_image_embedding = bool(enable_image_embedding and image_encoder is not None)
68 65
  66 + def _translate_index_languages(
  67 + self,
  68 + text: str,
  69 + source_lang: str,
  70 + index_languages: List[str],
  71 + scene: str,
  72 + ) -> Dict[str, Optional[str]]:
  73 + translations: Dict[str, Optional[str]] = {}
  74 + if not self.translator or not text or not str(text).strip():
  75 + return translations
  76 + for lang in index_languages:
  77 + if lang == source_lang:
  78 + translations[lang] = text
  79 + continue
  80 + translations[lang] = self.translator.translate(
  81 + text=text,
  82 + target_lang=lang,
  83 + source_lang=source_lang,
  84 + context=scene,
  85 + )
  86 + return translations
  87 +
69 def transform_spu_to_doc( 88 def transform_spu_to_doc(
70 self, 89 self,
71 tenant_id: str, 90 tenant_id: str,
@@ -322,15 +341,12 @@ class SPUDocumentTransformer: @@ -322,15 +341,12 @@ class SPUDocumentTransformer:
322 title_text = str(spu_row['title']) 341 title_text = str(spu_row['title'])
323 translations: Dict[str, Optional[str]] = {} 342 translations: Dict[str, Optional[str]] = {}
324 if self.translator: 343 if self.translator:
325 - prompt_zh = self.translation_prompts.get('product_title_zh') or self.translation_prompts.get('default_zh')  
326 - prompt_en = self.translation_prompts.get('product_title_en') or self.translation_prompts.get('default_en')  
327 - translations = self.translator.translate_for_indexing(  
328 - title_text,  
329 - shop_language=primary_lang, 344 + translations = self._translate_index_languages(
  345 + text=title_text,
330 source_lang=primary_lang, 346 source_lang=primary_lang,
331 - prompt=prompt_zh if primary_lang == 'zh' else prompt_en,  
332 index_languages=index_langs, 347 index_languages=index_langs,
333 - ) or {} 348 + scene="product_title",
  349 + )
334 _set_lang_obj("title", title_text, translations) 350 _set_lang_obj("title", title_text, translations)
335 351
336 # Brief 352 # Brief
@@ -338,14 +354,12 @@ class SPUDocumentTransformer: @@ -338,14 +354,12 @@ class SPUDocumentTransformer:
338 brief_text = str(spu_row['brief']) 354 brief_text = str(spu_row['brief'])
339 translations = {} 355 translations = {}
340 if self.translator: 356 if self.translator:
341 - prompt = self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en')  
342 - translations = self.translator.translate_for_indexing(  
343 - brief_text,  
344 - shop_language=primary_lang, 357 + translations = self._translate_index_languages(
  358 + text=brief_text,
345 source_lang=primary_lang, 359 source_lang=primary_lang,
346 - prompt=prompt,  
347 index_languages=index_langs, 360 index_languages=index_langs,
348 - ) or {} 361 + scene="default",
  362 + )
349 _set_lang_obj("brief", brief_text, translations) 363 _set_lang_obj("brief", brief_text, translations)
350 364
351 # Description 365 # Description
@@ -353,14 +367,12 @@ class SPUDocumentTransformer: @@ -353,14 +367,12 @@ class SPUDocumentTransformer:
353 desc_text = str(spu_row['description']) 367 desc_text = str(spu_row['description'])
354 translations = {} 368 translations = {}
355 if self.translator: 369 if self.translator:
356 - prompt = self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en')  
357 - translations = self.translator.translate_for_indexing(  
358 - desc_text,  
359 - shop_language=primary_lang, 370 + translations = self._translate_index_languages(
  371 + text=desc_text,
360 source_lang=primary_lang, 372 source_lang=primary_lang,
361 - prompt=prompt,  
362 index_languages=index_langs, 373 index_languages=index_langs,
363 - ) or {} 374 + scene="default",
  375 + )
364 _set_lang_obj("description", desc_text, translations) 376 _set_lang_obj("description", desc_text, translations)
365 377
366 # Vendor 378 # Vendor
@@ -368,14 +380,12 @@ class SPUDocumentTransformer: @@ -368,14 +380,12 @@ class SPUDocumentTransformer:
368 vendor_text = str(spu_row['vendor']) 380 vendor_text = str(spu_row['vendor'])
369 translations = {} 381 translations = {}
370 if self.translator: 382 if self.translator:
371 - prompt = self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en')  
372 - translations = self.translator.translate_for_indexing(  
373 - vendor_text,  
374 - shop_language=primary_lang, 383 + translations = self._translate_index_languages(
  384 + text=vendor_text,
375 source_lang=primary_lang, 385 source_lang=primary_lang,
376 - prompt=prompt,  
377 index_languages=index_langs, 386 index_languages=index_langs,
378 - ) or {} 387 + scene="default",
  388 + )
379 _set_lang_obj("vendor", vendor_text, translations) 389 _set_lang_obj("vendor", vendor_text, translations)
380 390
381 def _fill_category_fields(self, doc: Dict[str, Any], spu_row: pd.Series): 391 def _fill_category_fields(self, doc: Dict[str, Any], spu_row: pd.Series):
indexer/incremental_service.py
@@ -39,7 +39,6 @@ class IncrementalIndexerService: @@ -39,7 +39,6 @@ class IncrementalIndexerService:
39 self._config: Optional[Any] = None 39 self._config: Optional[Any] = None
40 self._config_lock = threading.Lock() 40 self._config_lock = threading.Lock()
41 self._translator: Optional[Any] = None 41 self._translator: Optional[Any] = None
42 - self._translation_prompts: Optional[Dict[str, Any]] = None  
43 self._searchable_option_dimensions: Optional[List[str]] = None 42 self._searchable_option_dimensions: Optional[List[str]] = None
44 self._shared_text_encoder: Optional[Any] = None 43 self._shared_text_encoder: Optional[Any] = None
45 self._shared_image_encoder: Optional[Any] = None 44 self._shared_image_encoder: Optional[Any] = None
@@ -52,7 +51,6 @@ class IncrementalIndexerService: @@ -52,7 +51,6 @@ class IncrementalIndexerService:
52 def _eager_init(self) -> None: 51 def _eager_init(self) -> None:
53 """Strict eager initialization. Any dependency failure should fail fast.""" 52 """Strict eager initialization. Any dependency failure should fail fast."""
54 self._config = ConfigLoader("config/config.yaml").load_config() 53 self._config = ConfigLoader("config/config.yaml").load_config()
55 - self._translation_prompts = getattr(self._config.query_config, "translation_prompts", {}) or {}  
56 self._searchable_option_dimensions = ( 54 self._searchable_option_dimensions = (
57 getattr(self._config.spu_config, "searchable_option_dimensions", None) 55 getattr(self._config.spu_config, "searchable_option_dimensions", None)
58 or ["option1", "option2", "option3"] 56 or ["option1", "option2", "option3"]
@@ -110,7 +108,6 @@ class IncrementalIndexerService: @@ -110,7 +108,6 @@ class IncrementalIndexerService:
110 tenant_id=tenant_id, 108 tenant_id=tenant_id,
111 searchable_option_dimensions=self._searchable_option_dimensions, 109 searchable_option_dimensions=self._searchable_option_dimensions,
112 translator=self._translator, 110 translator=self._translator,
113 - translation_prompts=self._translation_prompts,  
114 encoder=encoder, 111 encoder=encoder,
115 enable_title_embedding=False, # batch fill later 112 enable_title_embedding=False, # batch fill later
116 image_encoder=image_encoder, 113 image_encoder=image_encoder,
indexer/indexing_utils.py
@@ -57,7 +57,6 @@ def create_document_transformer( @@ -57,7 +57,6 @@ def create_document_transformer(
57 tenant_id: str, 57 tenant_id: str,
58 searchable_option_dimensions: Optional[list] = None, 58 searchable_option_dimensions: Optional[list] = None,
59 translator: Optional[Any] = None, 59 translator: Optional[Any] = None,
60 - translation_prompts: Optional[Dict[str, str]] = None,  
61 encoder: Optional[Any] = None, 60 encoder: Optional[Any] = None,
62 enable_title_embedding: bool = True, 61 enable_title_embedding: bool = True,
63 image_encoder: Optional[Any] = None, 62 image_encoder: Optional[Any] = None,
@@ -72,7 +71,6 @@ def create_document_transformer( @@ -72,7 +71,6 @@ def create_document_transformer(
72 tenant_id: 租户ID 71 tenant_id: 租户ID
73 searchable_option_dimensions: 可搜索的option维度列表(如果为None则从配置加载) 72 searchable_option_dimensions: 可搜索的option维度列表(如果为None则从配置加载)
74 translator: 翻译器实例(如果为None则根据配置初始化) 73 translator: 翻译器实例(如果为None则根据配置初始化)
75 - translation_prompts: 翻译提示词配置(如果为None则从配置加载)  
76 encoder: 文本编码器实例(如果为None且enable_title_embedding为True则根据配置初始化) 74 encoder: 文本编码器实例(如果为None且enable_title_embedding为True则根据配置初始化)
77 enable_title_embedding: 是否启用标题向量化(默认True) 75 enable_title_embedding: 是否启用标题向量化(默认True)
78 image_encoder: 图片编码器(可选,需实现 encode_image_urls(urls)) 76 image_encoder: 图片编码器(可选,需实现 encode_image_urls(urls))
@@ -89,7 +87,6 @@ def create_document_transformer( @@ -89,7 +87,6 @@ def create_document_transformer(
89 if ( 87 if (
90 searchable_option_dimensions is None 88 searchable_option_dimensions is None
91 or translator is None 89 or translator is None
92 - or translation_prompts is None  
93 or (encoder is None and enable_title_embedding) 90 or (encoder is None and enable_title_embedding)
94 or config is None 91 or config is None
95 ): 92 ):
@@ -107,9 +104,6 @@ def create_document_transformer( @@ -107,9 +104,6 @@ def create_document_transformer(
107 104
108 translator = create_translation_provider(config.query_config) 105 translator = create_translation_provider(config.query_config)
109 106
110 - if translation_prompts is None:  
111 - translation_prompts = config.query_config.translation_prompts  
112 -  
113 # 初始化encoder(如果启用标题向量化且未提供encoder) 107 # 初始化encoder(如果启用标题向量化且未提供encoder)
114 if encoder is None and enable_title_embedding and config.query_config.enable_text_embedding: 108 if encoder is None and enable_title_embedding and config.query_config.enable_text_embedding:
115 from embeddings.text_encoder import TextEmbeddingEncoder 109 from embeddings.text_encoder import TextEmbeddingEncoder
@@ -122,7 +116,6 @@ def create_document_transformer( @@ -122,7 +116,6 @@ def create_document_transformer(
122 searchable_option_dimensions=searchable_option_dimensions, 116 searchable_option_dimensions=searchable_option_dimensions,
123 tenant_config=tenant_config, 117 tenant_config=tenant_config,
124 translator=translator, 118 translator=translator,
125 - translation_prompts=translation_prompts,  
126 encoder=encoder, 119 encoder=encoder,
127 enable_title_embedding=enable_title_embedding, 120 enable_title_embedding=enable_title_embedding,
128 image_encoder=image_encoder, 121 image_encoder=image_encoder,
indexer/test_indexing.py
@@ -285,7 +285,6 @@ def test_document_transformer(): @@ -285,7 +285,6 @@ def test_document_transformer():
285 searchable_option_dimensions=['option1', 'option2', 'option3'], 285 searchable_option_dimensions=['option1', 'option2', 'option3'],
286 tenant_config=tenant_config, 286 tenant_config=tenant_config,
287 translator=translator, 287 translator=translator,
288 - translation_prompts=config.query_config.translation_prompts  
289 ) 288 )
290 289
291 # 转换文档 290 # 转换文档
providers/translation.py
1 -"""  
2 -Translation provider - direct (in-process) or HTTP service.  
3 -""" 1 +"""Translation provider factory and HTTP provider implementation."""
4 from __future__ import annotations 2 from __future__ import annotations
5 3
6 import logging 4 import logging
7 -from typing import Any, Dict, List, Optional, Union  
8 -  
9 -from concurrent.futures import Future, ThreadPoolExecutor 5 +from typing import Any, Dict, Optional
10 import requests 6 import requests
11 7
12 from config.services_config import get_translation_config, get_translation_base_url 8 from config.services_config import get_translation_config, get_translation_base_url
@@ -22,19 +18,18 @@ class HttpTranslationProvider: @@ -22,19 +18,18 @@ class HttpTranslationProvider:
22 base_url: str, 18 base_url: str,
23 model: str = "qwen", 19 model: str = "qwen",
24 timeout_sec: float = 10.0, 20 timeout_sec: float = 10.0,
25 - translation_context: Optional[str] = None,  
26 ): 21 ):
27 self.base_url = (base_url or "").rstrip("/") 22 self.base_url = (base_url or "").rstrip("/")
28 self.model = model or "qwen" 23 self.model = model or "qwen"
29 self.timeout_sec = float(timeout_sec or 10.0) 24 self.timeout_sec = float(timeout_sec or 10.0)
30 - self.translation_context = translation_context or "e-commerce product search"  
31 - self.executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="http-translator")  
32 25
33 def _translate_once( 26 def _translate_once(
34 self, 27 self,
35 text: str, 28 text: str,
36 target_lang: str, 29 target_lang: str,
37 source_lang: Optional[str] = None, 30 source_lang: Optional[str] = None,
  31 + context: Optional[str] = None,
  32 + prompt: Optional[str] = None,
38 ) -> Optional[str]: 33 ) -> Optional[str]:
39 if not text or not str(text).strip(): 34 if not text or not str(text).strip():
40 return text 35 return text
@@ -46,6 +41,10 @@ class HttpTranslationProvider: @@ -46,6 +41,10 @@ class HttpTranslationProvider:
46 "source_lang": source_lang or "auto", 41 "source_lang": source_lang or "auto",
47 "model": self.model, 42 "model": self.model,
48 } 43 }
  44 + if context:
  45 + payload["context"] = context
  46 + if prompt:
  47 + payload["prompt"] = prompt
49 response = requests.post(url, json=payload, timeout=self.timeout_sec) 48 response = requests.post(url, json=payload, timeout=self.timeout_sec)
50 if response.status_code != 200: 49 if response.status_code != 200:
51 logger.warning( 50 logger.warning(
@@ -69,58 +68,13 @@ class HttpTranslationProvider: @@ -69,58 +68,13 @@ class HttpTranslationProvider:
69 context: Optional[str] = None, 68 context: Optional[str] = None,
70 prompt: Optional[str] = None, 69 prompt: Optional[str] = None,
71 ) -> Optional[str]: 70 ) -> Optional[str]:
72 - del context, prompt  
73 - result = self._translate_once(text=text, target_lang=target_lang, source_lang=source_lang)  
74 - return result if result is not None else text  
75 -  
76 - def translate_multi(  
77 - self,  
78 - text: str,  
79 - target_langs: List[str],  
80 - source_lang: Optional[str] = None,  
81 - context: Optional[str] = None,  
82 - async_mode: bool = True,  
83 - prompt: Optional[str] = None,  
84 - ) -> Dict[str, Optional[str]]:  
85 - del context, async_mode, prompt  
86 - out: Dict[str, Optional[str]] = {}  
87 - for lang in target_langs:  
88 - out[lang] = self.translate(text, lang, source_lang=source_lang)  
89 - return out  
90 -  
91 - def translate_multi_async(  
92 - self,  
93 - text: str,  
94 - target_langs: List[str],  
95 - source_lang: Optional[str] = None,  
96 - context: Optional[str] = None,  
97 - prompt: Optional[str] = None,  
98 - ) -> Dict[str, Union[str, Future]]:  
99 - del context, prompt  
100 - out: Dict[str, Union[str, Future]] = {}  
101 - for lang in target_langs:  
102 - out[lang] = self.executor.submit(self.translate, text, lang, source_lang)  
103 - return out  
104 -  
105 - def translate_for_indexing(  
106 - self,  
107 - text: str,  
108 - shop_language: str,  
109 - source_lang: Optional[str] = None,  
110 - context: Optional[str] = None,  
111 - prompt: Optional[str] = None,  
112 - index_languages: Optional[List[str]] = None,  
113 - ) -> Dict[str, Optional[str]]:  
114 - del context, prompt  
115 - langs = index_languages if index_languages else ["en", "zh"]  
116 - source = source_lang or shop_language or "auto"  
117 - out: Dict[str, Optional[str]] = {}  
118 - for lang in langs:  
119 - if lang == shop_language:  
120 - out[lang] = text  
121 - else:  
122 - out[lang] = self.translate(text, target_lang=lang, source_lang=source)  
123 - return out 71 + return self._translate_once(
  72 + text=text,
  73 + target_lang=target_lang,
  74 + source_lang=source_lang,
  75 + context=context,
  76 + prompt=prompt,
  77 + )
124 78
125 79
126 def create_translation_provider(query_config: Any = None) -> Any: 80 def create_translation_provider(query_config: Any = None) -> Any:
@@ -133,9 +87,9 @@ def create_translation_provider(query_config: Any = None) -&gt; Any: @@ -133,9 +87,9 @@ def create_translation_provider(query_config: Any = None) -&gt; Any:
133 provider = cfg.provider 87 provider = cfg.provider
134 pc = cfg.get_provider_cfg() 88 pc = cfg.get_provider_cfg()
135 89
136 - if provider in ("direct", "local", "inprocess"): 90 + if provider in ("qwen-mt", "direct", "local", "inprocess"):
137 from query.qwen_mt_translate import Translator 91 from query.qwen_mt_translate import Translator
138 - model = pc.get("model") or "qwen" 92 + model = pc.get("model") or "qwen-mt-flash"
139 qc = query_config or _empty_query_config() 93 qc = query_config or _empty_query_config()
140 return Translator( 94 return Translator(
141 model=model, 95 model=model,
@@ -145,7 +99,7 @@ def create_translation_provider(query_config: Any = None) -&gt; Any: @@ -145,7 +99,7 @@ def create_translation_provider(query_config: Any = None) -&gt; Any:
145 translation_context=getattr(qc, "translation_context", "e-commerce product search"), 99 translation_context=getattr(qc, "translation_context", "e-commerce product search"),
146 ) 100 )
147 101
148 - if provider in ("http", "service"): 102 + elif provider in ("http", "service"):
149 base_url = get_translation_base_url() 103 base_url = get_translation_base_url()
150 model = pc.get("model") or "qwen" 104 model = pc.get("model") or "qwen"
151 timeout = pc.get("timeout_sec", 10.0) 105 timeout = pc.get("timeout_sec", 10.0)
@@ -154,7 +108,26 @@ def create_translation_provider(query_config: Any = None) -&gt; Any: @@ -154,7 +108,26 @@ def create_translation_provider(query_config: Any = None) -&gt; Any:
154 base_url=base_url, 108 base_url=base_url,
155 model=model, 109 model=model,
156 timeout_sec=float(timeout), 110 timeout_sec=float(timeout),
157 - translation_context=getattr(qc, "translation_context", "e-commerce product search"), 111 + )
  112 +
  113 + elif provider == "llm":
  114 + from query.llm_translate import LLMTranslatorProvider
  115 + model = pc.get("model")
  116 + timeout = float(pc.get("timeout_sec", 30.0))
  117 + base_url = (pc.get("base_url") or "").strip() or None
  118 + return LLMTranslatorProvider(
  119 + model=model,
  120 + timeout_sec=timeout,
  121 + base_url=base_url,
  122 + )
  123 +
  124 + elif provider == "deepl":
  125 + from query.deepl_provider import DeepLProvider
  126 + qc = query_config or _empty_query_config()
  127 + return DeepLProvider(
  128 + api_key=getattr(qc, "translation_api_key", None),
  129 + timeout=float(pc.get("timeout_sec", 10.0)),
  130 + glossary_id=pc.get("glossary_id") or getattr(qc, "translation_glossary_id", None),
158 ) 131 )
159 132
160 raise ValueError(f"Unsupported translation provider: {provider}") 133 raise ValueError(f"Unsupported translation provider: {provider}")
query/deepl_provider.py 0 → 100644
@@ -0,0 +1,203 @@ @@ -0,0 +1,203 @@
  1 +"""
  2 +DeepL backend provider.
  3 +
  4 +This module only handles network calls to DeepL.
  5 +It does not handle cache, async fanout, or fallback semantics.
  6 +"""
  7 +
  8 +from __future__ import annotations
  9 +
  10 +import logging
  11 +import os
  12 +import re
  13 +from typing import Dict, Optional, Tuple
  14 +
  15 +import requests
  16 +from config.services_config import get_translation_config
  17 +
  18 +
  19 +logger = logging.getLogger(__name__)
  20 +
  21 +DEFAULT_CONTEXTS: Dict[str, Dict[str, str]] = {
  22 + "sku_name": {
  23 + "zh": "商品SKU名称",
  24 + "en": "product SKU name",
  25 + },
  26 + "ecommerce_search_query": {
  27 + "zh": "电商",
  28 + "en": "e-commerce",
  29 + },
  30 + "general": {
  31 + "zh": "",
  32 + "en": "",
  33 + },
  34 +}
  35 +SCENE_NAMES = frozenset(DEFAULT_CONTEXTS.keys())
  36 +
  37 +
  38 +def _merge_contexts(raw: object) -> Dict[str, Dict[str, str]]:
  39 + merged: Dict[str, Dict[str, str]] = {
  40 + scene: dict(lang_map) for scene, lang_map in DEFAULT_CONTEXTS.items()
  41 + }
  42 + if not isinstance(raw, dict):
  43 + return merged
  44 + for scene, lang_map in raw.items():
  45 + if not isinstance(lang_map, dict):
  46 + continue
  47 + scene_name = str(scene or "").strip()
  48 + if not scene_name:
  49 + continue
  50 + merged.setdefault(scene_name, {})
  51 + for lang, value in lang_map.items():
  52 + lang_key = str(lang or "").strip().lower()
  53 + context_value = str(value or "").strip()
  54 + if lang_key and context_value:
  55 + merged[scene_name][lang_key] = context_value
  56 + return merged
  57 +
  58 +
  59 +class DeepLProvider:
  60 + API_URL = "https://api.deepl.com/v2/translate" # Pro tier
  61 + LANG_CODE_MAP = {
  62 + "zh": "ZH",
  63 + "en": "EN",
  64 + "ru": "RU",
  65 + "ar": "AR",
  66 + "ja": "JA",
  67 + "es": "ES",
  68 + "de": "DE",
  69 + "fr": "FR",
  70 + "it": "IT",
  71 + "pt": "PT",
  72 + }
  73 +
  74 + def __init__(
  75 + self,
  76 + api_key: Optional[str],
  77 + *,
  78 + timeout: float = 10.0,
  79 + glossary_id: Optional[str] = None,
  80 + ) -> None:
  81 + cfg = get_translation_config()
  82 + provider_cfg = cfg.providers.get("deepl", {}) if isinstance(cfg.providers, dict) else {}
  83 + self.api_key = api_key or os.getenv("DEEPL_AUTH_KEY")
  84 + self.timeout = float(provider_cfg.get("timeout_sec") or timeout or 10.0)
  85 + self.glossary_id = glossary_id or provider_cfg.get("glossary_id")
  86 + self.model = "deepl"
  87 + self.context_presets = _merge_contexts(provider_cfg.get("contexts"))
  88 + if not self.api_key:
  89 + logger.warning("DEEPL_AUTH_KEY not set; DeepL translation is unavailable")
  90 +
  91 + def _resolve_request_context(
  92 + self,
  93 + target_lang: str,
  94 + context: Optional[str],
  95 + prompt: Optional[str],
  96 + ) -> Optional[str]:
  97 + if prompt:
  98 + return prompt
  99 + if context in SCENE_NAMES:
  100 + scene_map = self.context_presets.get(context) or self.context_presets.get("default") or {}
  101 + tgt = (target_lang or "").strip().lower()
  102 + return scene_map.get(tgt) or scene_map.get("en")
  103 + if context:
  104 + return context
  105 + scene_map = self.context_presets.get("default") or {}
  106 + tgt = (target_lang or "").strip().lower()
  107 + return scene_map.get(tgt) or scene_map.get("en")
  108 +
  109 + def translate(
  110 + self,
  111 + text: str,
  112 + target_lang: str,
  113 + source_lang: Optional[str] = None,
  114 + context: Optional[str] = None,
  115 + prompt: Optional[str] = None,
  116 + ) -> Optional[str]:
  117 + if not self.api_key:
  118 + return None
  119 +
  120 + target_code = self.LANG_CODE_MAP.get((target_lang or "").lower(), (target_lang or "").upper())
  121 + headers = {
  122 + "Authorization": f"DeepL-Auth-Key {self.api_key}",
  123 + "Content-Type": "application/json",
  124 + }
  125 +
  126 + api_context = self._resolve_request_context(target_lang, context, prompt)
  127 + text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context)
  128 +
  129 + payload = {
  130 + "text": [text_to_translate],
  131 + "target_lang": target_code,
  132 + }
  133 + if source_lang:
  134 + payload["source_lang"] = self.LANG_CODE_MAP.get(source_lang.lower(), source_lang.upper())
  135 + if api_context:
  136 + payload["context"] = api_context
  137 + if self.glossary_id:
  138 + payload["glossary_id"] = self.glossary_id
  139 +
  140 + try:
  141 + response = requests.post(self.API_URL, headers=headers, json=payload, timeout=self.timeout)
  142 + if response.status_code != 200:
  143 + logger.warning(
  144 + "[deepl] Failed | status=%s tgt=%s body=%s",
  145 + response.status_code,
  146 + target_code,
  147 + (response.text or "")[:200],
  148 + )
  149 + return None
  150 +
  151 + data = response.json()
  152 + translations = data.get("translations") or []
  153 + if not translations:
  154 + return None
  155 + translated = translations[0].get("text")
  156 + if not translated:
  157 + return None
  158 + if needs_extraction:
  159 + translated = self._extract_term_from_translation(translated, text, target_code)
  160 + return translated
  161 + except requests.Timeout:
  162 + logger.warning("[deepl] Timeout | tgt=%s timeout=%.1fs", target_code, self.timeout)
  163 + return None
  164 + except Exception as exc:
  165 + logger.warning("[deepl] Exception | tgt=%s error=%s", target_code, exc, exc_info=True)
  166 + return None
  167 +
  168 + def _add_ecommerce_context(
  169 + self,
  170 + text: str,
  171 + source_lang: Optional[str],
  172 + context: Optional[str],
  173 + ) -> Tuple[str, bool]:
  174 + if not context or "e-commerce" not in context.lower():
  175 + return text, False
  176 + if (source_lang or "").lower() != "zh":
  177 + return text, False
  178 +
  179 + term = (text or "").strip()
  180 + if len(term.split()) == 1 and len(term) <= 2:
  181 + return f"购买 {term}", True
  182 + return text, False
  183 +
  184 + def _extract_term_from_translation(
  185 + self,
  186 + translated_text: str,
  187 + original_text: str,
  188 + target_lang_code: str,
  189 + ) -> str:
  190 + del original_text
  191 + if target_lang_code != "EN":
  192 + return translated_text
  193 +
  194 + words = translated_text.strip().split()
  195 + if len(words) <= 1:
  196 + return translated_text
  197 + context_words = {"buy", "purchase", "product", "item", "commodity", "goods"}
  198 + for word in reversed(words):
  199 + normalized = re.sub(r"[.,!?;:]+$", "", word.lower())
  200 + if normalized not in context_words:
  201 + return normalized
  202 + return re.sub(r"[.,!?;:]+$", "", words[-1].lower())
  203 +
query/llm_translate.py
1 """ 1 """
2 -LLM-based translation helper using Qwen chat model. 2 +LLM-based translation backend (DashScope-compatible OpenAI API).
3 3
4 -This module provides a thin wrapper around DashScope's `qwen-flash` model  
5 -for high-quality, prompt-controlled translation, independent of the main  
6 -`Translator` (machine translation) pipeline.  
7 -  
8 -Usage example:  
9 -  
10 - from query.llm_translate import llm_translate  
11 -  
12 - result = llm_translate(  
13 - text="我看到这个视频后没有笑",  
14 - target_lang="en",  
15 - source_lang="zh",  
16 - source_lang_label="中文",  
17 - target_lang_label="英文",  
18 - ) 4 +Failure semantics are strict:
  5 +- success: translated string
  6 +- failure: None
19 """ 7 """
20 8
21 from __future__ import annotations 9 from __future__ import annotations
@@ -23,113 +11,159 @@ from __future__ import annotations @@ -23,113 +11,159 @@ from __future__ import annotations
23 import logging 11 import logging
24 import os 12 import os
25 import time 13 import time
26 -from typing import Dict, Optional 14 +from typing import Optional
27 15
28 from openai import OpenAI 16 from openai import OpenAI
29 17
30 from config.env_config import DASHSCOPE_API_KEY 18 from config.env_config import DASHSCOPE_API_KEY
31 from config.services_config import get_translation_config 19 from config.services_config import get_translation_config
  20 +from config.translate_prompts import TRANSLATION_PROMPTS, SOURCE_LANG_CODE_MAP
  21 +
32 22
33 logger = logging.getLogger(__name__) 23 logger = logging.getLogger(__name__)
34 24
35 25
36 -# 华北2(北京):https://dashscope.aliyuncs.com/compatible-mode/v1  
37 -# 新加坡:https://dashscope-intl.aliyuncs.com/compatible-mode/v1  
38 -# 美国(弗吉尼亚):https://dashscope-us.aliyuncs.com/compatible-mode/v1  
39 -#  
40 -# 默认保持与现有翻译/索引脚本相同的美国地域,可通过环境变量覆盖:  
41 -# DASHSCOPE_BASE_URL=https://dashscope.aliyuncs.com/compatible-mode/v1  
42 DEFAULT_QWEN_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1" 26 DEFAULT_QWEN_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1"
43 -QWEN_MODEL_NAME = "qwen-flash"  
44 -  
45 -  
46 -# 由调用方提供的语言标签/代码填充,占位符说明:  
47 -# - source_lang: 源语言的人类可读名称(按目标语言本地化,例如 "中文", "English")  
48 -# - target_lang: 目标语言的人类可读名称  
49 -# - src_lang_code: 源语言代码,例如 "zh"  
50 -# - tgt_lang_code: 目标语言代码,例如 "en"  
51 -TRANSLATION_PROMPTS: Dict[str, str] = {  
52 - "zh": """你是一名专业的 {source_lang}({src_lang_code})到 {target_lang}({tgt_lang_code})翻译员。你的目标是在遵循 {target_lang} 的语法、词汇和文化习惯的前提下,准确传达原始 {source_lang} 文本的含义和细微差别。请只输出 {target_lang} 的翻译内容,不要包含任何额外的解释或评论。请将以下 {source_lang} 文本翻译成 {target_lang}:  
53 -  
54 -{text}""",  
55 - "en": """You are a professional {source_lang} ({src_lang_code}) to {target_lang} ({tgt_lang_code}) translator. Your goal is to accurately convey the meaning and nuances of the original {source_lang} text while adhering to {target_lang} grammar, vocabulary, and cultural sensitivities. Produce only the {target_lang} translation, without any additional explanations or commentary. Please translate the following {source_lang} text into {target_lang}:  
56 -  
57 -{text}""",  
58 - "ru": """Вы профессиональный переводчик с {source_lang} ({src_lang_code}) на {target_lang} ({tgt_lang_code}). Ваша задача — точно передать смысл и нюансы исходного текста на {source_lang}, соблюдая грамматику, лексику и культурные особенности {target_lang}. Выводите только перевод на {target_lang}, без каких-либо дополнительных объяснений или комментариев. Пожалуйста, переведите следующий текст с {source_lang} на {target_lang}:  
59 -  
60 -{text}""",  
61 - "ar": """أنت مترجم محترف من {source_lang} ({src_lang_code}) إلى {target_lang} ({tgt_lang_code}). هدفك هو نقل المعنى والدلالات الدقيقة للنص الأصلي بلغة {source_lang} بدقة، مع الالتزام بقواعد اللغة والمفردات والحساسيات الثقافية الخاصة بلغة {target_lang}. قم بإنتاج الترجمة إلى {target_lang} فقط دون أي شروحات أو تعليقات إضافية. يرجى ترجمة النص التالي من {source_lang} إلى {target_lang}:  
62 -  
63 -{text}""",  
64 - "ja": """あなたは {source_lang}({src_lang_code})から {target_lang}({tgt_lang_code})へのプロの翻訳者です。{target_lang} の文法、語彙、文化的配慮に従いながら、元の {source_lang} テキストの意味やニュアンスを正確に伝えることが目的です。追加の説明やコメントは一切含めず、{target_lang} の翻訳のみを出力してください。次の {source_lang} テキストを {target_lang} に翻訳してください:  
65 -  
66 -{text}""",  
67 - "es": """Eres un traductor profesional de {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Tu objetivo es transmitir con precisión el significado y los matices del texto original en {source_lang}, respetando la gramática, el vocabulario y las sensibilidades culturales de {target_lang}. Produce únicamente la traducción en {target_lang}, sin explicaciones ni comentarios adicionales. Por favor, traduce el siguiente texto de {source_lang} a {target_lang}:  
68 -  
69 -{text}""",  
70 - "de": """Du bist ein professioneller Übersetzer von {source_lang} ({src_lang_code}) nach {target_lang} ({tgt_lang_code}). Dein Ziel ist es, die Bedeutung und Nuancen des ursprünglichen {source_lang}-Textes genau zu vermitteln und dabei die Grammatik, den Wortschatz und die kulturellen Besonderheiten von {target_lang} zu berücksichtigen. Gib ausschließlich die Übersetzung in {target_lang} aus, ohne zusätzliche Erklärungen oder Kommentare. Bitte übersetze den folgenden {source_lang}-Text in {target_lang}:  
71 -  
72 -{text}""",  
73 - "fr": """Vous êtes un traducteur professionnel de {source_lang} ({src_lang_code}) vers {target_lang} ({tgt_lang_code}). Votre objectif est de transmettre fidèlement le sens et les nuances du texte original en {source_lang}, tout en respectant la grammaire, le vocabulaire et les sensibilités culturelles de {target_lang}. Produisez uniquement la traduction en {target_lang}, sans explications ni commentaires supplémentaires. Veuillez traduire le texte suivant de {source_lang} vers {target_lang} :  
74 -  
75 -{text}""",  
76 - "it": """Sei un traduttore professionista da {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Il tuo obiettivo è trasmettere con precisione il significato e le sfumature del testo originale in {source_lang}, rispettando la grammatica, il vocabolario e le sensibilità culturali di {target_lang}. Produci solo la traduzione in {target_lang}, senza spiegazioni o commenti aggiuntivi. Per favore traduci il seguente testo da {source_lang} a {target_lang}:  
77 -  
78 -{text}""",  
79 - "pt": """Você é um tradutor profissional de {source_lang} ({src_lang_code}) para {target_lang} ({tgt_lang_code}). Seu objetivo é transmitir com precisão o significado e as nuances do texto original em {source_lang}, respeitando a gramática, o vocabulário e as sensibilidades culturais de {target_lang}. Produza apenas a tradução em {target_lang}, sem quaisquer explicações ou comentários adicionais. Por favor, traduza o seguinte texto de {source_lang} para {target_lang}:  
80 -  
81 -{text}""",  
82 -}  
83 -  
84 -  
85 -def _get_qwen_client(base_url: Optional[str] = None) -> Optional[OpenAI]:  
86 - """  
87 - Lazily construct an OpenAI-compatible client for DashScope.  
88 -  
89 - Uses DASHSCOPE_API_KEY and base_url (provider config / env) to configure endpoint.  
90 - """  
91 - api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY")  
92 - if not api_key:  
93 - logger.warning("DASHSCOPE_API_KEY not set; llm-based translation will be disabled")  
94 - return None  
95 -  
96 - # 优先使用显式传入的 base_url,其次环境变量,最后默认地域。  
97 - base_url = (  
98 - (base_url or "").strip()  
99 - or os.getenv("DASHSCOPE_BASE_URL")  
100 - or DEFAULT_QWEN_BASE_URL  
101 - )  
102 -  
103 - try:  
104 - client = OpenAI(api_key=api_key, base_url=base_url)  
105 - return client  
106 - except Exception as exc:  
107 - logger.error("Failed to initialize DashScope OpenAI client: %s", exc, exc_info=True)  
108 - return None 27 +DEFAULT_LLM_MODEL = "qwen-flash"
109 28
110 29
111 def _build_prompt( 30 def _build_prompt(
112 text: str, 31 text: str,
  32 + *,
  33 + source_lang: Optional[str],
113 target_lang: str, 34 target_lang: str,
114 - source_lang_label: str,  
115 - target_lang_label: str,  
116 - src_lang_code: str,  
117 - tgt_lang_code: str, 35 + scene: Optional[str],
118 ) -> str: 36 ) -> str:
119 """ 37 """
120 - Build translation prompt for given target language, defaulting to English template. 38 + 从 config.translate_prompts.TRANSLATION_PROMPTS 中构建提示词。
  39 +
  40 + 要求:模板必须包含 {source_lang}({src_lang_code}){target_lang}({tgt_lang_code})。
  41 + 这里统一使用 code 作为占位的 lang 与 label,外部接口仍然只传语言 code。
121 """ 42 """
122 - key = (target_lang or "").lower()  
123 - template = TRANSLATION_PROMPTS.get(key) or TRANSLATION_PROMPTS["en"] 43 + tgt = (target_lang or "").lower() or "en"
  44 + src = (source_lang or "auto").lower()
  45 +
  46 + # 将业务上下文 scene 映射为模板分组名
  47 + normalized_scene = (scene or "").strip() or "general"
  48 + # 如果出现历史词,则报错,用于发现错误
  49 + if normalized_scene in {"query", "ecommerce_search", "ecommerce_search_query"}:
  50 + group_key = "ecommerce_search_query"
  51 + elif normalized_scene in {"product_title", "sku_name"}:
  52 + group_key = "sku_name"
  53 + else:
  54 + group_key = normalized_scene
  55 + group = TRANSLATION_PROMPTS.get(group_key) or TRANSLATION_PROMPTS["general"]
  56 +
  57 + # 先按目标语言 code 取模板,取不到回退到英文
  58 + template = group.get(tgt) or group.get("en")
  59 + if not template:
  60 + # 理论上不会发生,兜底一个简单模板
  61 + template = (
  62 + "You are a professional {source_lang} ({src_lang_code}) to "
  63 + "{target_lang} ({tgt_lang_code}) translator, output only the translation: {text}"
  64 + )
  65 +
  66 + # 目前不额外维护语言名称映射,直接使用 code 作为 label
  67 + source_lang_label = SOURCE_LANG_CODE_MAP.get(src, src)
  68 + target_lang_label = SOURCE_LANG_CODE_MAP.get(tgt, tgt)
  69 +
124 return template.format( 70 return template.format(
125 source_lang=source_lang_label, 71 source_lang=source_lang_label,
  72 + src_lang_code=src,
126 target_lang=target_lang_label, 73 target_lang=target_lang_label,
127 - src_lang_code=src_lang_code,  
128 - tgt_lang_code=tgt_lang_code, 74 + tgt_lang_code=tgt,
129 text=text, 75 text=text,
130 ) 76 )
131 77
132 78
  79 +class LLMTranslatorProvider:
  80 + def __init__(
  81 + self,
  82 + *,
  83 + model: Optional[str] = None,
  84 + timeout_sec: float = 30.0,
  85 + base_url: Optional[str] = None,
  86 + ) -> None:
  87 + cfg = get_translation_config()
  88 + llm_cfg = cfg.providers.get("llm", {}) if isinstance(cfg.providers, dict) else {}
  89 + self.model = model or llm_cfg.get("model") or DEFAULT_LLM_MODEL
  90 + self.timeout_sec = float(llm_cfg.get("timeout_sec") or timeout_sec or 30.0)
  91 + self.base_url = (
  92 + (base_url or "").strip()
  93 + or (llm_cfg.get("base_url") or "").strip()
  94 + or os.getenv("DASHSCOPE_BASE_URL")
  95 + or DEFAULT_QWEN_BASE_URL
  96 + )
  97 + self.client = self._create_client()
  98 +
  99 + def _create_client(self) -> Optional[OpenAI]:
  100 + api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY")
  101 + if not api_key:
  102 + logger.warning("DASHSCOPE_API_KEY not set; llm translation unavailable")
  103 + return None
  104 + try:
  105 + return OpenAI(api_key=api_key, base_url=self.base_url)
  106 + except Exception as exc:
  107 + logger.error("Failed to initialize llm translation client: %s", exc, exc_info=True)
  108 + return None
  109 +
  110 + def translate(
  111 + self,
  112 + text: str,
  113 + target_lang: str,
  114 + source_lang: Optional[str] = None,
  115 + context: Optional[str] = None,
  116 + prompt: Optional[str] = None,
  117 + ) -> Optional[str]:
  118 + if not text or not str(text).strip():
  119 + return text
  120 + if not self.client:
  121 + return None
  122 +
  123 + tgt = (target_lang or "").lower() or "en"
  124 + src = (source_lang or "auto").lower()
  125 + scene = context or "default"
  126 + user_prompt = prompt or _build_prompt(
  127 + text=text,
  128 + source_lang=src,
  129 + target_lang=tgt,
  130 + scene=scene,
  131 + )
  132 + start = time.time()
  133 + try:
  134 + logger.info(
  135 + "[llm] Request | src=%s tgt=%s model=%s prompt=%s",
  136 + src,
  137 + tgt,
  138 + self.model,
  139 + user_prompt,
  140 + )
  141 + completion = self.client.chat.completions.create(
  142 + model=self.model,
  143 + messages=[{"role": "user", "content": user_prompt}],
  144 + timeout=self.timeout_sec,
  145 + )
  146 + content = (completion.choices[0].message.content or "").strip()
  147 + latency_ms = (time.time() - start) * 1000
  148 + if not content:
  149 + logger.warning("[llm] Empty result | src=%s tgt=%s latency=%.1fms", src, tgt, latency_ms)
  150 + return None
  151 + logger.info("[llm] Response | src=%s tgt=%s response=%s", src, tgt, content)
  152 + logger.info("[llm] Success | src=%s tgt=%s latency=%.1fms", src, tgt, latency_ms)
  153 + return content
  154 + except Exception as exc:
  155 + latency_ms = (time.time() - start) * 1000
  156 + logger.warning(
  157 + "[llm] Failed | src=%s tgt=%s latency=%.1fms error=%s",
  158 + src,
  159 + tgt,
  160 + latency_ms,
  161 + exc,
  162 + exc_info=True,
  163 + )
  164 + return None
  165 +
  166 +
133 def llm_translate( 167 def llm_translate(
134 text: str, 168 text: str,
135 target_lang: str, 169 target_lang: str,
@@ -139,100 +173,13 @@ def llm_translate( @@ -139,100 +173,13 @@ def llm_translate(
139 target_lang_label: Optional[str] = None, 173 target_lang_label: Optional[str] = None,
140 timeout_sec: Optional[float] = None, 174 timeout_sec: Optional[float] = None,
141 ) -> Optional[str]: 175 ) -> Optional[str]:
142 - """  
143 - Translate text with Qwen chat model using rich prompts.  
144 -  
145 - - 根据目标语言选择提示词,如果没匹配到则退回英文模板。  
146 - - 不对 text 做语言检测或缓存,调用方自行控制。  
147 -  
148 - Args:  
149 - text: 原始文本  
150 - target_lang: 目标语言代码(如 "zh", "en")  
151 - source_lang: 源语言代码(可选,不影响提示词选择,仅用于日志)  
152 - source_lang_label: 源语言展示名称,用于 prompt(默认使用 source_lang)  
153 - target_lang_label: 目标语言展示名称,用于 prompt(默认使用 target_lang)  
154 - timeout_sec: 请求超时时间(秒,可选;若未配置则从 config 读取或采用默认)  
155 -  
156 - Returns:  
157 - 翻译后的文本;如失败则返回 None。  
158 - """  
159 - if not text or not str(text).strip():  
160 - return text  
161 -  
162 - cfg = get_translation_config()  
163 - provider_cfg = cfg.providers.get("llm", {}) if isinstance(cfg.providers, dict) else {}  
164 -  
165 - model_name = provider_cfg.get("model") or QWEN_MODEL_NAME  
166 - req_timeout = float(provider_cfg.get("timeout_sec") or timeout_sec or 30.0)  
167 - base_url = (provider_cfg.get("base_url") or "").strip() or None  
168 -  
169 - client = _get_qwen_client(base_url=base_url)  
170 - if not client:  
171 - # 无法调用云端,直接回退  
172 - logger.warning(  
173 - "[llm_translate] Client init failed; returning original text. "  
174 - "text=%r target_lang=%s source_lang=%s",  
175 - text[:80],  
176 - target_lang,  
177 - source_lang or "auto",  
178 - )  
179 - return text  
180 -  
181 - tgt = (target_lang or "").lower() or "en"  
182 - src = (source_lang or "auto").lower()  
183 - src_label = source_lang_label or src  
184 - tgt_label = target_lang_label or tgt  
185 -  
186 - prompt = _build_prompt( 176 + provider = LLMTranslatorProvider(timeout_sec=timeout_sec or 30.0)
  177 + return provider.translate(
187 text=text, 178 text=text,
188 - target_lang=tgt,  
189 - source_lang_label=src_label,  
190 - target_lang_label=tgt_label,  
191 - src_lang_code=src,  
192 - tgt_lang_code=tgt, 179 + target_lang=target_lang,
  180 + source_lang=source_lang,
  181 + context=None,
193 ) 182 )
194 183
195 - start = time.time()  
196 - try:  
197 - completion = client.chat.completions.create(  
198 - model=model_name,  
199 - messages=[  
200 - {  
201 - "role": "user",  
202 - "content": prompt,  
203 - }  
204 - ],  
205 - timeout=req_timeout,  
206 - )  
207 - content = (completion.choices[0].message.content or "").strip()  
208 - duration_ms = (time.time() - start) * 1000  
209 - logger.info(  
210 - "[llm_translate] Success | model=%s src=%s tgt=%s latency=%.1fms text=%r -> %r",  
211 - model_name,  
212 - src,  
213 - tgt,  
214 - duration_ms,  
215 - text[:80],  
216 - content[:80],  
217 - )  
218 - return content or text  
219 - except Exception as exc:  
220 - duration_ms = (time.time() - start) * 1000  
221 - logger.warning(  
222 - "[llm_translate] Failed | model=%s src=%s tgt=%s latency=%.1fms error=%s",  
223 - model_name,  
224 - src,  
225 - tgt,  
226 - duration_ms,  
227 - exc,  
228 - exc_info=True,  
229 - )  
230 - # 安全回退:出错时返回原文,避免中断上游流程  
231 - return text  
232 -  
233 -  
234 -__all__ = [  
235 - "TRANSLATION_PROMPTS",  
236 - "llm_translate",  
237 -]  
238 184
  185 +__all__ = ["LLMTranslatorProvider", "llm_translate"]
query/query_parser.py
@@ -8,7 +8,7 @@ from typing import Dict, List, Optional, Any, Union @@ -8,7 +8,7 @@ from typing import Dict, List, Optional, Any, Union
8 import numpy as np 8 import numpy as np
9 import logging 9 import logging
10 import re 10 import re
11 -from concurrent.futures import Future, ThreadPoolExecutor, as_completed 11 +from concurrent.futures import ThreadPoolExecutor, as_completed, wait
12 12
13 from embeddings.text_encoder import TextEmbeddingEncoder 13 from embeddings.text_encoder import TextEmbeddingEncoder
14 from config import SearchConfig 14 from config import SearchConfig
@@ -135,6 +135,7 @@ class QueryParser: @@ -135,6 +135,7 @@ class QueryParser:
135 cfg = get_translation_config() 135 cfg = get_translation_config()
136 logger.info("Initializing translator at QueryParser construction (provider=%s)...", cfg.provider) 136 logger.info("Initializing translator at QueryParser construction (provider=%s)...", cfg.provider)
137 self._translator = create_translation_provider(self.config.query_config) 137 self._translator = create_translation_provider(self.config.query_config)
  138 + self._translation_executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="query-translation")
138 139
139 @property 140 @property
140 def text_encoder(self) -> TextEmbeddingEncoder: 141 def text_encoder(self) -> TextEmbeddingEncoder:
@@ -265,6 +266,7 @@ class QueryParser: @@ -265,6 +266,7 @@ class QueryParser:
265 # Stage 4: Translation (with async support and conditional waiting) 266 # Stage 4: Translation (with async support and conditional waiting)
266 translations = {} 267 translations = {}
267 translation_futures = {} 268 translation_futures = {}
  269 + translation_executor = None
268 index_langs = ["en", "zh"] 270 index_langs = ["en", "zh"]
269 try: 271 try:
270 # 根据租户配置的 index_languages 决定翻译目标语言 272 # 根据租户配置的 index_languages 决定翻译目标语言
@@ -287,48 +289,33 @@ class QueryParser: @@ -287,48 +289,33 @@ class QueryParser:
287 target_langs = target_langs_for_translation 289 target_langs = target_langs_for_translation
288 290
289 if target_langs: 291 if target_langs:
290 - # Use e-commerce context for better disambiguation  
291 - translation_context = self.config.query_config.translation_context  
292 - # For query translation, we use a general prompt (not language-specific)  
293 - query_prompt = (  
294 - self.config.query_config.translation_prompts.get(f"query_{detected_lang}")  
295 - or self.config.query_config.translation_prompts.get("query_en")  
296 - or self.config.query_config.translation_prompts.get("default_en")  
297 - or self.config.query_config.translation_prompts.get("default_zh")  
298 - )  
299 -  
300 # Determine if we need to wait for translation results 292 # Determine if we need to wait for translation results
301 # If detected_lang is not in index_languages, we must wait for translation 293 # If detected_lang is not in index_languages, we must wait for translation
302 need_wait_translation = detected_lang not in index_langs 294 need_wait_translation = detected_lang not in index_langs
303 - 295 +
304 if need_wait_translation: 296 if need_wait_translation:
305 - # Use async method that returns Futures, so we can wait for results  
306 - translation_results = self.translator.translate_multi_async(  
307 - query_text,  
308 - target_langs,  
309 - source_lang=detected_lang,  
310 - context=translation_context,  
311 - prompt=query_prompt 297 + translation_executor = ThreadPoolExecutor(
  298 + max_workers=max(1, min(len(target_langs), 4)),
  299 + thread_name_prefix="query-translation-wait",
312 ) 300 )
313 - # Separate cached results and futures  
314 - for lang, result in translation_results.items():  
315 - if isinstance(result, Future):  
316 - translation_futures[lang] = result  
317 - else:  
318 - translations[lang] = result 301 + for lang in target_langs:
  302 + translation_futures[lang] = translation_executor.submit(
  303 + self.translator.translate,
  304 + query_text,
  305 + lang,
  306 + detected_lang,
  307 + "ecommerce_search_query",
  308 + )
319 else: 309 else:
320 - # Use async mode: returns cached translations immediately, missing ones translated in background  
321 - translations = self.translator.translate_multi(  
322 - query_text,  
323 - target_langs,  
324 - source_lang=detected_lang,  
325 - context=translation_context,  
326 - async_mode=True,  
327 - prompt=query_prompt  
328 - )  
329 - # Filter out None values (missing translations that are being processed async)  
330 - translations = {k: v for k, v in translations.items() if v is not None}  
331 - 310 + for lang in target_langs:
  311 + self._translation_executor.submit(
  312 + self.translator.translate,
  313 + query_text,
  314 + lang,
  315 + detected_lang,
  316 + "ecommerce_search_query",
  317 + )
  318 +
332 if translations: 319 if translations:
333 log_info(f"Translation completed (cache hit) | Query text: '{query_text}' | Results: {translations}") 320 log_info(f"Translation completed (cache hit) | Query text: '{query_text}' | Results: {translations}")
334 if translation_futures: 321 if translation_futures:
@@ -407,15 +394,18 @@ class QueryParser: @@ -407,15 +394,18 @@ class QueryParser:
407 all_futures.append(embedding_future) 394 all_futures.append(embedding_future)
408 future_to_lang[embedding_future] = ('embedding', None) 395 future_to_lang[embedding_future] = ('embedding', None)
409 396
410 - # Wait for all futures to complete  
411 - for future in as_completed(all_futures): 397 + # Enforce a hard timeout for translation-related work (300ms budget)
  398 + done, not_done = wait(all_futures, timeout=0.3)
  399 + for future in done:
412 task_type, lang = future_to_lang[future] 400 task_type, lang = future_to_lang[future]
413 try: 401 try:
414 result = future.result() 402 result = future.result()
415 if task_type == 'translation': 403 if task_type == 'translation':
416 if result: 404 if result:
417 translations[lang] = result 405 translations[lang] = result
418 - log_info(f"Translation completed | Query text: '{query_text}' | Target language: {lang} | Translation result: '{result}'") 406 + log_info(
  407 + f"Translation completed | Query text: '{query_text}' | Target language: {lang} | Translation result: '{result}'"
  408 + )
419 if context: 409 if context:
420 context.store_intermediate_result(f'translation_{lang}', result) 410 context.store_intermediate_result(f'translation_{lang}', result)
421 elif task_type == 'embedding': 411 elif task_type == 'embedding':
@@ -434,10 +424,27 @@ class QueryParser: @@ -434,10 +424,27 @@ class QueryParser:
434 log_info(error_msg) 424 log_info(error_msg)
435 if context: 425 if context:
436 context.add_warning(error_msg) 426 context.add_warning(error_msg)
437 - 427 +
  428 + # Log timeouts for any futures that did not finish within 300ms
  429 + if not_done:
  430 + for future in not_done:
  431 + task_type, lang = future_to_lang[future]
  432 + if task_type == 'translation':
  433 + timeout_msg = (
  434 + f"Translation timeout (>300ms) | Language: {lang} | "
  435 + f"Query text: '{query_text}'"
  436 + )
  437 + else:
  438 + timeout_msg = "Query vector generation timeout (>300ms), proceeding without embedding result"
  439 + log_info(timeout_msg)
  440 + if context:
  441 + context.add_warning(timeout_msg)
  442 +
438 # Clean up encoding executor 443 # Clean up encoding executor
439 if encoding_executor: 444 if encoding_executor:
440 encoding_executor.shutdown(wait=False) 445 encoding_executor.shutdown(wait=False)
  446 + if translation_executor:
  447 + translation_executor.shutdown(wait=False)
441 448
442 # Update translations in context after all are complete 449 # Update translations in context after all are complete
443 if translations and context: 450 if translations and context:
query/qwen_mt_translate.py
1 -"""  
2 -Translation service for multi-language query support. 1 +"""Qwen-MT translation orchestrator with cache and async helpers."""
3 2
4 -Supports multiple translation models:  
5 -- Qwen (default): Alibaba Cloud DashScope API using qwen-mt-flash model  
6 -- DeepL: DeepL API for high-quality translations  
7 -  
8 -重要说明(Qwen 机翻限速):  
9 -- 当前默认使用的 `qwen-mt-flash` 为云端机翻模型,**官方限速较低,约 RPM=60(每分钟约 60 请求)**  
10 -- 在高并发场景必须依赖 Redis 翻译缓存与批量预热,避免在用户实时请求路径上直接打满 DashScope 限流  
11 -- 若业务侧存在大规模离线翻译或更高吞吐需求,建议评估 DeepL 或自建翻译后端  
12 -  
13 -使用方法 (Usage):  
14 -  
15 -```python  
16 -from query.translator import Translator  
17 -  
18 -# 使用默认的 qwen 模型(推荐)  
19 -translator = Translator() # 默认使用 qwen 模型  
20 -  
21 -# 或显式指定模型  
22 -translator = Translator(model='qwen') # 使用 qwen 模型  
23 -translator = Translator(model='deepl') # 使用 DeepL 模型  
24 -  
25 -# 翻译文本  
26 -result = translator.translate(  
27 - text="我看到这个视频后没有笑",  
28 - target_lang="en",  
29 - source_lang="auto" # 自动检测源语言  
30 -)  
31 -```  
32 -  
33 -配置说明 (Configuration):  
34 -- Qwen 模型需要设置 DASHSCOPE_API_KEY 环境变量(在 .env 文件中)  
35 -- DeepL 模型需要设置 DEEPL_AUTH_KEY 环境变量(在 .env 文件中)  
36 -  
37 -Qwen 模型参考文档:  
38 -- 官方文档:https://help.aliyun.com/zh/model-studio/get-api-key  
39 -- 模型:qwen-mt-flash(快速翻译模型)  
40 -  
41 -DeepL 官方文档:  
42 -https://developers.deepl.com/api-reference/translate/request-translation  
43 -""" 3 +from __future__ import annotations
44 4
  5 +import hashlib
  6 +import logging
45 import os 7 import os
46 -import requests  
47 import re 8 import re
48 -import redis  
49 -from concurrent.futures import ThreadPoolExecutor, Future  
50 -from datetime import timedelta  
51 -from typing import Dict, List, Optional, Union  
52 -import logging  
53 import time 9 import time
  10 +from typing import Dict, List, Optional
54 11
55 -logger = logging.getLogger(__name__)  
56 -  
57 -from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG 12 +import redis
58 from openai import OpenAI 13 from openai import OpenAI
59 14
  15 +from config.env_config import DASHSCOPE_API_KEY, REDIS_CONFIG
  16 +from config.services_config import get_translation_cache_config
  17 +from config.translate_prompts import SOURCE_LANG_CODE_MAP
60 18
61 -class Translator:  
62 - """  
63 - Multi-language translator supporting Qwen and DeepL APIs.  
64 -  
65 - Default model is 'qwen' which uses Alibaba Cloud DashScope API.  
66 - """  
67 -# 华北2(北京):https://dashscope.aliyuncs.com/compatible-mode/v1  
68 -# 新加坡:https://dashscope-intl.aliyuncs.com/compatible-mode/v1  
69 -# 美国(弗吉尼亚):https://dashscope-us.aliyuncs.com/compatible-mode/v1 19 +logger = logging.getLogger(__name__)
70 20
71 - DEEPL_API_URL = "https://api.deepl.com/v2/translate" # Pro tier  
72 - QWEN_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1" # 北京地域  
73 - # QWEN_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" # 新加坡  
74 - # 如果使用新加坡地域的模型,需要将base_url替换为:https://dashscope-intl.aliyuncs.com/compatible-mode/v1  
75 - QWEN_MODEL = "qwen-mt-flash" # 快速翻译模型  
76 21
77 - # Language code mapping  
78 - LANG_CODE_MAP = {  
79 - 'zh': 'ZH',  
80 - 'en': 'EN',  
81 - 'ru': 'RU',  
82 - 'ar': 'AR',  
83 - 'ja': 'JA',  
84 - 'es': 'ES',  
85 - 'de': 'DE',  
86 - 'fr': 'FR',  
87 - 'it': 'IT',  
88 - 'pt': 'PT',  
89 - } 22 +class Translator:
  23 + QWEN_DEFAULT_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1"
  24 + QWEN_MODEL = "qwen-mt-flash"
90 25
91 def __init__( 26 def __init__(
92 self, 27 self,
@@ -95,77 +30,90 @@ class Translator: @@ -95,77 +30,90 @@ class Translator:
95 use_cache: bool = True, 30 use_cache: bool = True,
96 timeout: int = 10, 31 timeout: int = 10,
97 glossary_id: Optional[str] = None, 32 glossary_id: Optional[str] = None,
98 - translation_context: Optional[str] = None 33 + translation_context: Optional[str] = None,
99 ): 34 ):
100 - """  
101 - Initialize translator.  
102 -  
103 - Args:  
104 - model: Translation model to use. Options: 'qwen' (default) or 'deepl'  
105 - api_key: API key for the selected model (or None to use from config/env)  
106 - use_cache: Whether to cache translations  
107 - timeout: Request timeout in seconds  
108 - glossary_id: DeepL glossary ID for custom terminology (optional, only for DeepL)  
109 - translation_context: Context hint for translation (e.g., "e-commerce", "product search")  
110 - """  
111 - self.model = model.lower()  
112 - if self.model not in ['qwen', 'deepl']:  
113 - raise ValueError(f"Unsupported model: {model}. Supported models: 'qwen', 'deepl'")  
114 -  
115 - # Get API key from config if not provided  
116 - if api_key is None:  
117 - if self.model == 'qwen':  
118 - api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY")  
119 - else: # deepl  
120 - api_key = DEEPL_AUTH_KEY or os.getenv("DEEPL_AUTH_KEY")  
121 -  
122 - self.api_key = api_key  
123 - self.timeout = timeout  
124 - self.use_cache = use_cache 35 + self.model = self._normalize_model(model)
  36 + self.timeout = int(timeout)
  37 + self.use_cache = bool(use_cache)
125 self.glossary_id = glossary_id 38 self.glossary_id = glossary_id
126 self.translation_context = translation_context or "e-commerce product search" 39 self.translation_context = translation_context or "e-commerce product search"
127 -  
128 - # Initialize OpenAI client for Qwen if needed  
129 - self.qwen_client = None  
130 - if self.model == 'qwen':  
131 - if not self.api_key:  
132 - logger.warning("DASHSCOPE_API_KEY not set. Qwen translation will not work.")  
133 - else:  
134 - self.qwen_client = OpenAI(  
135 - api_key=self.api_key,  
136 - base_url=self.QWEN_BASE_URL,  
137 - )  
138 40
139 - # Initialize Redis cache if enabled  
140 - if use_cache: 41 + cache_cfg = get_translation_cache_config()
  42 + self.cache_prefix = str(cache_cfg.get("key_prefix", "trans:v2"))
  43 + self.expire_seconds = int(cache_cfg.get("ttl_seconds", 360 * 24 * 3600))
  44 + self.cache_sliding_expiration = bool(cache_cfg.get("sliding_expiration", True))
  45 + self.cache_include_context = bool(cache_cfg.get("key_include_context", True))
  46 + self.cache_include_prompt = bool(cache_cfg.get("key_include_prompt", True))
  47 + self.cache_include_source_lang = bool(cache_cfg.get("key_include_source_lang", True))
  48 +
  49 + self.qwen_model_name = self._resolve_qwen_model_name(model)
  50 + self._api_key = api_key or self._default_api_key(self.model)
  51 + self._qwen_client: Optional[OpenAI] = None
  52 + base_url = os.getenv("DASHSCOPE_BASE_URL") or self.QWEN_DEFAULT_BASE_URL
  53 + if self._api_key:
141 try: 54 try:
142 - self.redis_client = redis.Redis(  
143 - host=REDIS_CONFIG.get('host', 'localhost'),  
144 - port=REDIS_CONFIG.get('port', 6479),  
145 - password=REDIS_CONFIG.get('password'),  
146 - decode_responses=True, # Return str instead of bytes  
147 - socket_timeout=REDIS_CONFIG.get('socket_timeout', 1),  
148 - socket_connect_timeout=REDIS_CONFIG.get('socket_connect_timeout', 1),  
149 - retry_on_timeout=REDIS_CONFIG.get('retry_on_timeout', False),  
150 - health_check_interval=10, # 避免复用坏连接  
151 - )  
152 - # Test connection  
153 - self.redis_client.ping()  
154 - expire_days = REDIS_CONFIG.get('translation_cache_expire_days', 360)  
155 - self.expire_time = timedelta(days=expire_days)  
156 - self.expire_seconds = int(self.expire_time.total_seconds()) # Redis 需要秒数  
157 - self.cache_prefix = REDIS_CONFIG.get('translation_cache_prefix', 'trans')  
158 - logger.info("Redis cache initialized for translations")  
159 - except Exception as e:  
160 - logger.warning(f"Failed to initialize Redis cache: {e}, falling back to no cache")  
161 - self.redis_client = None  
162 - self.cache = None 55 + self._qwen_client = OpenAI(api_key=self._api_key, base_url=base_url)
  56 + except Exception as exc:
  57 + logger.warning("Failed to initialize qwen-mt client: %s", exc, exc_info=True)
163 else: 58 else:
164 - self.redis_client = None  
165 - self.cache = None  
166 -  
167 - # Thread pool for async translation  
168 - self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="translator") 59 + logger.warning("DASHSCOPE_API_KEY not set; qwen-mt translation unavailable")
  60 +
  61 + self.redis_client = None
  62 + if self.use_cache and bool(cache_cfg.get("enabled", True)):
  63 + self.redis_client = self._init_redis_client()
  64 +
  65 + @staticmethod
  66 + def _normalize_model(model: str) -> str:
  67 + m = (model or "qwen").strip().lower()
  68 + if m.startswith("qwen"):
  69 + return "qwen-mt"
  70 + raise ValueError(f"Unsupported model: {model}. Supported models: 'qwen', 'qwen-mt', 'qwen-mt-flash'")
  71 +
  72 + @staticmethod
  73 + def _resolve_qwen_model_name(model: str) -> str:
  74 + m = (model or "qwen").strip().lower()
  75 + if m in {"qwen", "qwen-mt"}:
  76 + return "qwen-mt-flash"
  77 + return m
  78 +
  79 + @staticmethod
  80 + def _default_api_key(model: str) -> Optional[str]:
  81 + del model
  82 + return DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY")
  83 +
  84 + def _init_redis_client(self):
  85 + try:
  86 + client = redis.Redis(
  87 + host=REDIS_CONFIG.get("host", "localhost"),
  88 + port=REDIS_CONFIG.get("port", 6479),
  89 + password=REDIS_CONFIG.get("password"),
  90 + decode_responses=True,
  91 + socket_timeout=REDIS_CONFIG.get("socket_timeout", 1),
  92 + socket_connect_timeout=REDIS_CONFIG.get("socket_connect_timeout", 1),
  93 + retry_on_timeout=REDIS_CONFIG.get("retry_on_timeout", False),
  94 + health_check_interval=10,
  95 + )
  96 + client.ping()
  97 + return client
  98 + except Exception as exc:
  99 + logger.warning("Failed to initialize translation redis cache: %s", exc)
  100 + return None
  101 +
  102 + def _build_cache_key(
  103 + self,
  104 + text: str,
  105 + target_lang: str,
  106 + source_lang: Optional[str],
  107 + context: Optional[str],
  108 + prompt: Optional[str],
  109 + ) -> str:
  110 + src = (source_lang or "auto").strip().lower() if self.cache_include_source_lang else "-"
  111 + tgt = (target_lang or "").strip().lower()
  112 + ctx = (context or "").strip() if self.cache_include_context else ""
  113 + prm = (prompt or "").strip() if self.cache_include_prompt else ""
  114 + payload = f"model={self.model}\nsrc={src}\ntgt={tgt}\nctx={ctx}\nprm={prm}\ntext={text}"
  115 + digest = hashlib.sha256(payload.encode("utf-8")).hexdigest()
  116 + return f"{self.cache_prefix}:{self.model}:{src}:{tgt}:{digest}"
169 117
170 def translate( 118 def translate(
171 self, 119 self,
@@ -173,99 +121,27 @@ class Translator: @@ -173,99 +121,27 @@ class Translator:
173 target_lang: str, 121 target_lang: str,
174 source_lang: Optional[str] = None, 122 source_lang: Optional[str] = None,
175 context: Optional[str] = None, 123 context: Optional[str] = None,
176 - prompt: Optional[str] = None 124 + prompt: Optional[str] = None,
177 ) -> Optional[str]: 125 ) -> Optional[str]:
178 - """  
179 - Translate text to target language (synchronous mode).  
180 -  
181 - Args:  
182 - text: Text to translate  
183 - target_lang: Target language code ('zh', 'en', 'ru', etc.)  
184 - source_lang: Source language code (option al, auto-detect if None)  
185 - context: Additional context for translation (overrides default context)  
186 - prompt: Translation prompt/instruction (optional, for better translation quality)  
187 -  
188 - Returns:  
189 - Translated text or None if translation fails  
190 - """  
191 if not text or not text.strip(): 126 if not text or not text.strip():
192 return text 127 return text
193 128
194 - # Normalize language codes  
195 - target_lang = target_lang.lower()  
196 - if source_lang:  
197 - source_lang = source_lang.lower()  
198 -  
199 - # Optimization: Skip translation if not needed  
200 - if target_lang == 'en' and self._is_english_text(text):  
201 - logger.info(f"[Translator] Text is already English, skipping translation: '{text[:50]}...'") 129 + tgt = (target_lang or "").strip().lower()
  130 + src = (source_lang or "").strip().lower() or None
  131 + if tgt == "en" and self._is_english_text(text):
202 return text 132 return text
203 -  
204 - if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)):  
205 - logger.info(  
206 - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "  
207 - f"Source language: {source_lang or 'auto'} | Result: Skip translation (contains Chinese or pure number)"  
208 - ) 133 + if tgt == "zh" and (self._contains_chinese(text) or self._is_pure_number(text)):
209 return text 134 return text
210 135
211 - # Use provided context or default context  
212 translation_context = context or self.translation_context 136 translation_context = context or self.translation_context
213 -  
214 - # Build cache key (include prompt in cache key if provided)  
215 - cache_key_parts = [source_lang or 'auto', target_lang, translation_context]  
216 - if prompt:  
217 - cache_key_parts.append(prompt)  
218 - cache_key_parts.append(text)  
219 - cache_key = ':'.join(cache_key_parts) 137 + cached = self._get_cached_translation_redis(text, tgt, src, translation_context, prompt)
  138 + if cached is not None:
  139 + return cached
220 140
221 - # Check cache (include context and prompt in cache key for accuracy)  
222 - if self.use_cache and self.redis_client:  
223 - cached = self._get_cached_translation_redis(text, target_lang, source_lang, translation_context, prompt)  
224 - if cached:  
225 - logger.info(  
226 - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "  
227 - f"Source language: {source_lang or 'auto'} | Result: '{cached}' | Source: Cache hit"  
228 - )  
229 - return cached  
230 -  
231 - # If no API key, return mock translation (for testing)  
232 - if not self.api_key:  
233 - logger.info(  
234 - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "  
235 - f"Source language: {source_lang or 'auto'} | Result: '{text}' | Source: Mock mode (no API key)"  
236 - )  
237 - return text  
238 -  
239 - # Translate using selected model  
240 - logger.info(  
241 - f"[Translator] Translation request | Model: {self.model} | Original text: '{text}' | Target language: {target_lang} | "  
242 - f"Source language: {source_lang or 'auto'} | Context: {translation_context} | "  
243 - f"Prompt: {'yes' if prompt else 'no'} | Status: Starting translation"  
244 - )  
245 -  
246 - if self.model == 'qwen':  
247 - result = self._translate_qwen(text, target_lang, source_lang, translation_context, prompt)  
248 - else: # deepl  
249 - result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt)  
250 -  
251 - # Surface translation failure to the caller instead of silently  
252 - # masquerading the source text as a successful translation.  
253 - if result is None:  
254 - logger.warning(  
255 - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "  
256 - f"Source language: {source_lang or 'auto'} | Status: Translation failed"  
257 - )  
258 - else:  
259 - logger.info(  
260 - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "  
261 - f"Source language: {source_lang or 'auto'} | Result: '{result}' | Status: Translation successful"  
262 - )  
263 -  
264 - # Cache only successful translations. Failed attempts must not poison  
265 - # Redis with the original text.  
266 - if result is not None and self.use_cache and self.redis_client:  
267 - self._set_cached_translation_redis(text, target_lang, result, source_lang, translation_context, prompt) 141 + result = self._translate_qwen(text, tgt, src)
268 142
  143 + if result is not None:
  144 + self._set_cached_translation_redis(text, tgt, result, src, translation_context, prompt)
269 return result 145 return result
270 146
271 def _translate_qwen( 147 def _translate_qwen(
@@ -273,412 +149,63 @@ class Translator: @@ -273,412 +149,63 @@ class Translator:
273 text: str, 149 text: str,
274 target_lang: str, 150 target_lang: str,
275 source_lang: Optional[str], 151 source_lang: Optional[str],
276 - context: Optional[str] = None,  
277 - prompt: Optional[str] = None  
278 ) -> Optional[str]: 152 ) -> Optional[str]:
279 - """  
280 - Translate using Qwen MT Flash model via Alibaba Cloud DashScope API.  
281 -  
282 - Args:  
283 - text: Text to translate  
284 - target_lang: Target language code ('zh', 'en', 'ru', etc.)  
285 - source_lang: Source language code (optional, 'auto' if None)  
286 - context: Context hint for translation (optional)  
287 - prompt: Translation prompt/instruction (optional)  
288 -  
289 - Returns:  
290 - Translated text or None if translation fails  
291 - """  
292 - if not self.qwen_client:  
293 - logger.error("[Translator] Qwen client not initialized. Check DASHSCOPE_API_KEY.") 153 + if not self._qwen_client:
294 return None 154 return None
295 -  
296 - # Qwen (qwen-mt-plus/flash/turbo) supported languages mapping  
297 - # 标准来自:你提供的“语言 / 英文名 / 代码”表  
298 - qwen_lang_map = {  
299 - "en": "English",  
300 - "zh": "Chinese",  
301 - "zh_tw": "Traditional Chinese",  
302 - "ru": "Russian",  
303 - "ja": "Japanese",  
304 - "ko": "Korean",  
305 - "es": "Spanish",  
306 - "fr": "French",  
307 - "pt": "Portuguese",  
308 - "de": "German",  
309 - "it": "Italian",  
310 - "th": "Thai",  
311 - "vi": "Vietnamese",  
312 - "id": "Indonesian",  
313 - "ms": "Malay",  
314 - "ar": "Arabic",  
315 - "hi": "Hindi",  
316 - "he": "Hebrew",  
317 - "my": "Burmese",  
318 - "ta": "Tamil",  
319 - "ur": "Urdu",  
320 - "bn": "Bengali",  
321 - "pl": "Polish",  
322 - "nl": "Dutch",  
323 - "ro": "Romanian",  
324 - "tr": "Turkish",  
325 - "km": "Khmer",  
326 - "lo": "Lao",  
327 - "yue": "Cantonese",  
328 - "cs": "Czech",  
329 - "el": "Greek",  
330 - "sv": "Swedish",  
331 - "hu": "Hungarian",  
332 - "da": "Danish",  
333 - "fi": "Finnish",  
334 - "uk": "Ukrainian",  
335 - "bg": "Bulgarian",  
336 - }  
337 -  
338 - # Convert target language  
339 - target_lang_normalized = target_lang.lower()  
340 - target_lang_qwen = qwen_lang_map.get(target_lang_normalized, target_lang.capitalize())  
341 -  
342 - # Convert source language  
343 - source_lang_normalized = (source_lang or "").strip().lower()  
344 - if not source_lang_normalized or source_lang_normalized == "auto":  
345 - source_lang_qwen = "auto"  
346 - else:  
347 - source_lang_qwen = qwen_lang_map.get(source_lang_normalized, source_lang.capitalize())  
348 -  
349 - # Prepare translation options  
350 - translation_options = {  
351 - "source_lang": source_lang_qwen,  
352 - "target_lang": target_lang_qwen,  
353 - }  
354 -  
355 - # Prepare messages  
356 - messages = [  
357 - {  
358 - "role": "user",  
359 - "content": text  
360 - }  
361 - ]  
362 -  
363 - start_time = time.time() 155 + tgt_norm = (target_lang or "").strip().lower()
  156 + src_norm = (source_lang or "").strip().lower()
  157 + tgt_qwen = self.SOURCE_LANG_CODE_MAP.get(tgt_norm, tgt_norm.capitalize())
  158 + src_qwen = "auto" if not src_norm or src_norm == "auto" else self.SOURCE_LANG_CODE_MAP.get(src_norm, src_norm.capitalize())
  159 + start = time.time()
364 try: 160 try:
365 - completion = self.qwen_client.chat.completions.create(  
366 - model=self.QWEN_MODEL,  
367 - messages=messages, 161 + completion = self._qwen_client.chat.completions.create(
  162 + model=self.qwen_model_name,
  163 + messages=[{"role": "user", "content": text}],
368 extra_body={ 164 extra_body={
369 - "translation_options": translation_options  
370 - }  
371 - )  
372 -  
373 - translated_text = completion.choices[0].message.content.strip()  
374 - duration_ms = (time.time() - start_time) * 1000  
375 -  
376 - logger.info(  
377 - f"[Translator] Qwen API response success | Original text: '{text}' | Target language: {target_lang_qwen} | "  
378 - f"Translation result: '{translated_text}' | Duration: {duration_ms:.2f} ms"  
379 - )  
380 - return translated_text  
381 -  
382 - except Exception as e:  
383 - duration_ms = (time.time() - start_time) * 1000  
384 - logger.error(  
385 - f"[Translator] Qwen API request exception | Original text: '{text}' | Target language: {target_lang_qwen} | "  
386 - f"Duration: {duration_ms:.2f} ms | Error: {e}", exc_info=True  
387 - )  
388 - return None  
389 -  
390 - def _translate_deepl(  
391 - self,  
392 - text: str,  
393 - target_lang: str,  
394 - source_lang: Optional[str],  
395 - context: Optional[str] = None,  
396 - prompt: Optional[str] = None  
397 - ) -> Optional[str]:  
398 - """  
399 - Translate using DeepL API with context and glossary support.  
400 -  
401 - Args:  
402 - text: Text to translate  
403 - target_lang: Target language code  
404 - source_lang: Source language code (optional)  
405 - context: Context hint for translation (e.g., "e-commerce product search")  
406 - """  
407 - # Map to DeepL language codes  
408 - target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper())  
409 -  
410 - headers = {  
411 - "Authorization": f"DeepL-Auth-Key {self.api_key}",  
412 - "Content-Type": "application/json",  
413 - }  
414 -  
415 - # Use prompt as context parameter for DeepL API (not as text prefix)  
416 - # According to DeepL API: context is "Additional context that can influence a translation but is not translated itself"  
417 - # If prompt is provided, use it as context; otherwise use the default context  
418 - api_context = prompt if prompt else context  
419 -  
420 - # For e-commerce, add context words to help DeepL understand the domain  
421 - # This is especially important for single-word ambiguous terms like "车" (car vs rook)  
422 - text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context)  
423 -  
424 - payload = {  
425 - "text": [text_to_translate],  
426 - "target_lang": target_code,  
427 - }  
428 -  
429 - if source_lang:  
430 - source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper())  
431 - payload["source_lang"] = source_code  
432 -  
433 - # Add context parameter (prompt or default context)  
434 - # Context influences translation but is not translated itself  
435 - if api_context:  
436 - payload["context"] = api_context  
437 -  
438 - # Add glossary if configured  
439 - if self.glossary_id:  
440 - payload["glossary_id"] = self.glossary_id  
441 -  
442 - # Note: DeepL API v2 supports "context" parameter for additional context  
443 - # that influences translation but is not translated itself.  
444 - # We use prompt as context parameter when provided.  
445 -  
446 - try:  
447 - response = requests.post(  
448 - self.DEEPL_API_URL,  
449 - headers=headers,  
450 - json=payload,  
451 - timeout=self.timeout 165 + "translation_options": {
  166 + "source_lang": src_qwen,
  167 + "target_lang": tgt_qwen,
  168 + }
  169 + },
  170 + timeout=self.timeout,
452 ) 171 )
453 -  
454 - if response.status_code == 200:  
455 - data = response.json()  
456 - if "translations" in data and len(data["translations"]) > 0:  
457 - translated_text = data["translations"][0]["text"]  
458 - # If we added context, extract just the term from the result  
459 - if needs_extraction:  
460 - translated_text = self._extract_term_from_translation(  
461 - translated_text, text, target_code  
462 - )  
463 - logger.debug(  
464 - f"[Translator] DeepL API response success | Original text: '{text}' | Target language: {target_code} | "  
465 - f"Translation result: '{translated_text}'"  
466 - )  
467 - return translated_text  
468 - else:  
469 - logger.error(  
470 - f"[Translator] DeepL API error | Original text: '{text}' | Target language: {target_code} | "  
471 - f"Status code: {response.status_code} | Error message: {response.text}"  
472 - ) 172 + content = (completion.choices[0].message.content or "").strip()
  173 + if not content:
473 return None 174 return None
474 -  
475 - except requests.Timeout: 175 + logger.info("[qwen-mt] Success | src=%s tgt=%s latency=%.1fms", src_qwen, tgt_qwen, (time.time() - start) * 1000)
  176 + return content
  177 + except Exception as exc:
476 logger.warning( 178 logger.warning(
477 - f"[Translator] DeepL API request timeout | Original text: '{text}' | Target language: {target_code} | "  
478 - f"Timeout: {self.timeout}s"  
479 - )  
480 - return None  
481 - except Exception as e:  
482 - logger.error(  
483 - f"[Translator] DeepL API request exception | Original text: '{text}' | Target language: {target_code} | "  
484 - f"Error: {e}", exc_info=True 179 + "[qwen-mt] Failed | src=%s tgt=%s latency=%.1fms error=%s",
  180 + src_qwen,
  181 + tgt_qwen,
  182 + (time.time() - start) * 1000,
  183 + exc,
  184 + exc_info=True,
485 ) 185 )
486 return None 186 return None
487 187
488 - # NOTE: _translate_deepl_free is intentionally not implemented.  
489 - # We do not support automatic fallback to the free endpoint, to avoid  
490 - # mixing Pro keys with https://api-free.deepl.com and related 403 errors.  
491 -  
492 - def translate_multi(  
493 - self,  
494 - text: str,  
495 - target_langs: List[str],  
496 - source_lang: Optional[str] = None,  
497 - context: Optional[str] = None,  
498 - async_mode: bool = True,  
499 - prompt: Optional[str] = None  
500 - ) -> Dict[str, Optional[str]]:  
501 - """  
502 - Translate text to multiple target languages.  
503 -  
504 - In async_mode=True (default):  
505 - - Returns cached translations immediately if available  
506 - - For translations that can be optimized (e.g., pure numbers, already in target language),  
507 - returns result immediately via synchronous call  
508 - - Launches async tasks for other missing translations (non-blocking)  
509 - - Returns None for missing translations that require async processing  
510 -  
511 - In async_mode=False:  
512 - - Waits for all translations to complete (blocking)  
513 -  
514 - Args:  
515 - text: Text to translate  
516 - target_langs: List of target language codes  
517 - source_lang: Source language code (optional)  
518 - context: Context hint for translation (optional)  
519 - async_mode: If True, return cached results immediately and translate missing ones async  
520 - prompt: Translation prompt/instruction (optional)  
521 188
522 - Returns:  
523 - Dictionary mapping language code to translated text (only cached results in async mode)  
524 - """  
525 - results = {}  
526 - missing_langs = []  
527 - async_langs = []  
528 -  
529 - # First, get cached translations  
530 - for lang in target_langs:  
531 - cached = self._get_cached_translation(text, lang, source_lang, context, prompt)  
532 - if cached is not None:  
533 - results[lang] = cached  
534 - else:  
535 - missing_langs.append(lang)  
536 -  
537 - # If async mode and there are missing translations  
538 - if async_mode and missing_langs:  
539 - # Check if translation can be optimized (immediate return)  
540 - for lang in missing_langs:  
541 - target_lang = lang.lower()  
542 - # Check optimization conditions (same as in translate method)  
543 - can_optimize = False  
544 - if target_lang == 'en' and self._is_english_text(text):  
545 - can_optimize = True  
546 - elif target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)):  
547 - can_optimize = True  
548 -  
549 - if can_optimize:  
550 - # Can be optimized, call translate synchronously for immediate result  
551 - results[lang] = self.translate(text, lang, source_lang, context, prompt)  
552 - else:  
553 - # Requires actual translation, add to async list  
554 - async_langs.append(lang)  
555 -  
556 - # Launch async tasks for translations that require actual API calls  
557 - if async_langs:  
558 - for lang in async_langs:  
559 - self._translate_async(text, lang, source_lang, context, prompt)  
560 - # Return None for async translations  
561 - for lang in async_langs:  
562 - results[lang] = None  
563 - else:  
564 - # Synchronous mode: wait for all translations  
565 - for lang in missing_langs:  
566 - results[lang] = self.translate(text, lang, source_lang, context, prompt)  
567 -  
568 - return results  
569 -  
570 - def translate_multi_async(  
571 - self,  
572 - text: str,  
573 - target_langs: List[str],  
574 - source_lang: Optional[str] = None,  
575 - context: Optional[str] = None,  
576 - prompt: Optional[str] = None  
577 - ) -> Dict[str, Union[str, Future]]:  
578 - """  
579 - Translate text to multiple target languages asynchronously, returning Futures that can be awaited.  
580 -  
581 - This method returns a dictionary where:  
582 - - If translation is cached, the value is the translation string (immediate)  
583 - - If translation needs to be done, the value is a Future object that can be awaited  
584 -  
585 - Args:  
586 - text: Text to translate  
587 - target_langs: List of target language codes  
588 - source_lang: Source language code (optional)  
589 - context: Context hint for translation (optional)  
590 - prompt: Translation prompt/instruction (optional)  
591 -  
592 - Returns:  
593 - Dictionary mapping language code to either translation string (cached) or Future object  
594 - """  
595 - results = {}  
596 - missing_langs = []  
597 -  
598 - # First, get cached translations  
599 - for lang in target_langs:  
600 - cached = self._get_cached_translation(text, lang, source_lang, context, prompt)  
601 - if cached is not None:  
602 - results[lang] = cached  
603 - else:  
604 - missing_langs.append(lang)  
605 -  
606 - # For missing translations, submit async tasks and return Futures  
607 - for lang in missing_langs:  
608 - future = self.executor.submit(  
609 - self.translate,  
610 - text,  
611 - lang,  
612 - source_lang,  
613 - context,  
614 - prompt  
615 - )  
616 - results[lang] = future  
617 -  
618 - return results  
619 -  
620 - def _get_cached_translation(  
621 - self,  
622 - text: str,  
623 - target_lang: str,  
624 - source_lang: Optional[str] = None,  
625 - context: Optional[str] = None,  
626 - prompt: Optional[str] = None  
627 - ) -> Optional[str]:  
628 - """Get translation from cache if available."""  
629 - if not self.redis_client:  
630 - return None  
631 - return self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt)  
632 -  
633 def _get_cached_translation_redis( 189 def _get_cached_translation_redis(
634 self, 190 self,
635 text: str, 191 text: str,
636 target_lang: str, 192 target_lang: str,
637 source_lang: Optional[str] = None, 193 source_lang: Optional[str] = None,
638 context: Optional[str] = None, 194 context: Optional[str] = None,
639 - prompt: Optional[str] = None 195 + prompt: Optional[str] = None,
640 ) -> Optional[str]: 196 ) -> Optional[str]:
641 - """  
642 - Get translation from Redis cache with sliding expiration.  
643 -  
644 - 滑动过期机制:每次访问缓存时,重置过期时间为配置的过期时间(默认720天)。  
645 - 这样缓存会在最后一次访问后的720天才过期,而不是写入后的720天。  
646 - 这确保了常用的翻译缓存不会被过早删除。  
647 - """  
648 if not self.redis_client: 197 if not self.redis_client:
649 return None 198 return None
650 - 199 + key = self._build_cache_key(text, target_lang, source_lang, context, prompt)
651 try: 200 try:
652 - # Build cache key: prefix:target_lang:text  
653 - # For simplicity, we use target_lang and text as key  
654 - # Context and prompt are not included in key to maximize cache hits  
655 - cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}"  
656 - value = self.redis_client.get(cache_key)  
657 - if value:  
658 - # Sliding expiration: reset expiration time on access  
659 - # 每次读取缓存时,重置过期时间为配置的过期时间(最后一次访问后的N天才过期)  
660 - try:  
661 - self.redis_client.expire(cache_key, self.expire_seconds)  
662 - except Exception as expire_error:  
663 - # 即使 expire 失败,也返回缓存值(不影响功能)  
664 - logger.warning(  
665 - f"[Translator] Failed to update cache expiration for key {cache_key}: {expire_error}"  
666 - )  
667 -  
668 - logger.debug(  
669 - f"[Translator] Redis cache hit | Original text: '{text}' | Target language: {target_lang} | "  
670 - f"Cache key: {cache_key} | Translation result: '{value}' | TTL reset to {self.expire_seconds}s"  
671 - )  
672 - return value  
673 - logger.debug(  
674 - f"[Translator] Redis cache miss | Original text: '{text}' | Target language: {target_lang} | "  
675 - f"Cache key: {cache_key}"  
676 - ) 201 + value = self.redis_client.get(key)
  202 + if value and self.cache_sliding_expiration:
  203 + self.redis_client.expire(key, self.expire_seconds)
  204 + return value
  205 + except Exception as exc:
  206 + logger.warning("Redis get translation cache failed: %s", exc)
677 return None 207 return None
678 - except Exception as e:  
679 - logger.error(f"[Translator] Redis error during get translation cache | Original text: '{text}' | Target language: {target_lang} | Error: {e}")  
680 - return None  
681 - 208 +
682 def _set_cached_translation_redis( 209 def _set_cached_translation_redis(
683 self, 210 self,
684 text: str, 211 text: str,
@@ -686,128 +213,17 @@ class Translator: @@ -686,128 +213,17 @@ class Translator:
686 translation: str, 213 translation: str,
687 source_lang: Optional[str] = None, 214 source_lang: Optional[str] = None,
688 context: Optional[str] = None, 215 context: Optional[str] = None,
689 - prompt: Optional[str] = None 216 + prompt: Optional[str] = None,
690 ) -> None: 217 ) -> None:
691 - """Store translation in Redis cache."""  
692 if not self.redis_client: 218 if not self.redis_client:
693 return 219 return
694 - 220 + key = self._build_cache_key(text, target_lang, source_lang, context, prompt)
695 try: 221 try:
696 - cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}"  
697 - self.redis_client.setex(cache_key, self.expire_seconds, translation)  
698 - logger.info(  
699 - f"[Translator] Redis cache write | Original text: '{text}' | Target language: {target_lang} | "  
700 - f"Cache key: {cache_key} | Translation result: '{translation}'"  
701 - )  
702 - except Exception as e:  
703 - logger.error(  
704 - f"[Translator] Redis cache write failed | Original text: '{text}' | Target language: {target_lang} | "  
705 - f"Error: {e}"  
706 - )  
707 -  
708 - def _translate_async(  
709 - self,  
710 - text: str,  
711 - target_lang: str,  
712 - source_lang: Optional[str] = None,  
713 - context: Optional[str] = None,  
714 - prompt: Optional[str] = None  
715 - ):  
716 - """Launch async translation task."""  
717 - def _do_translate():  
718 - try:  
719 - result = self.translate(text, target_lang, source_lang, context, prompt)  
720 - if result:  
721 - logger.debug(f"Async translation completed: {text} -> {target_lang}: {result}")  
722 - except Exception as e:  
723 - logger.warning(f"Async translation failed: {text} -> {target_lang}: {e}")  
724 -  
725 - self.executor.submit(_do_translate)  
726 -  
727 - def _add_ecommerce_context(  
728 - self,  
729 - text: str,  
730 - source_lang: Optional[str],  
731 - context: Optional[str]  
732 - ) -> tuple:  
733 - """  
734 - Add e-commerce context to text for better disambiguation.  
735 -  
736 - For single-word ambiguous Chinese terms, we add context words that help  
737 - DeepL understand this is an e-commerce/product search context.  
738 -  
739 - Args:  
740 - text: Original text to translate  
741 - source_lang: Source language code  
742 - context: Context hint  
743 -  
744 - Returns:  
745 - Tuple of (text_with_context, needs_extraction)  
746 - - text_with_context: Text to send to DeepL  
747 - - needs_extraction: Whether we need to extract the term from the result  
748 - """  
749 - # Only apply for e-commerce context and Chinese source  
750 - if not context or "e-commerce" not in context.lower():  
751 - return text, False  
752 -  
753 - if not source_lang or source_lang.lower() != 'zh':  
754 - return text, False  
755 -  
756 - # For single-word queries, add context to help disambiguation  
757 - text_stripped = text.strip()  
758 - if len(text_stripped.split()) == 1 and len(text_stripped) <= 2:  
759 - # Common ambiguous Chinese e-commerce terms like "车" (car vs rook)  
760 - # We add a context phrase: "购买 [term]" (buy [term]) or "商品 [term]" (product [term])  
761 - # This helps DeepL understand the e-commerce context  
762 - # We'll need to extract just the term from the translation result  
763 - context_phrase = f"购买 {text_stripped}"  
764 - return context_phrase, True  
765 -  
766 - # For multi-word queries, DeepL usually has enough context  
767 - return text, False  
768 -  
769 - def _extract_term_from_translation(  
770 - self,  
771 - translated_text: str,  
772 - original_text: str,  
773 - target_lang_code: str  
774 - ) -> str:  
775 - """  
776 - Extract the actual term from a translation that included context.  
777 -  
778 - For example, if we translated "购买 车" (buy car) and got "buy car",  
779 - we want to extract just "car".  
780 -  
781 - Args:  
782 - translated_text: Full translation result  
783 - original_text: Original single-word query  
784 - target_lang_code: Target language code (EN, ZH, etc.)  
785 -  
786 - Returns:  
787 - Extracted term or original translation if extraction fails  
788 - """  
789 - # For English target, try to extract the last word (the actual term)  
790 - if target_lang_code == "EN":  
791 - words = translated_text.strip().split()  
792 - if len(words) > 1:  
793 - # Usually the last word is the term we want  
794 - # But we need to be smart - if it's "buy car", we want "car"  
795 - # Common context words to skip: buy, purchase, product, item, etc.  
796 - context_words = {"buy", "purchase", "product", "item", "commodity", "goods"}  
797 - # Try to find the term (not a context word)  
798 - for word in reversed(words):  
799 - word_lower = word.lower().rstrip('.,!?;:')  
800 - if word_lower not in context_words:  
801 - return word_lower  
802 - # If all words are context words, return the last one  
803 - return words[-1].lower().rstrip('.,!?;:')  
804 -  
805 - # For other languages or if extraction fails, return as-is  
806 - # The user can configure a glossary for better results  
807 - return translated_text 222 + self.redis_client.setex(key, self.expire_seconds, translation)
  223 + except Exception as exc:
  224 + logger.warning("Redis set translation cache failed: %s", exc)
808 225
809 def _shop_lang_matches(self, shop_lang_lower: str, lang_code: str) -> bool: 226 def _shop_lang_matches(self, shop_lang_lower: str, lang_code: str) -> bool:
810 - """True if shop language matches index language (use source, no translate)."""  
811 if not shop_lang_lower or not lang_code: 227 if not shop_lang_lower or not lang_code:
812 return False 228 return False
813 if shop_lang_lower == lang_code: 229 if shop_lang_lower == lang_code:
@@ -818,146 +234,27 @@ class Translator: @@ -818,146 +234,27 @@ class Translator:
818 return True 234 return True
819 return False 235 return False
820 236
821 - def translate_for_indexing(  
822 - self,  
823 - text: str,  
824 - shop_language: str,  
825 - source_lang: Optional[str] = None,  
826 - context: Optional[str] = None,  
827 - prompt: Optional[str] = None,  
828 - index_languages: Optional[List[str]] = None,  
829 - ) -> Dict[str, Optional[str]]:  
830 - """  
831 - Translate text for indexing based on shop language and tenant index_languages.  
832 -  
833 - For each language in index_languages: use source text if shop language matches,  
834 - otherwise translate to that language.  
835 -  
836 - Args:  
837 - text: Text to translate  
838 - shop_language: Shop primary language (e.g. 'zh', 'en', 'ru')  
839 - source_lang: Source language code (optional)  
840 - context: Additional context for translation (optional)  
841 - prompt: Translation prompt (optional)  
842 - index_languages: Languages to index (from tenant_config). Default ["en", "zh"].  
843 -  
844 - Returns:  
845 - Dict keyed by each index_language with translated or source text (or None).  
846 - """  
847 - langs = index_languages if index_languages else ["en", "zh"]  
848 - results = {lang: None for lang in langs}  
849 - if not text or not text.strip():  
850 - return results  
851 - if re.match(r'^[\d\s_-]+$', text):  
852 - logger.info(f"[Translator] Skip translation for symbol-only query: '{text}'")  
853 - return results  
854 -  
855 - shop_lang_lower = (shop_language or "").strip().lower()  
856 - targets = []  
857 - for lang in langs:  
858 - if self._shop_lang_matches(shop_lang_lower, lang):  
859 - results[lang] = text  
860 - else:  
861 - targets.append(lang)  
862 -  
863 - for target_lang in targets:  
864 - cached = self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt)  
865 - if cached:  
866 - results[target_lang] = cached  
867 - logger.debug(f"[Translator] Cache hit for indexing: '{text}' -> {target_lang}: {cached}")  
868 - continue  
869 - translated = self.translate(  
870 - text,  
871 - target_lang=target_lang,  
872 - source_lang=source_lang or shop_language,  
873 - context=context,  
874 - prompt=prompt,  
875 - )  
876 - results[target_lang] = translated  
877 - return results  
878 -  
879 - def get_translation_needs(  
880 - self,  
881 - detected_lang: str,  
882 - supported_langs: List[str]  
883 - ) -> List[str]:  
884 - """  
885 - Determine which languages need translation.  
886 -  
887 - Args:  
888 - detected_lang: Detected query language  
889 - supported_langs: List of supported languages  
890 -  
891 - Returns:  
892 - List of language codes to translate to  
893 - """  
894 - # If detected language is in supported list, translate to others 237 + def get_translation_needs(self, detected_lang: str, supported_langs: List[str]) -> List[str]:
895 if detected_lang in supported_langs: 238 if detected_lang in supported_langs:
896 - return [lang for lang in supported_langs if detected_lang != lang]  
897 -  
898 - # Otherwise, translate to all supported languages 239 + return [lang for lang in supported_langs if lang != detected_lang]
899 return supported_langs 240 return supported_langs
900 - 241 +
901 def _is_english_text(self, text: str) -> bool: 242 def _is_english_text(self, text: str) -> bool:
902 - """  
903 - Check if text is primarily English (ASCII letters, numbers, common punctuation).  
904 -  
905 - Args:  
906 - text: Text to check  
907 -  
908 - Returns:  
909 - True if text appears to be English  
910 - """  
911 if not text or not text.strip(): 243 if not text or not text.strip():
912 return True 244 return True
913 -  
914 - # Remove whitespace and common punctuation  
915 - text_clean = re.sub(r'[\s\.,!?;:\-\'\"\(\)\[\]{}]', '', text) 245 + text_clean = re.sub(r"[\s\.,!?;:\-\'\"\(\)\[\]{}]", "", text)
916 if not text_clean: 246 if not text_clean:
917 return True 247 return True
918 -  
919 - # Check if all remaining characters are ASCII (letters, numbers)  
920 - # This is a simple heuristic: if most characters are ASCII, it's likely English  
921 ascii_count = sum(1 for c in text_clean if ord(c) < 128) 248 ascii_count = sum(1 for c in text_clean if ord(c) < 128)
922 - ratio = ascii_count / len(text_clean) if text_clean else 0  
923 -  
924 - # If more than 80% are ASCII characters, consider it English  
925 - return ratio > 0.8  
926 - 249 + return (ascii_count / len(text_clean)) > 0.8
  250 +
927 def _contains_chinese(self, text: str) -> bool: 251 def _contains_chinese(self, text: str) -> bool:
928 - """  
929 - Check if text contains Chinese characters (Han characters).  
930 -  
931 - Args:  
932 - text: Text to check  
933 -  
934 - Returns:  
935 - True if text contains Chinese characters  
936 - """  
937 if not text: 252 if not text:
938 return False 253 return False
939 -  
940 - # Check for Chinese characters (Unicode range: \u4e00-\u9fff)  
941 - chinese_pattern = re.compile(r'[\u4e00-\u9fff]')  
942 - return bool(chinese_pattern.search(text))  
943 - 254 + return bool(re.search(r"[\u4e00-\u9fff]", text))
  255 +
944 def _is_pure_number(self, text: str) -> bool: 256 def _is_pure_number(self, text: str) -> bool:
945 - """  
946 - Check if text is purely numeric (digits, possibly with spaces, dots, commas).  
947 -  
948 - Args:  
949 - text: Text to check  
950 -  
951 - Returns:  
952 - True if text is purely numeric  
953 - """  
954 if not text or not text.strip(): 257 if not text or not text.strip():
955 return False 258 return False
956 -  
957 - # Remove whitespace, dots, commas (common number separators)  
958 - text_clean = re.sub(r'[\s\.,]', '', text.strip())  
959 - if not text_clean:  
960 - return False  
961 -  
962 - # Check if all remaining characters are digits  
963 - return text_clean.isdigit() 259 + text_clean = re.sub(r"[\s\.,]", "", text.strip())
  260 + return bool(text_clean) and text_clean.isdigit()
query/test_translation.py
@@ -14,6 +14,7 @@ Test content: @@ -14,6 +14,7 @@ Test content:
14 import sys 14 import sys
15 import os 15 import os
16 from pathlib import Path 16 from pathlib import Path
  17 +from concurrent.futures import ThreadPoolExecutor
17 18
18 # Add parent directory to path 19 # Add parent directory to path
19 sys.path.insert(0, str(Path(__file__).parent.parent)) 20 sys.path.insert(0, str(Path(__file__).parent.parent))
@@ -42,9 +43,6 @@ def test_config_loading(): @@ -42,9 +43,6 @@ def test_config_loading():
42 43
43 print(f"✓ Configuration loaded successfully") 44 print(f"✓ Configuration loaded successfully")
44 print(f" Translation service: {config.query_config.translation_service}") 45 print(f" Translation service: {config.query_config.translation_service}")
45 - print(f" Translation prompt configuration:")  
46 - for key, value in config.query_config.translation_prompts.items():  
47 - print(f" {key}: {value[:60]}..." if len(value) > 60 else f" {key}: {value}")  
48 46
49 return config 47 return config
50 except Exception as e: 48 except Exception as e:
@@ -72,34 +70,23 @@ def test_translator_sync(config): @@ -72,34 +70,23 @@ def test_translator_sync(config):
72 translation_context=config.query_config.translation_context 70 translation_context=config.query_config.translation_context
73 ) 71 )
74 72
75 - # 测试商品标题翻译(使用product_title提示词) 73 + # 测试商品标题翻译(使用sku_name提示词)
76 test_texts = [ 74 test_texts = [
77 - ("蓝牙耳机", "zh", "en", "product_title"),  
78 - ("Wireless Headphones", "en", "zh", "product_title"), 75 + ("蓝牙耳机", "zh", "en", "sku_name"),
  76 + ("Wireless Headphones", "en", "zh", "sku_name"),
79 ] 77 ]
80 78
81 - for text, source_lang, target_lang, prompt_type in test_texts:  
82 - if prompt_type == "product_title":  
83 - if target_lang == "zh":  
84 - prompt = config.query_config.translation_prompts.get('product_title_zh')  
85 - else:  
86 - prompt = config.query_config.translation_prompts.get('product_title_en')  
87 - else:  
88 - if target_lang == "zh":  
89 - prompt = config.query_config.translation_prompts.get('default_zh')  
90 - else:  
91 - prompt = config.query_config.translation_prompts.get('default_en')  
92 - 79 + for text, source_lang, target_lang, scene in test_texts:
93 print(f"\nTranslation test:") 80 print(f"\nTranslation test:")
94 print(f" Original text ({source_lang}): {text}") 81 print(f" Original text ({source_lang}): {text}")
95 print(f" Target language: {target_lang}") 82 print(f" Target language: {target_lang}")
96 - print(f" Prompt: {prompt[:50] if prompt else 'None'}...") 83 + print(f" Scene: {scene}")
97 84
98 result = translator.translate( 85 result = translator.translate(
99 text, 86 text,
100 target_lang=target_lang, 87 target_lang=target_lang,
101 source_lang=source_lang, 88 source_lang=source_lang,
102 - prompt=prompt 89 + context=scene,
103 ) 90 )
104 91
105 if result: 92 if result:
@@ -131,43 +118,25 @@ def test_translator_async(config, translator): @@ -131,43 +118,25 @@ def test_translator_async(config, translator):
131 query_text = "手机" 118 query_text = "手机"
132 target_langs = ['en'] 119 target_langs = ['en']
133 source_lang = 'zh' 120 source_lang = 'zh'
134 -  
135 - query_prompt = config.query_config.translation_prompts.get('query_zh')  
136 - 121 +
137 print(f"Query text: {query_text}") 122 print(f"Query text: {query_text}")
138 print(f"Target languages: {target_langs}") 123 print(f"Target languages: {target_langs}")
139 - print(f"Prompt: {query_prompt}")  
140 -  
141 - # 异步模式(立即返回,后台翻译)  
142 - results = translator.translate_multi(  
143 - query_text,  
144 - target_langs,  
145 - source_lang=source_lang,  
146 - context=config.query_config.translation_context,  
147 - async_mode=True,  
148 - prompt=query_prompt  
149 - )  
150 -  
151 - print(f"\nAsynchronous translation results:")  
152 - for lang, translation in results.items():  
153 - if translation:  
154 - print(f" {lang}: {translation} (cache hit)")  
155 - else:  
156 - print(f" {lang}: None (translating in background...)")  
157 -  
158 - # 同步模式(等待完成)  
159 - print(f"\nSynchronous translation (waiting for completion):")  
160 - results_sync = translator.translate_multi(  
161 - query_text,  
162 - target_langs,  
163 - source_lang=source_lang,  
164 - context=config.query_config.translation_context,  
165 - async_mode=False,  
166 - prompt=query_prompt  
167 - ) 124 + print("Scene: ecommerce_search_query")
168 125
169 - for lang, translation in results_sync.items():  
170 - print(f" {lang}: {translation}") 126 + print(f"\nConcurrent translation via generic translate():")
  127 + with ThreadPoolExecutor(max_workers=len(target_langs)) as executor:
  128 + futures = {
  129 + lang: executor.submit(
  130 + translator.translate,
  131 + query_text,
  132 + lang,
  133 + source_lang,
  134 + "ecommerce_search_query",
  135 + )
  136 + for lang in target_langs
  137 + }
  138 + for lang, future in futures.items():
  139 + print(f" {lang}: {future.result()}")
171 140
172 except Exception as e: 141 except Exception as e:
173 print(f"✗ Asynchronous translation test failed: {e}") 142 print(f"✗ Asynchronous translation test failed: {e}")
@@ -193,14 +162,13 @@ def test_cache(): @@ -193,14 +162,13 @@ def test_cache():
193 test_text = "测试文本" 162 test_text = "测试文本"
194 target_lang = "en" 163 target_lang = "en"
195 source_lang = "zh" 164 source_lang = "zh"
196 - prompt = config.query_config.translation_prompts.get('default_zh')  
197 165
198 print(f"First translation (should call API or return mock):") 166 print(f"First translation (should call API or return mock):")
199 - result1 = translator.translate(test_text, target_lang, source_lang, prompt=prompt) 167 + result1 = translator.translate(test_text, target_lang, source_lang, context="default")
200 print(f" Result: {result1}") 168 print(f" Result: {result1}")
201 169
202 print(f"\nSecond translation (should use cache):") 170 print(f"\nSecond translation (should use cache):")
203 - result2 = translator.translate(test_text, target_lang, source_lang, prompt=prompt) 171 + result2 = translator.translate(test_text, target_lang, source_lang, context="default")
204 print(f" Result: {result2}") 172 print(f" Result: {result2}")
205 173
206 if result1 == result2: 174 if result1 == result2:
@@ -231,17 +199,16 @@ def test_context_parameter(): @@ -231,17 +199,16 @@ def test_context_parameter():
231 199
232 # 测试带context和不带context的翻译 200 # 测试带context和不带context的翻译
233 text = "手机" 201 text = "手机"
234 - prompt = config.query_config.translation_prompts.get('query_zh')  
235 202
236 print(f"Test text: {text}") 203 print(f"Test text: {text}")
237 - print(f"Prompt (as context): {prompt}") 204 + print("Scene: ecommerce_search_query")
238 205
239 # 带context的翻译 206 # 带context的翻译
240 result_with_context = translator.translate( 207 result_with_context = translator.translate(
241 text, 208 text,
242 target_lang='en', 209 target_lang='en',
243 source_lang='zh', 210 source_lang='zh',
244 - prompt=prompt 211 + context="ecommerce_search_query",
245 ) 212 )
246 print(f"\nTranslation result with context: {result_with_context}") 213 print(f"\nTranslation result with context: {result_with_context}")
247 214
query/translator.py deleted
@@ -1,963 +0,0 @@ @@ -1,963 +0,0 @@
1 -"""  
2 -Translation service for multi-language query support.  
3 -  
4 -Supports multiple translation models:  
5 -- Qwen (default): Alibaba Cloud DashScope API using qwen-mt-flash model  
6 -- DeepL: DeepL API for high-quality translations  
7 -  
8 -重要说明(Qwen 机翻限速):  
9 -- 当前默认使用的 `qwen-mt-flash` 为云端机翻模型,**官方限速较低,约 RPM=60(每分钟约 60 请求)**  
10 -- 在高并发场景必须依赖 Redis 翻译缓存与批量预热,避免在用户实时请求路径上直接打满 DashScope 限流  
11 -- 若业务侧存在大规模离线翻译或更高吞吐需求,建议评估 DeepL 或自建翻译后端  
12 -  
13 -使用方法 (Usage):  
14 -  
15 -```python  
16 -from query.qwen_mt_translate import Translator  
17 -  
18 -# 使用默认的 qwen 模型(推荐)  
19 -translator = Translator() # 默认使用 qwen 模型  
20 -  
21 -# 或显式指定模型  
22 -translator = Translator(model='qwen') # 使用 qwen 模型  
23 -translator = Translator(model='deepl') # 使用 DeepL 模型  
24 -  
25 -# 翻译文本  
26 -result = translator.translate(  
27 - text="我看到这个视频后没有笑",  
28 - target_lang="en",  
29 - source_lang="auto" # 自动检测源语言  
30 -)  
31 -```  
32 -  
33 -配置说明 (Configuration):  
34 -- Qwen 模型需要设置 DASHSCOPE_API_KEY 环境变量(在 .env 文件中)  
35 -- DeepL 模型需要设置 DEEPL_AUTH_KEY 环境变量(在 .env 文件中)  
36 -  
37 -Qwen 模型参考文档:  
38 -- 官方文档:https://help.aliyun.com/zh/model-studio/get-api-key  
39 -- 模型:qwen-mt-flash(快速翻译模型)  
40 -  
41 -DeepL 官方文档:  
42 -https://developers.deepl.com/api-reference/translate/request-translation  
43 -"""  
44 -  
45 -import os  
46 -import requests  
47 -import re  
48 -import redis  
49 -from concurrent.futures import ThreadPoolExecutor, Future  
50 -from datetime import timedelta  
51 -from typing import Dict, List, Optional, Union  
52 -import logging  
53 -import time  
54 -  
55 -logger = logging.getLogger(__name__)  
56 -  
57 -from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG  
58 -from openai import OpenAI  
59 -  
60 -  
61 -class Translator:  
62 - """  
63 - Multi-language translator supporting Qwen and DeepL APIs.  
64 -  
65 - Default model is 'qwen' which uses Alibaba Cloud DashScope API.  
66 - """  
67 -# 华北2(北京):https://dashscope.aliyuncs.com/compatible-mode/v1  
68 -# 新加坡:https://dashscope-intl.aliyuncs.com/compatible-mode/v1  
69 -# 美国(弗吉尼亚):https://dashscope-us.aliyuncs.com/compatible-mode/v1  
70 -  
71 - DEEPL_API_URL = "https://api.deepl.com/v2/translate" # Pro tier  
72 - QWEN_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1" # 北京地域  
73 - # QWEN_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" # 新加坡  
74 - # 如果使用新加坡地域的模型,需要将base_url替换为:https://dashscope-intl.aliyuncs.com/compatible-mode/v1  
75 - QWEN_MODEL = "qwen-mt-flash" # 快速翻译模型  
76 -  
77 - # Language code mapping  
78 - LANG_CODE_MAP = {  
79 - 'zh': 'ZH',  
80 - 'en': 'EN',  
81 - 'ru': 'RU',  
82 - 'ar': 'AR',  
83 - 'ja': 'JA',  
84 - 'es': 'ES',  
85 - 'de': 'DE',  
86 - 'fr': 'FR',  
87 - 'it': 'IT',  
88 - 'pt': 'PT',  
89 - }  
90 -  
91 - def __init__(  
92 - self,  
93 - model: str = "qwen",  
94 - api_key: Optional[str] = None,  
95 - use_cache: bool = True,  
96 - timeout: int = 10,  
97 - glossary_id: Optional[str] = None,  
98 - translation_context: Optional[str] = None  
99 - ):  
100 - """  
101 - Initialize translator.  
102 -  
103 - Args:  
104 - model: Translation model to use. Options: 'qwen' (default) or 'deepl'  
105 - api_key: API key for the selected model (or None to use from config/env)  
106 - use_cache: Whether to cache translations  
107 - timeout: Request timeout in seconds  
108 - glossary_id: DeepL glossary ID for custom terminology (optional, only for DeepL)  
109 - translation_context: Context hint for translation (e.g., "e-commerce", "product search")  
110 - """  
111 - self.model = model.lower()  
112 - if self.model not in ['qwen', 'deepl']:  
113 - raise ValueError(f"Unsupported model: {model}. Supported models: 'qwen', 'deepl'")  
114 -  
115 - # Get API key from config if not provided  
116 - if api_key is None:  
117 - if self.model == 'qwen':  
118 - api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY")  
119 - else: # deepl  
120 - api_key = DEEPL_AUTH_KEY or os.getenv("DEEPL_AUTH_KEY")  
121 -  
122 - self.api_key = api_key  
123 - self.timeout = timeout  
124 - self.use_cache = use_cache  
125 - self.glossary_id = glossary_id  
126 - self.translation_context = translation_context or "e-commerce product search"  
127 -  
128 - # Initialize OpenAI client for Qwen if needed  
129 - self.qwen_client = None  
130 - if self.model == 'qwen':  
131 - if not self.api_key:  
132 - logger.warning("DASHSCOPE_API_KEY not set. Qwen translation will not work.")  
133 - else:  
134 - self.qwen_client = OpenAI(  
135 - api_key=self.api_key,  
136 - base_url=self.QWEN_BASE_URL,  
137 - )  
138 -  
139 - # Initialize Redis cache if enabled  
140 - if use_cache:  
141 - try:  
142 - self.redis_client = redis.Redis(  
143 - host=REDIS_CONFIG.get('host', 'localhost'),  
144 - port=REDIS_CONFIG.get('port', 6479),  
145 - password=REDIS_CONFIG.get('password'),  
146 - decode_responses=True, # Return str instead of bytes  
147 - socket_timeout=REDIS_CONFIG.get('socket_timeout', 1),  
148 - socket_connect_timeout=REDIS_CONFIG.get('socket_connect_timeout', 1),  
149 - retry_on_timeout=REDIS_CONFIG.get('retry_on_timeout', False),  
150 - health_check_interval=10, # 避免复用坏连接  
151 - )  
152 - # Test connection  
153 - self.redis_client.ping()  
154 - expire_days = REDIS_CONFIG.get('translation_cache_expire_days', 360)  
155 - self.expire_time = timedelta(days=expire_days)  
156 - self.expire_seconds = int(self.expire_time.total_seconds()) # Redis 需要秒数  
157 - self.cache_prefix = REDIS_CONFIG.get('translation_cache_prefix', 'trans')  
158 - logger.info("Redis cache initialized for translations")  
159 - except Exception as e:  
160 - logger.warning(f"Failed to initialize Redis cache: {e}, falling back to no cache")  
161 - self.redis_client = None  
162 - self.cache = None  
163 - else:  
164 - self.redis_client = None  
165 - self.cache = None  
166 -  
167 - # Thread pool for async translation  
168 - self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="translator")  
169 -  
170 - def translate(  
171 - self,  
172 - text: str,  
173 - target_lang: str,  
174 - source_lang: Optional[str] = None,  
175 - context: Optional[str] = None,  
176 - prompt: Optional[str] = None  
177 - ) -> Optional[str]:  
178 - """  
179 - Translate text to target language (synchronous mode).  
180 -  
181 - Args:  
182 - text: Text to translate  
183 - target_lang: Target language code ('zh', 'en', 'ru', etc.)  
184 - source_lang: Source language code (option al, auto-detect if None)  
185 - context: Additional context for translation (overrides default context)  
186 - prompt: Translation prompt/instruction (optional, for better translation quality)  
187 -  
188 - Returns:  
189 - Translated text or None if translation fails  
190 - """  
191 - if not text or not text.strip():  
192 - return text  
193 -  
194 - # Normalize language codes  
195 - target_lang = target_lang.lower()  
196 - if source_lang:  
197 - source_lang = source_lang.lower()  
198 -  
199 - # Optimization: Skip translation if not needed  
200 - if target_lang == 'en' and self._is_english_text(text):  
201 - logger.info(f"[Translator] Text is already English, skipping translation: '{text[:50]}...'")  
202 - return text  
203 -  
204 - if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)):  
205 - logger.info(  
206 - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "  
207 - f"Source language: {source_lang or 'auto'} | Result: Skip translation (contains Chinese or pure number)"  
208 - )  
209 - return text  
210 -  
211 - # Use provided context or default context  
212 - translation_context = context or self.translation_context  
213 -  
214 - # Build cache key (include prompt in cache key if provided)  
215 - cache_key_parts = [source_lang or 'auto', target_lang, translation_context]  
216 - if prompt:  
217 - cache_key_parts.append(prompt)  
218 - cache_key_parts.append(text)  
219 - cache_key = ':'.join(cache_key_parts)  
220 -  
221 - # Check cache (include context and prompt in cache key for accuracy)  
222 - if self.use_cache and self.redis_client:  
223 - cached = self._get_cached_translation_redis(text, target_lang, source_lang, translation_context, prompt)  
224 - if cached:  
225 - logger.info(  
226 - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "  
227 - f"Source language: {source_lang or 'auto'} | Result: '{cached}' | Source: Cache hit"  
228 - )  
229 - return cached  
230 -  
231 - # If no API key, return mock translation (for testing)  
232 - if not self.api_key:  
233 - logger.info(  
234 - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "  
235 - f"Source language: {source_lang or 'auto'} | Result: '{text}' | Source: Mock mode (no API key)"  
236 - )  
237 - return text  
238 -  
239 - # Translate using selected model  
240 - logger.info(  
241 - f"[Translator] Translation request | Model: {self.model} | Original text: '{text}' | Target language: {target_lang} | "  
242 - f"Source language: {source_lang or 'auto'} | Context: {translation_context} | "  
243 - f"Prompt: {'yes' if prompt else 'no'} | Status: Starting translation"  
244 - )  
245 -  
246 - if self.model == 'qwen':  
247 - result = self._translate_qwen(text, target_lang, source_lang, translation_context, prompt)  
248 - else: # deepl  
249 - result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt)  
250 -  
251 - # Surface translation failure to the caller instead of silently  
252 - # masquerading the source text as a successful translation.  
253 - if result is None:  
254 - logger.warning(  
255 - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "  
256 - f"Source language: {source_lang or 'auto'} | Status: Translation failed"  
257 - )  
258 - else:  
259 - logger.info(  
260 - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "  
261 - f"Source language: {source_lang or 'auto'} | Result: '{result}' | Status: Translation successful"  
262 - )  
263 -  
264 - # Cache only successful translations. Failed attempts must not poison  
265 - # Redis with the original text.  
266 - if result is not None and self.use_cache and self.redis_client:  
267 - self._set_cached_translation_redis(text, target_lang, result, source_lang, translation_context, prompt)  
268 -  
269 - return result  
270 -  
271 - def _translate_qwen(  
272 - self,  
273 - text: str,  
274 - target_lang: str,  
275 - source_lang: Optional[str],  
276 - context: Optional[str] = None,  
277 - prompt: Optional[str] = None  
278 - ) -> Optional[str]:  
279 - """  
280 - Translate using Qwen MT Flash model via Alibaba Cloud DashScope API.  
281 -  
282 - Args:  
283 - text: Text to translate  
284 - target_lang: Target language code ('zh', 'en', 'ru', etc.)  
285 - source_lang: Source language code (optional, 'auto' if None)  
286 - context: Context hint for translation (optional)  
287 - prompt: Translation prompt/instruction (optional)  
288 -  
289 - Returns:  
290 - Translated text or None if translation fails  
291 - """  
292 - if not self.qwen_client:  
293 - logger.error("[Translator] Qwen client not initialized. Check DASHSCOPE_API_KEY.")  
294 - return None  
295 -  
296 - # Qwen (qwen-mt-plus/flash/turbo) supported languages mapping  
297 - # 标准来自:你提供的“语言 / 英文名 / 代码”表  
298 - qwen_lang_map = {  
299 - "en": "English",  
300 - "zh": "Chinese",  
301 - "zh_tw": "Traditional Chinese",  
302 - "ru": "Russian",  
303 - "ja": "Japanese",  
304 - "ko": "Korean",  
305 - "es": "Spanish",  
306 - "fr": "French",  
307 - "pt": "Portuguese",  
308 - "de": "German",  
309 - "it": "Italian",  
310 - "th": "Thai",  
311 - "vi": "Vietnamese",  
312 - "id": "Indonesian",  
313 - "ms": "Malay",  
314 - "ar": "Arabic",  
315 - "hi": "Hindi",  
316 - "he": "Hebrew",  
317 - "my": "Burmese",  
318 - "ta": "Tamil",  
319 - "ur": "Urdu",  
320 - "bn": "Bengali",  
321 - "pl": "Polish",  
322 - "nl": "Dutch",  
323 - "ro": "Romanian",  
324 - "tr": "Turkish",  
325 - "km": "Khmer",  
326 - "lo": "Lao",  
327 - "yue": "Cantonese",  
328 - "cs": "Czech",  
329 - "el": "Greek",  
330 - "sv": "Swedish",  
331 - "hu": "Hungarian",  
332 - "da": "Danish",  
333 - "fi": "Finnish",  
334 - "uk": "Ukrainian",  
335 - "bg": "Bulgarian",  
336 - }  
337 -  
338 - # Convert target language  
339 - target_lang_normalized = target_lang.lower()  
340 - target_lang_qwen = qwen_lang_map.get(target_lang_normalized, target_lang.capitalize())  
341 -  
342 - # Convert source language  
343 - source_lang_normalized = (source_lang or "").strip().lower()  
344 - if not source_lang_normalized or source_lang_normalized == "auto":  
345 - source_lang_qwen = "auto"  
346 - else:  
347 - source_lang_qwen = qwen_lang_map.get(source_lang_normalized, source_lang.capitalize())  
348 -  
349 - # Prepare translation options  
350 - translation_options = {  
351 - "source_lang": source_lang_qwen,  
352 - "target_lang": target_lang_qwen,  
353 - }  
354 -  
355 - # Prepare messages  
356 - messages = [  
357 - {  
358 - "role": "user",  
359 - "content": text  
360 - }  
361 - ]  
362 -  
363 - start_time = time.time()  
364 - try:  
365 - completion = self.qwen_client.chat.completions.create(  
366 - model=self.QWEN_MODEL,  
367 - messages=messages,  
368 - extra_body={  
369 - "translation_options": translation_options  
370 - }  
371 - )  
372 -  
373 - translated_text = completion.choices[0].message.content.strip()  
374 - duration_ms = (time.time() - start_time) * 1000  
375 -  
376 - logger.info(  
377 - f"[Translator] Qwen API response success | Original text: '{text}' | Target language: {target_lang_qwen} | "  
378 - f"Translation result: '{translated_text}' | Duration: {duration_ms:.2f} ms"  
379 - )  
380 - return translated_text  
381 -  
382 - except Exception as e:  
383 - duration_ms = (time.time() - start_time) * 1000  
384 - logger.error(  
385 - f"[Translator] Qwen API request exception | Original text: '{text}' | Target language: {target_lang_qwen} | "  
386 - f"Duration: {duration_ms:.2f} ms | Error: {e}", exc_info=True  
387 - )  
388 - return None  
389 -  
390 - def _translate_deepl(  
391 - self,  
392 - text: str,  
393 - target_lang: str,  
394 - source_lang: Optional[str],  
395 - context: Optional[str] = None,  
396 - prompt: Optional[str] = None  
397 - ) -> Optional[str]:  
398 - """  
399 - Translate using DeepL API with context and glossary support.  
400 -  
401 - Args:  
402 - text: Text to translate  
403 - target_lang: Target language code  
404 - source_lang: Source language code (optional)  
405 - context: Context hint for translation (e.g., "e-commerce product search")  
406 - """  
407 - # Map to DeepL language codes  
408 - target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper())  
409 -  
410 - headers = {  
411 - "Authorization": f"DeepL-Auth-Key {self.api_key}",  
412 - "Content-Type": "application/json",  
413 - }  
414 -  
415 - # Use prompt as context parameter for DeepL API (not as text prefix)  
416 - # According to DeepL API: context is "Additional context that can influence a translation but is not translated itself"  
417 - # If prompt is provided, use it as context; otherwise use the default context  
418 - api_context = prompt if prompt else context  
419 -  
420 - # For e-commerce, add context words to help DeepL understand the domain  
421 - # This is especially important for single-word ambiguous terms like "车" (car vs rook)  
422 - text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context)  
423 -  
424 - payload = {  
425 - "text": [text_to_translate],  
426 - "target_lang": target_code,  
427 - }  
428 -  
429 - if source_lang:  
430 - source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper())  
431 - payload["source_lang"] = source_code  
432 -  
433 - # Add context parameter (prompt or default context)  
434 - # Context influences translation but is not translated itself  
435 - if api_context:  
436 - payload["context"] = api_context  
437 -  
438 - # Add glossary if configured  
439 - if self.glossary_id:  
440 - payload["glossary_id"] = self.glossary_id  
441 -  
442 - # Note: DeepL API v2 supports "context" parameter for additional context  
443 - # that influences translation but is not translated itself.  
444 - # We use prompt as context parameter when provided.  
445 -  
446 - try:  
447 - response = requests.post(  
448 - self.DEEPL_API_URL,  
449 - headers=headers,  
450 - json=payload,  
451 - timeout=self.timeout  
452 - )  
453 -  
454 - if response.status_code == 200:  
455 - data = response.json()  
456 - if "translations" in data and len(data["translations"]) > 0:  
457 - translated_text = data["translations"][0]["text"]  
458 - # If we added context, extract just the term from the result  
459 - if needs_extraction:  
460 - translated_text = self._extract_term_from_translation(  
461 - translated_text, text, target_code  
462 - )  
463 - logger.debug(  
464 - f"[Translator] DeepL API response success | Original text: '{text}' | Target language: {target_code} | "  
465 - f"Translation result: '{translated_text}'"  
466 - )  
467 - return translated_text  
468 - else:  
469 - logger.error(  
470 - f"[Translator] DeepL API error | Original text: '{text}' | Target language: {target_code} | "  
471 - f"Status code: {response.status_code} | Error message: {response.text}"  
472 - )  
473 - return None  
474 -  
475 - except requests.Timeout:  
476 - logger.warning(  
477 - f"[Translator] DeepL API request timeout | Original text: '{text}' | Target language: {target_code} | "  
478 - f"Timeout: {self.timeout}s"  
479 - )  
480 - return None  
481 - except Exception as e:  
482 - logger.error(  
483 - f"[Translator] DeepL API request exception | Original text: '{text}' | Target language: {target_code} | "  
484 - f"Error: {e}", exc_info=True  
485 - )  
486 - return None  
487 -  
488 - # NOTE: _translate_deepl_free is intentionally not implemented.  
489 - # We do not support automatic fallback to the free endpoint, to avoid  
490 - # mixing Pro keys with https://api-free.deepl.com and related 403 errors.  
491 -  
492 - def translate_multi(  
493 - self,  
494 - text: str,  
495 - target_langs: List[str],  
496 - source_lang: Optional[str] = None,  
497 - context: Optional[str] = None,  
498 - async_mode: bool = True,  
499 - prompt: Optional[str] = None  
500 - ) -> Dict[str, Optional[str]]:  
501 - """  
502 - Translate text to multiple target languages.  
503 -  
504 - In async_mode=True (default):  
505 - - Returns cached translations immediately if available  
506 - - For translations that can be optimized (e.g., pure numbers, already in target language),  
507 - returns result immediately via synchronous call  
508 - - Launches async tasks for other missing translations (non-blocking)  
509 - - Returns None for missing translations that require async processing  
510 -  
511 - In async_mode=False:  
512 - - Waits for all translations to complete (blocking)  
513 -  
514 - Args:  
515 - text: Text to translate  
516 - target_langs: List of target language codes  
517 - source_lang: Source language code (optional)  
518 - context: Context hint for translation (optional)  
519 - async_mode: If True, return cached results immediately and translate missing ones async  
520 - prompt: Translation prompt/instruction (optional)  
521 -  
522 - Returns:  
523 - Dictionary mapping language code to translated text (only cached results in async mode)  
524 - """  
525 - results = {}  
526 - missing_langs = []  
527 - async_langs = []  
528 -  
529 - # First, get cached translations  
530 - for lang in target_langs:  
531 - cached = self._get_cached_translation(text, lang, source_lang, context, prompt)  
532 - if cached is not None:  
533 - results[lang] = cached  
534 - else:  
535 - missing_langs.append(lang)  
536 -  
537 - # If async mode and there are missing translations  
538 - if async_mode and missing_langs:  
539 - # Check if translation can be optimized (immediate return)  
540 - for lang in missing_langs:  
541 - target_lang = lang.lower()  
542 - # Check optimization conditions (same as in translate method)  
543 - can_optimize = False  
544 - if target_lang == 'en' and self._is_english_text(text):  
545 - can_optimize = True  
546 - elif target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)):  
547 - can_optimize = True  
548 -  
549 - if can_optimize:  
550 - # Can be optimized, call translate synchronously for immediate result  
551 - results[lang] = self.translate(text, lang, source_lang, context, prompt)  
552 - else:  
553 - # Requires actual translation, add to async list  
554 - async_langs.append(lang)  
555 -  
556 - # Launch async tasks for translations that require actual API calls  
557 - if async_langs:  
558 - for lang in async_langs:  
559 - self._translate_async(text, lang, source_lang, context, prompt)  
560 - # Return None for async translations  
561 - for lang in async_langs:  
562 - results[lang] = None  
563 - else:  
564 - # Synchronous mode: wait for all translations  
565 - for lang in missing_langs:  
566 - results[lang] = self.translate(text, lang, source_lang, context, prompt)  
567 -  
568 - return results  
569 -  
570 - def translate_multi_async(  
571 - self,  
572 - text: str,  
573 - target_langs: List[str],  
574 - source_lang: Optional[str] = None,  
575 - context: Optional[str] = None,  
576 - prompt: Optional[str] = None  
577 - ) -> Dict[str, Union[str, Future]]:  
578 - """  
579 - Translate text to multiple target languages asynchronously, returning Futures that can be awaited.  
580 -  
581 - This method returns a dictionary where:  
582 - - If translation is cached, the value is the translation string (immediate)  
583 - - If translation needs to be done, the value is a Future object that can be awaited  
584 -  
585 - Args:  
586 - text: Text to translate  
587 - target_langs: List of target language codes  
588 - source_lang: Source language code (optional)  
589 - context: Context hint for translation (optional)  
590 - prompt: Translation prompt/instruction (optional)  
591 -  
592 - Returns:  
593 - Dictionary mapping language code to either translation string (cached) or Future object  
594 - """  
595 - results = {}  
596 - missing_langs = []  
597 -  
598 - # First, get cached translations  
599 - for lang in target_langs:  
600 - cached = self._get_cached_translation(text, lang, source_lang, context, prompt)  
601 - if cached is not None:  
602 - results[lang] = cached  
603 - else:  
604 - missing_langs.append(lang)  
605 -  
606 - # For missing translations, submit async tasks and return Futures  
607 - for lang in missing_langs:  
608 - future = self.executor.submit(  
609 - self.translate,  
610 - text,  
611 - lang,  
612 - source_lang,  
613 - context,  
614 - prompt  
615 - )  
616 - results[lang] = future  
617 -  
618 - return results  
619 -  
620 - def _get_cached_translation(  
621 - self,  
622 - text: str,  
623 - target_lang: str,  
624 - source_lang: Optional[str] = None,  
625 - context: Optional[str] = None,  
626 - prompt: Optional[str] = None  
627 - ) -> Optional[str]:  
628 - """Get translation from cache if available."""  
629 - if not self.redis_client:  
630 - return None  
631 - return self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt)  
632 -  
633 - def _get_cached_translation_redis(  
634 - self,  
635 - text: str,  
636 - target_lang: str,  
637 - source_lang: Optional[str] = None,  
638 - context: Optional[str] = None,  
639 - prompt: Optional[str] = None  
640 - ) -> Optional[str]:  
641 - """  
642 - Get translation from Redis cache with sliding expiration.  
643 -  
644 - 滑动过期机制:每次访问缓存时,重置过期时间为配置的过期时间(默认720天)。  
645 - 这样缓存会在最后一次访问后的720天才过期,而不是写入后的720天。  
646 - 这确保了常用的翻译缓存不会被过早删除。  
647 - """  
648 - if not self.redis_client:  
649 - return None  
650 -  
651 - try:  
652 - # Build cache key: prefix:target_lang:text  
653 - # For simplicity, we use target_lang and text as key  
654 - # Context and prompt are not included in key to maximize cache hits  
655 - cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}"  
656 - value = self.redis_client.get(cache_key)  
657 - if value:  
658 - # Sliding expiration: reset expiration time on access  
659 - # 每次读取缓存时,重置过期时间为配置的过期时间(最后一次访问后的N天才过期)  
660 - try:  
661 - self.redis_client.expire(cache_key, self.expire_seconds)  
662 - except Exception as expire_error:  
663 - # 即使 expire 失败,也返回缓存值(不影响功能)  
664 - logger.warning(  
665 - f"[Translator] Failed to update cache expiration for key {cache_key}: {expire_error}"  
666 - )  
667 -  
668 - logger.debug(  
669 - f"[Translator] Redis cache hit | Original text: '{text}' | Target language: {target_lang} | "  
670 - f"Cache key: {cache_key} | Translation result: '{value}' | TTL reset to {self.expire_seconds}s"  
671 - )  
672 - return value  
673 - logger.debug(  
674 - f"[Translator] Redis cache miss | Original text: '{text}' | Target language: {target_lang} | "  
675 - f"Cache key: {cache_key}"  
676 - )  
677 - return None  
678 - except Exception as e:  
679 - logger.error(f"[Translator] Redis error during get translation cache | Original text: '{text}' | Target language: {target_lang} | Error: {e}")  
680 - return None  
681 -  
682 - def _set_cached_translation_redis(  
683 - self,  
684 - text: str,  
685 - target_lang: str,  
686 - translation: str,  
687 - source_lang: Optional[str] = None,  
688 - context: Optional[str] = None,  
689 - prompt: Optional[str] = None  
690 - ) -> None:  
691 - """Store translation in Redis cache."""  
692 - if not self.redis_client:  
693 - return  
694 -  
695 - try:  
696 - cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}"  
697 - self.redis_client.setex(cache_key, self.expire_seconds, translation)  
698 - logger.info(  
699 - f"[Translator] Redis cache write | Original text: '{text}' | Target language: {target_lang} | "  
700 - f"Cache key: {cache_key} | Translation result: '{translation}'"  
701 - )  
702 - except Exception as e:  
703 - logger.error(  
704 - f"[Translator] Redis cache write failed | Original text: '{text}' | Target language: {target_lang} | "  
705 - f"Error: {e}"  
706 - )  
707 -  
708 - def _translate_async(  
709 - self,  
710 - text: str,  
711 - target_lang: str,  
712 - source_lang: Optional[str] = None,  
713 - context: Optional[str] = None,  
714 - prompt: Optional[str] = None  
715 - ):  
716 - """Launch async translation task."""  
717 - def _do_translate():  
718 - try:  
719 - result = self.translate(text, target_lang, source_lang, context, prompt)  
720 - if result:  
721 - logger.debug(f"Async translation completed: {text} -> {target_lang}: {result}")  
722 - except Exception as e:  
723 - logger.warning(f"Async translation failed: {text} -> {target_lang}: {e}")  
724 -  
725 - self.executor.submit(_do_translate)  
726 -  
727 - def _add_ecommerce_context(  
728 - self,  
729 - text: str,  
730 - source_lang: Optional[str],  
731 - context: Optional[str]  
732 - ) -> tuple:  
733 - """  
734 - Add e-commerce context to text for better disambiguation.  
735 -  
736 - For single-word ambiguous Chinese terms, we add context words that help  
737 - DeepL understand this is an e-commerce/product search context.  
738 -  
739 - Args:  
740 - text: Original text to translate  
741 - source_lang: Source language code  
742 - context: Context hint  
743 -  
744 - Returns:  
745 - Tuple of (text_with_context, needs_extraction)  
746 - - text_with_context: Text to send to DeepL  
747 - - needs_extraction: Whether we need to extract the term from the result  
748 - """  
749 - # Only apply for e-commerce context and Chinese source  
750 - if not context or "e-commerce" not in context.lower():  
751 - return text, False  
752 -  
753 - if not source_lang or source_lang.lower() != 'zh':  
754 - return text, False  
755 -  
756 - # For single-word queries, add context to help disambiguation  
757 - text_stripped = text.strip()  
758 - if len(text_stripped.split()) == 1 and len(text_stripped) <= 2:  
759 - # Common ambiguous Chinese e-commerce terms like "车" (car vs rook)  
760 - # We add a context phrase: "购买 [term]" (buy [term]) or "商品 [term]" (product [term])  
761 - # This helps DeepL understand the e-commerce context  
762 - # We'll need to extract just the term from the translation result  
763 - context_phrase = f"购买 {text_stripped}"  
764 - return context_phrase, True  
765 -  
766 - # For multi-word queries, DeepL usually has enough context  
767 - return text, False  
768 -  
769 - def _extract_term_from_translation(  
770 - self,  
771 - translated_text: str,  
772 - original_text: str,  
773 - target_lang_code: str  
774 - ) -> str:  
775 - """  
776 - Extract the actual term from a translation that included context.  
777 -  
778 - For example, if we translated "购买 车" (buy car) and got "buy car",  
779 - we want to extract just "car".  
780 -  
781 - Args:  
782 - translated_text: Full translation result  
783 - original_text: Original single-word query  
784 - target_lang_code: Target language code (EN, ZH, etc.)  
785 -  
786 - Returns:  
787 - Extracted term or original translation if extraction fails  
788 - """  
789 - # For English target, try to extract the last word (the actual term)  
790 - if target_lang_code == "EN":  
791 - words = translated_text.strip().split()  
792 - if len(words) > 1:  
793 - # Usually the last word is the term we want  
794 - # But we need to be smart - if it's "buy car", we want "car"  
795 - # Common context words to skip: buy, purchase, product, item, etc.  
796 - context_words = {"buy", "purchase", "product", "item", "commodity", "goods"}  
797 - # Try to find the term (not a context word)  
798 - for word in reversed(words):  
799 - word_lower = word.lower().rstrip('.,!?;:')  
800 - if word_lower not in context_words:  
801 - return word_lower  
802 - # If all words are context words, return the last one  
803 - return words[-1].lower().rstrip('.,!?;:')  
804 -  
805 - # For other languages or if extraction fails, return as-is  
806 - # The user can configure a glossary for better results  
807 - return translated_text  
808 -  
809 - def _shop_lang_matches(self, shop_lang_lower: str, lang_code: str) -> bool:  
810 - """True if shop language matches index language (use source, no translate)."""  
811 - if not shop_lang_lower or not lang_code:  
812 - return False  
813 - if shop_lang_lower == lang_code:  
814 - return True  
815 - if lang_code == "zh" and "zh" in shop_lang_lower:  
816 - return True  
817 - if lang_code == "en" and "en" in shop_lang_lower:  
818 - return True  
819 - return False  
820 -  
821 - def translate_for_indexing(  
822 - self,  
823 - text: str,  
824 - shop_language: str,  
825 - source_lang: Optional[str] = None,  
826 - context: Optional[str] = None,  
827 - prompt: Optional[str] = None,  
828 - index_languages: Optional[List[str]] = None,  
829 - ) -> Dict[str, Optional[str]]:  
830 - """  
831 - Translate text for indexing based on shop language and tenant index_languages.  
832 -  
833 - For each language in index_languages: use source text if shop language matches,  
834 - otherwise translate to that language.  
835 -  
836 - Args:  
837 - text: Text to translate  
838 - shop_language: Shop primary language (e.g. 'zh', 'en', 'ru')  
839 - source_lang: Source language code (optional)  
840 - context: Additional context for translation (optional)  
841 - prompt: Translation prompt (optional)  
842 - index_languages: Languages to index (from tenant_config). Default ["en", "zh"].  
843 -  
844 - Returns:  
845 - Dict keyed by each index_language with translated or source text (or None).  
846 - """  
847 - langs = index_languages if index_languages else ["en", "zh"]  
848 - results = {lang: None for lang in langs}  
849 - if not text or not text.strip():  
850 - return results  
851 - if re.match(r'^[\d\s_-]+$', text):  
852 - logger.info(f"[Translator] Skip translation for symbol-only query: '{text}'")  
853 - return results  
854 -  
855 - shop_lang_lower = (shop_language or "").strip().lower()  
856 - targets = []  
857 - for lang in langs:  
858 - if self._shop_lang_matches(shop_lang_lower, lang):  
859 - results[lang] = text  
860 - else:  
861 - targets.append(lang)  
862 -  
863 - for target_lang in targets:  
864 - cached = self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt)  
865 - if cached:  
866 - results[target_lang] = cached  
867 - logger.debug(f"[Translator] Cache hit for indexing: '{text}' -> {target_lang}: {cached}")  
868 - continue  
869 - translated = self.translate(  
870 - text,  
871 - target_lang=target_lang,  
872 - source_lang=source_lang or shop_language,  
873 - context=context,  
874 - prompt=prompt,  
875 - )  
876 - results[target_lang] = translated  
877 - return results  
878 -  
879 - def get_translation_needs(  
880 - self,  
881 - detected_lang: str,  
882 - supported_langs: List[str]  
883 - ) -> List[str]:  
884 - """  
885 - Determine which languages need translation.  
886 -  
887 - Args:  
888 - detected_lang: Detected query language  
889 - supported_langs: List of supported languages  
890 -  
891 - Returns:  
892 - List of language codes to translate to  
893 - """  
894 - # If detected language is in supported list, translate to others  
895 - if detected_lang in supported_langs:  
896 - return [lang for lang in supported_langs if detected_lang != lang]  
897 -  
898 - # Otherwise, translate to all supported languages  
899 - return supported_langs  
900 -  
901 - def _is_english_text(self, text: str) -> bool:  
902 - """  
903 - Check if text is primarily English (ASCII letters, numbers, common punctuation).  
904 -  
905 - Args:  
906 - text: Text to check  
907 -  
908 - Returns:  
909 - True if text appears to be English  
910 - """  
911 - if not text or not text.strip():  
912 - return True  
913 -  
914 - # Remove whitespace and common punctuation  
915 - text_clean = re.sub(r'[\s\.,!?;:\-\'\"\(\)\[\]{}]', '', text)  
916 - if not text_clean:  
917 - return True  
918 -  
919 - # Check if all remaining characters are ASCII (letters, numbers)  
920 - # This is a simple heuristic: if most characters are ASCII, it's likely English  
921 - ascii_count = sum(1 for c in text_clean if ord(c) < 128)  
922 - ratio = ascii_count / len(text_clean) if text_clean else 0  
923 -  
924 - # If more than 80% are ASCII characters, consider it English  
925 - return ratio > 0.8  
926 -  
927 - def _contains_chinese(self, text: str) -> bool:  
928 - """  
929 - Check if text contains Chinese characters (Han characters).  
930 -  
931 - Args:  
932 - text: Text to check  
933 -  
934 - Returns:  
935 - True if text contains Chinese characters  
936 - """  
937 - if not text:  
938 - return False  
939 -  
940 - # Check for Chinese characters (Unicode range: \u4e00-\u9fff)  
941 - chinese_pattern = re.compile(r'[\u4e00-\u9fff]')  
942 - return bool(chinese_pattern.search(text))  
943 -  
944 - def _is_pure_number(self, text: str) -> bool:  
945 - """  
946 - Check if text is purely numeric (digits, possibly with spaces, dots, commas).  
947 -  
948 - Args:  
949 - text: Text to check  
950 -  
951 - Returns:  
952 - True if text is purely numeric  
953 - """  
954 - if not text or not text.strip():  
955 - return False  
956 -  
957 - # Remove whitespace, dots, commas (common number separators)  
958 - text_clean = re.sub(r'[\s\.,]', '', text.strip())  
959 - if not text_clean:  
960 - return False  
961 -  
962 - # Check if all remaining characters are digits  
963 - return text_clean.isdigit()  
services.translation.providers.llm 0 → 100644
tests/test_embedding_pipeline.py
@@ -77,12 +77,10 @@ def _build_test_config() -&gt; SearchConfig: @@ -77,12 +77,10 @@ def _build_test_config() -&gt; SearchConfig:
77 enable_text_embedding=True, 77 enable_text_embedding=True,
78 enable_query_rewrite=False, 78 enable_query_rewrite=False,
79 rewrite_dictionary={}, 79 rewrite_dictionary={},
80 - translation_prompts={"query_zh": "e-commerce domain", "query_en": "e-commerce domain"},  
81 text_embedding_field="title_embedding", 80 text_embedding_field="title_embedding",
82 image_embedding_field=None, 81 image_embedding_field=None,
83 ), 82 ),
84 function_score=FunctionScoreConfig(), 83 function_score=FunctionScoreConfig(),
85 - function_score=FunctionScoreConfig(),  
86 rerank=RerankConfig(), 84 rerank=RerankConfig(),
87 spu_config=SPUConfig(enabled=True, spu_field="spu_id", inner_hits_size=3), 85 spu_config=SPUConfig(enabled=True, spu_field="spu_id", inner_hits_size=3),
88 es_index_name="test_products", 86 es_index_name="test_products",