Commit d4cadc13bd22491045c3580a54d0aaa1d4f625e6

Authored by tangwang
1 parent a0a173ae

翻译重构

api/routes/search.py
... ... @@ -472,7 +472,6 @@ async def get_es_raw_document(spu_id: str, http_request: Request):
472 472 index_name = get_tenant_index_name(tenant_id)
473 473  
474 474 body = {
475   - "size": 5,
476 475 "query": {
477 476 "bool": {
478 477 "filter": [
... ...
api/translator_app.py
... ... @@ -98,7 +98,9 @@ from pydantic import BaseModel, Field
98 98 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
99 99  
100 100 from query.qwen_mt_translate import Translator
101   -from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG
  101 +from query.llm_translate import LLMTranslatorProvider
  102 +from query.deepl_provider import DeepLProvider
  103 +from config.services_config import get_translation_config
102 104  
103 105 # Configure logging
104 106 logging.basicConfig(
... ... @@ -107,23 +109,52 @@ logging.basicConfig(
107 109 )
108 110 logger = logging.getLogger(__name__)
109 111  
110   -# Fixed translation prompt
111   -TRANSLATION_PROMPT = "Translate the original text into an English product SKU name. Requirements: Ensure accurate and complete transmission of the original information, with concise, clear, authentic, and professional language."
112   -
113 112 # Global translator instances cache (keyed by model)
114   -_translators: Dict[str, Translator] = {}
  113 +_translators: Dict[str, object] = {}
  114 +
115 115  
  116 +def _resolve_default_model() -> str:
  117 + """
  118 + Resolve translator model from services.translation config first.
116 119  
117   -def get_translator(model: str = "qwen") -> Translator:
  120 + Priority:
  121 + 1) TRANSLATION_MODEL env (explicit runtime override)
  122 + 2) services.translation.provider + providers.<provider>.model
  123 + 3) qwen-mt
  124 + """
  125 + env_model = (os.getenv("TRANSLATION_MODEL") or "").strip()
  126 + if env_model:
  127 + return env_model
  128 + try:
  129 + cfg = get_translation_config()
  130 + provider = (cfg.provider or "").strip().lower()
  131 + provider_cfg = cfg.get_provider_cfg() if hasattr(cfg, "get_provider_cfg") else {}
  132 + model = (provider_cfg.get("model") or "").strip().lower() if isinstance(provider_cfg, dict) else ""
  133 + if provider == "llm":
  134 + return "llm"
  135 + if provider in {"qwen-mt", "direct", "http"}:
  136 + return model or "qwen-mt"
  137 + if provider == "deepl":
  138 + return "deepl"
  139 + except Exception:
  140 + pass
  141 + return "qwen-mt"
  142 +
  143 +
  144 +def get_translator(model: str = "qwen") -> object:
118 145 """Get or create translator instance for the specified model."""
119 146 global _translators
120 147 if model not in _translators:
121 148 logger.info(f"Initializing translator with model: {model}...")
122   - _translators[model] = Translator(
123   - model=model,
124   - use_cache=True,
125   - timeout=10
126   - )
  149 + normalized = (model or "qwen").strip().lower()
  150 + if normalized in {"qwen", "qwen-mt", "qwen-mt-flash", "qwen-mt-flush"}:
  151 + _translators[model] = Translator(model=normalized, use_cache=True, timeout=10)
  152 + elif normalized == "deepl":
  153 + _translators[model] = DeepLProvider(api_key=None, timeout=10.0)
  154 + elif normalized == "llm":
  155 + _translators[model] = LLMTranslatorProvider()
  156 + else:
  157 + raise ValueError(f"Unsupported model: {model}")
127 158 logger.info(f"Translator initialized with model: {model}")
128 159 return _translators[model]
129 160  
... ... @@ -134,7 +165,9 @@ class TranslationRequest(BaseModel):
134 165 text: str = Field(..., description="Text to translate")
135 166 target_lang: str = Field(..., description="Target language code (zh, en, ru, etc.)")
136 167 source_lang: Optional[str] = Field(None, description="Source language code (optional, auto-detect if not provided)")
137   - model: Optional[str] = Field("qwen", description="Translation model: 'qwen' (default) or 'deepl'")
  168 + model: Optional[str] = Field(None, description="Translation model: qwen-mt | deepl | llm")
  169 + context: Optional[str] = Field(None, description="Optional translation scene or context")
  170 + prompt: Optional[str] = Field(None, description="Optional prompt override")
138 171  
139 172 class Config:
140 173 json_schema_extra = {
... ... @@ -142,7 +175,8 @@ class TranslationRequest(BaseModel):
142 175 "text": "商品名称",
143 176 "target_lang": "en",
144 177 "source_lang": "zh",
145   - "model": "qwen"
  178 + "model": "llm",
  179 + "context": "sku_name"
146 180 }
147 181 }
148 182  
... ... @@ -180,8 +214,7 @@ app.add_middleware(
180 214 async def startup_event():
181 215 """Initialize translator on startup."""
182 216 logger.info("Starting Translation Service API on port 6006")
183   - # Get default model from environment variable or use 'qwen'
184   - default_model = os.getenv("TRANSLATION_MODEL", "qwen")
  217 + default_model = _resolve_default_model()
185 218 try:
186 219 get_translator(model=default_model)
187 220 logger.info(f"Translation service ready with default model: {default_model}")
... ... @@ -194,15 +227,17 @@ async def startup_event():
194 227 async def health_check():
195 228 """Health check endpoint."""
196 229 try:
197   - default_model = os.getenv("TRANSLATION_MODEL", "qwen")
198   - translator = get_translator(model=default_model)
  230 + # 仅做轻量级本地检查,避免在健康检查中触发潜在的阻塞初始化或外部依赖
  231 + default_model = _resolve_default_model()
  232 + # 如果启动事件成功,默认模型通常会已经初始化到缓存中
  233 + translator = _translators.get(default_model) or next(iter(_translators.values()), None)
199 234 return {
200 235 "status": "healthy",
201 236 "service": "translation",
202 237 "default_model": default_model,
203 238 "available_models": list(_translators.keys()),
204 239 "translator_initialized": translator is not None,
205   - "cache_enabled": translator.use_cache if translator else False
  240 + "cache_enabled": bool(getattr(translator, "use_cache", False))
206 241 }
207 242 except Exception as e:
208 243 logger.error(f"Health check failed: {e}")
... ... @@ -238,11 +273,11 @@ async def translate(request: TranslationRequest):
238 273 )
239 274  
240 275 # Validate model parameter
241   - model = request.model.lower() if request.model else "qwen"
242   - if model not in ['qwen', 'deepl']:
  276 + model = request.model.lower() if request.model else _resolve_default_model().lower()
  277 + if model not in ["qwen", "qwen-mt", "deepl", "llm"]:
243 278 raise HTTPException(
244 279 status_code=400,
245   - detail=f"Invalid model: {model}. Supported models: 'qwen', 'deepl'"
  280 + detail="Invalid model. Supported models: 'qwen-mt', 'deepl', 'llm'"
246 281 )
247 282  
248 283 try:
... ... @@ -254,7 +289,8 @@ async def translate(request: TranslationRequest):
254 289 text=request.text,
255 290 target_lang=request.target_lang,
256 291 source_lang=request.source_lang,
257   - prompt=TRANSLATION_PROMPT
  292 + context=request.context,
  293 + prompt=request.prompt,
258 294 )
259 295  
260 296 if translated_text is None:
... ... @@ -269,7 +305,7 @@ async def translate(request: TranslationRequest):
269 305 source_lang=request.source_lang,
270 306 translated_text=translated_text,
271 307 status="success",
272   - model=translator.model
  308 + model=str(getattr(translator, "model", model))
273 309 )
274 310  
275 311 except HTTPException:
... ...
config/__init__.py
... ... @@ -28,6 +28,7 @@ from .services_config import (
28 28 get_translation_base_url,
29 29 get_embedding_base_url,
30 30 get_rerank_service_url,
  31 + get_translation_cache_config,
31 32 ServiceConfig,
32 33 )
33 34  
... ... @@ -53,5 +54,6 @@ __all__ = [
53 54 'get_translation_base_url',
54 55 'get_embedding_base_url',
55 56 'get_rerank_service_url',
  57 + 'get_translation_cache_config',
56 58 'ServiceConfig',
57 59 ]
... ...
config/config.yaml
... ... @@ -81,18 +81,6 @@ query_config:
81 81 translation_service: "deepl"
82 82 translation_api_key: null # 通过环境变量设置
83 83  
84   - # 翻译提示词配置(用于提高翻译质量,作为DeepL API的context参数)
85   - translation_prompts:
86   - # 商品标题翻译提示词
87   - product_title_zh: "请将原文翻译成中文商品SKU名称,要求:确保精确、完整地传达原文信息的基础上,语言简洁清晰、地道、专业。"
88   - product_title_en: "Translate the original text into an English product SKU name. Requirements: Ensure accurate and complete transmission of the original information, with concise, clear, authentic, and professional language."
89   - # query翻译提示词
90   - query_zh: "电商领域"
91   - query_en: "e-commerce domain"
92   - # 默认翻译用词
93   - default_zh: "电商领域"
94   - default_en: "e-commerce domain"
95   -
96 84 # 返回字段配置(_source includes)
97 85 # null表示返回所有字段,[]表示不返回任何字段,列表表示只返回指定字段
98 86 source_fields: null
... ... @@ -119,16 +107,24 @@ rerank:
119 107 # 可扩展服务/provider 注册表(单一配置源)
120 108 services:
121 109 translation:
122   - provider: "llm" # direct | http | google(reserved)
  110 + provider: "llm" # qwen-mt | deepl | http | llm
123 111 base_url: "http://127.0.0.1:6006"
124   - model: "qwen"
  112 + model: "qwen-flash"
125 113 timeout_sec: 10.0
  114 + cache:
  115 + enabled: true
  116 + key_prefix: "trans:v2"
  117 + ttl_seconds: 62208000
  118 + sliding_expiration: true
  119 + key_include_context: true
  120 + key_include_prompt: true
  121 + key_include_source_lang: true
126 122 providers:
127   - direct:
128   - model: "qwen"
  123 + qwen-mt:
  124 + model: "qwen-mt-flush"
129 125 http:
130 126 base_url: "http://127.0.0.1:6006"
131   - model: "qwen"
  127 + model: "qwen-mt-flush"
132 128 timeout_sec: 10.0
133 129 llm:
134 130 model: "qwen-flash"
... ... @@ -136,6 +132,11 @@ services:
136 132 # base_url 留空则使用 DASHSCOPE_BASE_URL 或默认地域
137 133 base_url: ""
138 134 timeout_sec: 30.0
  135 + deepl:
  136 + model: "deepl"
  137 + timeout_sec: 10.0
  138 + # 可选:用于术语表翻译(由 query_config.translation_glossary_id 衔接)
  139 + glossary_id: ""
139 140 google:
140 141 enabled: false
141 142 project_id: ""
... ...
config/config_loader.py
... ... @@ -42,7 +42,6 @@ class QueryConfig:
42 42 translation_api_key: Optional[str] = None
43 43 translation_glossary_id: Optional[str] = None
44 44 translation_context: str = "e-commerce product search"
45   - translation_prompts: Dict[str, str] = field(default_factory=dict)
46 45  
47 46 # Embedding field names
48 47 text_embedding_field: Optional[str] = "title_embedding"
... ... @@ -250,7 +249,6 @@ class ConfigLoader:
250 249 translation_service=query_config_data.get("translation_service") or "deepl",
251 250 translation_glossary_id=query_config_data.get("translation_glossary_id"),
252 251 translation_context=query_config_data.get("translation_context") or "e-commerce product search",
253   - translation_prompts=query_config_data.get("translation_prompts", {}),
254 252 text_embedding_field=query_config_data.get("text_embedding_field"),
255 253 image_embedding_field=query_config_data.get("image_embedding_field"),
256 254 source_fields=query_config_data.get("source_fields"),
... ...
config/services_config.py
... ... @@ -72,12 +72,12 @@ def _resolve_translation() -&gt; ServiceConfig:
72 72 config_provider=cfg.get("provider"),
73 73 capability="translation",
74 74 )
75   - if provider not in ("direct", "local", "inprocess", "http", "service"):
  75 + if provider not in ("qwen-mt", "deepl", "direct", "local", "inprocess", "http", "service", "llm"):
76 76 raise ValueError(f"Unsupported translation provider: {provider}")
77 77  
78 78 # Env override for http base_url
79 79 env_url = os.getenv("TRANSLATION_SERVICE_URL")
80   - if env_url and provider == "http":
  80 + if env_url and provider in ("http", "service"):
81 81 providers = dict(providers)
82 82 providers["http"] = dict(providers.get("http", {}))
83 83 providers["http"]["base_url"] = env_url.rstrip("/")
... ... @@ -206,6 +206,27 @@ def get_translation_base_url() -&gt; str:
206 206 return str(base).rstrip("/")
207 207  
208 208  
  209 +def get_translation_cache_config() -> Dict[str, Any]:
  210 + """
  211 + Resolve translation cache policy from services.translation.cache.
  212 +
  213 + All translation cache key/TTL behavior should be configured in config.yaml,
  214 + not hardcoded in code.
  215 + """
  216 + raw = _load_services_raw()
  217 + cfg = raw.get("translation", {}) if isinstance(raw.get("translation"), dict) else {}
  218 + cache_cfg = cfg.get("cache", {}) if isinstance(cfg.get("cache"), dict) else {}
  219 + return {
  220 + "enabled": bool(cache_cfg.get("enabled", True)),
  221 + "key_prefix": str(cache_cfg.get("key_prefix", "trans:v2")),
  222 + "ttl_seconds": int(cache_cfg.get("ttl_seconds", 360 * 24 * 3600)),
  223 + "sliding_expiration": bool(cache_cfg.get("sliding_expiration", True)),
  224 + "key_include_context": bool(cache_cfg.get("key_include_context", True)),
  225 + "key_include_prompt": bool(cache_cfg.get("key_include_prompt", True)),
  226 + "key_include_source_lang": bool(cache_cfg.get("key_include_source_lang", True)),
  227 + }
  228 +
  229 +
209 230 def get_embedding_base_url() -> str:
210 231 """Resolve embedding HTTP base URL."""
211 232 base = (
... ...
config/translate_prompts.py 0 → 100644
... ... @@ -0,0 +1,82 @@
  1 +SOURCE_LANG_CODE_MAP = {
  2 + "en": "English",
  3 + "zh": "Chinese",
  4 + "zh_tw": "Traditional Chinese",
  5 + "ru": "Russian",
  6 + "ja": "Japanese",
  7 + "ko": "Korean",
  8 + "es": "Spanish",
  9 + "fr": "French",
  10 + "pt": "Portuguese",
  11 + "de": "German",
  12 + "it": "Italian",
  13 + "th": "Thai",
  14 + "vi": "Vietnamese",
  15 + "id": "Indonesian",
  16 + "ms": "Malay",
  17 + "ar": "Arabic",
  18 + "hi": "Hindi",
  19 + "he": "Hebrew",
  20 + "my": "Burmese",
  21 + "ta": "Tamil",
  22 + "ur": "Urdu",
  23 + "bn": "Bengali",
  24 + "pl": "Polish",
  25 + "nl": "Dutch",
  26 + "ro": "Romanian",
  27 + "tr": "Turkish",
  28 + "km": "Khmer",
  29 + "lo": "Lao",
  30 + "yue": "Cantonese",
  31 + "cs": "Czech",
  32 + "el": "Greek",
  33 + "sv": "Swedish",
  34 + "hu": "Hungarian",
  35 + "da": "Danish",
  36 + "fi": "Finnish",
  37 + "uk": "Ukrainian",
  38 + "bg": "Bulgarian",
  39 +}
  40 +
  41 +TARGET_LANG_CODE_MAP = {v: k for k, v in SOURCE_LANG_CODE_MAP.items()}
  42 +
  43 +TRANSLATION_PROMPTS = {
  44 + "general": {
  45 + "zh": "你是一名专业的 {source_lang}({src_lang_code})到 {target_lang}({tgt_lang_code})翻译专家,请准确传达原文含义并符合{target_lang}语言习惯,只输出翻译结果:{text}",
  46 + "en": "You are a professional {source_lang} ({src_lang_code}) to {target_lang} ({tgt_lang_code}) translator. Accurately convey the meaning following {target_lang} grammar and usage, output only the translation: {text}",
  47 + "ru": "Вы профессиональный переводчик с {source_lang} ({src_lang_code}) на {target_lang} ({tgt_lang_code}). Точно передайте смысл текста, соблюдая нормы {target_lang}, выводите только перевод: {text}",
  48 + "ar": "أنت مترجم محترف من {source_lang} ({src_lang_code}) إلى {target_lang} ({tgt_lang_code}). انقل المعنى بدقة وفق قواعد {target_lang} وأخرج الترجمة فقط: {text}",
  49 + "ja": "あなたは {source_lang}({src_lang_code})から {target_lang}({tgt_lang_code})へのプロ翻訳者です。意味を正確に伝え、{target_lang}の表現に従い、翻訳のみ出力してください:{text}",
  50 + "es": "Eres un traductor profesional de {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Transmite con precisión el significado y devuelve solo la traducción: {text}",
  51 + "de": "Du bist ein professioneller Übersetzer von {source_lang} ({src_lang_code}) nach {target_lang} ({tgt_lang_code}). Gib die Bedeutung korrekt wieder und gib nur die Übersetzung aus: {text}",
  52 + "fr": "Vous êtes un traducteur professionnel de {source_lang} ({src_lang_code}) vers {target_lang} ({tgt_lang_code}). Transmettez fidèlement le sens et produisez uniquement la traduction : {text}",
  53 + "it": "Sei un traduttore professionista da {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Trasmetti accuratamente il significato e restituisci solo la traduzione: {text}",
  54 + "pt": "Você é um tradutor profissional de {source_lang} ({src_lang_code}) para {target_lang} ({tgt_lang_code}). Transmita o significado com precisão e produza apenas a tradução: {text}"
  55 + },
  56 +
  57 + "sku_name": {
  58 + "zh": "你是一名专业的 {source_lang}({src_lang_code})到 {target_lang}({tgt_lang_code})电商翻译专家,请将原文翻译为{target_lang}商品SKU名称,要求准确完整、简洁专业,只输出结果:{text}",
  59 + "en": "You are a professional {source_lang} ({src_lang_code}) to {target_lang} ({tgt_lang_code}) ecommerce translator. Translate into a concise and accurate {target_lang} product SKU name, output only the result: {text}",
  60 + "ru": "Вы переводчик e-commerce с {source_lang} ({src_lang_code}) на {target_lang} ({tgt_lang_code}). Переведите в краткое и точное название SKU товара на {target_lang}, выводите только результат: {text}",
  61 + "ar": "أنت مترجم تجارة إلكترونية من {source_lang} ({src_lang_code}) إلى {target_lang} ({tgt_lang_code}). ترجم إلى اسم SKU للمنتج بلغة {target_lang} بدقة واختصار، وأخرج النتيجة فقط: {text}",
  62 + "ja": "{source_lang}({src_lang_code})から {target_lang}({tgt_lang_code})へのEC翻訳者として、簡潔で正確な{target_lang}の商品SKU名に翻訳し、結果のみ出力してください:{text}",
  63 + "es": "Eres un traductor ecommerce de {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Traduce a un nombre SKU de producto en {target_lang}, preciso y conciso, devuelve solo el resultado: {text}",
  64 + "de": "Du bist ein E-Commerce-Übersetzer von {source_lang} ({src_lang_code}) nach {target_lang} ({tgt_lang_code}). Übersetze in einen präzisen und kurzen {target_lang} Produkt-SKU-Namen, nur Ergebnis ausgeben: {text}",
  65 + "fr": "Vous êtes un traducteur e-commerce de {source_lang} ({src_lang_code}) vers {target_lang} ({tgt_lang_code}). Traduisez en un nom SKU produit {target_lang} précis et concis, sortie uniquement : {text}",
  66 + "it": "Sei un traduttore ecommerce da {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Traduce in un nome SKU prodotto {target_lang} conciso e accurato, restituisci solo il risultato: {text}",
  67 + "pt": "Você é um tradutor de e-commerce de {source_lang} ({src_lang_code}) para {target_lang} ({tgt_lang_code}). Traduza para um nome SKU de produto {target_lang} conciso e preciso, produza apenas o resultado: {text}"
  68 + },
  69 +
  70 + "ecommerce_search_query": {
  71 + "zh": "你是一名专业的 {source_lang}({src_lang_code})到 {target_lang}({tgt_lang_code})翻译助手,请将电商搜索词准确翻译为{target_lang}并符合搜索习惯,只输出结果:{text}",
  72 + "en": "You are a professional {source_lang} ({src_lang_code}) to {target_lang} ({tgt_lang_code}) translator. Translate the ecommerce search query accurately following {target_lang} search habits, output only the result: {text}",
  73 + "ru": "Вы переводчик с {source_lang} ({src_lang_code}) на {target_lang} ({tgt_lang_code}). Переведите поисковый запрос e-commerce с учётом привычек поиска, выводите только результат: {text}",
  74 + "ar": "أنت مترجم من {source_lang} ({src_lang_code}) إلى {target_lang} ({tgt_lang_code}). ترجم عبارة البحث للتجارة الإلكترونية بما يناسب عادات البحث وأخرج النتيجة فقط: {text}",
  75 + "ja": "{source_lang}({src_lang_code})から {target_lang}({tgt_lang_code})への翻訳者として、EC検索キーワードを{target_lang}の検索習慣に合わせて翻訳し、結果のみ出力してください:{text}",
  76 + "es": "Eres un traductor de {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Traduce la consulta de búsqueda ecommerce según los hábitos de búsqueda y devuelve solo el resultado: {text}",
  77 + "de": "Du bist ein Übersetzer von {source_lang} ({src_lang_code}) nach {target_lang} ({tgt_lang_code}). Übersetze die E-Commerce-Suchanfrage entsprechend den Suchgewohnheiten, nur Ergebnis ausgeben: {text}",
  78 + "fr": "Vous êtes un traducteur de {source_lang} ({src_lang_code}) vers {target_lang} ({tgt_lang_code}). Traduisez la requête de recherche e-commerce selon les habitudes de recherche, sortie uniquement : {text}",
  79 + "it": "Sei un traduttore da {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Traduce la query di ricerca ecommerce secondo le abitudini di ricerca e restituisci solo il risultato: {text}",
  80 + "pt": "Você é um tradutor de {source_lang} ({src_lang_code}) para {target_lang} ({tgt_lang_code}). Traduza a consulta de busca de ecommerce conforme os hábitos de busca e produza apenas o resultado: {text}"
  81 + }
  82 +}
... ...
docs/搜索API对接指南.md
... ... @@ -1814,7 +1814,8 @@ curl &quot;http://localhost:6007/health&quot;
1814 1814 "text": "商品名称",
1815 1815 "target_lang": "en",
1816 1816 "source_lang": "zh",
1817   - "model": "qwen"
  1817 + "model": "qwen",
  1818 + "context": "sku_name"
1818 1819 }
1819 1820 ```
1820 1821  
... ... @@ -1823,7 +1824,8 @@ curl &quot;http://localhost:6007/health&quot;
1823 1824 | `text` | string | Y | 待翻译文本 |
1824 1825 | `target_lang` | string | Y | 目标语言:`zh`、`en`、`ru` 等 |
1825 1826 | `source_lang` | string | N | 源语言,不传则自动检测 |
1826   -| `model` | string | N | `qwen`(默认)或 `deepl` |
  1827 +| `model` | string | N | `qwen`(默认)、`deepl` 或 `llm` |
  1828 +| `context` | string | N | 翻译场景参数:商品标题翻译使用 `sku_name`,搜索请求中的 query 翻译使用 `ecommerce_search_query`,其它通用场景可不传或使用 `general` |
1827 1829  
1828 1830 **响应**:
1829 1831 ```json
... ...
indexer/document_transformer.py
... ... @@ -36,7 +36,6 @@ class SPUDocumentTransformer:
36 36 searchable_option_dimensions: List[str],
37 37 tenant_config: Optional[Dict[str, Any]] = None,
38 38 translator: Optional[Any] = None,
39   - translation_prompts: Optional[Dict[str, str]] = None,
40 39 encoder: Optional[Any] = None,
41 40 enable_title_embedding: bool = True,
42 41 image_encoder: Optional[Any] = None,
... ... @@ -50,7 +49,6 @@ class SPUDocumentTransformer:
50 49 searchable_option_dimensions: 可搜索的option维度列表
51 50 tenant_config: 租户配置(包含主语言和翻译配置)
52 51 translator: 翻译器实例(可选,如果提供则启用翻译功能)
53   - translation_prompts: 翻译提示词配置(可选)
54 52 encoder: 文本编码器实例(可选,用于生成title_embedding)
55 53 enable_title_embedding: 是否启用标题向量化(默认True)
56 54 image_encoder: 图片编码器实例(可选,需实现 encode_image_urls(urls) -> List[Optional[np.ndarray]])
... ... @@ -60,12 +58,33 @@ class SPUDocumentTransformer:
60 58 self.searchable_option_dimensions = searchable_option_dimensions
61 59 self.tenant_config = tenant_config or {}
62 60 self.translator = translator
63   - self.translation_prompts = translation_prompts or {}
64 61 self.encoder = encoder
65 62 self.enable_title_embedding = enable_title_embedding
66 63 self.image_encoder = image_encoder
67 64 self.enable_image_embedding = bool(enable_image_embedding and image_encoder is not None)
68 65  
  66 + def _translate_index_languages(
  67 + self,
  68 + text: str,
  69 + source_lang: str,
  70 + index_languages: List[str],
  71 + scene: str,
  72 + ) -> Dict[str, Optional[str]]:
  73 + translations: Dict[str, Optional[str]] = {}
  74 + if not self.translator or not text or not str(text).strip():
  75 + return translations
  76 + for lang in index_languages:
  77 + if lang == source_lang:
  78 + translations[lang] = text
  79 + continue
  80 + translations[lang] = self.translator.translate(
  81 + text=text,
  82 + target_lang=lang,
  83 + source_lang=source_lang,
  84 + context=scene,
  85 + )
  86 + return translations
  87 +
69 88 def transform_spu_to_doc(
70 89 self,
71 90 tenant_id: str,
... ... @@ -322,15 +341,12 @@ class SPUDocumentTransformer:
322 341 title_text = str(spu_row['title'])
323 342 translations: Dict[str, Optional[str]] = {}
324 343 if self.translator:
325   - prompt_zh = self.translation_prompts.get('product_title_zh') or self.translation_prompts.get('default_zh')
326   - prompt_en = self.translation_prompts.get('product_title_en') or self.translation_prompts.get('default_en')
327   - translations = self.translator.translate_for_indexing(
328   - title_text,
329   - shop_language=primary_lang,
  344 + translations = self._translate_index_languages(
  345 + text=title_text,
330 346 source_lang=primary_lang,
331   - prompt=prompt_zh if primary_lang == 'zh' else prompt_en,
332 347 index_languages=index_langs,
333   - ) or {}
  348 + scene="product_title",
  349 + )
334 350 _set_lang_obj("title", title_text, translations)
335 351  
336 352 # Brief
... ... @@ -338,14 +354,12 @@ class SPUDocumentTransformer:
338 354 brief_text = str(spu_row['brief'])
339 355 translations = {}
340 356 if self.translator:
341   - prompt = self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en')
342   - translations = self.translator.translate_for_indexing(
343   - brief_text,
344   - shop_language=primary_lang,
  357 + translations = self._translate_index_languages(
  358 + text=brief_text,
345 359 source_lang=primary_lang,
346   - prompt=prompt,
347 360 index_languages=index_langs,
348   - ) or {}
  361 + scene="default",
  362 + )
349 363 _set_lang_obj("brief", brief_text, translations)
350 364  
351 365 # Description
... ... @@ -353,14 +367,12 @@ class SPUDocumentTransformer:
353 367 desc_text = str(spu_row['description'])
354 368 translations = {}
355 369 if self.translator:
356   - prompt = self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en')
357   - translations = self.translator.translate_for_indexing(
358   - desc_text,
359   - shop_language=primary_lang,
  370 + translations = self._translate_index_languages(
  371 + text=desc_text,
360 372 source_lang=primary_lang,
361   - prompt=prompt,
362 373 index_languages=index_langs,
363   - ) or {}
  374 + scene="default",
  375 + )
364 376 _set_lang_obj("description", desc_text, translations)
365 377  
366 378 # Vendor
... ... @@ -368,14 +380,12 @@ class SPUDocumentTransformer:
368 380 vendor_text = str(spu_row['vendor'])
369 381 translations = {}
370 382 if self.translator:
371   - prompt = self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en')
372   - translations = self.translator.translate_for_indexing(
373   - vendor_text,
374   - shop_language=primary_lang,
  383 + translations = self._translate_index_languages(
  384 + text=vendor_text,
375 385 source_lang=primary_lang,
376   - prompt=prompt,
377 386 index_languages=index_langs,
378   - ) or {}
  387 + scene="default",
  388 + )
379 389 _set_lang_obj("vendor", vendor_text, translations)
380 390  
381 391 def _fill_category_fields(self, doc: Dict[str, Any], spu_row: pd.Series):
... ...
indexer/incremental_service.py
... ... @@ -39,7 +39,6 @@ class IncrementalIndexerService:
39 39 self._config: Optional[Any] = None
40 40 self._config_lock = threading.Lock()
41 41 self._translator: Optional[Any] = None
42   - self._translation_prompts: Optional[Dict[str, Any]] = None
43 42 self._searchable_option_dimensions: Optional[List[str]] = None
44 43 self._shared_text_encoder: Optional[Any] = None
45 44 self._shared_image_encoder: Optional[Any] = None
... ... @@ -52,7 +51,6 @@ class IncrementalIndexerService:
52 51 def _eager_init(self) -> None:
53 52 """Strict eager initialization. Any dependency failure should fail fast."""
54 53 self._config = ConfigLoader("config/config.yaml").load_config()
55   - self._translation_prompts = getattr(self._config.query_config, "translation_prompts", {}) or {}
56 54 self._searchable_option_dimensions = (
57 55 getattr(self._config.spu_config, "searchable_option_dimensions", None)
58 56 or ["option1", "option2", "option3"]
... ... @@ -110,7 +108,6 @@ class IncrementalIndexerService:
110 108 tenant_id=tenant_id,
111 109 searchable_option_dimensions=self._searchable_option_dimensions,
112 110 translator=self._translator,
113   - translation_prompts=self._translation_prompts,
114 111 encoder=encoder,
115 112 enable_title_embedding=False, # batch fill later
116 113 image_encoder=image_encoder,
... ...
indexer/indexing_utils.py
... ... @@ -57,7 +57,6 @@ def create_document_transformer(
57 57 tenant_id: str,
58 58 searchable_option_dimensions: Optional[list] = None,
59 59 translator: Optional[Any] = None,
60   - translation_prompts: Optional[Dict[str, str]] = None,
61 60 encoder: Optional[Any] = None,
62 61 enable_title_embedding: bool = True,
63 62 image_encoder: Optional[Any] = None,
... ... @@ -72,7 +71,6 @@ def create_document_transformer(
72 71 tenant_id: 租户ID
73 72 searchable_option_dimensions: 可搜索的option维度列表(如果为None则从配置加载)
74 73 translator: 翻译器实例(如果为None则根据配置初始化)
75   - translation_prompts: 翻译提示词配置(如果为None则从配置加载)
76 74 encoder: 文本编码器实例(如果为None且enable_title_embedding为True则根据配置初始化)
77 75 enable_title_embedding: 是否启用标题向量化(默认True)
78 76 image_encoder: 图片编码器(可选,需实现 encode_image_urls(urls))
... ... @@ -89,7 +87,6 @@ def create_document_transformer(
89 87 if (
90 88 searchable_option_dimensions is None
91 89 or translator is None
92   - or translation_prompts is None
93 90 or (encoder is None and enable_title_embedding)
94 91 or config is None
95 92 ):
... ... @@ -107,9 +104,6 @@ def create_document_transformer(
107 104  
108 105 translator = create_translation_provider(config.query_config)
109 106  
110   - if translation_prompts is None:
111   - translation_prompts = config.query_config.translation_prompts
112   -
113 107 # 初始化encoder(如果启用标题向量化且未提供encoder)
114 108 if encoder is None and enable_title_embedding and config.query_config.enable_text_embedding:
115 109 from embeddings.text_encoder import TextEmbeddingEncoder
... ... @@ -122,7 +116,6 @@ def create_document_transformer(
122 116 searchable_option_dimensions=searchable_option_dimensions,
123 117 tenant_config=tenant_config,
124 118 translator=translator,
125   - translation_prompts=translation_prompts,
126 119 encoder=encoder,
127 120 enable_title_embedding=enable_title_embedding,
128 121 image_encoder=image_encoder,
... ...
indexer/test_indexing.py
... ... @@ -285,7 +285,6 @@ def test_document_transformer():
285 285 searchable_option_dimensions=['option1', 'option2', 'option3'],
286 286 tenant_config=tenant_config,
287 287 translator=translator,
288   - translation_prompts=config.query_config.translation_prompts
289 288 )
290 289  
291 290 # 转换文档
... ...
providers/translation.py
1   -"""
2   -Translation provider - direct (in-process) or HTTP service.
3   -"""
  1 +"""Translation provider factory and HTTP provider implementation."""
4 2 from __future__ import annotations
5 3  
6 4 import logging
7   -from typing import Any, Dict, List, Optional, Union
8   -
9   -from concurrent.futures import Future, ThreadPoolExecutor
  5 +from typing import Any, Dict, Optional
10 6 import requests
11 7  
12 8 from config.services_config import get_translation_config, get_translation_base_url
... ... @@ -22,19 +18,18 @@ class HttpTranslationProvider:
22 18 base_url: str,
23 19 model: str = "qwen",
24 20 timeout_sec: float = 10.0,
25   - translation_context: Optional[str] = None,
26 21 ):
27 22 self.base_url = (base_url or "").rstrip("/")
28 23 self.model = model or "qwen"
29 24 self.timeout_sec = float(timeout_sec or 10.0)
30   - self.translation_context = translation_context or "e-commerce product search"
31   - self.executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="http-translator")
32 25  
33 26 def _translate_once(
34 27 self,
35 28 text: str,
36 29 target_lang: str,
37 30 source_lang: Optional[str] = None,
  31 + context: Optional[str] = None,
  32 + prompt: Optional[str] = None,
38 33 ) -> Optional[str]:
39 34 if not text or not str(text).strip():
40 35 return text
... ... @@ -46,6 +41,10 @@ class HttpTranslationProvider:
46 41 "source_lang": source_lang or "auto",
47 42 "model": self.model,
48 43 }
  44 + if context:
  45 + payload["context"] = context
  46 + if prompt:
  47 + payload["prompt"] = prompt
49 48 response = requests.post(url, json=payload, timeout=self.timeout_sec)
50 49 if response.status_code != 200:
51 50 logger.warning(
... ... @@ -69,58 +68,13 @@ class HttpTranslationProvider:
69 68 context: Optional[str] = None,
70 69 prompt: Optional[str] = None,
71 70 ) -> Optional[str]:
72   - del context, prompt
73   - result = self._translate_once(text=text, target_lang=target_lang, source_lang=source_lang)
74   - return result if result is not None else text
75   -
76   - def translate_multi(
77   - self,
78   - text: str,
79   - target_langs: List[str],
80   - source_lang: Optional[str] = None,
81   - context: Optional[str] = None,
82   - async_mode: bool = True,
83   - prompt: Optional[str] = None,
84   - ) -> Dict[str, Optional[str]]:
85   - del context, async_mode, prompt
86   - out: Dict[str, Optional[str]] = {}
87   - for lang in target_langs:
88   - out[lang] = self.translate(text, lang, source_lang=source_lang)
89   - return out
90   -
91   - def translate_multi_async(
92   - self,
93   - text: str,
94   - target_langs: List[str],
95   - source_lang: Optional[str] = None,
96   - context: Optional[str] = None,
97   - prompt: Optional[str] = None,
98   - ) -> Dict[str, Union[str, Future]]:
99   - del context, prompt
100   - out: Dict[str, Union[str, Future]] = {}
101   - for lang in target_langs:
102   - out[lang] = self.executor.submit(self.translate, text, lang, source_lang)
103   - return out
104   -
105   - def translate_for_indexing(
106   - self,
107   - text: str,
108   - shop_language: str,
109   - source_lang: Optional[str] = None,
110   - context: Optional[str] = None,
111   - prompt: Optional[str] = None,
112   - index_languages: Optional[List[str]] = None,
113   - ) -> Dict[str, Optional[str]]:
114   - del context, prompt
115   - langs = index_languages if index_languages else ["en", "zh"]
116   - source = source_lang or shop_language or "auto"
117   - out: Dict[str, Optional[str]] = {}
118   - for lang in langs:
119   - if lang == shop_language:
120   - out[lang] = text
121   - else:
122   - out[lang] = self.translate(text, target_lang=lang, source_lang=source)
123   - return out
  71 + return self._translate_once(
  72 + text=text,
  73 + target_lang=target_lang,
  74 + source_lang=source_lang,
  75 + context=context,
  76 + prompt=prompt,
  77 + )
124 78  
125 79  
126 80 def create_translation_provider(query_config: Any = None) -> Any:
... ... @@ -133,9 +87,9 @@ def create_translation_provider(query_config: Any = None) -&gt; Any:
133 87 provider = cfg.provider
134 88 pc = cfg.get_provider_cfg()
135 89  
136   - if provider in ("direct", "local", "inprocess"):
  90 + if provider in ("qwen-mt", "direct", "local", "inprocess"):
137 91 from query.qwen_mt_translate import Translator
138   - model = pc.get("model") or "qwen"
  92 + model = pc.get("model") or "qwen-mt-flash"
139 93 qc = query_config or _empty_query_config()
140 94 return Translator(
141 95 model=model,
... ... @@ -145,7 +99,7 @@ def create_translation_provider(query_config: Any = None) -&gt; Any:
145 99 translation_context=getattr(qc, "translation_context", "e-commerce product search"),
146 100 )
147 101  
148   - if provider in ("http", "service"):
  102 + elif provider in ("http", "service"):
149 103 base_url = get_translation_base_url()
150 104 model = pc.get("model") or "qwen"
151 105 timeout = pc.get("timeout_sec", 10.0)
... ... @@ -154,7 +108,26 @@ def create_translation_provider(query_config: Any = None) -&gt; Any:
154 108 base_url=base_url,
155 109 model=model,
156 110 timeout_sec=float(timeout),
157   - translation_context=getattr(qc, "translation_context", "e-commerce product search"),
  111 + )
  112 +
  113 + elif provider == "llm":
  114 + from query.llm_translate import LLMTranslatorProvider
  115 + model = pc.get("model")
  116 + timeout = float(pc.get("timeout_sec", 30.0))
  117 + base_url = (pc.get("base_url") or "").strip() or None
  118 + return LLMTranslatorProvider(
  119 + model=model,
  120 + timeout_sec=timeout,
  121 + base_url=base_url,
  122 + )
  123 +
  124 + elif provider == "deepl":
  125 + from query.deepl_provider import DeepLProvider
  126 + qc = query_config or _empty_query_config()
  127 + return DeepLProvider(
  128 + api_key=getattr(qc, "translation_api_key", None),
  129 + timeout=float(pc.get("timeout_sec", 10.0)),
  130 + glossary_id=pc.get("glossary_id") or getattr(qc, "translation_glossary_id", None),
158 131 )
159 132  
160 133 raise ValueError(f"Unsupported translation provider: {provider}")
... ...
query/deepl_provider.py 0 → 100644
... ... @@ -0,0 +1,203 @@
  1 +"""
  2 +DeepL backend provider.
  3 +
  4 +This module only handles network calls to DeepL.
  5 +It does not handle cache, async fanout, or fallback semantics.
  6 +"""
  7 +
  8 +from __future__ import annotations
  9 +
  10 +import logging
  11 +import os
  12 +import re
  13 +from typing import Dict, Optional, Tuple
  14 +
  15 +import requests
  16 +from config.services_config import get_translation_config
  17 +
  18 +
  19 +logger = logging.getLogger(__name__)
  20 +
  21 +DEFAULT_CONTEXTS: Dict[str, Dict[str, str]] = {
  22 + "sku_name": {
  23 + "zh": "商品SKU名称",
  24 + "en": "product SKU name",
  25 + },
  26 + "ecommerce_search_query": {
  27 + "zh": "电商",
  28 + "en": "e-commerce",
  29 + },
  30 + "general": {
  31 + "zh": "",
  32 + "en": "",
  33 + },
  34 +}
  35 +SCENE_NAMES = frozenset(DEFAULT_CONTEXTS.keys())
  36 +
  37 +
  38 +def _merge_contexts(raw: object) -> Dict[str, Dict[str, str]]:
  39 + merged: Dict[str, Dict[str, str]] = {
  40 + scene: dict(lang_map) for scene, lang_map in DEFAULT_CONTEXTS.items()
  41 + }
  42 + if not isinstance(raw, dict):
  43 + return merged
  44 + for scene, lang_map in raw.items():
  45 + if not isinstance(lang_map, dict):
  46 + continue
  47 + scene_name = str(scene or "").strip()
  48 + if not scene_name:
  49 + continue
  50 + merged.setdefault(scene_name, {})
  51 + for lang, value in lang_map.items():
  52 + lang_key = str(lang or "").strip().lower()
  53 + context_value = str(value or "").strip()
  54 + if lang_key and context_value:
  55 + merged[scene_name][lang_key] = context_value
  56 + return merged
  57 +
  58 +
  59 +class DeepLProvider:
  60 + API_URL = "https://api.deepl.com/v2/translate" # Pro tier
  61 + LANG_CODE_MAP = {
  62 + "zh": "ZH",
  63 + "en": "EN",
  64 + "ru": "RU",
  65 + "ar": "AR",
  66 + "ja": "JA",
  67 + "es": "ES",
  68 + "de": "DE",
  69 + "fr": "FR",
  70 + "it": "IT",
  71 + "pt": "PT",
  72 + }
  73 +
  74 + def __init__(
  75 + self,
  76 + api_key: Optional[str],
  77 + *,
  78 + timeout: float = 10.0,
  79 + glossary_id: Optional[str] = None,
  80 + ) -> None:
  81 + cfg = get_translation_config()
  82 + provider_cfg = cfg.providers.get("deepl", {}) if isinstance(cfg.providers, dict) else {}
  83 + self.api_key = api_key or os.getenv("DEEPL_AUTH_KEY")
  84 + self.timeout = float(provider_cfg.get("timeout_sec") or timeout or 10.0)
  85 + self.glossary_id = glossary_id or provider_cfg.get("glossary_id")
  86 + self.model = "deepl"
  87 + self.context_presets = _merge_contexts(provider_cfg.get("contexts"))
  88 + if not self.api_key:
  89 + logger.warning("DEEPL_AUTH_KEY not set; DeepL translation is unavailable")
  90 +
  91 + def _resolve_request_context(
  92 + self,
  93 + target_lang: str,
  94 + context: Optional[str],
  95 + prompt: Optional[str],
  96 + ) -> Optional[str]:
  97 + if prompt:
  98 + return prompt
  99 + if context in SCENE_NAMES:
  100 + scene_map = self.context_presets.get(context) or self.context_presets.get("default") or {}
  101 + tgt = (target_lang or "").strip().lower()
  102 + return scene_map.get(tgt) or scene_map.get("en")
  103 + if context:
  104 + return context
  105 + scene_map = self.context_presets.get("default") or {}
  106 + tgt = (target_lang or "").strip().lower()
  107 + return scene_map.get(tgt) or scene_map.get("en")
  108 +
  109 + def translate(
  110 + self,
  111 + text: str,
  112 + target_lang: str,
  113 + source_lang: Optional[str] = None,
  114 + context: Optional[str] = None,
  115 + prompt: Optional[str] = None,
  116 + ) -> Optional[str]:
  117 + if not self.api_key:
  118 + return None
  119 +
  120 + target_code = self.LANG_CODE_MAP.get((target_lang or "").lower(), (target_lang or "").upper())
  121 + headers = {
  122 + "Authorization": f"DeepL-Auth-Key {self.api_key}",
  123 + "Content-Type": "application/json",
  124 + }
  125 +
  126 + api_context = self._resolve_request_context(target_lang, context, prompt)
  127 + text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context)
  128 +
  129 + payload = {
  130 + "text": [text_to_translate],
  131 + "target_lang": target_code,
  132 + }
  133 + if source_lang:
  134 + payload["source_lang"] = self.LANG_CODE_MAP.get(source_lang.lower(), source_lang.upper())
  135 + if api_context:
  136 + payload["context"] = api_context
  137 + if self.glossary_id:
  138 + payload["glossary_id"] = self.glossary_id
  139 +
  140 + try:
  141 + response = requests.post(self.API_URL, headers=headers, json=payload, timeout=self.timeout)
  142 + if response.status_code != 200:
  143 + logger.warning(
  144 + "[deepl] Failed | status=%s tgt=%s body=%s",
  145 + response.status_code,
  146 + target_code,
  147 + (response.text or "")[:200],
  148 + )
  149 + return None
  150 +
  151 + data = response.json()
  152 + translations = data.get("translations") or []
  153 + if not translations:
  154 + return None
  155 + translated = translations[0].get("text")
  156 + if not translated:
  157 + return None
  158 + if needs_extraction:
  159 + translated = self._extract_term_from_translation(translated, text, target_code)
  160 + return translated
  161 + except requests.Timeout:
  162 + logger.warning("[deepl] Timeout | tgt=%s timeout=%.1fs", target_code, self.timeout)
  163 + return None
  164 + except Exception as exc:
  165 + logger.warning("[deepl] Exception | tgt=%s error=%s", target_code, exc, exc_info=True)
  166 + return None
  167 +
  168 + def _add_ecommerce_context(
  169 + self,
  170 + text: str,
  171 + source_lang: Optional[str],
  172 + context: Optional[str],
  173 + ) -> Tuple[str, bool]:
  174 + if not context or "e-commerce" not in context.lower():
  175 + return text, False
  176 + if (source_lang or "").lower() != "zh":
  177 + return text, False
  178 +
  179 + term = (text or "").strip()
  180 + if len(term.split()) == 1 and len(term) <= 2:
  181 + return f"购买 {term}", True
  182 + return text, False
  183 +
  184 + def _extract_term_from_translation(
  185 + self,
  186 + translated_text: str,
  187 + original_text: str,
  188 + target_lang_code: str,
  189 + ) -> str:
  190 + del original_text
  191 + if target_lang_code != "EN":
  192 + return translated_text
  193 +
  194 + words = translated_text.strip().split()
  195 + if len(words) <= 1:
  196 + return translated_text
  197 + context_words = {"buy", "purchase", "product", "item", "commodity", "goods"}
  198 + for word in reversed(words):
  199 + normalized = re.sub(r"[.,!?;:]+$", "", word.lower())
  200 + if normalized not in context_words:
  201 + return normalized
  202 + return re.sub(r"[.,!?;:]+$", "", words[-1].lower())
  203 +
... ...
query/llm_translate.py
1 1 """
2   -LLM-based translation helper using Qwen chat model.
  2 +LLM-based translation backend (DashScope-compatible OpenAI API).
3 3  
4   -This module provides a thin wrapper around DashScope's `qwen-flash` model
5   -for high-quality, prompt-controlled translation, independent of the main
6   -`Translator` (machine translation) pipeline.
7   -
8   -Usage example:
9   -
10   - from query.llm_translate import llm_translate
11   -
12   - result = llm_translate(
13   - text="我看到这个视频后没有笑",
14   - target_lang="en",
15   - source_lang="zh",
16   - source_lang_label="中文",
17   - target_lang_label="英文",
18   - )
  4 +Failure semantics are strict:
  5 +- success: translated string
  6 +- failure: None
19 7 """
20 8  
21 9 from __future__ import annotations
... ... @@ -23,113 +11,159 @@ from __future__ import annotations
23 11 import logging
24 12 import os
25 13 import time
26   -from typing import Dict, Optional
  14 +from typing import Optional
27 15  
28 16 from openai import OpenAI
29 17  
30 18 from config.env_config import DASHSCOPE_API_KEY
31 19 from config.services_config import get_translation_config
  20 +from config.translate_prompts import TRANSLATION_PROMPTS, SOURCE_LANG_CODE_MAP
  21 +
32 22  
33 23 logger = logging.getLogger(__name__)
34 24  
35 25  
36   -# 华北2(北京):https://dashscope.aliyuncs.com/compatible-mode/v1
37   -# 新加坡:https://dashscope-intl.aliyuncs.com/compatible-mode/v1
38   -# 美国(弗吉尼亚):https://dashscope-us.aliyuncs.com/compatible-mode/v1
39   -#
40   -# 默认保持与现有翻译/索引脚本相同的美国地域,可通过环境变量覆盖:
41   -# DASHSCOPE_BASE_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
42 26 DEFAULT_QWEN_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1"
43   -QWEN_MODEL_NAME = "qwen-flash"
44   -
45   -
46   -# 由调用方提供的语言标签/代码填充,占位符说明:
47   -# - source_lang: 源语言的人类可读名称(按目标语言本地化,例如 "中文", "English")
48   -# - target_lang: 目标语言的人类可读名称
49   -# - src_lang_code: 源语言代码,例如 "zh"
50   -# - tgt_lang_code: 目标语言代码,例如 "en"
51   -TRANSLATION_PROMPTS: Dict[str, str] = {
52   - "zh": """你是一名专业的 {source_lang}({src_lang_code})到 {target_lang}({tgt_lang_code})翻译员。你的目标是在遵循 {target_lang} 的语法、词汇和文化习惯的前提下,准确传达原始 {source_lang} 文本的含义和细微差别。请只输出 {target_lang} 的翻译内容,不要包含任何额外的解释或评论。请将以下 {source_lang} 文本翻译成 {target_lang}:
53   -
54   -{text}""",
55   - "en": """You are a professional {source_lang} ({src_lang_code}) to {target_lang} ({tgt_lang_code}) translator. Your goal is to accurately convey the meaning and nuances of the original {source_lang} text while adhering to {target_lang} grammar, vocabulary, and cultural sensitivities. Produce only the {target_lang} translation, without any additional explanations or commentary. Please translate the following {source_lang} text into {target_lang}:
56   -
57   -{text}""",
58   - "ru": """Вы профессиональный переводчик с {source_lang} ({src_lang_code}) на {target_lang} ({tgt_lang_code}). Ваша задача — точно передать смысл и нюансы исходного текста на {source_lang}, соблюдая грамматику, лексику и культурные особенности {target_lang}. Выводите только перевод на {target_lang}, без каких-либо дополнительных объяснений или комментариев. Пожалуйста, переведите следующий текст с {source_lang} на {target_lang}:
59   -
60   -{text}""",
61   - "ar": """أنت مترجم محترف من {source_lang} ({src_lang_code}) إلى {target_lang} ({tgt_lang_code}). هدفك هو نقل المعنى والدلالات الدقيقة للنص الأصلي بلغة {source_lang} بدقة، مع الالتزام بقواعد اللغة والمفردات والحساسيات الثقافية الخاصة بلغة {target_lang}. قم بإنتاج الترجمة إلى {target_lang} فقط دون أي شروحات أو تعليقات إضافية. يرجى ترجمة النص التالي من {source_lang} إلى {target_lang}:
62   -
63   -{text}""",
64   - "ja": """あなたは {source_lang}({src_lang_code})から {target_lang}({tgt_lang_code})へのプロの翻訳者です。{target_lang} の文法、語彙、文化的配慮に従いながら、元の {source_lang} テキストの意味やニュアンスを正確に伝えることが目的です。追加の説明やコメントは一切含めず、{target_lang} の翻訳のみを出力してください。次の {source_lang} テキストを {target_lang} に翻訳してください:
65   -
66   -{text}""",
67   - "es": """Eres un traductor profesional de {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Tu objetivo es transmitir con precisión el significado y los matices del texto original en {source_lang}, respetando la gramática, el vocabulario y las sensibilidades culturales de {target_lang}. Produce únicamente la traducción en {target_lang}, sin explicaciones ni comentarios adicionales. Por favor, traduce el siguiente texto de {source_lang} a {target_lang}:
68   -
69   -{text}""",
70   - "de": """Du bist ein professioneller Übersetzer von {source_lang} ({src_lang_code}) nach {target_lang} ({tgt_lang_code}). Dein Ziel ist es, die Bedeutung und Nuancen des ursprünglichen {source_lang}-Textes genau zu vermitteln und dabei die Grammatik, den Wortschatz und die kulturellen Besonderheiten von {target_lang} zu berücksichtigen. Gib ausschließlich die Übersetzung in {target_lang} aus, ohne zusätzliche Erklärungen oder Kommentare. Bitte übersetze den folgenden {source_lang}-Text in {target_lang}:
71   -
72   -{text}""",
73   - "fr": """Vous êtes un traducteur professionnel de {source_lang} ({src_lang_code}) vers {target_lang} ({tgt_lang_code}). Votre objectif est de transmettre fidèlement le sens et les nuances du texte original en {source_lang}, tout en respectant la grammaire, le vocabulaire et les sensibilités culturelles de {target_lang}. Produisez uniquement la traduction en {target_lang}, sans explications ni commentaires supplémentaires. Veuillez traduire le texte suivant de {source_lang} vers {target_lang} :
74   -
75   -{text}""",
76   - "it": """Sei un traduttore professionista da {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Il tuo obiettivo è trasmettere con precisione il significato e le sfumature del testo originale in {source_lang}, rispettando la grammatica, il vocabolario e le sensibilità culturali di {target_lang}. Produci solo la traduzione in {target_lang}, senza spiegazioni o commenti aggiuntivi. Per favore traduci il seguente testo da {source_lang} a {target_lang}:
77   -
78   -{text}""",
79   - "pt": """Você é um tradutor profissional de {source_lang} ({src_lang_code}) para {target_lang} ({tgt_lang_code}). Seu objetivo é transmitir com precisão o significado e as nuances do texto original em {source_lang}, respeitando a gramática, o vocabulário e as sensibilidades culturais de {target_lang}. Produza apenas a tradução em {target_lang}, sem quaisquer explicações ou comentários adicionais. Por favor, traduza o seguinte texto de {source_lang} para {target_lang}:
80   -
81   -{text}""",
82   -}
83   -
84   -
85   -def _get_qwen_client(base_url: Optional[str] = None) -> Optional[OpenAI]:
86   - """
87   - Lazily construct an OpenAI-compatible client for DashScope.
88   -
89   - Uses DASHSCOPE_API_KEY and base_url (provider config / env) to configure endpoint.
90   - """
91   - api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY")
92   - if not api_key:
93   - logger.warning("DASHSCOPE_API_KEY not set; llm-based translation will be disabled")
94   - return None
95   -
96   - # 优先使用显式传入的 base_url,其次环境变量,最后默认地域。
97   - base_url = (
98   - (base_url or "").strip()
99   - or os.getenv("DASHSCOPE_BASE_URL")
100   - or DEFAULT_QWEN_BASE_URL
101   - )
102   -
103   - try:
104   - client = OpenAI(api_key=api_key, base_url=base_url)
105   - return client
106   - except Exception as exc:
107   - logger.error("Failed to initialize DashScope OpenAI client: %s", exc, exc_info=True)
108   - return None
  27 +DEFAULT_LLM_MODEL = "qwen-flash"
109 28  
110 29  
111 30 def _build_prompt(
112 31 text: str,
  32 + *,
  33 + source_lang: Optional[str],
113 34 target_lang: str,
114   - source_lang_label: str,
115   - target_lang_label: str,
116   - src_lang_code: str,
117   - tgt_lang_code: str,
  35 + scene: Optional[str],
118 36 ) -> str:
119 37 """
120   - Build translation prompt for given target language, defaulting to English template.
  38 + 从 config.translate_prompts.TRANSLATION_PROMPTS 中构建提示词。
  39 +
  40 + 要求:模板必须包含 {source_lang}({src_lang_code}){target_lang}({tgt_lang_code})。
  41 + 这里统一使用 code 作为占位的 lang 与 label,外部接口仍然只传语言 code。
121 42 """
122   - key = (target_lang or "").lower()
123   - template = TRANSLATION_PROMPTS.get(key) or TRANSLATION_PROMPTS["en"]
  43 + tgt = (target_lang or "").lower() or "en"
  44 + src = (source_lang or "auto").lower()
  45 +
  46 + # 将业务上下文 scene 映射为模板分组名
  47 + normalized_scene = (scene or "").strip() or "general"
  48 + # 如果出现历史词,则报错,用于发现错误
  49 + if normalized_scene in {"query", "ecommerce_search", "ecommerce_search_query"}:
  50 + group_key = "ecommerce_search_query"
  51 + elif normalized_scene in {"product_title", "sku_name"}:
  52 + group_key = "sku_name"
  53 + else:
  54 + group_key = normalized_scene
  55 + group = TRANSLATION_PROMPTS.get(group_key) or TRANSLATION_PROMPTS["general"]
  56 +
  57 + # 先按目标语言 code 取模板,取不到回退到英文
  58 + template = group.get(tgt) or group.get("en")
  59 + if not template:
  60 + # 理论上不会发生,兜底一个简单模板
  61 + template = (
  62 + "You are a professional {source_lang} ({src_lang_code}) to "
  63 + "{target_lang} ({tgt_lang_code}) translator, output only the translation: {text}"
  64 + )
  65 +
  66 + # 目前不额外维护语言名称映射,直接使用 code 作为 label
  67 + source_lang_label = SOURCE_LANG_CODE_MAP.get(src, src)
  68 + target_lang_label = SOURCE_LANG_CODE_MAP.get(tgt, tgt)
  69 +
124 70 return template.format(
125 71 source_lang=source_lang_label,
  72 + src_lang_code=src,
126 73 target_lang=target_lang_label,
127   - src_lang_code=src_lang_code,
128   - tgt_lang_code=tgt_lang_code,
  74 + tgt_lang_code=tgt,
129 75 text=text,
130 76 )
131 77  
132 78  
  79 +class LLMTranslatorProvider:
  80 + def __init__(
  81 + self,
  82 + *,
  83 + model: Optional[str] = None,
  84 + timeout_sec: float = 30.0,
  85 + base_url: Optional[str] = None,
  86 + ) -> None:
  87 + cfg = get_translation_config()
  88 + llm_cfg = cfg.providers.get("llm", {}) if isinstance(cfg.providers, dict) else {}
  89 + self.model = model or llm_cfg.get("model") or DEFAULT_LLM_MODEL
  90 + self.timeout_sec = float(llm_cfg.get("timeout_sec") or timeout_sec or 30.0)
  91 + self.base_url = (
  92 + (base_url or "").strip()
  93 + or (llm_cfg.get("base_url") or "").strip()
  94 + or os.getenv("DASHSCOPE_BASE_URL")
  95 + or DEFAULT_QWEN_BASE_URL
  96 + )
  97 + self.client = self._create_client()
  98 +
  99 + def _create_client(self) -> Optional[OpenAI]:
  100 + api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY")
  101 + if not api_key:
  102 + logger.warning("DASHSCOPE_API_KEY not set; llm translation unavailable")
  103 + return None
  104 + try:
  105 + return OpenAI(api_key=api_key, base_url=self.base_url)
  106 + except Exception as exc:
  107 + logger.error("Failed to initialize llm translation client: %s", exc, exc_info=True)
  108 + return None
  109 +
  110 + def translate(
  111 + self,
  112 + text: str,
  113 + target_lang: str,
  114 + source_lang: Optional[str] = None,
  115 + context: Optional[str] = None,
  116 + prompt: Optional[str] = None,
  117 + ) -> Optional[str]:
  118 + if not text or not str(text).strip():
  119 + return text
  120 + if not self.client:
  121 + return None
  122 +
  123 + tgt = (target_lang or "").lower() or "en"
  124 + src = (source_lang or "auto").lower()
  125 + scene = context or "default"
  126 + user_prompt = prompt or _build_prompt(
  127 + text=text,
  128 + source_lang=src,
  129 + target_lang=tgt,
  130 + scene=scene,
  131 + )
  132 + start = time.time()
  133 + try:
  134 + logger.info(
  135 + "[llm] Request | src=%s tgt=%s model=%s prompt=%s",
  136 + src,
  137 + tgt,
  138 + self.model,
  139 + user_prompt,
  140 + )
  141 + completion = self.client.chat.completions.create(
  142 + model=self.model,
  143 + messages=[{"role": "user", "content": user_prompt}],
  144 + timeout=self.timeout_sec,
  145 + )
  146 + content = (completion.choices[0].message.content or "").strip()
  147 + latency_ms = (time.time() - start) * 1000
  148 + if not content:
  149 + logger.warning("[llm] Empty result | src=%s tgt=%s latency=%.1fms", src, tgt, latency_ms)
  150 + return None
  151 + logger.info("[llm] Response | src=%s tgt=%s response=%s", src, tgt, content)
  152 + logger.info("[llm] Success | src=%s tgt=%s latency=%.1fms", src, tgt, latency_ms)
  153 + return content
  154 + except Exception as exc:
  155 + latency_ms = (time.time() - start) * 1000
  156 + logger.warning(
  157 + "[llm] Failed | src=%s tgt=%s latency=%.1fms error=%s",
  158 + src,
  159 + tgt,
  160 + latency_ms,
  161 + exc,
  162 + exc_info=True,
  163 + )
  164 + return None
  165 +
  166 +
133 167 def llm_translate(
134 168 text: str,
135 169 target_lang: str,
... ... @@ -139,100 +173,13 @@ def llm_translate(
139 173 target_lang_label: Optional[str] = None,
140 174 timeout_sec: Optional[float] = None,
141 175 ) -> Optional[str]:
142   - """
143   - Translate text with Qwen chat model using rich prompts.
144   -
145   - - 根据目标语言选择提示词,如果没匹配到则退回英文模板。
146   - - 不对 text 做语言检测或缓存,调用方自行控制。
147   -
148   - Args:
149   - text: 原始文本
150   - target_lang: 目标语言代码(如 "zh", "en")
151   - source_lang: 源语言代码(可选,不影响提示词选择,仅用于日志)
152   - source_lang_label: 源语言展示名称,用于 prompt(默认使用 source_lang)
153   - target_lang_label: 目标语言展示名称,用于 prompt(默认使用 target_lang)
154   - timeout_sec: 请求超时时间(秒,可选;若未配置则从 config 读取或采用默认)
155   -
156   - Returns:
157   - 翻译后的文本;如失败则返回 None。
158   - """
159   - if not text or not str(text).strip():
160   - return text
161   -
162   - cfg = get_translation_config()
163   - provider_cfg = cfg.providers.get("llm", {}) if isinstance(cfg.providers, dict) else {}
164   -
165   - model_name = provider_cfg.get("model") or QWEN_MODEL_NAME
166   - req_timeout = float(provider_cfg.get("timeout_sec") or timeout_sec or 30.0)
167   - base_url = (provider_cfg.get("base_url") or "").strip() or None
168   -
169   - client = _get_qwen_client(base_url=base_url)
170   - if not client:
171   - # 无法调用云端,直接回退
172   - logger.warning(
173   - "[llm_translate] Client init failed; returning original text. "
174   - "text=%r target_lang=%s source_lang=%s",
175   - text[:80],
176   - target_lang,
177   - source_lang or "auto",
178   - )
179   - return text
180   -
181   - tgt = (target_lang or "").lower() or "en"
182   - src = (source_lang or "auto").lower()
183   - src_label = source_lang_label or src
184   - tgt_label = target_lang_label or tgt
185   -
186   - prompt = _build_prompt(
  176 + provider = LLMTranslatorProvider(timeout_sec=timeout_sec or 30.0)
  177 + return provider.translate(
187 178 text=text,
188   - target_lang=tgt,
189   - source_lang_label=src_label,
190   - target_lang_label=tgt_label,
191   - src_lang_code=src,
192   - tgt_lang_code=tgt,
  179 + target_lang=target_lang,
  180 + source_lang=source_lang,
  181 + context=None,
193 182 )
194 183  
195   - start = time.time()
196   - try:
197   - completion = client.chat.completions.create(
198   - model=model_name,
199   - messages=[
200   - {
201   - "role": "user",
202   - "content": prompt,
203   - }
204   - ],
205   - timeout=req_timeout,
206   - )
207   - content = (completion.choices[0].message.content or "").strip()
208   - duration_ms = (time.time() - start) * 1000
209   - logger.info(
210   - "[llm_translate] Success | model=%s src=%s tgt=%s latency=%.1fms text=%r -> %r",
211   - model_name,
212   - src,
213   - tgt,
214   - duration_ms,
215   - text[:80],
216   - content[:80],
217   - )
218   - return content or text
219   - except Exception as exc:
220   - duration_ms = (time.time() - start) * 1000
221   - logger.warning(
222   - "[llm_translate] Failed | model=%s src=%s tgt=%s latency=%.1fms error=%s",
223   - model_name,
224   - src,
225   - tgt,
226   - duration_ms,
227   - exc,
228   - exc_info=True,
229   - )
230   - # 安全回退:出错时返回原文,避免中断上游流程
231   - return text
232   -
233   -
234   -__all__ = [
235   - "TRANSLATION_PROMPTS",
236   - "llm_translate",
237   -]
238 184  
  185 +__all__ = ["LLMTranslatorProvider", "llm_translate"]
... ...
query/query_parser.py
... ... @@ -8,7 +8,7 @@ from typing import Dict, List, Optional, Any, Union
8 8 import numpy as np
9 9 import logging
10 10 import re
11   -from concurrent.futures import Future, ThreadPoolExecutor, as_completed
  11 +from concurrent.futures import ThreadPoolExecutor, as_completed, wait
12 12  
13 13 from embeddings.text_encoder import TextEmbeddingEncoder
14 14 from config import SearchConfig
... ... @@ -135,6 +135,7 @@ class QueryParser:
135 135 cfg = get_translation_config()
136 136 logger.info("Initializing translator at QueryParser construction (provider=%s)...", cfg.provider)
137 137 self._translator = create_translation_provider(self.config.query_config)
  138 + self._translation_executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="query-translation")
138 139  
139 140 @property
140 141 def text_encoder(self) -> TextEmbeddingEncoder:
... ... @@ -265,6 +266,7 @@ class QueryParser:
265 266 # Stage 4: Translation (with async support and conditional waiting)
266 267 translations = {}
267 268 translation_futures = {}
  269 + translation_executor = None
268 270 index_langs = ["en", "zh"]
269 271 try:
270 272 # 根据租户配置的 index_languages 决定翻译目标语言
... ... @@ -287,48 +289,33 @@ class QueryParser:
287 289 target_langs = target_langs_for_translation
288 290  
289 291 if target_langs:
290   - # Use e-commerce context for better disambiguation
291   - translation_context = self.config.query_config.translation_context
292   - # For query translation, we use a general prompt (not language-specific)
293   - query_prompt = (
294   - self.config.query_config.translation_prompts.get(f"query_{detected_lang}")
295   - or self.config.query_config.translation_prompts.get("query_en")
296   - or self.config.query_config.translation_prompts.get("default_en")
297   - or self.config.query_config.translation_prompts.get("default_zh")
298   - )
299   -
300 292 # Determine if we need to wait for translation results
301 293 # If detected_lang is not in index_languages, we must wait for translation
302 294 need_wait_translation = detected_lang not in index_langs
303   -
  295 +
304 296 if need_wait_translation:
305   - # Use async method that returns Futures, so we can wait for results
306   - translation_results = self.translator.translate_multi_async(
307   - query_text,
308   - target_langs,
309   - source_lang=detected_lang,
310   - context=translation_context,
311   - prompt=query_prompt
  297 + translation_executor = ThreadPoolExecutor(
  298 + max_workers=max(1, min(len(target_langs), 4)),
  299 + thread_name_prefix="query-translation-wait",
312 300 )
313   - # Separate cached results and futures
314   - for lang, result in translation_results.items():
315   - if isinstance(result, Future):
316   - translation_futures[lang] = result
317   - else:
318   - translations[lang] = result
  301 + for lang in target_langs:
  302 + translation_futures[lang] = translation_executor.submit(
  303 + self.translator.translate,
  304 + query_text,
  305 + lang,
  306 + detected_lang,
  307 + "ecommerce_search_query",
  308 + )
319 309 else:
320   - # Use async mode: returns cached translations immediately, missing ones translated in background
321   - translations = self.translator.translate_multi(
322   - query_text,
323   - target_langs,
324   - source_lang=detected_lang,
325   - context=translation_context,
326   - async_mode=True,
327   - prompt=query_prompt
328   - )
329   - # Filter out None values (missing translations that are being processed async)
330   - translations = {k: v for k, v in translations.items() if v is not None}
331   -
  310 + for lang in target_langs:
  311 + self._translation_executor.submit(
  312 + self.translator.translate,
  313 + query_text,
  314 + lang,
  315 + detected_lang,
  316 + "ecommerce_search_query",
  317 + )
  318 +
332 319 if translations:
333 320 log_info(f"Translation completed (cache hit) | Query text: '{query_text}' | Results: {translations}")
334 321 if translation_futures:
... ... @@ -407,15 +394,18 @@ class QueryParser:
407 394 all_futures.append(embedding_future)
408 395 future_to_lang[embedding_future] = ('embedding', None)
409 396  
410   - # Wait for all futures to complete
411   - for future in as_completed(all_futures):
  397 + # Enforce a hard timeout for translation-related work (300ms budget)
  398 + done, not_done = wait(all_futures, timeout=0.3)
  399 + for future in done:
412 400 task_type, lang = future_to_lang[future]
413 401 try:
414 402 result = future.result()
415 403 if task_type == 'translation':
416 404 if result:
417 405 translations[lang] = result
418   - log_info(f"Translation completed | Query text: '{query_text}' | Target language: {lang} | Translation result: '{result}'")
  406 + log_info(
  407 + f"Translation completed | Query text: '{query_text}' | Target language: {lang} | Translation result: '{result}'"
  408 + )
419 409 if context:
420 410 context.store_intermediate_result(f'translation_{lang}', result)
421 411 elif task_type == 'embedding':
... ... @@ -434,10 +424,27 @@ class QueryParser:
434 424 log_info(error_msg)
435 425 if context:
436 426 context.add_warning(error_msg)
437   -
  427 +
  428 + # Log timeouts for any futures that did not finish within 300ms
  429 + if not_done:
  430 + for future in not_done:
  431 + task_type, lang = future_to_lang[future]
  432 + if task_type == 'translation':
  433 + timeout_msg = (
  434 + f"Translation timeout (>300ms) | Language: {lang} | "
  435 + f"Query text: '{query_text}'"
  436 + )
  437 + else:
  438 + timeout_msg = "Query vector generation timeout (>300ms), proceeding without embedding result"
  439 + log_info(timeout_msg)
  440 + if context:
  441 + context.add_warning(timeout_msg)
  442 +
438 443 # Clean up encoding executor
439 444 if encoding_executor:
440 445 encoding_executor.shutdown(wait=False)
  446 + if translation_executor:
  447 + translation_executor.shutdown(wait=False)
441 448  
442 449 # Update translations in context after all are complete
443 450 if translations and context:
... ...
query/qwen_mt_translate.py
1   -"""
2   -Translation service for multi-language query support.
  1 +"""Qwen-MT translation orchestrator with cache and async helpers."""
3 2  
4   -Supports multiple translation models:
5   -- Qwen (default): Alibaba Cloud DashScope API using qwen-mt-flash model
6   -- DeepL: DeepL API for high-quality translations
7   -
8   -重要说明(Qwen 机翻限速):
9   -- 当前默认使用的 `qwen-mt-flash` 为云端机翻模型,**官方限速较低,约 RPM=60(每分钟约 60 请求)**
10   -- 在高并发场景必须依赖 Redis 翻译缓存与批量预热,避免在用户实时请求路径上直接打满 DashScope 限流
11   -- 若业务侧存在大规模离线翻译或更高吞吐需求,建议评估 DeepL 或自建翻译后端
12   -
13   -使用方法 (Usage):
14   -
15   -```python
16   -from query.translator import Translator
17   -
18   -# 使用默认的 qwen 模型(推荐)
19   -translator = Translator() # 默认使用 qwen 模型
20   -
21   -# 或显式指定模型
22   -translator = Translator(model='qwen') # 使用 qwen 模型
23   -translator = Translator(model='deepl') # 使用 DeepL 模型
24   -
25   -# 翻译文本
26   -result = translator.translate(
27   - text="我看到这个视频后没有笑",
28   - target_lang="en",
29   - source_lang="auto" # 自动检测源语言
30   -)
31   -```
32   -
33   -配置说明 (Configuration):
34   -- Qwen 模型需要设置 DASHSCOPE_API_KEY 环境变量(在 .env 文件中)
35   -- DeepL 模型需要设置 DEEPL_AUTH_KEY 环境变量(在 .env 文件中)
36   -
37   -Qwen 模型参考文档:
38   -- 官方文档:https://help.aliyun.com/zh/model-studio/get-api-key
39   -- 模型:qwen-mt-flash(快速翻译模型)
40   -
41   -DeepL 官方文档:
42   -https://developers.deepl.com/api-reference/translate/request-translation
43   -"""
  3 +from __future__ import annotations
44 4  
  5 +import hashlib
  6 +import logging
45 7 import os
46   -import requests
47 8 import re
48   -import redis
49   -from concurrent.futures import ThreadPoolExecutor, Future
50   -from datetime import timedelta
51   -from typing import Dict, List, Optional, Union
52   -import logging
53 9 import time
  10 +from typing import Dict, List, Optional
54 11  
55   -logger = logging.getLogger(__name__)
56   -
57   -from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG
  12 +import redis
58 13 from openai import OpenAI
59 14  
  15 +from config.env_config import DASHSCOPE_API_KEY, REDIS_CONFIG
  16 +from config.services_config import get_translation_cache_config
  17 +from config.translate_prompts import SOURCE_LANG_CODE_MAP
60 18  
61   -class Translator:
62   - """
63   - Multi-language translator supporting Qwen and DeepL APIs.
64   -
65   - Default model is 'qwen' which uses Alibaba Cloud DashScope API.
66   - """
67   -# 华北2(北京):https://dashscope.aliyuncs.com/compatible-mode/v1
68   -# 新加坡:https://dashscope-intl.aliyuncs.com/compatible-mode/v1
69   -# 美国(弗吉尼亚):https://dashscope-us.aliyuncs.com/compatible-mode/v1
  19 +logger = logging.getLogger(__name__)
70 20  
71   - DEEPL_API_URL = "https://api.deepl.com/v2/translate" # Pro tier
72   - QWEN_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1" # 北京地域
73   - # QWEN_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" # 新加坡
74   - # 如果使用新加坡地域的模型,需要将base_url替换为:https://dashscope-intl.aliyuncs.com/compatible-mode/v1
75   - QWEN_MODEL = "qwen-mt-flash" # 快速翻译模型
76 21  
77   - # Language code mapping
78   - LANG_CODE_MAP = {
79   - 'zh': 'ZH',
80   - 'en': 'EN',
81   - 'ru': 'RU',
82   - 'ar': 'AR',
83   - 'ja': 'JA',
84   - 'es': 'ES',
85   - 'de': 'DE',
86   - 'fr': 'FR',
87   - 'it': 'IT',
88   - 'pt': 'PT',
89   - }
  22 +class Translator:
  23 + QWEN_DEFAULT_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1"
  24 + QWEN_MODEL = "qwen-mt-flash"
90 25  
91 26 def __init__(
92 27 self,
... ... @@ -95,77 +30,90 @@ class Translator:
95 30 use_cache: bool = True,
96 31 timeout: int = 10,
97 32 glossary_id: Optional[str] = None,
98   - translation_context: Optional[str] = None
  33 + translation_context: Optional[str] = None,
99 34 ):
100   - """
101   - Initialize translator.
102   -
103   - Args:
104   - model: Translation model to use. Options: 'qwen' (default) or 'deepl'
105   - api_key: API key for the selected model (or None to use from config/env)
106   - use_cache: Whether to cache translations
107   - timeout: Request timeout in seconds
108   - glossary_id: DeepL glossary ID for custom terminology (optional, only for DeepL)
109   - translation_context: Context hint for translation (e.g., "e-commerce", "product search")
110   - """
111   - self.model = model.lower()
112   - if self.model not in ['qwen', 'deepl']:
113   - raise ValueError(f"Unsupported model: {model}. Supported models: 'qwen', 'deepl'")
114   -
115   - # Get API key from config if not provided
116   - if api_key is None:
117   - if self.model == 'qwen':
118   - api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY")
119   - else: # deepl
120   - api_key = DEEPL_AUTH_KEY or os.getenv("DEEPL_AUTH_KEY")
121   -
122   - self.api_key = api_key
123   - self.timeout = timeout
124   - self.use_cache = use_cache
  35 + self.model = self._normalize_model(model)
  36 + self.timeout = int(timeout)
  37 + self.use_cache = bool(use_cache)
125 38 self.glossary_id = glossary_id
126 39 self.translation_context = translation_context or "e-commerce product search"
127   -
128   - # Initialize OpenAI client for Qwen if needed
129   - self.qwen_client = None
130   - if self.model == 'qwen':
131   - if not self.api_key:
132   - logger.warning("DASHSCOPE_API_KEY not set. Qwen translation will not work.")
133   - else:
134   - self.qwen_client = OpenAI(
135   - api_key=self.api_key,
136   - base_url=self.QWEN_BASE_URL,
137   - )
138 40  
139   - # Initialize Redis cache if enabled
140   - if use_cache:
  41 + cache_cfg = get_translation_cache_config()
  42 + self.cache_prefix = str(cache_cfg.get("key_prefix", "trans:v2"))
  43 + self.expire_seconds = int(cache_cfg.get("ttl_seconds", 360 * 24 * 3600))
  44 + self.cache_sliding_expiration = bool(cache_cfg.get("sliding_expiration", True))
  45 + self.cache_include_context = bool(cache_cfg.get("key_include_context", True))
  46 + self.cache_include_prompt = bool(cache_cfg.get("key_include_prompt", True))
  47 + self.cache_include_source_lang = bool(cache_cfg.get("key_include_source_lang", True))
  48 +
  49 + self.qwen_model_name = self._resolve_qwen_model_name(model)
  50 + self._api_key = api_key or self._default_api_key(self.model)
  51 + self._qwen_client: Optional[OpenAI] = None
  52 + base_url = os.getenv("DASHSCOPE_BASE_URL") or self.QWEN_DEFAULT_BASE_URL
  53 + if self._api_key:
141 54 try:
142   - self.redis_client = redis.Redis(
143   - host=REDIS_CONFIG.get('host', 'localhost'),
144   - port=REDIS_CONFIG.get('port', 6479),
145   - password=REDIS_CONFIG.get('password'),
146   - decode_responses=True, # Return str instead of bytes
147   - socket_timeout=REDIS_CONFIG.get('socket_timeout', 1),
148   - socket_connect_timeout=REDIS_CONFIG.get('socket_connect_timeout', 1),
149   - retry_on_timeout=REDIS_CONFIG.get('retry_on_timeout', False),
150   - health_check_interval=10, # 避免复用坏连接
151   - )
152   - # Test connection
153   - self.redis_client.ping()
154   - expire_days = REDIS_CONFIG.get('translation_cache_expire_days', 360)
155   - self.expire_time = timedelta(days=expire_days)
156   - self.expire_seconds = int(self.expire_time.total_seconds()) # Redis 需要秒数
157   - self.cache_prefix = REDIS_CONFIG.get('translation_cache_prefix', 'trans')
158   - logger.info("Redis cache initialized for translations")
159   - except Exception as e:
160   - logger.warning(f"Failed to initialize Redis cache: {e}, falling back to no cache")
161   - self.redis_client = None
162   - self.cache = None
  55 + self._qwen_client = OpenAI(api_key=self._api_key, base_url=base_url)
  56 + except Exception as exc:
  57 + logger.warning("Failed to initialize qwen-mt client: %s", exc, exc_info=True)
163 58 else:
164   - self.redis_client = None
165   - self.cache = None
166   -
167   - # Thread pool for async translation
168   - self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="translator")
  59 + logger.warning("DASHSCOPE_API_KEY not set; qwen-mt translation unavailable")
  60 +
  61 + self.redis_client = None
  62 + if self.use_cache and bool(cache_cfg.get("enabled", True)):
  63 + self.redis_client = self._init_redis_client()
  64 +
  65 + @staticmethod
  66 + def _normalize_model(model: str) -> str:
  67 + m = (model or "qwen").strip().lower()
  68 + if m.startswith("qwen"):
  69 + return "qwen-mt"
  70 + raise ValueError(f"Unsupported model: {model}. Supported models: 'qwen', 'qwen-mt', 'qwen-mt-flash'")
  71 +
  72 + @staticmethod
  73 + def _resolve_qwen_model_name(model: str) -> str:
  74 + m = (model or "qwen").strip().lower()
  75 + if m in {"qwen", "qwen-mt"}:
  76 + return "qwen-mt-flash"
  77 + return m
  78 +
  79 + @staticmethod
  80 + def _default_api_key(model: str) -> Optional[str]:
  81 + del model
  82 + return DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY")
  83 +
  84 + def _init_redis_client(self):
  85 + try:
  86 + client = redis.Redis(
  87 + host=REDIS_CONFIG.get("host", "localhost"),
  88 + port=REDIS_CONFIG.get("port", 6479),
  89 + password=REDIS_CONFIG.get("password"),
  90 + decode_responses=True,
  91 + socket_timeout=REDIS_CONFIG.get("socket_timeout", 1),
  92 + socket_connect_timeout=REDIS_CONFIG.get("socket_connect_timeout", 1),
  93 + retry_on_timeout=REDIS_CONFIG.get("retry_on_timeout", False),
  94 + health_check_interval=10,
  95 + )
  96 + client.ping()
  97 + return client
  98 + except Exception as exc:
  99 + logger.warning("Failed to initialize translation redis cache: %s", exc)
  100 + return None
  101 +
  102 + def _build_cache_key(
  103 + self,
  104 + text: str,
  105 + target_lang: str,
  106 + source_lang: Optional[str],
  107 + context: Optional[str],
  108 + prompt: Optional[str],
  109 + ) -> str:
  110 + src = (source_lang or "auto").strip().lower() if self.cache_include_source_lang else "-"
  111 + tgt = (target_lang or "").strip().lower()
  112 + ctx = (context or "").strip() if self.cache_include_context else ""
  113 + prm = (prompt or "").strip() if self.cache_include_prompt else ""
  114 + payload = f"model={self.model}\nsrc={src}\ntgt={tgt}\nctx={ctx}\nprm={prm}\ntext={text}"
  115 + digest = hashlib.sha256(payload.encode("utf-8")).hexdigest()
  116 + return f"{self.cache_prefix}:{self.model}:{src}:{tgt}:{digest}"
169 117  
170 118 def translate(
171 119 self,
... ... @@ -173,99 +121,27 @@ class Translator:
173 121 target_lang: str,
174 122 source_lang: Optional[str] = None,
175 123 context: Optional[str] = None,
176   - prompt: Optional[str] = None
  124 + prompt: Optional[str] = None,
177 125 ) -> Optional[str]:
178   - """
179   - Translate text to target language (synchronous mode).
180   -
181   - Args:
182   - text: Text to translate
183   - target_lang: Target language code ('zh', 'en', 'ru', etc.)
184   - source_lang: Source language code (option al, auto-detect if None)
185   - context: Additional context for translation (overrides default context)
186   - prompt: Translation prompt/instruction (optional, for better translation quality)
187   -
188   - Returns:
189   - Translated text or None if translation fails
190   - """
191 126 if not text or not text.strip():
192 127 return text
193 128  
194   - # Normalize language codes
195   - target_lang = target_lang.lower()
196   - if source_lang:
197   - source_lang = source_lang.lower()
198   -
199   - # Optimization: Skip translation if not needed
200   - if target_lang == 'en' and self._is_english_text(text):
201   - logger.info(f"[Translator] Text is already English, skipping translation: '{text[:50]}...'")
  129 + tgt = (target_lang or "").strip().lower()
  130 + src = (source_lang or "").strip().lower() or None
  131 + if tgt == "en" and self._is_english_text(text):
202 132 return text
203   -
204   - if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)):
205   - logger.info(
206   - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
207   - f"Source language: {source_lang or 'auto'} | Result: Skip translation (contains Chinese or pure number)"
208   - )
  133 + if tgt == "zh" and (self._contains_chinese(text) or self._is_pure_number(text)):
209 134 return text
210 135  
211   - # Use provided context or default context
212 136 translation_context = context or self.translation_context
213   -
214   - # Build cache key (include prompt in cache key if provided)
215   - cache_key_parts = [source_lang or 'auto', target_lang, translation_context]
216   - if prompt:
217   - cache_key_parts.append(prompt)
218   - cache_key_parts.append(text)
219   - cache_key = ':'.join(cache_key_parts)
  137 + cached = self._get_cached_translation_redis(text, tgt, src, translation_context, prompt)
  138 + if cached is not None:
  139 + return cached
220 140  
221   - # Check cache (include context and prompt in cache key for accuracy)
222   - if self.use_cache and self.redis_client:
223   - cached = self._get_cached_translation_redis(text, target_lang, source_lang, translation_context, prompt)
224   - if cached:
225   - logger.info(
226   - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
227   - f"Source language: {source_lang or 'auto'} | Result: '{cached}' | Source: Cache hit"
228   - )
229   - return cached
230   -
231   - # If no API key, return mock translation (for testing)
232   - if not self.api_key:
233   - logger.info(
234   - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
235   - f"Source language: {source_lang or 'auto'} | Result: '{text}' | Source: Mock mode (no API key)"
236   - )
237   - return text
238   -
239   - # Translate using selected model
240   - logger.info(
241   - f"[Translator] Translation request | Model: {self.model} | Original text: '{text}' | Target language: {target_lang} | "
242   - f"Source language: {source_lang or 'auto'} | Context: {translation_context} | "
243   - f"Prompt: {'yes' if prompt else 'no'} | Status: Starting translation"
244   - )
245   -
246   - if self.model == 'qwen':
247   - result = self._translate_qwen(text, target_lang, source_lang, translation_context, prompt)
248   - else: # deepl
249   - result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt)
250   -
251   - # Surface translation failure to the caller instead of silently
252   - # masquerading the source text as a successful translation.
253   - if result is None:
254   - logger.warning(
255   - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
256   - f"Source language: {source_lang or 'auto'} | Status: Translation failed"
257   - )
258   - else:
259   - logger.info(
260   - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
261   - f"Source language: {source_lang or 'auto'} | Result: '{result}' | Status: Translation successful"
262   - )
263   -
264   - # Cache only successful translations. Failed attempts must not poison
265   - # Redis with the original text.
266   - if result is not None and self.use_cache and self.redis_client:
267   - self._set_cached_translation_redis(text, target_lang, result, source_lang, translation_context, prompt)
  141 + result = self._translate_qwen(text, tgt, src)
268 142  
  143 + if result is not None:
  144 + self._set_cached_translation_redis(text, tgt, result, src, translation_context, prompt)
269 145 return result
270 146  
271 147 def _translate_qwen(
... ... @@ -273,412 +149,63 @@ class Translator:
273 149 text: str,
274 150 target_lang: str,
275 151 source_lang: Optional[str],
276   - context: Optional[str] = None,
277   - prompt: Optional[str] = None
278 152 ) -> Optional[str]:
279   - """
280   - Translate using Qwen MT Flash model via Alibaba Cloud DashScope API.
281   -
282   - Args:
283   - text: Text to translate
284   - target_lang: Target language code ('zh', 'en', 'ru', etc.)
285   - source_lang: Source language code (optional, 'auto' if None)
286   - context: Context hint for translation (optional)
287   - prompt: Translation prompt/instruction (optional)
288   -
289   - Returns:
290   - Translated text or None if translation fails
291   - """
292   - if not self.qwen_client:
293   - logger.error("[Translator] Qwen client not initialized. Check DASHSCOPE_API_KEY.")
  153 + if not self._qwen_client:
294 154 return None
295   -
296   - # Qwen (qwen-mt-plus/flash/turbo) supported languages mapping
297   - # 标准来自:你提供的“语言 / 英文名 / 代码”表
298   - qwen_lang_map = {
299   - "en": "English",
300   - "zh": "Chinese",
301   - "zh_tw": "Traditional Chinese",
302   - "ru": "Russian",
303   - "ja": "Japanese",
304   - "ko": "Korean",
305   - "es": "Spanish",
306   - "fr": "French",
307   - "pt": "Portuguese",
308   - "de": "German",
309   - "it": "Italian",
310   - "th": "Thai",
311   - "vi": "Vietnamese",
312   - "id": "Indonesian",
313   - "ms": "Malay",
314   - "ar": "Arabic",
315   - "hi": "Hindi",
316   - "he": "Hebrew",
317   - "my": "Burmese",
318   - "ta": "Tamil",
319   - "ur": "Urdu",
320   - "bn": "Bengali",
321   - "pl": "Polish",
322   - "nl": "Dutch",
323   - "ro": "Romanian",
324   - "tr": "Turkish",
325   - "km": "Khmer",
326   - "lo": "Lao",
327   - "yue": "Cantonese",
328   - "cs": "Czech",
329   - "el": "Greek",
330   - "sv": "Swedish",
331   - "hu": "Hungarian",
332   - "da": "Danish",
333   - "fi": "Finnish",
334   - "uk": "Ukrainian",
335   - "bg": "Bulgarian",
336   - }
337   -
338   - # Convert target language
339   - target_lang_normalized = target_lang.lower()
340   - target_lang_qwen = qwen_lang_map.get(target_lang_normalized, target_lang.capitalize())
341   -
342   - # Convert source language
343   - source_lang_normalized = (source_lang or "").strip().lower()
344   - if not source_lang_normalized or source_lang_normalized == "auto":
345   - source_lang_qwen = "auto"
346   - else:
347   - source_lang_qwen = qwen_lang_map.get(source_lang_normalized, source_lang.capitalize())
348   -
349   - # Prepare translation options
350   - translation_options = {
351   - "source_lang": source_lang_qwen,
352   - "target_lang": target_lang_qwen,
353   - }
354   -
355   - # Prepare messages
356   - messages = [
357   - {
358   - "role": "user",
359   - "content": text
360   - }
361   - ]
362   -
363   - start_time = time.time()
  155 + tgt_norm = (target_lang or "").strip().lower()
  156 + src_norm = (source_lang or "").strip().lower()
  157 + tgt_qwen = self.SOURCE_LANG_CODE_MAP.get(tgt_norm, tgt_norm.capitalize())
  158 + src_qwen = "auto" if not src_norm or src_norm == "auto" else self.SOURCE_LANG_CODE_MAP.get(src_norm, src_norm.capitalize())
  159 + start = time.time()
364 160 try:
365   - completion = self.qwen_client.chat.completions.create(
366   - model=self.QWEN_MODEL,
367   - messages=messages,
  161 + completion = self._qwen_client.chat.completions.create(
  162 + model=self.qwen_model_name,
  163 + messages=[{"role": "user", "content": text}],
368 164 extra_body={
369   - "translation_options": translation_options
370   - }
371   - )
372   -
373   - translated_text = completion.choices[0].message.content.strip()
374   - duration_ms = (time.time() - start_time) * 1000
375   -
376   - logger.info(
377   - f"[Translator] Qwen API response success | Original text: '{text}' | Target language: {target_lang_qwen} | "
378   - f"Translation result: '{translated_text}' | Duration: {duration_ms:.2f} ms"
379   - )
380   - return translated_text
381   -
382   - except Exception as e:
383   - duration_ms = (time.time() - start_time) * 1000
384   - logger.error(
385   - f"[Translator] Qwen API request exception | Original text: '{text}' | Target language: {target_lang_qwen} | "
386   - f"Duration: {duration_ms:.2f} ms | Error: {e}", exc_info=True
387   - )
388   - return None
389   -
390   - def _translate_deepl(
391   - self,
392   - text: str,
393   - target_lang: str,
394   - source_lang: Optional[str],
395   - context: Optional[str] = None,
396   - prompt: Optional[str] = None
397   - ) -> Optional[str]:
398   - """
399   - Translate using DeepL API with context and glossary support.
400   -
401   - Args:
402   - text: Text to translate
403   - target_lang: Target language code
404   - source_lang: Source language code (optional)
405   - context: Context hint for translation (e.g., "e-commerce product search")
406   - """
407   - # Map to DeepL language codes
408   - target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper())
409   -
410   - headers = {
411   - "Authorization": f"DeepL-Auth-Key {self.api_key}",
412   - "Content-Type": "application/json",
413   - }
414   -
415   - # Use prompt as context parameter for DeepL API (not as text prefix)
416   - # According to DeepL API: context is "Additional context that can influence a translation but is not translated itself"
417   - # If prompt is provided, use it as context; otherwise use the default context
418   - api_context = prompt if prompt else context
419   -
420   - # For e-commerce, add context words to help DeepL understand the domain
421   - # This is especially important for single-word ambiguous terms like "车" (car vs rook)
422   - text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context)
423   -
424   - payload = {
425   - "text": [text_to_translate],
426   - "target_lang": target_code,
427   - }
428   -
429   - if source_lang:
430   - source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper())
431   - payload["source_lang"] = source_code
432   -
433   - # Add context parameter (prompt or default context)
434   - # Context influences translation but is not translated itself
435   - if api_context:
436   - payload["context"] = api_context
437   -
438   - # Add glossary if configured
439   - if self.glossary_id:
440   - payload["glossary_id"] = self.glossary_id
441   -
442   - # Note: DeepL API v2 supports "context" parameter for additional context
443   - # that influences translation but is not translated itself.
444   - # We use prompt as context parameter when provided.
445   -
446   - try:
447   - response = requests.post(
448   - self.DEEPL_API_URL,
449   - headers=headers,
450   - json=payload,
451   - timeout=self.timeout
  165 + "translation_options": {
  166 + "source_lang": src_qwen,
  167 + "target_lang": tgt_qwen,
  168 + }
  169 + },
  170 + timeout=self.timeout,
452 171 )
453   -
454   - if response.status_code == 200:
455   - data = response.json()
456   - if "translations" in data and len(data["translations"]) > 0:
457   - translated_text = data["translations"][0]["text"]
458   - # If we added context, extract just the term from the result
459   - if needs_extraction:
460   - translated_text = self._extract_term_from_translation(
461   - translated_text, text, target_code
462   - )
463   - logger.debug(
464   - f"[Translator] DeepL API response success | Original text: '{text}' | Target language: {target_code} | "
465   - f"Translation result: '{translated_text}'"
466   - )
467   - return translated_text
468   - else:
469   - logger.error(
470   - f"[Translator] DeepL API error | Original text: '{text}' | Target language: {target_code} | "
471   - f"Status code: {response.status_code} | Error message: {response.text}"
472   - )
  172 + content = (completion.choices[0].message.content or "").strip()
  173 + if not content:
473 174 return None
474   -
475   - except requests.Timeout:
  175 + logger.info("[qwen-mt] Success | src=%s tgt=%s latency=%.1fms", src_qwen, tgt_qwen, (time.time() - start) * 1000)
  176 + return content
  177 + except Exception as exc:
476 178 logger.warning(
477   - f"[Translator] DeepL API request timeout | Original text: '{text}' | Target language: {target_code} | "
478   - f"Timeout: {self.timeout}s"
479   - )
480   - return None
481   - except Exception as e:
482   - logger.error(
483   - f"[Translator] DeepL API request exception | Original text: '{text}' | Target language: {target_code} | "
484   - f"Error: {e}", exc_info=True
  179 + "[qwen-mt] Failed | src=%s tgt=%s latency=%.1fms error=%s",
  180 + src_qwen,
  181 + tgt_qwen,
  182 + (time.time() - start) * 1000,
  183 + exc,
  184 + exc_info=True,
485 185 )
486 186 return None
487 187  
488   - # NOTE: _translate_deepl_free is intentionally not implemented.
489   - # We do not support automatic fallback to the free endpoint, to avoid
490   - # mixing Pro keys with https://api-free.deepl.com and related 403 errors.
491   -
492   - def translate_multi(
493   - self,
494   - text: str,
495   - target_langs: List[str],
496   - source_lang: Optional[str] = None,
497   - context: Optional[str] = None,
498   - async_mode: bool = True,
499   - prompt: Optional[str] = None
500   - ) -> Dict[str, Optional[str]]:
501   - """
502   - Translate text to multiple target languages.
503   -
504   - In async_mode=True (default):
505   - - Returns cached translations immediately if available
506   - - For translations that can be optimized (e.g., pure numbers, already in target language),
507   - returns result immediately via synchronous call
508   - - Launches async tasks for other missing translations (non-blocking)
509   - - Returns None for missing translations that require async processing
510   -
511   - In async_mode=False:
512   - - Waits for all translations to complete (blocking)
513   -
514   - Args:
515   - text: Text to translate
516   - target_langs: List of target language codes
517   - source_lang: Source language code (optional)
518   - context: Context hint for translation (optional)
519   - async_mode: If True, return cached results immediately and translate missing ones async
520   - prompt: Translation prompt/instruction (optional)
521 188  
522   - Returns:
523   - Dictionary mapping language code to translated text (only cached results in async mode)
524   - """
525   - results = {}
526   - missing_langs = []
527   - async_langs = []
528   -
529   - # First, get cached translations
530   - for lang in target_langs:
531   - cached = self._get_cached_translation(text, lang, source_lang, context, prompt)
532   - if cached is not None:
533   - results[lang] = cached
534   - else:
535   - missing_langs.append(lang)
536   -
537   - # If async mode and there are missing translations
538   - if async_mode and missing_langs:
539   - # Check if translation can be optimized (immediate return)
540   - for lang in missing_langs:
541   - target_lang = lang.lower()
542   - # Check optimization conditions (same as in translate method)
543   - can_optimize = False
544   - if target_lang == 'en' and self._is_english_text(text):
545   - can_optimize = True
546   - elif target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)):
547   - can_optimize = True
548   -
549   - if can_optimize:
550   - # Can be optimized, call translate synchronously for immediate result
551   - results[lang] = self.translate(text, lang, source_lang, context, prompt)
552   - else:
553   - # Requires actual translation, add to async list
554   - async_langs.append(lang)
555   -
556   - # Launch async tasks for translations that require actual API calls
557   - if async_langs:
558   - for lang in async_langs:
559   - self._translate_async(text, lang, source_lang, context, prompt)
560   - # Return None for async translations
561   - for lang in async_langs:
562   - results[lang] = None
563   - else:
564   - # Synchronous mode: wait for all translations
565   - for lang in missing_langs:
566   - results[lang] = self.translate(text, lang, source_lang, context, prompt)
567   -
568   - return results
569   -
570   - def translate_multi_async(
571   - self,
572   - text: str,
573   - target_langs: List[str],
574   - source_lang: Optional[str] = None,
575   - context: Optional[str] = None,
576   - prompt: Optional[str] = None
577   - ) -> Dict[str, Union[str, Future]]:
578   - """
579   - Translate text to multiple target languages asynchronously, returning Futures that can be awaited.
580   -
581   - This method returns a dictionary where:
582   - - If translation is cached, the value is the translation string (immediate)
583   - - If translation needs to be done, the value is a Future object that can be awaited
584   -
585   - Args:
586   - text: Text to translate
587   - target_langs: List of target language codes
588   - source_lang: Source language code (optional)
589   - context: Context hint for translation (optional)
590   - prompt: Translation prompt/instruction (optional)
591   -
592   - Returns:
593   - Dictionary mapping language code to either translation string (cached) or Future object
594   - """
595   - results = {}
596   - missing_langs = []
597   -
598   - # First, get cached translations
599   - for lang in target_langs:
600   - cached = self._get_cached_translation(text, lang, source_lang, context, prompt)
601   - if cached is not None:
602   - results[lang] = cached
603   - else:
604   - missing_langs.append(lang)
605   -
606   - # For missing translations, submit async tasks and return Futures
607   - for lang in missing_langs:
608   - future = self.executor.submit(
609   - self.translate,
610   - text,
611   - lang,
612   - source_lang,
613   - context,
614   - prompt
615   - )
616   - results[lang] = future
617   -
618   - return results
619   -
620   - def _get_cached_translation(
621   - self,
622   - text: str,
623   - target_lang: str,
624   - source_lang: Optional[str] = None,
625   - context: Optional[str] = None,
626   - prompt: Optional[str] = None
627   - ) -> Optional[str]:
628   - """Get translation from cache if available."""
629   - if not self.redis_client:
630   - return None
631   - return self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt)
632   -
633 189 def _get_cached_translation_redis(
634 190 self,
635 191 text: str,
636 192 target_lang: str,
637 193 source_lang: Optional[str] = None,
638 194 context: Optional[str] = None,
639   - prompt: Optional[str] = None
  195 + prompt: Optional[str] = None,
640 196 ) -> Optional[str]:
641   - """
642   - Get translation from Redis cache with sliding expiration.
643   -
644   - 滑动过期机制:每次访问缓存时,重置过期时间为配置的过期时间(默认720天)。
645   - 这样缓存会在最后一次访问后的720天才过期,而不是写入后的720天。
646   - 这确保了常用的翻译缓存不会被过早删除。
647   - """
648 197 if not self.redis_client:
649 198 return None
650   -
  199 + key = self._build_cache_key(text, target_lang, source_lang, context, prompt)
651 200 try:
652   - # Build cache key: prefix:target_lang:text
653   - # For simplicity, we use target_lang and text as key
654   - # Context and prompt are not included in key to maximize cache hits
655   - cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}"
656   - value = self.redis_client.get(cache_key)
657   - if value:
658   - # Sliding expiration: reset expiration time on access
659   - # 每次读取缓存时,重置过期时间为配置的过期时间(最后一次访问后的N天才过期)
660   - try:
661   - self.redis_client.expire(cache_key, self.expire_seconds)
662   - except Exception as expire_error:
663   - # 即使 expire 失败,也返回缓存值(不影响功能)
664   - logger.warning(
665   - f"[Translator] Failed to update cache expiration for key {cache_key}: {expire_error}"
666   - )
667   -
668   - logger.debug(
669   - f"[Translator] Redis cache hit | Original text: '{text}' | Target language: {target_lang} | "
670   - f"Cache key: {cache_key} | Translation result: '{value}' | TTL reset to {self.expire_seconds}s"
671   - )
672   - return value
673   - logger.debug(
674   - f"[Translator] Redis cache miss | Original text: '{text}' | Target language: {target_lang} | "
675   - f"Cache key: {cache_key}"
676   - )
  201 + value = self.redis_client.get(key)
  202 + if value and self.cache_sliding_expiration:
  203 + self.redis_client.expire(key, self.expire_seconds)
  204 + return value
  205 + except Exception as exc:
  206 + logger.warning("Redis get translation cache failed: %s", exc)
677 207 return None
678   - except Exception as e:
679   - logger.error(f"[Translator] Redis error during get translation cache | Original text: '{text}' | Target language: {target_lang} | Error: {e}")
680   - return None
681   -
  208 +
682 209 def _set_cached_translation_redis(
683 210 self,
684 211 text: str,
... ... @@ -686,128 +213,17 @@ class Translator:
686 213 translation: str,
687 214 source_lang: Optional[str] = None,
688 215 context: Optional[str] = None,
689   - prompt: Optional[str] = None
  216 + prompt: Optional[str] = None,
690 217 ) -> None:
691   - """Store translation in Redis cache."""
692 218 if not self.redis_client:
693 219 return
694   -
  220 + key = self._build_cache_key(text, target_lang, source_lang, context, prompt)
695 221 try:
696   - cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}"
697   - self.redis_client.setex(cache_key, self.expire_seconds, translation)
698   - logger.info(
699   - f"[Translator] Redis cache write | Original text: '{text}' | Target language: {target_lang} | "
700   - f"Cache key: {cache_key} | Translation result: '{translation}'"
701   - )
702   - except Exception as e:
703   - logger.error(
704   - f"[Translator] Redis cache write failed | Original text: '{text}' | Target language: {target_lang} | "
705   - f"Error: {e}"
706   - )
707   -
708   - def _translate_async(
709   - self,
710   - text: str,
711   - target_lang: str,
712   - source_lang: Optional[str] = None,
713   - context: Optional[str] = None,
714   - prompt: Optional[str] = None
715   - ):
716   - """Launch async translation task."""
717   - def _do_translate():
718   - try:
719   - result = self.translate(text, target_lang, source_lang, context, prompt)
720   - if result:
721   - logger.debug(f"Async translation completed: {text} -> {target_lang}: {result}")
722   - except Exception as e:
723   - logger.warning(f"Async translation failed: {text} -> {target_lang}: {e}")
724   -
725   - self.executor.submit(_do_translate)
726   -
727   - def _add_ecommerce_context(
728   - self,
729   - text: str,
730   - source_lang: Optional[str],
731   - context: Optional[str]
732   - ) -> tuple:
733   - """
734   - Add e-commerce context to text for better disambiguation.
735   -
736   - For single-word ambiguous Chinese terms, we add context words that help
737   - DeepL understand this is an e-commerce/product search context.
738   -
739   - Args:
740   - text: Original text to translate
741   - source_lang: Source language code
742   - context: Context hint
743   -
744   - Returns:
745   - Tuple of (text_with_context, needs_extraction)
746   - - text_with_context: Text to send to DeepL
747   - - needs_extraction: Whether we need to extract the term from the result
748   - """
749   - # Only apply for e-commerce context and Chinese source
750   - if not context or "e-commerce" not in context.lower():
751   - return text, False
752   -
753   - if not source_lang or source_lang.lower() != 'zh':
754   - return text, False
755   -
756   - # For single-word queries, add context to help disambiguation
757   - text_stripped = text.strip()
758   - if len(text_stripped.split()) == 1 and len(text_stripped) <= 2:
759   - # Common ambiguous Chinese e-commerce terms like "车" (car vs rook)
760   - # We add a context phrase: "购买 [term]" (buy [term]) or "商品 [term]" (product [term])
761   - # This helps DeepL understand the e-commerce context
762   - # We'll need to extract just the term from the translation result
763   - context_phrase = f"购买 {text_stripped}"
764   - return context_phrase, True
765   -
766   - # For multi-word queries, DeepL usually has enough context
767   - return text, False
768   -
769   - def _extract_term_from_translation(
770   - self,
771   - translated_text: str,
772   - original_text: str,
773   - target_lang_code: str
774   - ) -> str:
775   - """
776   - Extract the actual term from a translation that included context.
777   -
778   - For example, if we translated "购买 车" (buy car) and got "buy car",
779   - we want to extract just "car".
780   -
781   - Args:
782   - translated_text: Full translation result
783   - original_text: Original single-word query
784   - target_lang_code: Target language code (EN, ZH, etc.)
785   -
786   - Returns:
787   - Extracted term or original translation if extraction fails
788   - """
789   - # For English target, try to extract the last word (the actual term)
790   - if target_lang_code == "EN":
791   - words = translated_text.strip().split()
792   - if len(words) > 1:
793   - # Usually the last word is the term we want
794   - # But we need to be smart - if it's "buy car", we want "car"
795   - # Common context words to skip: buy, purchase, product, item, etc.
796   - context_words = {"buy", "purchase", "product", "item", "commodity", "goods"}
797   - # Try to find the term (not a context word)
798   - for word in reversed(words):
799   - word_lower = word.lower().rstrip('.,!?;:')
800   - if word_lower not in context_words:
801   - return word_lower
802   - # If all words are context words, return the last one
803   - return words[-1].lower().rstrip('.,!?;:')
804   -
805   - # For other languages or if extraction fails, return as-is
806   - # The user can configure a glossary for better results
807   - return translated_text
  222 + self.redis_client.setex(key, self.expire_seconds, translation)
  223 + except Exception as exc:
  224 + logger.warning("Redis set translation cache failed: %s", exc)
808 225  
809 226 def _shop_lang_matches(self, shop_lang_lower: str, lang_code: str) -> bool:
810   - """True if shop language matches index language (use source, no translate)."""
811 227 if not shop_lang_lower or not lang_code:
812 228 return False
813 229 if shop_lang_lower == lang_code:
... ... @@ -818,146 +234,27 @@ class Translator:
818 234 return True
819 235 return False
820 236  
821   - def translate_for_indexing(
822   - self,
823   - text: str,
824   - shop_language: str,
825   - source_lang: Optional[str] = None,
826   - context: Optional[str] = None,
827   - prompt: Optional[str] = None,
828   - index_languages: Optional[List[str]] = None,
829   - ) -> Dict[str, Optional[str]]:
830   - """
831   - Translate text for indexing based on shop language and tenant index_languages.
832   -
833   - For each language in index_languages: use source text if shop language matches,
834   - otherwise translate to that language.
835   -
836   - Args:
837   - text: Text to translate
838   - shop_language: Shop primary language (e.g. 'zh', 'en', 'ru')
839   - source_lang: Source language code (optional)
840   - context: Additional context for translation (optional)
841   - prompt: Translation prompt (optional)
842   - index_languages: Languages to index (from tenant_config). Default ["en", "zh"].
843   -
844   - Returns:
845   - Dict keyed by each index_language with translated or source text (or None).
846   - """
847   - langs = index_languages if index_languages else ["en", "zh"]
848   - results = {lang: None for lang in langs}
849   - if not text or not text.strip():
850   - return results
851   - if re.match(r'^[\d\s_-]+$', text):
852   - logger.info(f"[Translator] Skip translation for symbol-only query: '{text}'")
853   - return results
854   -
855   - shop_lang_lower = (shop_language or "").strip().lower()
856   - targets = []
857   - for lang in langs:
858   - if self._shop_lang_matches(shop_lang_lower, lang):
859   - results[lang] = text
860   - else:
861   - targets.append(lang)
862   -
863   - for target_lang in targets:
864   - cached = self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt)
865   - if cached:
866   - results[target_lang] = cached
867   - logger.debug(f"[Translator] Cache hit for indexing: '{text}' -> {target_lang}: {cached}")
868   - continue
869   - translated = self.translate(
870   - text,
871   - target_lang=target_lang,
872   - source_lang=source_lang or shop_language,
873   - context=context,
874   - prompt=prompt,
875   - )
876   - results[target_lang] = translated
877   - return results
878   -
879   - def get_translation_needs(
880   - self,
881   - detected_lang: str,
882   - supported_langs: List[str]
883   - ) -> List[str]:
884   - """
885   - Determine which languages need translation.
886   -
887   - Args:
888   - detected_lang: Detected query language
889   - supported_langs: List of supported languages
890   -
891   - Returns:
892   - List of language codes to translate to
893   - """
894   - # If detected language is in supported list, translate to others
  237 + def get_translation_needs(self, detected_lang: str, supported_langs: List[str]) -> List[str]:
895 238 if detected_lang in supported_langs:
896   - return [lang for lang in supported_langs if detected_lang != lang]
897   -
898   - # Otherwise, translate to all supported languages
  239 + return [lang for lang in supported_langs if lang != detected_lang]
899 240 return supported_langs
900   -
  241 +
901 242 def _is_english_text(self, text: str) -> bool:
902   - """
903   - Check if text is primarily English (ASCII letters, numbers, common punctuation).
904   -
905   - Args:
906   - text: Text to check
907   -
908   - Returns:
909   - True if text appears to be English
910   - """
911 243 if not text or not text.strip():
912 244 return True
913   -
914   - # Remove whitespace and common punctuation
915   - text_clean = re.sub(r'[\s\.,!?;:\-\'\"\(\)\[\]{}]', '', text)
  245 + text_clean = re.sub(r"[\s\.,!?;:\-\'\"\(\)\[\]{}]", "", text)
916 246 if not text_clean:
917 247 return True
918   -
919   - # Check if all remaining characters are ASCII (letters, numbers)
920   - # This is a simple heuristic: if most characters are ASCII, it's likely English
921 248 ascii_count = sum(1 for c in text_clean if ord(c) < 128)
922   - ratio = ascii_count / len(text_clean) if text_clean else 0
923   -
924   - # If more than 80% are ASCII characters, consider it English
925   - return ratio > 0.8
926   -
  249 + return (ascii_count / len(text_clean)) > 0.8
  250 +
927 251 def _contains_chinese(self, text: str) -> bool:
928   - """
929   - Check if text contains Chinese characters (Han characters).
930   -
931   - Args:
932   - text: Text to check
933   -
934   - Returns:
935   - True if text contains Chinese characters
936   - """
937 252 if not text:
938 253 return False
939   -
940   - # Check for Chinese characters (Unicode range: \u4e00-\u9fff)
941   - chinese_pattern = re.compile(r'[\u4e00-\u9fff]')
942   - return bool(chinese_pattern.search(text))
943   -
  254 + return bool(re.search(r"[\u4e00-\u9fff]", text))
  255 +
944 256 def _is_pure_number(self, text: str) -> bool:
945   - """
946   - Check if text is purely numeric (digits, possibly with spaces, dots, commas).
947   -
948   - Args:
949   - text: Text to check
950   -
951   - Returns:
952   - True if text is purely numeric
953   - """
954 257 if not text or not text.strip():
955 258 return False
956   -
957   - # Remove whitespace, dots, commas (common number separators)
958   - text_clean = re.sub(r'[\s\.,]', '', text.strip())
959   - if not text_clean:
960   - return False
961   -
962   - # Check if all remaining characters are digits
963   - return text_clean.isdigit()
  259 + text_clean = re.sub(r"[\s\.,]", "", text.strip())
  260 + return bool(text_clean) and text_clean.isdigit()
... ...
query/test_translation.py
... ... @@ -14,6 +14,7 @@ Test content:
14 14 import sys
15 15 import os
16 16 from pathlib import Path
  17 +from concurrent.futures import ThreadPoolExecutor
17 18  
18 19 # Add parent directory to path
19 20 sys.path.insert(0, str(Path(__file__).parent.parent))
... ... @@ -42,9 +43,6 @@ def test_config_loading():
42 43  
43 44 print(f"✓ Configuration loaded successfully")
44 45 print(f" Translation service: {config.query_config.translation_service}")
45   - print(f" Translation prompt configuration:")
46   - for key, value in config.query_config.translation_prompts.items():
47   - print(f" {key}: {value[:60]}..." if len(value) > 60 else f" {key}: {value}")
48 46  
49 47 return config
50 48 except Exception as e:
... ... @@ -72,34 +70,23 @@ def test_translator_sync(config):
72 70 translation_context=config.query_config.translation_context
73 71 )
74 72  
75   - # 测试商品标题翻译(使用product_title提示词)
  73 + # 测试商品标题翻译(使用sku_name提示词)
76 74 test_texts = [
77   - ("蓝牙耳机", "zh", "en", "product_title"),
78   - ("Wireless Headphones", "en", "zh", "product_title"),
  75 + ("蓝牙耳机", "zh", "en", "sku_name"),
  76 + ("Wireless Headphones", "en", "zh", "sku_name"),
79 77 ]
80 78  
81   - for text, source_lang, target_lang, prompt_type in test_texts:
82   - if prompt_type == "product_title":
83   - if target_lang == "zh":
84   - prompt = config.query_config.translation_prompts.get('product_title_zh')
85   - else:
86   - prompt = config.query_config.translation_prompts.get('product_title_en')
87   - else:
88   - if target_lang == "zh":
89   - prompt = config.query_config.translation_prompts.get('default_zh')
90   - else:
91   - prompt = config.query_config.translation_prompts.get('default_en')
92   -
  79 + for text, source_lang, target_lang, scene in test_texts:
93 80 print(f"\nTranslation test:")
94 81 print(f" Original text ({source_lang}): {text}")
95 82 print(f" Target language: {target_lang}")
96   - print(f" Prompt: {prompt[:50] if prompt else 'None'}...")
  83 + print(f" Scene: {scene}")
97 84  
98 85 result = translator.translate(
99 86 text,
100 87 target_lang=target_lang,
101 88 source_lang=source_lang,
102   - prompt=prompt
  89 + context=scene,
103 90 )
104 91  
105 92 if result:
... ... @@ -131,43 +118,25 @@ def test_translator_async(config, translator):
131 118 query_text = "手机"
132 119 target_langs = ['en']
133 120 source_lang = 'zh'
134   -
135   - query_prompt = config.query_config.translation_prompts.get('query_zh')
136   -
  121 +
137 122 print(f"Query text: {query_text}")
138 123 print(f"Target languages: {target_langs}")
139   - print(f"Prompt: {query_prompt}")
140   -
141   - # 异步模式(立即返回,后台翻译)
142   - results = translator.translate_multi(
143   - query_text,
144   - target_langs,
145   - source_lang=source_lang,
146   - context=config.query_config.translation_context,
147   - async_mode=True,
148   - prompt=query_prompt
149   - )
150   -
151   - print(f"\nAsynchronous translation results:")
152   - for lang, translation in results.items():
153   - if translation:
154   - print(f" {lang}: {translation} (cache hit)")
155   - else:
156   - print(f" {lang}: None (translating in background...)")
157   -
158   - # 同步模式(等待完成)
159   - print(f"\nSynchronous translation (waiting for completion):")
160   - results_sync = translator.translate_multi(
161   - query_text,
162   - target_langs,
163   - source_lang=source_lang,
164   - context=config.query_config.translation_context,
165   - async_mode=False,
166   - prompt=query_prompt
167   - )
  124 + print("Scene: ecommerce_search_query")
168 125  
169   - for lang, translation in results_sync.items():
170   - print(f" {lang}: {translation}")
  126 + print(f"\nConcurrent translation via generic translate():")
  127 + with ThreadPoolExecutor(max_workers=len(target_langs)) as executor:
  128 + futures = {
  129 + lang: executor.submit(
  130 + translator.translate,
  131 + query_text,
  132 + lang,
  133 + source_lang,
  134 + "ecommerce_search_query",
  135 + )
  136 + for lang in target_langs
  137 + }
  138 + for lang, future in futures.items():
  139 + print(f" {lang}: {future.result()}")
171 140  
172 141 except Exception as e:
173 142 print(f"✗ Asynchronous translation test failed: {e}")
... ... @@ -193,14 +162,13 @@ def test_cache():
193 162 test_text = "测试文本"
194 163 target_lang = "en"
195 164 source_lang = "zh"
196   - prompt = config.query_config.translation_prompts.get('default_zh')
197 165  
198 166 print(f"First translation (should call API or return mock):")
199   - result1 = translator.translate(test_text, target_lang, source_lang, prompt=prompt)
  167 + result1 = translator.translate(test_text, target_lang, source_lang, context="default")
200 168 print(f" Result: {result1}")
201 169  
202 170 print(f"\nSecond translation (should use cache):")
203   - result2 = translator.translate(test_text, target_lang, source_lang, prompt=prompt)
  171 + result2 = translator.translate(test_text, target_lang, source_lang, context="default")
204 172 print(f" Result: {result2}")
205 173  
206 174 if result1 == result2:
... ... @@ -231,17 +199,16 @@ def test_context_parameter():
231 199  
232 200 # 测试带context和不带context的翻译
233 201 text = "手机"
234   - prompt = config.query_config.translation_prompts.get('query_zh')
235 202  
236 203 print(f"Test text: {text}")
237   - print(f"Prompt (as context): {prompt}")
  204 + print("Scene: ecommerce_search_query")
238 205  
239 206 # 带context的翻译
240 207 result_with_context = translator.translate(
241 208 text,
242 209 target_lang='en',
243 210 source_lang='zh',
244   - prompt=prompt
  211 + context="ecommerce_search_query",
245 212 )
246 213 print(f"\nTranslation result with context: {result_with_context}")
247 214  
... ...
query/translator.py deleted
... ... @@ -1,963 +0,0 @@
1   -"""
2   -Translation service for multi-language query support.
3   -
4   -Supports multiple translation models:
5   -- Qwen (default): Alibaba Cloud DashScope API using qwen-mt-flash model
6   -- DeepL: DeepL API for high-quality translations
7   -
8   -重要说明(Qwen 机翻限速):
9   -- 当前默认使用的 `qwen-mt-flash` 为云端机翻模型,**官方限速较低,约 RPM=60(每分钟约 60 请求)**
10   -- 在高并发场景必须依赖 Redis 翻译缓存与批量预热,避免在用户实时请求路径上直接打满 DashScope 限流
11   -- 若业务侧存在大规模离线翻译或更高吞吐需求,建议评估 DeepL 或自建翻译后端
12   -
13   -使用方法 (Usage):
14   -
15   -```python
16   -from query.qwen_mt_translate import Translator
17   -
18   -# 使用默认的 qwen 模型(推荐)
19   -translator = Translator() # 默认使用 qwen 模型
20   -
21   -# 或显式指定模型
22   -translator = Translator(model='qwen') # 使用 qwen 模型
23   -translator = Translator(model='deepl') # 使用 DeepL 模型
24   -
25   -# 翻译文本
26   -result = translator.translate(
27   - text="我看到这个视频后没有笑",
28   - target_lang="en",
29   - source_lang="auto" # 自动检测源语言
30   -)
31   -```
32   -
33   -配置说明 (Configuration):
34   -- Qwen 模型需要设置 DASHSCOPE_API_KEY 环境变量(在 .env 文件中)
35   -- DeepL 模型需要设置 DEEPL_AUTH_KEY 环境变量(在 .env 文件中)
36   -
37   -Qwen 模型参考文档:
38   -- 官方文档:https://help.aliyun.com/zh/model-studio/get-api-key
39   -- 模型:qwen-mt-flash(快速翻译模型)
40   -
41   -DeepL 官方文档:
42   -https://developers.deepl.com/api-reference/translate/request-translation
43   -"""
44   -
45   -import os
46   -import requests
47   -import re
48   -import redis
49   -from concurrent.futures import ThreadPoolExecutor, Future
50   -from datetime import timedelta
51   -from typing import Dict, List, Optional, Union
52   -import logging
53   -import time
54   -
55   -logger = logging.getLogger(__name__)
56   -
57   -from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG
58   -from openai import OpenAI
59   -
60   -
61   -class Translator:
62   - """
63   - Multi-language translator supporting Qwen and DeepL APIs.
64   -
65   - Default model is 'qwen' which uses Alibaba Cloud DashScope API.
66   - """
67   -# 华北2(北京):https://dashscope.aliyuncs.com/compatible-mode/v1
68   -# 新加坡:https://dashscope-intl.aliyuncs.com/compatible-mode/v1
69   -# 美国(弗吉尼亚):https://dashscope-us.aliyuncs.com/compatible-mode/v1
70   -
71   - DEEPL_API_URL = "https://api.deepl.com/v2/translate" # Pro tier
72   - QWEN_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1" # 北京地域
73   - # QWEN_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" # 新加坡
74   - # 如果使用新加坡地域的模型,需要将base_url替换为:https://dashscope-intl.aliyuncs.com/compatible-mode/v1
75   - QWEN_MODEL = "qwen-mt-flash" # 快速翻译模型
76   -
77   - # Language code mapping
78   - LANG_CODE_MAP = {
79   - 'zh': 'ZH',
80   - 'en': 'EN',
81   - 'ru': 'RU',
82   - 'ar': 'AR',
83   - 'ja': 'JA',
84   - 'es': 'ES',
85   - 'de': 'DE',
86   - 'fr': 'FR',
87   - 'it': 'IT',
88   - 'pt': 'PT',
89   - }
90   -
91   - def __init__(
92   - self,
93   - model: str = "qwen",
94   - api_key: Optional[str] = None,
95   - use_cache: bool = True,
96   - timeout: int = 10,
97   - glossary_id: Optional[str] = None,
98   - translation_context: Optional[str] = None
99   - ):
100   - """
101   - Initialize translator.
102   -
103   - Args:
104   - model: Translation model to use. Options: 'qwen' (default) or 'deepl'
105   - api_key: API key for the selected model (or None to use from config/env)
106   - use_cache: Whether to cache translations
107   - timeout: Request timeout in seconds
108   - glossary_id: DeepL glossary ID for custom terminology (optional, only for DeepL)
109   - translation_context: Context hint for translation (e.g., "e-commerce", "product search")
110   - """
111   - self.model = model.lower()
112   - if self.model not in ['qwen', 'deepl']:
113   - raise ValueError(f"Unsupported model: {model}. Supported models: 'qwen', 'deepl'")
114   -
115   - # Get API key from config if not provided
116   - if api_key is None:
117   - if self.model == 'qwen':
118   - api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY")
119   - else: # deepl
120   - api_key = DEEPL_AUTH_KEY or os.getenv("DEEPL_AUTH_KEY")
121   -
122   - self.api_key = api_key
123   - self.timeout = timeout
124   - self.use_cache = use_cache
125   - self.glossary_id = glossary_id
126   - self.translation_context = translation_context or "e-commerce product search"
127   -
128   - # Initialize OpenAI client for Qwen if needed
129   - self.qwen_client = None
130   - if self.model == 'qwen':
131   - if not self.api_key:
132   - logger.warning("DASHSCOPE_API_KEY not set. Qwen translation will not work.")
133   - else:
134   - self.qwen_client = OpenAI(
135   - api_key=self.api_key,
136   - base_url=self.QWEN_BASE_URL,
137   - )
138   -
139   - # Initialize Redis cache if enabled
140   - if use_cache:
141   - try:
142   - self.redis_client = redis.Redis(
143   - host=REDIS_CONFIG.get('host', 'localhost'),
144   - port=REDIS_CONFIG.get('port', 6479),
145   - password=REDIS_CONFIG.get('password'),
146   - decode_responses=True, # Return str instead of bytes
147   - socket_timeout=REDIS_CONFIG.get('socket_timeout', 1),
148   - socket_connect_timeout=REDIS_CONFIG.get('socket_connect_timeout', 1),
149   - retry_on_timeout=REDIS_CONFIG.get('retry_on_timeout', False),
150   - health_check_interval=10, # 避免复用坏连接
151   - )
152   - # Test connection
153   - self.redis_client.ping()
154   - expire_days = REDIS_CONFIG.get('translation_cache_expire_days', 360)
155   - self.expire_time = timedelta(days=expire_days)
156   - self.expire_seconds = int(self.expire_time.total_seconds()) # Redis 需要秒数
157   - self.cache_prefix = REDIS_CONFIG.get('translation_cache_prefix', 'trans')
158   - logger.info("Redis cache initialized for translations")
159   - except Exception as e:
160   - logger.warning(f"Failed to initialize Redis cache: {e}, falling back to no cache")
161   - self.redis_client = None
162   - self.cache = None
163   - else:
164   - self.redis_client = None
165   - self.cache = None
166   -
167   - # Thread pool for async translation
168   - self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="translator")
169   -
170   - def translate(
171   - self,
172   - text: str,
173   - target_lang: str,
174   - source_lang: Optional[str] = None,
175   - context: Optional[str] = None,
176   - prompt: Optional[str] = None
177   - ) -> Optional[str]:
178   - """
179   - Translate text to target language (synchronous mode).
180   -
181   - Args:
182   - text: Text to translate
183   - target_lang: Target language code ('zh', 'en', 'ru', etc.)
184   - source_lang: Source language code (option al, auto-detect if None)
185   - context: Additional context for translation (overrides default context)
186   - prompt: Translation prompt/instruction (optional, for better translation quality)
187   -
188   - Returns:
189   - Translated text or None if translation fails
190   - """
191   - if not text or not text.strip():
192   - return text
193   -
194   - # Normalize language codes
195   - target_lang = target_lang.lower()
196   - if source_lang:
197   - source_lang = source_lang.lower()
198   -
199   - # Optimization: Skip translation if not needed
200   - if target_lang == 'en' and self._is_english_text(text):
201   - logger.info(f"[Translator] Text is already English, skipping translation: '{text[:50]}...'")
202   - return text
203   -
204   - if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)):
205   - logger.info(
206   - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
207   - f"Source language: {source_lang or 'auto'} | Result: Skip translation (contains Chinese or pure number)"
208   - )
209   - return text
210   -
211   - # Use provided context or default context
212   - translation_context = context or self.translation_context
213   -
214   - # Build cache key (include prompt in cache key if provided)
215   - cache_key_parts = [source_lang or 'auto', target_lang, translation_context]
216   - if prompt:
217   - cache_key_parts.append(prompt)
218   - cache_key_parts.append(text)
219   - cache_key = ':'.join(cache_key_parts)
220   -
221   - # Check cache (include context and prompt in cache key for accuracy)
222   - if self.use_cache and self.redis_client:
223   - cached = self._get_cached_translation_redis(text, target_lang, source_lang, translation_context, prompt)
224   - if cached:
225   - logger.info(
226   - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
227   - f"Source language: {source_lang or 'auto'} | Result: '{cached}' | Source: Cache hit"
228   - )
229   - return cached
230   -
231   - # If no API key, return mock translation (for testing)
232   - if not self.api_key:
233   - logger.info(
234   - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
235   - f"Source language: {source_lang or 'auto'} | Result: '{text}' | Source: Mock mode (no API key)"
236   - )
237   - return text
238   -
239   - # Translate using selected model
240   - logger.info(
241   - f"[Translator] Translation request | Model: {self.model} | Original text: '{text}' | Target language: {target_lang} | "
242   - f"Source language: {source_lang or 'auto'} | Context: {translation_context} | "
243   - f"Prompt: {'yes' if prompt else 'no'} | Status: Starting translation"
244   - )
245   -
246   - if self.model == 'qwen':
247   - result = self._translate_qwen(text, target_lang, source_lang, translation_context, prompt)
248   - else: # deepl
249   - result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt)
250   -
251   - # Surface translation failure to the caller instead of silently
252   - # masquerading the source text as a successful translation.
253   - if result is None:
254   - logger.warning(
255   - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
256   - f"Source language: {source_lang or 'auto'} | Status: Translation failed"
257   - )
258   - else:
259   - logger.info(
260   - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
261   - f"Source language: {source_lang or 'auto'} | Result: '{result}' | Status: Translation successful"
262   - )
263   -
264   - # Cache only successful translations. Failed attempts must not poison
265   - # Redis with the original text.
266   - if result is not None and self.use_cache and self.redis_client:
267   - self._set_cached_translation_redis(text, target_lang, result, source_lang, translation_context, prompt)
268   -
269   - return result
270   -
271   - def _translate_qwen(
272   - self,
273   - text: str,
274   - target_lang: str,
275   - source_lang: Optional[str],
276   - context: Optional[str] = None,
277   - prompt: Optional[str] = None
278   - ) -> Optional[str]:
279   - """
280   - Translate using Qwen MT Flash model via Alibaba Cloud DashScope API.
281   -
282   - Args:
283   - text: Text to translate
284   - target_lang: Target language code ('zh', 'en', 'ru', etc.)
285   - source_lang: Source language code (optional, 'auto' if None)
286   - context: Context hint for translation (optional)
287   - prompt: Translation prompt/instruction (optional)
288   -
289   - Returns:
290   - Translated text or None if translation fails
291   - """
292   - if not self.qwen_client:
293   - logger.error("[Translator] Qwen client not initialized. Check DASHSCOPE_API_KEY.")
294   - return None
295   -
296   - # Qwen (qwen-mt-plus/flash/turbo) supported languages mapping
297   - # 标准来自:你提供的“语言 / 英文名 / 代码”表
298   - qwen_lang_map = {
299   - "en": "English",
300   - "zh": "Chinese",
301   - "zh_tw": "Traditional Chinese",
302   - "ru": "Russian",
303   - "ja": "Japanese",
304   - "ko": "Korean",
305   - "es": "Spanish",
306   - "fr": "French",
307   - "pt": "Portuguese",
308   - "de": "German",
309   - "it": "Italian",
310   - "th": "Thai",
311   - "vi": "Vietnamese",
312   - "id": "Indonesian",
313   - "ms": "Malay",
314   - "ar": "Arabic",
315   - "hi": "Hindi",
316   - "he": "Hebrew",
317   - "my": "Burmese",
318   - "ta": "Tamil",
319   - "ur": "Urdu",
320   - "bn": "Bengali",
321   - "pl": "Polish",
322   - "nl": "Dutch",
323   - "ro": "Romanian",
324   - "tr": "Turkish",
325   - "km": "Khmer",
326   - "lo": "Lao",
327   - "yue": "Cantonese",
328   - "cs": "Czech",
329   - "el": "Greek",
330   - "sv": "Swedish",
331   - "hu": "Hungarian",
332   - "da": "Danish",
333   - "fi": "Finnish",
334   - "uk": "Ukrainian",
335   - "bg": "Bulgarian",
336   - }
337   -
338   - # Convert target language
339   - target_lang_normalized = target_lang.lower()
340   - target_lang_qwen = qwen_lang_map.get(target_lang_normalized, target_lang.capitalize())
341   -
342   - # Convert source language
343   - source_lang_normalized = (source_lang or "").strip().lower()
344   - if not source_lang_normalized or source_lang_normalized == "auto":
345   - source_lang_qwen = "auto"
346   - else:
347   - source_lang_qwen = qwen_lang_map.get(source_lang_normalized, source_lang.capitalize())
348   -
349   - # Prepare translation options
350   - translation_options = {
351   - "source_lang": source_lang_qwen,
352   - "target_lang": target_lang_qwen,
353   - }
354   -
355   - # Prepare messages
356   - messages = [
357   - {
358   - "role": "user",
359   - "content": text
360   - }
361   - ]
362   -
363   - start_time = time.time()
364   - try:
365   - completion = self.qwen_client.chat.completions.create(
366   - model=self.QWEN_MODEL,
367   - messages=messages,
368   - extra_body={
369   - "translation_options": translation_options
370   - }
371   - )
372   -
373   - translated_text = completion.choices[0].message.content.strip()
374   - duration_ms = (time.time() - start_time) * 1000
375   -
376   - logger.info(
377   - f"[Translator] Qwen API response success | Original text: '{text}' | Target language: {target_lang_qwen} | "
378   - f"Translation result: '{translated_text}' | Duration: {duration_ms:.2f} ms"
379   - )
380   - return translated_text
381   -
382   - except Exception as e:
383   - duration_ms = (time.time() - start_time) * 1000
384   - logger.error(
385   - f"[Translator] Qwen API request exception | Original text: '{text}' | Target language: {target_lang_qwen} | "
386   - f"Duration: {duration_ms:.2f} ms | Error: {e}", exc_info=True
387   - )
388   - return None
389   -
390   - def _translate_deepl(
391   - self,
392   - text: str,
393   - target_lang: str,
394   - source_lang: Optional[str],
395   - context: Optional[str] = None,
396   - prompt: Optional[str] = None
397   - ) -> Optional[str]:
398   - """
399   - Translate using DeepL API with context and glossary support.
400   -
401   - Args:
402   - text: Text to translate
403   - target_lang: Target language code
404   - source_lang: Source language code (optional)
405   - context: Context hint for translation (e.g., "e-commerce product search")
406   - """
407   - # Map to DeepL language codes
408   - target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper())
409   -
410   - headers = {
411   - "Authorization": f"DeepL-Auth-Key {self.api_key}",
412   - "Content-Type": "application/json",
413   - }
414   -
415   - # Use prompt as context parameter for DeepL API (not as text prefix)
416   - # According to DeepL API: context is "Additional context that can influence a translation but is not translated itself"
417   - # If prompt is provided, use it as context; otherwise use the default context
418   - api_context = prompt if prompt else context
419   -
420   - # For e-commerce, add context words to help DeepL understand the domain
421   - # This is especially important for single-word ambiguous terms like "车" (car vs rook)
422   - text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context)
423   -
424   - payload = {
425   - "text": [text_to_translate],
426   - "target_lang": target_code,
427   - }
428   -
429   - if source_lang:
430   - source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper())
431   - payload["source_lang"] = source_code
432   -
433   - # Add context parameter (prompt or default context)
434   - # Context influences translation but is not translated itself
435   - if api_context:
436   - payload["context"] = api_context
437   -
438   - # Add glossary if configured
439   - if self.glossary_id:
440   - payload["glossary_id"] = self.glossary_id
441   -
442   - # Note: DeepL API v2 supports "context" parameter for additional context
443   - # that influences translation but is not translated itself.
444   - # We use prompt as context parameter when provided.
445   -
446   - try:
447   - response = requests.post(
448   - self.DEEPL_API_URL,
449   - headers=headers,
450   - json=payload,
451   - timeout=self.timeout
452   - )
453   -
454   - if response.status_code == 200:
455   - data = response.json()
456   - if "translations" in data and len(data["translations"]) > 0:
457   - translated_text = data["translations"][0]["text"]
458   - # If we added context, extract just the term from the result
459   - if needs_extraction:
460   - translated_text = self._extract_term_from_translation(
461   - translated_text, text, target_code
462   - )
463   - logger.debug(
464   - f"[Translator] DeepL API response success | Original text: '{text}' | Target language: {target_code} | "
465   - f"Translation result: '{translated_text}'"
466   - )
467   - return translated_text
468   - else:
469   - logger.error(
470   - f"[Translator] DeepL API error | Original text: '{text}' | Target language: {target_code} | "
471   - f"Status code: {response.status_code} | Error message: {response.text}"
472   - )
473   - return None
474   -
475   - except requests.Timeout:
476   - logger.warning(
477   - f"[Translator] DeepL API request timeout | Original text: '{text}' | Target language: {target_code} | "
478   - f"Timeout: {self.timeout}s"
479   - )
480   - return None
481   - except Exception as e:
482   - logger.error(
483   - f"[Translator] DeepL API request exception | Original text: '{text}' | Target language: {target_code} | "
484   - f"Error: {e}", exc_info=True
485   - )
486   - return None
487   -
488   - # NOTE: _translate_deepl_free is intentionally not implemented.
489   - # We do not support automatic fallback to the free endpoint, to avoid
490   - # mixing Pro keys with https://api-free.deepl.com and related 403 errors.
491   -
492   - def translate_multi(
493   - self,
494   - text: str,
495   - target_langs: List[str],
496   - source_lang: Optional[str] = None,
497   - context: Optional[str] = None,
498   - async_mode: bool = True,
499   - prompt: Optional[str] = None
500   - ) -> Dict[str, Optional[str]]:
501   - """
502   - Translate text to multiple target languages.
503   -
504   - In async_mode=True (default):
505   - - Returns cached translations immediately if available
506   - - For translations that can be optimized (e.g., pure numbers, already in target language),
507   - returns result immediately via synchronous call
508   - - Launches async tasks for other missing translations (non-blocking)
509   - - Returns None for missing translations that require async processing
510   -
511   - In async_mode=False:
512   - - Waits for all translations to complete (blocking)
513   -
514   - Args:
515   - text: Text to translate
516   - target_langs: List of target language codes
517   - source_lang: Source language code (optional)
518   - context: Context hint for translation (optional)
519   - async_mode: If True, return cached results immediately and translate missing ones async
520   - prompt: Translation prompt/instruction (optional)
521   -
522   - Returns:
523   - Dictionary mapping language code to translated text (only cached results in async mode)
524   - """
525   - results = {}
526   - missing_langs = []
527   - async_langs = []
528   -
529   - # First, get cached translations
530   - for lang in target_langs:
531   - cached = self._get_cached_translation(text, lang, source_lang, context, prompt)
532   - if cached is not None:
533   - results[lang] = cached
534   - else:
535   - missing_langs.append(lang)
536   -
537   - # If async mode and there are missing translations
538   - if async_mode and missing_langs:
539   - # Check if translation can be optimized (immediate return)
540   - for lang in missing_langs:
541   - target_lang = lang.lower()
542   - # Check optimization conditions (same as in translate method)
543   - can_optimize = False
544   - if target_lang == 'en' and self._is_english_text(text):
545   - can_optimize = True
546   - elif target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)):
547   - can_optimize = True
548   -
549   - if can_optimize:
550   - # Can be optimized, call translate synchronously for immediate result
551   - results[lang] = self.translate(text, lang, source_lang, context, prompt)
552   - else:
553   - # Requires actual translation, add to async list
554   - async_langs.append(lang)
555   -
556   - # Launch async tasks for translations that require actual API calls
557   - if async_langs:
558   - for lang in async_langs:
559   - self._translate_async(text, lang, source_lang, context, prompt)
560   - # Return None for async translations
561   - for lang in async_langs:
562   - results[lang] = None
563   - else:
564   - # Synchronous mode: wait for all translations
565   - for lang in missing_langs:
566   - results[lang] = self.translate(text, lang, source_lang, context, prompt)
567   -
568   - return results
569   -
570   - def translate_multi_async(
571   - self,
572   - text: str,
573   - target_langs: List[str],
574   - source_lang: Optional[str] = None,
575   - context: Optional[str] = None,
576   - prompt: Optional[str] = None
577   - ) -> Dict[str, Union[str, Future]]:
578   - """
579   - Translate text to multiple target languages asynchronously, returning Futures that can be awaited.
580   -
581   - This method returns a dictionary where:
582   - - If translation is cached, the value is the translation string (immediate)
583   - - If translation needs to be done, the value is a Future object that can be awaited
584   -
585   - Args:
586   - text: Text to translate
587   - target_langs: List of target language codes
588   - source_lang: Source language code (optional)
589   - context: Context hint for translation (optional)
590   - prompt: Translation prompt/instruction (optional)
591   -
592   - Returns:
593   - Dictionary mapping language code to either translation string (cached) or Future object
594   - """
595   - results = {}
596   - missing_langs = []
597   -
598   - # First, get cached translations
599   - for lang in target_langs:
600   - cached = self._get_cached_translation(text, lang, source_lang, context, prompt)
601   - if cached is not None:
602   - results[lang] = cached
603   - else:
604   - missing_langs.append(lang)
605   -
606   - # For missing translations, submit async tasks and return Futures
607   - for lang in missing_langs:
608   - future = self.executor.submit(
609   - self.translate,
610   - text,
611   - lang,
612   - source_lang,
613   - context,
614   - prompt
615   - )
616   - results[lang] = future
617   -
618   - return results
619   -
620   - def _get_cached_translation(
621   - self,
622   - text: str,
623   - target_lang: str,
624   - source_lang: Optional[str] = None,
625   - context: Optional[str] = None,
626   - prompt: Optional[str] = None
627   - ) -> Optional[str]:
628   - """Get translation from cache if available."""
629   - if not self.redis_client:
630   - return None
631   - return self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt)
632   -
633   - def _get_cached_translation_redis(
634   - self,
635   - text: str,
636   - target_lang: str,
637   - source_lang: Optional[str] = None,
638   - context: Optional[str] = None,
639   - prompt: Optional[str] = None
640   - ) -> Optional[str]:
641   - """
642   - Get translation from Redis cache with sliding expiration.
643   -
644   - 滑动过期机制:每次访问缓存时,重置过期时间为配置的过期时间(默认720天)。
645   - 这样缓存会在最后一次访问后的720天才过期,而不是写入后的720天。
646   - 这确保了常用的翻译缓存不会被过早删除。
647   - """
648   - if not self.redis_client:
649   - return None
650   -
651   - try:
652   - # Build cache key: prefix:target_lang:text
653   - # For simplicity, we use target_lang and text as key
654   - # Context and prompt are not included in key to maximize cache hits
655   - cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}"
656   - value = self.redis_client.get(cache_key)
657   - if value:
658   - # Sliding expiration: reset expiration time on access
659   - # 每次读取缓存时,重置过期时间为配置的过期时间(最后一次访问后的N天才过期)
660   - try:
661   - self.redis_client.expire(cache_key, self.expire_seconds)
662   - except Exception as expire_error:
663   - # 即使 expire 失败,也返回缓存值(不影响功能)
664   - logger.warning(
665   - f"[Translator] Failed to update cache expiration for key {cache_key}: {expire_error}"
666   - )
667   -
668   - logger.debug(
669   - f"[Translator] Redis cache hit | Original text: '{text}' | Target language: {target_lang} | "
670   - f"Cache key: {cache_key} | Translation result: '{value}' | TTL reset to {self.expire_seconds}s"
671   - )
672   - return value
673   - logger.debug(
674   - f"[Translator] Redis cache miss | Original text: '{text}' | Target language: {target_lang} | "
675   - f"Cache key: {cache_key}"
676   - )
677   - return None
678   - except Exception as e:
679   - logger.error(f"[Translator] Redis error during get translation cache | Original text: '{text}' | Target language: {target_lang} | Error: {e}")
680   - return None
681   -
682   - def _set_cached_translation_redis(
683   - self,
684   - text: str,
685   - target_lang: str,
686   - translation: str,
687   - source_lang: Optional[str] = None,
688   - context: Optional[str] = None,
689   - prompt: Optional[str] = None
690   - ) -> None:
691   - """Store translation in Redis cache."""
692   - if not self.redis_client:
693   - return
694   -
695   - try:
696   - cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}"
697   - self.redis_client.setex(cache_key, self.expire_seconds, translation)
698   - logger.info(
699   - f"[Translator] Redis cache write | Original text: '{text}' | Target language: {target_lang} | "
700   - f"Cache key: {cache_key} | Translation result: '{translation}'"
701   - )
702   - except Exception as e:
703   - logger.error(
704   - f"[Translator] Redis cache write failed | Original text: '{text}' | Target language: {target_lang} | "
705   - f"Error: {e}"
706   - )
707   -
708   - def _translate_async(
709   - self,
710   - text: str,
711   - target_lang: str,
712   - source_lang: Optional[str] = None,
713   - context: Optional[str] = None,
714   - prompt: Optional[str] = None
715   - ):
716   - """Launch async translation task."""
717   - def _do_translate():
718   - try:
719   - result = self.translate(text, target_lang, source_lang, context, prompt)
720   - if result:
721   - logger.debug(f"Async translation completed: {text} -> {target_lang}: {result}")
722   - except Exception as e:
723   - logger.warning(f"Async translation failed: {text} -> {target_lang}: {e}")
724   -
725   - self.executor.submit(_do_translate)
726   -
727   - def _add_ecommerce_context(
728   - self,
729   - text: str,
730   - source_lang: Optional[str],
731   - context: Optional[str]
732   - ) -> tuple:
733   - """
734   - Add e-commerce context to text for better disambiguation.
735   -
736   - For single-word ambiguous Chinese terms, we add context words that help
737   - DeepL understand this is an e-commerce/product search context.
738   -
739   - Args:
740   - text: Original text to translate
741   - source_lang: Source language code
742   - context: Context hint
743   -
744   - Returns:
745   - Tuple of (text_with_context, needs_extraction)
746   - - text_with_context: Text to send to DeepL
747   - - needs_extraction: Whether we need to extract the term from the result
748   - """
749   - # Only apply for e-commerce context and Chinese source
750   - if not context or "e-commerce" not in context.lower():
751   - return text, False
752   -
753   - if not source_lang or source_lang.lower() != 'zh':
754   - return text, False
755   -
756   - # For single-word queries, add context to help disambiguation
757   - text_stripped = text.strip()
758   - if len(text_stripped.split()) == 1 and len(text_stripped) <= 2:
759   - # Common ambiguous Chinese e-commerce terms like "车" (car vs rook)
760   - # We add a context phrase: "购买 [term]" (buy [term]) or "商品 [term]" (product [term])
761   - # This helps DeepL understand the e-commerce context
762   - # We'll need to extract just the term from the translation result
763   - context_phrase = f"购买 {text_stripped}"
764   - return context_phrase, True
765   -
766   - # For multi-word queries, DeepL usually has enough context
767   - return text, False
768   -
769   - def _extract_term_from_translation(
770   - self,
771   - translated_text: str,
772   - original_text: str,
773   - target_lang_code: str
774   - ) -> str:
775   - """
776   - Extract the actual term from a translation that included context.
777   -
778   - For example, if we translated "购买 车" (buy car) and got "buy car",
779   - we want to extract just "car".
780   -
781   - Args:
782   - translated_text: Full translation result
783   - original_text: Original single-word query
784   - target_lang_code: Target language code (EN, ZH, etc.)
785   -
786   - Returns:
787   - Extracted term or original translation if extraction fails
788   - """
789   - # For English target, try to extract the last word (the actual term)
790   - if target_lang_code == "EN":
791   - words = translated_text.strip().split()
792   - if len(words) > 1:
793   - # Usually the last word is the term we want
794   - # But we need to be smart - if it's "buy car", we want "car"
795   - # Common context words to skip: buy, purchase, product, item, etc.
796   - context_words = {"buy", "purchase", "product", "item", "commodity", "goods"}
797   - # Try to find the term (not a context word)
798   - for word in reversed(words):
799   - word_lower = word.lower().rstrip('.,!?;:')
800   - if word_lower not in context_words:
801   - return word_lower
802   - # If all words are context words, return the last one
803   - return words[-1].lower().rstrip('.,!?;:')
804   -
805   - # For other languages or if extraction fails, return as-is
806   - # The user can configure a glossary for better results
807   - return translated_text
808   -
809   - def _shop_lang_matches(self, shop_lang_lower: str, lang_code: str) -> bool:
810   - """True if shop language matches index language (use source, no translate)."""
811   - if not shop_lang_lower or not lang_code:
812   - return False
813   - if shop_lang_lower == lang_code:
814   - return True
815   - if lang_code == "zh" and "zh" in shop_lang_lower:
816   - return True
817   - if lang_code == "en" and "en" in shop_lang_lower:
818   - return True
819   - return False
820   -
821   - def translate_for_indexing(
822   - self,
823   - text: str,
824   - shop_language: str,
825   - source_lang: Optional[str] = None,
826   - context: Optional[str] = None,
827   - prompt: Optional[str] = None,
828   - index_languages: Optional[List[str]] = None,
829   - ) -> Dict[str, Optional[str]]:
830   - """
831   - Translate text for indexing based on shop language and tenant index_languages.
832   -
833   - For each language in index_languages: use source text if shop language matches,
834   - otherwise translate to that language.
835   -
836   - Args:
837   - text: Text to translate
838   - shop_language: Shop primary language (e.g. 'zh', 'en', 'ru')
839   - source_lang: Source language code (optional)
840   - context: Additional context for translation (optional)
841   - prompt: Translation prompt (optional)
842   - index_languages: Languages to index (from tenant_config). Default ["en", "zh"].
843   -
844   - Returns:
845   - Dict keyed by each index_language with translated or source text (or None).
846   - """
847   - langs = index_languages if index_languages else ["en", "zh"]
848   - results = {lang: None for lang in langs}
849   - if not text or not text.strip():
850   - return results
851   - if re.match(r'^[\d\s_-]+$', text):
852   - logger.info(f"[Translator] Skip translation for symbol-only query: '{text}'")
853   - return results
854   -
855   - shop_lang_lower = (shop_language or "").strip().lower()
856   - targets = []
857   - for lang in langs:
858   - if self._shop_lang_matches(shop_lang_lower, lang):
859   - results[lang] = text
860   - else:
861   - targets.append(lang)
862   -
863   - for target_lang in targets:
864   - cached = self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt)
865   - if cached:
866   - results[target_lang] = cached
867   - logger.debug(f"[Translator] Cache hit for indexing: '{text}' -> {target_lang}: {cached}")
868   - continue
869   - translated = self.translate(
870   - text,
871   - target_lang=target_lang,
872   - source_lang=source_lang or shop_language,
873   - context=context,
874   - prompt=prompt,
875   - )
876   - results[target_lang] = translated
877   - return results
878   -
879   - def get_translation_needs(
880   - self,
881   - detected_lang: str,
882   - supported_langs: List[str]
883   - ) -> List[str]:
884   - """
885   - Determine which languages need translation.
886   -
887   - Args:
888   - detected_lang: Detected query language
889   - supported_langs: List of supported languages
890   -
891   - Returns:
892   - List of language codes to translate to
893   - """
894   - # If detected language is in supported list, translate to others
895   - if detected_lang in supported_langs:
896   - return [lang for lang in supported_langs if detected_lang != lang]
897   -
898   - # Otherwise, translate to all supported languages
899   - return supported_langs
900   -
901   - def _is_english_text(self, text: str) -> bool:
902   - """
903   - Check if text is primarily English (ASCII letters, numbers, common punctuation).
904   -
905   - Args:
906   - text: Text to check
907   -
908   - Returns:
909   - True if text appears to be English
910   - """
911   - if not text or not text.strip():
912   - return True
913   -
914   - # Remove whitespace and common punctuation
915   - text_clean = re.sub(r'[\s\.,!?;:\-\'\"\(\)\[\]{}]', '', text)
916   - if not text_clean:
917   - return True
918   -
919   - # Check if all remaining characters are ASCII (letters, numbers)
920   - # This is a simple heuristic: if most characters are ASCII, it's likely English
921   - ascii_count = sum(1 for c in text_clean if ord(c) < 128)
922   - ratio = ascii_count / len(text_clean) if text_clean else 0
923   -
924   - # If more than 80% are ASCII characters, consider it English
925   - return ratio > 0.8
926   -
927   - def _contains_chinese(self, text: str) -> bool:
928   - """
929   - Check if text contains Chinese characters (Han characters).
930   -
931   - Args:
932   - text: Text to check
933   -
934   - Returns:
935   - True if text contains Chinese characters
936   - """
937   - if not text:
938   - return False
939   -
940   - # Check for Chinese characters (Unicode range: \u4e00-\u9fff)
941   - chinese_pattern = re.compile(r'[\u4e00-\u9fff]')
942   - return bool(chinese_pattern.search(text))
943   -
944   - def _is_pure_number(self, text: str) -> bool:
945   - """
946   - Check if text is purely numeric (digits, possibly with spaces, dots, commas).
947   -
948   - Args:
949   - text: Text to check
950   -
951   - Returns:
952   - True if text is purely numeric
953   - """
954   - if not text or not text.strip():
955   - return False
956   -
957   - # Remove whitespace, dots, commas (common number separators)
958   - text_clean = re.sub(r'[\s\.,]', '', text.strip())
959   - if not text_clean:
960   - return False
961   -
962   - # Check if all remaining characters are digits
963   - return text_clean.isdigit()
services.translation.providers.llm 0 → 100644
tests/test_embedding_pipeline.py
... ... @@ -77,12 +77,10 @@ def _build_test_config() -&gt; SearchConfig:
77 77 enable_text_embedding=True,
78 78 enable_query_rewrite=False,
79 79 rewrite_dictionary={},
80   - translation_prompts={"query_zh": "e-commerce domain", "query_en": "e-commerce domain"},
81 80 text_embedding_field="title_embedding",
82 81 image_embedding_field=None,
83 82 ),
84 83 function_score=FunctionScoreConfig(),
85   - function_score=FunctionScoreConfig(),
86 84 rerank=RerankConfig(),
87 85 spu_config=SPUConfig(enabled=True, spu_field="spu_id", inner_hits_size=3),
88 86 es_index_name="test_products",
... ...