Commit a0a173ae904212171b324f0976f034f6528ff749

Authored by tangwang
1 parent 985752f5

last

api/translator_app.py
... ... @@ -97,7 +97,7 @@ from pydantic import BaseModel, Field
97 97 # Add parent directory to path
98 98 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
99 99  
100   -from query.translator import Translator
  100 +from query.qwen_mt_translate import Translator
101 101 from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG
102 102  
103 103 # Configure logging
... ...
config/config.yaml
... ... @@ -119,7 +119,7 @@ rerank:
119 119 # 可扩展服务/provider 注册表(单一配置源)
120 120 services:
121 121 translation:
122   - provider: "direct" # direct | http | google(reserved)
  122 + provider: "llm" # direct | http | google(reserved)
123 123 base_url: "http://127.0.0.1:6006"
124 124 model: "qwen"
125 125 timeout_sec: 10.0
... ... @@ -130,6 +130,12 @@ services:
130 130 base_url: "http://127.0.0.1:6006"
131 131 model: "qwen"
132 132 timeout_sec: 10.0
  133 + llm:
  134 + model: "qwen-flash"
  135 + # 可选:覆盖 DashScope 兼容模式的 Endpoint 与超时
  136 + # base_url 留空则使用 DASHSCOPE_BASE_URL 或默认地域
  137 + base_url: ""
  138 + timeout_sec: 30.0
133 139 google:
134 140 enabled: false
135 141 project_id: ""
... ...
docs/系统设计文档.md
... ... @@ -384,6 +384,15 @@ query_config:
384 384  
385 385 实际代码中,通过通用的 translation provider 抽象来选择具体后端和模型,文档不固定绑定某一个具体翻译服务或模型名称,以保持可配置性。
386 386  
  387 +此外,为了支持**高质量、提示词可控的 LLM 翻译**(例如商品富化脚本、离线分析工具),在 `query/llm_translate.py` 中提供了一个独立的 LLM 翻译辅助模块:
  388 +
  389 +- **配置入口**:`config/config.yaml -> services.translation.providers.llm`,用于指定:
  390 + - `model`: 例如 `qwen-flash`(DashScope 兼容模式的对话模型)
  391 + - `base_url`: 可选;为空时使用环境变量 `DASHSCOPE_BASE_URL` 或默认 Endpoint
  392 + - `timeout_sec`: LLM 调用超时
  393 +- **环境变量**:仍通过 `DASHSCOPE_API_KEY` 注入 DashScope API Key。
  394 +- **使用方式**:主查询路径继续使用 machine translation(`query.translator.Translator`),只在需要更强表达控制的场景(如批量标注、产品分类脚本)中显式调用 `llm_translate()`。
  395 +
387 396 #### 功能特性
388 397 1. **语言检测**:自动检测查询语言
389 398 2. **智能翻译**:
... ...
indexer/document_transformer.py
... ... @@ -20,7 +20,7 @@ logger = logging.getLogger(__name__)
20 20  
21 21 # Try to import translator (optional dependency)
22 22 try:
23   - from query.translator import Translator
  23 + from query.qwen_mt_translate import Translator
24 24 TRANSLATOR_AVAILABLE = True
25 25 except ImportError:
26 26 TRANSLATOR_AVAILABLE = False
... ...
indexer/test_indexing.py
... ... @@ -273,7 +273,7 @@ def test_document_transformer():
273 273 tenant_config = tenant_config_loader.get_tenant_config('162')
274 274  
275 275 # 初始化翻译器(测试环境总是启用,具体翻译方向由tenant_config控制)
276   - from query.translator import Translator
  276 + from query.qwen_mt_translate import Translator
277 277 translator = Translator(
278 278 api_key=config.query_config.translation_api_key,
279 279 use_cache=True
... ...
providers/translation.py
... ... @@ -0,0 +1,169 @@
  1 +"""
  2 +Translation provider - direct (in-process) or HTTP service.
  3 +"""
  4 +from __future__ import annotations
  5 +
  6 +import logging
  7 +from typing import Any, Dict, List, Optional, Union
  8 +
  9 +from concurrent.futures import Future, ThreadPoolExecutor
  10 +import requests
  11 +
  12 +from config.services_config import get_translation_config, get_translation_base_url
  13 +
  14 +logger = logging.getLogger(__name__)
  15 +
  16 +
  17 +class HttpTranslationProvider:
  18 + """Translation via HTTP service."""
  19 +
  20 + def __init__(
  21 + self,
  22 + base_url: str,
  23 + model: str = "qwen",
  24 + timeout_sec: float = 10.0,
  25 + translation_context: Optional[str] = None,
  26 + ):
  27 + self.base_url = (base_url or "").rstrip("/")
  28 + self.model = model or "qwen"
  29 + self.timeout_sec = float(timeout_sec or 10.0)
  30 + self.translation_context = translation_context or "e-commerce product search"
  31 + self.executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="http-translator")
  32 +
  33 + def _translate_once(
  34 + self,
  35 + text: str,
  36 + target_lang: str,
  37 + source_lang: Optional[str] = None,
  38 + ) -> Optional[str]:
  39 + if not text or not str(text).strip():
  40 + return text
  41 + try:
  42 + url = f"{self.base_url}/translate"
  43 + payload = {
  44 + "text": text,
  45 + "target_lang": target_lang,
  46 + "source_lang": source_lang or "auto",
  47 + "model": self.model,
  48 + }
  49 + response = requests.post(url, json=payload, timeout=self.timeout_sec)
  50 + if response.status_code != 200:
  51 + logger.warning(
  52 + "HTTP translator failed: status=%s body=%s",
  53 + response.status_code,
  54 + (response.text or "")[:200],
  55 + )
  56 + return None
  57 + data = response.json()
  58 + translated = data.get("translated_text")
  59 + return translated if translated is not None else None
  60 + except Exception as exc:
  61 + logger.warning("HTTP translator request failed: %s", exc, exc_info=True)
  62 + return None
  63 +
  64 + def translate(
  65 + self,
  66 + text: str,
  67 + target_lang: str,
  68 + source_lang: Optional[str] = None,
  69 + context: Optional[str] = None,
  70 + prompt: Optional[str] = None,
  71 + ) -> Optional[str]:
  72 + del context, prompt
  73 + result = self._translate_once(text=text, target_lang=target_lang, source_lang=source_lang)
  74 + return result if result is not None else text
  75 +
  76 + def translate_multi(
  77 + self,
  78 + text: str,
  79 + target_langs: List[str],
  80 + source_lang: Optional[str] = None,
  81 + context: Optional[str] = None,
  82 + async_mode: bool = True,
  83 + prompt: Optional[str] = None,
  84 + ) -> Dict[str, Optional[str]]:
  85 + del context, async_mode, prompt
  86 + out: Dict[str, Optional[str]] = {}
  87 + for lang in target_langs:
  88 + out[lang] = self.translate(text, lang, source_lang=source_lang)
  89 + return out
  90 +
  91 + def translate_multi_async(
  92 + self,
  93 + text: str,
  94 + target_langs: List[str],
  95 + source_lang: Optional[str] = None,
  96 + context: Optional[str] = None,
  97 + prompt: Optional[str] = None,
  98 + ) -> Dict[str, Union[str, Future]]:
  99 + del context, prompt
  100 + out: Dict[str, Union[str, Future]] = {}
  101 + for lang in target_langs:
  102 + out[lang] = self.executor.submit(self.translate, text, lang, source_lang)
  103 + return out
  104 +
  105 + def translate_for_indexing(
  106 + self,
  107 + text: str,
  108 + shop_language: str,
  109 + source_lang: Optional[str] = None,
  110 + context: Optional[str] = None,
  111 + prompt: Optional[str] = None,
  112 + index_languages: Optional[List[str]] = None,
  113 + ) -> Dict[str, Optional[str]]:
  114 + del context, prompt
  115 + langs = index_languages if index_languages else ["en", "zh"]
  116 + source = source_lang or shop_language or "auto"
  117 + out: Dict[str, Optional[str]] = {}
  118 + for lang in langs:
  119 + if lang == shop_language:
  120 + out[lang] = text
  121 + else:
  122 + out[lang] = self.translate(text, target_lang=lang, source_lang=source)
  123 + return out
  124 +
  125 +
  126 +def create_translation_provider(query_config: Any = None) -> Any:
  127 + """
  128 + Create translation provider from services config.
  129 +
  130 + query_config: optional, for api_key/glossary_id/context (used by direct provider).
  131 + """
  132 + cfg = get_translation_config()
  133 + provider = cfg.provider
  134 + pc = cfg.get_provider_cfg()
  135 +
  136 + if provider in ("direct", "local", "inprocess"):
  137 + from query.qwen_mt_translate import Translator
  138 + model = pc.get("model") or "qwen"
  139 + qc = query_config or _empty_query_config()
  140 + return Translator(
  141 + model=model,
  142 + api_key=getattr(qc, "translation_api_key", None),
  143 + use_cache=True,
  144 + glossary_id=getattr(qc, "translation_glossary_id", None),
  145 + translation_context=getattr(qc, "translation_context", "e-commerce product search"),
  146 + )
  147 +
  148 + if provider in ("http", "service"):
  149 + base_url = get_translation_base_url()
  150 + model = pc.get("model") or "qwen"
  151 + timeout = pc.get("timeout_sec", 10.0)
  152 + qc = query_config or _empty_query_config()
  153 + return HttpTranslationProvider(
  154 + base_url=base_url,
  155 + model=model,
  156 + timeout_sec=float(timeout),
  157 + translation_context=getattr(qc, "translation_context", "e-commerce product search"),
  158 + )
  159 +
  160 + raise ValueError(f"Unsupported translation provider: {provider}")
  161 +
  162 +
  163 +def _empty_query_config() -> Any:
  164 + """Minimal object with default translation attrs."""
  165 + class _QC:
  166 + translation_api_key = None
  167 + translation_glossary_id = None
  168 + translation_context = "e-commerce product search"
  169 + return _QC()
... ...
query/__init__.py
1 1 """Query package initialization."""
2 2  
3 3 from .language_detector import LanguageDetector
4   -from .translator import Translator
  4 +from .qwen_mt_translate import Translator
5 5 from .query_rewriter import QueryRewriter, QueryNormalizer
6 6 from .query_parser import QueryParser, ParsedQuery
7 7  
... ...
query/llm_translate.py
... ... @@ -0,0 +1,238 @@
  1 +"""
  2 +LLM-based translation helper using Qwen chat model.
  3 +
  4 +This module provides a thin wrapper around DashScope's `qwen-flash` model
  5 +for high-quality, prompt-controlled translation, independent of the main
  6 +`Translator` (machine translation) pipeline.
  7 +
  8 +Usage example:
  9 +
  10 + from query.llm_translate import llm_translate
  11 +
  12 + result = llm_translate(
  13 + text="我看到这个视频后没有笑",
  14 + target_lang="en",
  15 + source_lang="zh",
  16 + source_lang_label="中文",
  17 + target_lang_label="英文",
  18 + )
  19 +"""
  20 +
  21 +from __future__ import annotations
  22 +
  23 +import logging
  24 +import os
  25 +import time
  26 +from typing import Dict, Optional
  27 +
  28 +from openai import OpenAI
  29 +
  30 +from config.env_config import DASHSCOPE_API_KEY
  31 +from config.services_config import get_translation_config
  32 +
  33 +logger = logging.getLogger(__name__)
  34 +
  35 +
  36 +# 华北2(北京):https://dashscope.aliyuncs.com/compatible-mode/v1
  37 +# 新加坡:https://dashscope-intl.aliyuncs.com/compatible-mode/v1
  38 +# 美国(弗吉尼亚):https://dashscope-us.aliyuncs.com/compatible-mode/v1
  39 +#
  40 +# 默认保持与现有翻译/索引脚本相同的美国地域,可通过环境变量覆盖:
  41 +# DASHSCOPE_BASE_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
  42 +DEFAULT_QWEN_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1"
  43 +QWEN_MODEL_NAME = "qwen-flash"
  44 +
  45 +
  46 +# 由调用方提供的语言标签/代码填充,占位符说明:
  47 +# - source_lang: 源语言的人类可读名称(按目标语言本地化,例如 "中文", "English")
  48 +# - target_lang: 目标语言的人类可读名称
  49 +# - src_lang_code: 源语言代码,例如 "zh"
  50 +# - tgt_lang_code: 目标语言代码,例如 "en"
  51 +TRANSLATION_PROMPTS: Dict[str, str] = {
  52 + "zh": """你是一名专业的 {source_lang}({src_lang_code})到 {target_lang}({tgt_lang_code})翻译员。你的目标是在遵循 {target_lang} 的语法、词汇和文化习惯的前提下,准确传达原始 {source_lang} 文本的含义和细微差别。请只输出 {target_lang} 的翻译内容,不要包含任何额外的解释或评论。请将以下 {source_lang} 文本翻译成 {target_lang}:
  53 +
  54 +{text}""",
  55 + "en": """You are a professional {source_lang} ({src_lang_code}) to {target_lang} ({tgt_lang_code}) translator. Your goal is to accurately convey the meaning and nuances of the original {source_lang} text while adhering to {target_lang} grammar, vocabulary, and cultural sensitivities. Produce only the {target_lang} translation, without any additional explanations or commentary. Please translate the following {source_lang} text into {target_lang}:
  56 +
  57 +{text}""",
  58 + "ru": """Вы профессиональный переводчик с {source_lang} ({src_lang_code}) на {target_lang} ({tgt_lang_code}). Ваша задача — точно передать смысл и нюансы исходного текста на {source_lang}, соблюдая грамматику, лексику и культурные особенности {target_lang}. Выводите только перевод на {target_lang}, без каких-либо дополнительных объяснений или комментариев. Пожалуйста, переведите следующий текст с {source_lang} на {target_lang}:
  59 +
  60 +{text}""",
  61 + "ar": """أنت مترجم محترف من {source_lang} ({src_lang_code}) إلى {target_lang} ({tgt_lang_code}). هدفك هو نقل المعنى والدلالات الدقيقة للنص الأصلي بلغة {source_lang} بدقة، مع الالتزام بقواعد اللغة والمفردات والحساسيات الثقافية الخاصة بلغة {target_lang}. قم بإنتاج الترجمة إلى {target_lang} فقط دون أي شروحات أو تعليقات إضافية. يرجى ترجمة النص التالي من {source_lang} إلى {target_lang}:
  62 +
  63 +{text}""",
  64 + "ja": """あなたは {source_lang}({src_lang_code})から {target_lang}({tgt_lang_code})へのプロの翻訳者です。{target_lang} の文法、語彙、文化的配慮に従いながら、元の {source_lang} テキストの意味やニュアンスを正確に伝えることが目的です。追加の説明やコメントは一切含めず、{target_lang} の翻訳のみを出力してください。次の {source_lang} テキストを {target_lang} に翻訳してください:
  65 +
  66 +{text}""",
  67 + "es": """Eres un traductor profesional de {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Tu objetivo es transmitir con precisión el significado y los matices del texto original en {source_lang}, respetando la gramática, el vocabulario y las sensibilidades culturales de {target_lang}. Produce únicamente la traducción en {target_lang}, sin explicaciones ni comentarios adicionales. Por favor, traduce el siguiente texto de {source_lang} a {target_lang}:
  68 +
  69 +{text}""",
  70 + "de": """Du bist ein professioneller Übersetzer von {source_lang} ({src_lang_code}) nach {target_lang} ({tgt_lang_code}). Dein Ziel ist es, die Bedeutung und Nuancen des ursprünglichen {source_lang}-Textes genau zu vermitteln und dabei die Grammatik, den Wortschatz und die kulturellen Besonderheiten von {target_lang} zu berücksichtigen. Gib ausschließlich die Übersetzung in {target_lang} aus, ohne zusätzliche Erklärungen oder Kommentare. Bitte übersetze den folgenden {source_lang}-Text in {target_lang}:
  71 +
  72 +{text}""",
  73 + "fr": """Vous êtes un traducteur professionnel de {source_lang} ({src_lang_code}) vers {target_lang} ({tgt_lang_code}). Votre objectif est de transmettre fidèlement le sens et les nuances du texte original en {source_lang}, tout en respectant la grammaire, le vocabulaire et les sensibilités culturelles de {target_lang}. Produisez uniquement la traduction en {target_lang}, sans explications ni commentaires supplémentaires. Veuillez traduire le texte suivant de {source_lang} vers {target_lang} :
  74 +
  75 +{text}""",
  76 + "it": """Sei un traduttore professionista da {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Il tuo obiettivo è trasmettere con precisione il significato e le sfumature del testo originale in {source_lang}, rispettando la grammatica, il vocabolario e le sensibilità culturali di {target_lang}. Produci solo la traduzione in {target_lang}, senza spiegazioni o commenti aggiuntivi. Per favore traduci il seguente testo da {source_lang} a {target_lang}:
  77 +
  78 +{text}""",
  79 + "pt": """Você é um tradutor profissional de {source_lang} ({src_lang_code}) para {target_lang} ({tgt_lang_code}). Seu objetivo é transmitir com precisão o significado e as nuances do texto original em {source_lang}, respeitando a gramática, o vocabulário e as sensibilidades culturais de {target_lang}. Produza apenas a tradução em {target_lang}, sem quaisquer explicações ou comentários adicionais. Por favor, traduza o seguinte texto de {source_lang} para {target_lang}:
  80 +
  81 +{text}""",
  82 +}
  83 +
  84 +
  85 +def _get_qwen_client(base_url: Optional[str] = None) -> Optional[OpenAI]:
  86 + """
  87 + Lazily construct an OpenAI-compatible client for DashScope.
  88 +
  89 + Uses DASHSCOPE_API_KEY and base_url (provider config / env) to configure endpoint.
  90 + """
  91 + api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY")
  92 + if not api_key:
  93 + logger.warning("DASHSCOPE_API_KEY not set; llm-based translation will be disabled")
  94 + return None
  95 +
  96 + # 优先使用显式传入的 base_url,其次环境变量,最后默认地域。
  97 + base_url = (
  98 + (base_url or "").strip()
  99 + or os.getenv("DASHSCOPE_BASE_URL")
  100 + or DEFAULT_QWEN_BASE_URL
  101 + )
  102 +
  103 + try:
  104 + client = OpenAI(api_key=api_key, base_url=base_url)
  105 + return client
  106 + except Exception as exc:
  107 + logger.error("Failed to initialize DashScope OpenAI client: %s", exc, exc_info=True)
  108 + return None
  109 +
  110 +
  111 +def _build_prompt(
  112 + text: str,
  113 + target_lang: str,
  114 + source_lang_label: str,
  115 + target_lang_label: str,
  116 + src_lang_code: str,
  117 + tgt_lang_code: str,
  118 +) -> str:
  119 + """
  120 + Build translation prompt for given target language, defaulting to English template.
  121 + """
  122 + key = (target_lang or "").lower()
  123 + template = TRANSLATION_PROMPTS.get(key) or TRANSLATION_PROMPTS["en"]
  124 + return template.format(
  125 + source_lang=source_lang_label,
  126 + target_lang=target_lang_label,
  127 + src_lang_code=src_lang_code,
  128 + tgt_lang_code=tgt_lang_code,
  129 + text=text,
  130 + )
  131 +
  132 +
  133 +def llm_translate(
  134 + text: str,
  135 + target_lang: str,
  136 + *,
  137 + source_lang: Optional[str] = None,
  138 + source_lang_label: Optional[str] = None,
  139 + target_lang_label: Optional[str] = None,
  140 + timeout_sec: Optional[float] = None,
  141 +) -> Optional[str]:
  142 + """
  143 + Translate text with Qwen chat model using rich prompts.
  144 +
  145 + - 根据目标语言选择提示词,如果没匹配到则退回英文模板。
  146 + - 不对 text 做语言检测或缓存,调用方自行控制。
  147 +
  148 + Args:
  149 + text: 原始文本
  150 + target_lang: 目标语言代码(如 "zh", "en")
  151 + source_lang: 源语言代码(可选,不影响提示词选择,仅用于日志)
  152 + source_lang_label: 源语言展示名称,用于 prompt(默认使用 source_lang)
  153 + target_lang_label: 目标语言展示名称,用于 prompt(默认使用 target_lang)
  154 + timeout_sec: 请求超时时间(秒,可选;若未配置则从 config 读取或采用默认)
  155 +
  156 + Returns:
  157 + 翻译后的文本;如失败则返回 None。
  158 + """
  159 + if not text or not str(text).strip():
  160 + return text
  161 +
  162 + cfg = get_translation_config()
  163 + provider_cfg = cfg.providers.get("llm", {}) if isinstance(cfg.providers, dict) else {}
  164 +
  165 + model_name = provider_cfg.get("model") or QWEN_MODEL_NAME
  166 + req_timeout = float(provider_cfg.get("timeout_sec") or timeout_sec or 30.0)
  167 + base_url = (provider_cfg.get("base_url") or "").strip() or None
  168 +
  169 + client = _get_qwen_client(base_url=base_url)
  170 + if not client:
  171 + # 无法调用云端,直接回退
  172 + logger.warning(
  173 + "[llm_translate] Client init failed; returning original text. "
  174 + "text=%r target_lang=%s source_lang=%s",
  175 + text[:80],
  176 + target_lang,
  177 + source_lang or "auto",
  178 + )
  179 + return text
  180 +
  181 + tgt = (target_lang or "").lower() or "en"
  182 + src = (source_lang or "auto").lower()
  183 + src_label = source_lang_label or src
  184 + tgt_label = target_lang_label or tgt
  185 +
  186 + prompt = _build_prompt(
  187 + text=text,
  188 + target_lang=tgt,
  189 + source_lang_label=src_label,
  190 + target_lang_label=tgt_label,
  191 + src_lang_code=src,
  192 + tgt_lang_code=tgt,
  193 + )
  194 +
  195 + start = time.time()
  196 + try:
  197 + completion = client.chat.completions.create(
  198 + model=model_name,
  199 + messages=[
  200 + {
  201 + "role": "user",
  202 + "content": prompt,
  203 + }
  204 + ],
  205 + timeout=req_timeout,
  206 + )
  207 + content = (completion.choices[0].message.content or "").strip()
  208 + duration_ms = (time.time() - start) * 1000
  209 + logger.info(
  210 + "[llm_translate] Success | model=%s src=%s tgt=%s latency=%.1fms text=%r -> %r",
  211 + model_name,
  212 + src,
  213 + tgt,
  214 + duration_ms,
  215 + text[:80],
  216 + content[:80],
  217 + )
  218 + return content or text
  219 + except Exception as exc:
  220 + duration_ms = (time.time() - start) * 1000
  221 + logger.warning(
  222 + "[llm_translate] Failed | model=%s src=%s tgt=%s latency=%.1fms error=%s",
  223 + model_name,
  224 + src,
  225 + tgt,
  226 + duration_ms,
  227 + exc,
  228 + exc_info=True,
  229 + )
  230 + # 安全回退:出错时返回原文,避免中断上游流程
  231 + return text
  232 +
  233 +
  234 +__all__ = [
  235 + "TRANSLATION_PROMPTS",
  236 + "llm_translate",
  237 +]
  238 +
... ...
query/qwen_mt_translate.py 0 → 100644
... ... @@ -0,0 +1,963 @@
  1 +"""
  2 +Translation service for multi-language query support.
  3 +
  4 +Supports multiple translation models:
  5 +- Qwen (default): Alibaba Cloud DashScope API using qwen-mt-flash model
  6 +- DeepL: DeepL API for high-quality translations
  7 +
  8 +重要说明(Qwen 机翻限速):
  9 +- 当前默认使用的 `qwen-mt-flash` 为云端机翻模型,**官方限速较低,约 RPM=60(每分钟约 60 请求)**
  10 +- 在高并发场景必须依赖 Redis 翻译缓存与批量预热,避免在用户实时请求路径上直接打满 DashScope 限流
  11 +- 若业务侧存在大规模离线翻译或更高吞吐需求,建议评估 DeepL 或自建翻译后端
  12 +
  13 +使用方法 (Usage):
  14 +
  15 +```python
  16 +from query.translator import Translator
  17 +
  18 +# 使用默认的 qwen 模型(推荐)
  19 +translator = Translator() # 默认使用 qwen 模型
  20 +
  21 +# 或显式指定模型
  22 +translator = Translator(model='qwen') # 使用 qwen 模型
  23 +translator = Translator(model='deepl') # 使用 DeepL 模型
  24 +
  25 +# 翻译文本
  26 +result = translator.translate(
  27 + text="我看到这个视频后没有笑",
  28 + target_lang="en",
  29 + source_lang="auto" # 自动检测源语言
  30 +)
  31 +```
  32 +
  33 +配置说明 (Configuration):
  34 +- Qwen 模型需要设置 DASHSCOPE_API_KEY 环境变量(在 .env 文件中)
  35 +- DeepL 模型需要设置 DEEPL_AUTH_KEY 环境变量(在 .env 文件中)
  36 +
  37 +Qwen 模型参考文档:
  38 +- 官方文档:https://help.aliyun.com/zh/model-studio/get-api-key
  39 +- 模型:qwen-mt-flash(快速翻译模型)
  40 +
  41 +DeepL 官方文档:
  42 +https://developers.deepl.com/api-reference/translate/request-translation
  43 +"""
  44 +
  45 +import os
  46 +import requests
  47 +import re
  48 +import redis
  49 +from concurrent.futures import ThreadPoolExecutor, Future
  50 +from datetime import timedelta
  51 +from typing import Dict, List, Optional, Union
  52 +import logging
  53 +import time
  54 +
  55 +logger = logging.getLogger(__name__)
  56 +
  57 +from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG
  58 +from openai import OpenAI
  59 +
  60 +
  61 +class Translator:
  62 + """
  63 + Multi-language translator supporting Qwen and DeepL APIs.
  64 +
  65 + Default model is 'qwen' which uses Alibaba Cloud DashScope API.
  66 + """
  67 +# 华北2(北京):https://dashscope.aliyuncs.com/compatible-mode/v1
  68 +# 新加坡:https://dashscope-intl.aliyuncs.com/compatible-mode/v1
  69 +# 美国(弗吉尼亚):https://dashscope-us.aliyuncs.com/compatible-mode/v1
  70 +
  71 + DEEPL_API_URL = "https://api.deepl.com/v2/translate" # Pro tier
  72 + QWEN_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1" # 北京地域
  73 + # QWEN_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" # 新加坡
  74 + # 如果使用新加坡地域的模型,需要将base_url替换为:https://dashscope-intl.aliyuncs.com/compatible-mode/v1
  75 + QWEN_MODEL = "qwen-mt-flash" # 快速翻译模型
  76 +
  77 + # Language code mapping
  78 + LANG_CODE_MAP = {
  79 + 'zh': 'ZH',
  80 + 'en': 'EN',
  81 + 'ru': 'RU',
  82 + 'ar': 'AR',
  83 + 'ja': 'JA',
  84 + 'es': 'ES',
  85 + 'de': 'DE',
  86 + 'fr': 'FR',
  87 + 'it': 'IT',
  88 + 'pt': 'PT',
  89 + }
  90 +
  91 + def __init__(
  92 + self,
  93 + model: str = "qwen",
  94 + api_key: Optional[str] = None,
  95 + use_cache: bool = True,
  96 + timeout: int = 10,
  97 + glossary_id: Optional[str] = None,
  98 + translation_context: Optional[str] = None
  99 + ):
  100 + """
  101 + Initialize translator.
  102 +
  103 + Args:
  104 + model: Translation model to use. Options: 'qwen' (default) or 'deepl'
  105 + api_key: API key for the selected model (or None to use from config/env)
  106 + use_cache: Whether to cache translations
  107 + timeout: Request timeout in seconds
  108 + glossary_id: DeepL glossary ID for custom terminology (optional, only for DeepL)
  109 + translation_context: Context hint for translation (e.g., "e-commerce", "product search")
  110 + """
  111 + self.model = model.lower()
  112 + if self.model not in ['qwen', 'deepl']:
  113 + raise ValueError(f"Unsupported model: {model}. Supported models: 'qwen', 'deepl'")
  114 +
  115 + # Get API key from config if not provided
  116 + if api_key is None:
  117 + if self.model == 'qwen':
  118 + api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY")
  119 + else: # deepl
  120 + api_key = DEEPL_AUTH_KEY or os.getenv("DEEPL_AUTH_KEY")
  121 +
  122 + self.api_key = api_key
  123 + self.timeout = timeout
  124 + self.use_cache = use_cache
  125 + self.glossary_id = glossary_id
  126 + self.translation_context = translation_context or "e-commerce product search"
  127 +
  128 + # Initialize OpenAI client for Qwen if needed
  129 + self.qwen_client = None
  130 + if self.model == 'qwen':
  131 + if not self.api_key:
  132 + logger.warning("DASHSCOPE_API_KEY not set. Qwen translation will not work.")
  133 + else:
  134 + self.qwen_client = OpenAI(
  135 + api_key=self.api_key,
  136 + base_url=self.QWEN_BASE_URL,
  137 + )
  138 +
  139 + # Initialize Redis cache if enabled
  140 + if use_cache:
  141 + try:
  142 + self.redis_client = redis.Redis(
  143 + host=REDIS_CONFIG.get('host', 'localhost'),
  144 + port=REDIS_CONFIG.get('port', 6479),
  145 + password=REDIS_CONFIG.get('password'),
  146 + decode_responses=True, # Return str instead of bytes
  147 + socket_timeout=REDIS_CONFIG.get('socket_timeout', 1),
  148 + socket_connect_timeout=REDIS_CONFIG.get('socket_connect_timeout', 1),
  149 + retry_on_timeout=REDIS_CONFIG.get('retry_on_timeout', False),
  150 + health_check_interval=10, # 避免复用坏连接
  151 + )
  152 + # Test connection
  153 + self.redis_client.ping()
  154 + expire_days = REDIS_CONFIG.get('translation_cache_expire_days', 360)
  155 + self.expire_time = timedelta(days=expire_days)
  156 + self.expire_seconds = int(self.expire_time.total_seconds()) # Redis 需要秒数
  157 + self.cache_prefix = REDIS_CONFIG.get('translation_cache_prefix', 'trans')
  158 + logger.info("Redis cache initialized for translations")
  159 + except Exception as e:
  160 + logger.warning(f"Failed to initialize Redis cache: {e}, falling back to no cache")
  161 + self.redis_client = None
  162 + self.cache = None
  163 + else:
  164 + self.redis_client = None
  165 + self.cache = None
  166 +
  167 + # Thread pool for async translation
  168 + self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="translator")
  169 +
  170 + def translate(
  171 + self,
  172 + text: str,
  173 + target_lang: str,
  174 + source_lang: Optional[str] = None,
  175 + context: Optional[str] = None,
  176 + prompt: Optional[str] = None
  177 + ) -> Optional[str]:
  178 + """
  179 + Translate text to target language (synchronous mode).
  180 +
  181 + Args:
  182 + text: Text to translate
  183 + target_lang: Target language code ('zh', 'en', 'ru', etc.)
  184 + source_lang: Source language code (option al, auto-detect if None)
  185 + context: Additional context for translation (overrides default context)
  186 + prompt: Translation prompt/instruction (optional, for better translation quality)
  187 +
  188 + Returns:
  189 + Translated text or None if translation fails
  190 + """
  191 + if not text or not text.strip():
  192 + return text
  193 +
  194 + # Normalize language codes
  195 + target_lang = target_lang.lower()
  196 + if source_lang:
  197 + source_lang = source_lang.lower()
  198 +
  199 + # Optimization: Skip translation if not needed
  200 + if target_lang == 'en' and self._is_english_text(text):
  201 + logger.info(f"[Translator] Text is already English, skipping translation: '{text[:50]}...'")
  202 + return text
  203 +
  204 + if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)):
  205 + logger.info(
  206 + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
  207 + f"Source language: {source_lang or 'auto'} | Result: Skip translation (contains Chinese or pure number)"
  208 + )
  209 + return text
  210 +
  211 + # Use provided context or default context
  212 + translation_context = context or self.translation_context
  213 +
  214 + # Build cache key (include prompt in cache key if provided)
  215 + cache_key_parts = [source_lang or 'auto', target_lang, translation_context]
  216 + if prompt:
  217 + cache_key_parts.append(prompt)
  218 + cache_key_parts.append(text)
  219 + cache_key = ':'.join(cache_key_parts)
  220 +
  221 + # Check cache (include context and prompt in cache key for accuracy)
  222 + if self.use_cache and self.redis_client:
  223 + cached = self._get_cached_translation_redis(text, target_lang, source_lang, translation_context, prompt)
  224 + if cached:
  225 + logger.info(
  226 + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
  227 + f"Source language: {source_lang or 'auto'} | Result: '{cached}' | Source: Cache hit"
  228 + )
  229 + return cached
  230 +
  231 + # If no API key, return mock translation (for testing)
  232 + if not self.api_key:
  233 + logger.info(
  234 + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
  235 + f"Source language: {source_lang or 'auto'} | Result: '{text}' | Source: Mock mode (no API key)"
  236 + )
  237 + return text
  238 +
  239 + # Translate using selected model
  240 + logger.info(
  241 + f"[Translator] Translation request | Model: {self.model} | Original text: '{text}' | Target language: {target_lang} | "
  242 + f"Source language: {source_lang or 'auto'} | Context: {translation_context} | "
  243 + f"Prompt: {'yes' if prompt else 'no'} | Status: Starting translation"
  244 + )
  245 +
  246 + if self.model == 'qwen':
  247 + result = self._translate_qwen(text, target_lang, source_lang, translation_context, prompt)
  248 + else: # deepl
  249 + result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt)
  250 +
  251 + # Surface translation failure to the caller instead of silently
  252 + # masquerading the source text as a successful translation.
  253 + if result is None:
  254 + logger.warning(
  255 + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
  256 + f"Source language: {source_lang or 'auto'} | Status: Translation failed"
  257 + )
  258 + else:
  259 + logger.info(
  260 + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
  261 + f"Source language: {source_lang or 'auto'} | Result: '{result}' | Status: Translation successful"
  262 + )
  263 +
  264 + # Cache only successful translations. Failed attempts must not poison
  265 + # Redis with the original text.
  266 + if result is not None and self.use_cache and self.redis_client:
  267 + self._set_cached_translation_redis(text, target_lang, result, source_lang, translation_context, prompt)
  268 +
  269 + return result
  270 +
  271 + def _translate_qwen(
  272 + self,
  273 + text: str,
  274 + target_lang: str,
  275 + source_lang: Optional[str],
  276 + context: Optional[str] = None,
  277 + prompt: Optional[str] = None
  278 + ) -> Optional[str]:
  279 + """
  280 + Translate using Qwen MT Flash model via Alibaba Cloud DashScope API.
  281 +
  282 + Args:
  283 + text: Text to translate
  284 + target_lang: Target language code ('zh', 'en', 'ru', etc.)
  285 + source_lang: Source language code (optional, 'auto' if None)
  286 + context: Context hint for translation (optional)
  287 + prompt: Translation prompt/instruction (optional)
  288 +
  289 + Returns:
  290 + Translated text or None if translation fails
  291 + """
  292 + if not self.qwen_client:
  293 + logger.error("[Translator] Qwen client not initialized. Check DASHSCOPE_API_KEY.")
  294 + return None
  295 +
  296 + # Qwen (qwen-mt-plus/flash/turbo) supported languages mapping
  297 + # 标准来自:你提供的“语言 / 英文名 / 代码”表
  298 + qwen_lang_map = {
  299 + "en": "English",
  300 + "zh": "Chinese",
  301 + "zh_tw": "Traditional Chinese",
  302 + "ru": "Russian",
  303 + "ja": "Japanese",
  304 + "ko": "Korean",
  305 + "es": "Spanish",
  306 + "fr": "French",
  307 + "pt": "Portuguese",
  308 + "de": "German",
  309 + "it": "Italian",
  310 + "th": "Thai",
  311 + "vi": "Vietnamese",
  312 + "id": "Indonesian",
  313 + "ms": "Malay",
  314 + "ar": "Arabic",
  315 + "hi": "Hindi",
  316 + "he": "Hebrew",
  317 + "my": "Burmese",
  318 + "ta": "Tamil",
  319 + "ur": "Urdu",
  320 + "bn": "Bengali",
  321 + "pl": "Polish",
  322 + "nl": "Dutch",
  323 + "ro": "Romanian",
  324 + "tr": "Turkish",
  325 + "km": "Khmer",
  326 + "lo": "Lao",
  327 + "yue": "Cantonese",
  328 + "cs": "Czech",
  329 + "el": "Greek",
  330 + "sv": "Swedish",
  331 + "hu": "Hungarian",
  332 + "da": "Danish",
  333 + "fi": "Finnish",
  334 + "uk": "Ukrainian",
  335 + "bg": "Bulgarian",
  336 + }
  337 +
  338 + # Convert target language
  339 + target_lang_normalized = target_lang.lower()
  340 + target_lang_qwen = qwen_lang_map.get(target_lang_normalized, target_lang.capitalize())
  341 +
  342 + # Convert source language
  343 + source_lang_normalized = (source_lang or "").strip().lower()
  344 + if not source_lang_normalized or source_lang_normalized == "auto":
  345 + source_lang_qwen = "auto"
  346 + else:
  347 + source_lang_qwen = qwen_lang_map.get(source_lang_normalized, source_lang.capitalize())
  348 +
  349 + # Prepare translation options
  350 + translation_options = {
  351 + "source_lang": source_lang_qwen,
  352 + "target_lang": target_lang_qwen,
  353 + }
  354 +
  355 + # Prepare messages
  356 + messages = [
  357 + {
  358 + "role": "user",
  359 + "content": text
  360 + }
  361 + ]
  362 +
  363 + start_time = time.time()
  364 + try:
  365 + completion = self.qwen_client.chat.completions.create(
  366 + model=self.QWEN_MODEL,
  367 + messages=messages,
  368 + extra_body={
  369 + "translation_options": translation_options
  370 + }
  371 + )
  372 +
  373 + translated_text = completion.choices[0].message.content.strip()
  374 + duration_ms = (time.time() - start_time) * 1000
  375 +
  376 + logger.info(
  377 + f"[Translator] Qwen API response success | Original text: '{text}' | Target language: {target_lang_qwen} | "
  378 + f"Translation result: '{translated_text}' | Duration: {duration_ms:.2f} ms"
  379 + )
  380 + return translated_text
  381 +
  382 + except Exception as e:
  383 + duration_ms = (time.time() - start_time) * 1000
  384 + logger.error(
  385 + f"[Translator] Qwen API request exception | Original text: '{text}' | Target language: {target_lang_qwen} | "
  386 + f"Duration: {duration_ms:.2f} ms | Error: {e}", exc_info=True
  387 + )
  388 + return None
  389 +
  390 + def _translate_deepl(
  391 + self,
  392 + text: str,
  393 + target_lang: str,
  394 + source_lang: Optional[str],
  395 + context: Optional[str] = None,
  396 + prompt: Optional[str] = None
  397 + ) -> Optional[str]:
  398 + """
  399 + Translate using DeepL API with context and glossary support.
  400 +
  401 + Args:
  402 + text: Text to translate
  403 + target_lang: Target language code
  404 + source_lang: Source language code (optional)
  405 + context: Context hint for translation (e.g., "e-commerce product search")
  406 + """
  407 + # Map to DeepL language codes
  408 + target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper())
  409 +
  410 + headers = {
  411 + "Authorization": f"DeepL-Auth-Key {self.api_key}",
  412 + "Content-Type": "application/json",
  413 + }
  414 +
  415 + # Use prompt as context parameter for DeepL API (not as text prefix)
  416 + # According to DeepL API: context is "Additional context that can influence a translation but is not translated itself"
  417 + # If prompt is provided, use it as context; otherwise use the default context
  418 + api_context = prompt if prompt else context
  419 +
  420 + # For e-commerce, add context words to help DeepL understand the domain
  421 + # This is especially important for single-word ambiguous terms like "车" (car vs rook)
  422 + text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context)
  423 +
  424 + payload = {
  425 + "text": [text_to_translate],
  426 + "target_lang": target_code,
  427 + }
  428 +
  429 + if source_lang:
  430 + source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper())
  431 + payload["source_lang"] = source_code
  432 +
  433 + # Add context parameter (prompt or default context)
  434 + # Context influences translation but is not translated itself
  435 + if api_context:
  436 + payload["context"] = api_context
  437 +
  438 + # Add glossary if configured
  439 + if self.glossary_id:
  440 + payload["glossary_id"] = self.glossary_id
  441 +
  442 + # Note: DeepL API v2 supports "context" parameter for additional context
  443 + # that influences translation but is not translated itself.
  444 + # We use prompt as context parameter when provided.
  445 +
  446 + try:
  447 + response = requests.post(
  448 + self.DEEPL_API_URL,
  449 + headers=headers,
  450 + json=payload,
  451 + timeout=self.timeout
  452 + )
  453 +
  454 + if response.status_code == 200:
  455 + data = response.json()
  456 + if "translations" in data and len(data["translations"]) > 0:
  457 + translated_text = data["translations"][0]["text"]
  458 + # If we added context, extract just the term from the result
  459 + if needs_extraction:
  460 + translated_text = self._extract_term_from_translation(
  461 + translated_text, text, target_code
  462 + )
  463 + logger.debug(
  464 + f"[Translator] DeepL API response success | Original text: '{text}' | Target language: {target_code} | "
  465 + f"Translation result: '{translated_text}'"
  466 + )
  467 + return translated_text
  468 + else:
  469 + logger.error(
  470 + f"[Translator] DeepL API error | Original text: '{text}' | Target language: {target_code} | "
  471 + f"Status code: {response.status_code} | Error message: {response.text}"
  472 + )
  473 + return None
  474 +
  475 + except requests.Timeout:
  476 + logger.warning(
  477 + f"[Translator] DeepL API request timeout | Original text: '{text}' | Target language: {target_code} | "
  478 + f"Timeout: {self.timeout}s"
  479 + )
  480 + return None
  481 + except Exception as e:
  482 + logger.error(
  483 + f"[Translator] DeepL API request exception | Original text: '{text}' | Target language: {target_code} | "
  484 + f"Error: {e}", exc_info=True
  485 + )
  486 + return None
  487 +
  488 + # NOTE: _translate_deepl_free is intentionally not implemented.
  489 + # We do not support automatic fallback to the free endpoint, to avoid
  490 + # mixing Pro keys with https://api-free.deepl.com and related 403 errors.
  491 +
  492 + def translate_multi(
  493 + self,
  494 + text: str,
  495 + target_langs: List[str],
  496 + source_lang: Optional[str] = None,
  497 + context: Optional[str] = None,
  498 + async_mode: bool = True,
  499 + prompt: Optional[str] = None
  500 + ) -> Dict[str, Optional[str]]:
  501 + """
  502 + Translate text to multiple target languages.
  503 +
  504 + In async_mode=True (default):
  505 + - Returns cached translations immediately if available
  506 + - For translations that can be optimized (e.g., pure numbers, already in target language),
  507 + returns result immediately via synchronous call
  508 + - Launches async tasks for other missing translations (non-blocking)
  509 + - Returns None for missing translations that require async processing
  510 +
  511 + In async_mode=False:
  512 + - Waits for all translations to complete (blocking)
  513 +
  514 + Args:
  515 + text: Text to translate
  516 + target_langs: List of target language codes
  517 + source_lang: Source language code (optional)
  518 + context: Context hint for translation (optional)
  519 + async_mode: If True, return cached results immediately and translate missing ones async
  520 + prompt: Translation prompt/instruction (optional)
  521 +
  522 + Returns:
  523 + Dictionary mapping language code to translated text (only cached results in async mode)
  524 + """
  525 + results = {}
  526 + missing_langs = []
  527 + async_langs = []
  528 +
  529 + # First, get cached translations
  530 + for lang in target_langs:
  531 + cached = self._get_cached_translation(text, lang, source_lang, context, prompt)
  532 + if cached is not None:
  533 + results[lang] = cached
  534 + else:
  535 + missing_langs.append(lang)
  536 +
  537 + # If async mode and there are missing translations
  538 + if async_mode and missing_langs:
  539 + # Check if translation can be optimized (immediate return)
  540 + for lang in missing_langs:
  541 + target_lang = lang.lower()
  542 + # Check optimization conditions (same as in translate method)
  543 + can_optimize = False
  544 + if target_lang == 'en' and self._is_english_text(text):
  545 + can_optimize = True
  546 + elif target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)):
  547 + can_optimize = True
  548 +
  549 + if can_optimize:
  550 + # Can be optimized, call translate synchronously for immediate result
  551 + results[lang] = self.translate(text, lang, source_lang, context, prompt)
  552 + else:
  553 + # Requires actual translation, add to async list
  554 + async_langs.append(lang)
  555 +
  556 + # Launch async tasks for translations that require actual API calls
  557 + if async_langs:
  558 + for lang in async_langs:
  559 + self._translate_async(text, lang, source_lang, context, prompt)
  560 + # Return None for async translations
  561 + for lang in async_langs:
  562 + results[lang] = None
  563 + else:
  564 + # Synchronous mode: wait for all translations
  565 + for lang in missing_langs:
  566 + results[lang] = self.translate(text, lang, source_lang, context, prompt)
  567 +
  568 + return results
  569 +
  570 + def translate_multi_async(
  571 + self,
  572 + text: str,
  573 + target_langs: List[str],
  574 + source_lang: Optional[str] = None,
  575 + context: Optional[str] = None,
  576 + prompt: Optional[str] = None
  577 + ) -> Dict[str, Union[str, Future]]:
  578 + """
  579 + Translate text to multiple target languages asynchronously, returning Futures that can be awaited.
  580 +
  581 + This method returns a dictionary where:
  582 + - If translation is cached, the value is the translation string (immediate)
  583 + - If translation needs to be done, the value is a Future object that can be awaited
  584 +
  585 + Args:
  586 + text: Text to translate
  587 + target_langs: List of target language codes
  588 + source_lang: Source language code (optional)
  589 + context: Context hint for translation (optional)
  590 + prompt: Translation prompt/instruction (optional)
  591 +
  592 + Returns:
  593 + Dictionary mapping language code to either translation string (cached) or Future object
  594 + """
  595 + results = {}
  596 + missing_langs = []
  597 +
  598 + # First, get cached translations
  599 + for lang in target_langs:
  600 + cached = self._get_cached_translation(text, lang, source_lang, context, prompt)
  601 + if cached is not None:
  602 + results[lang] = cached
  603 + else:
  604 + missing_langs.append(lang)
  605 +
  606 + # For missing translations, submit async tasks and return Futures
  607 + for lang in missing_langs:
  608 + future = self.executor.submit(
  609 + self.translate,
  610 + text,
  611 + lang,
  612 + source_lang,
  613 + context,
  614 + prompt
  615 + )
  616 + results[lang] = future
  617 +
  618 + return results
  619 +
  620 + def _get_cached_translation(
  621 + self,
  622 + text: str,
  623 + target_lang: str,
  624 + source_lang: Optional[str] = None,
  625 + context: Optional[str] = None,
  626 + prompt: Optional[str] = None
  627 + ) -> Optional[str]:
  628 + """Get translation from cache if available."""
  629 + if not self.redis_client:
  630 + return None
  631 + return self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt)
  632 +
  633 + def _get_cached_translation_redis(
  634 + self,
  635 + text: str,
  636 + target_lang: str,
  637 + source_lang: Optional[str] = None,
  638 + context: Optional[str] = None,
  639 + prompt: Optional[str] = None
  640 + ) -> Optional[str]:
  641 + """
  642 + Get translation from Redis cache with sliding expiration.
  643 +
  644 + 滑动过期机制:每次访问缓存时,重置过期时间为配置的过期时间(默认720天)。
  645 + 这样缓存会在最后一次访问后的720天才过期,而不是写入后的720天。
  646 + 这确保了常用的翻译缓存不会被过早删除。
  647 + """
  648 + if not self.redis_client:
  649 + return None
  650 +
  651 + try:
  652 + # Build cache key: prefix:target_lang:text
  653 + # For simplicity, we use target_lang and text as key
  654 + # Context and prompt are not included in key to maximize cache hits
  655 + cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}"
  656 + value = self.redis_client.get(cache_key)
  657 + if value:
  658 + # Sliding expiration: reset expiration time on access
  659 + # 每次读取缓存时,重置过期时间为配置的过期时间(最后一次访问后的N天才过期)
  660 + try:
  661 + self.redis_client.expire(cache_key, self.expire_seconds)
  662 + except Exception as expire_error:
  663 + # 即使 expire 失败,也返回缓存值(不影响功能)
  664 + logger.warning(
  665 + f"[Translator] Failed to update cache expiration for key {cache_key}: {expire_error}"
  666 + )
  667 +
  668 + logger.debug(
  669 + f"[Translator] Redis cache hit | Original text: '{text}' | Target language: {target_lang} | "
  670 + f"Cache key: {cache_key} | Translation result: '{value}' | TTL reset to {self.expire_seconds}s"
  671 + )
  672 + return value
  673 + logger.debug(
  674 + f"[Translator] Redis cache miss | Original text: '{text}' | Target language: {target_lang} | "
  675 + f"Cache key: {cache_key}"
  676 + )
  677 + return None
  678 + except Exception as e:
  679 + logger.error(f"[Translator] Redis error during get translation cache | Original text: '{text}' | Target language: {target_lang} | Error: {e}")
  680 + return None
  681 +
  682 + def _set_cached_translation_redis(
  683 + self,
  684 + text: str,
  685 + target_lang: str,
  686 + translation: str,
  687 + source_lang: Optional[str] = None,
  688 + context: Optional[str] = None,
  689 + prompt: Optional[str] = None
  690 + ) -> None:
  691 + """Store translation in Redis cache."""
  692 + if not self.redis_client:
  693 + return
  694 +
  695 + try:
  696 + cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}"
  697 + self.redis_client.setex(cache_key, self.expire_seconds, translation)
  698 + logger.info(
  699 + f"[Translator] Redis cache write | Original text: '{text}' | Target language: {target_lang} | "
  700 + f"Cache key: {cache_key} | Translation result: '{translation}'"
  701 + )
  702 + except Exception as e:
  703 + logger.error(
  704 + f"[Translator] Redis cache write failed | Original text: '{text}' | Target language: {target_lang} | "
  705 + f"Error: {e}"
  706 + )
  707 +
  708 + def _translate_async(
  709 + self,
  710 + text: str,
  711 + target_lang: str,
  712 + source_lang: Optional[str] = None,
  713 + context: Optional[str] = None,
  714 + prompt: Optional[str] = None
  715 + ):
  716 + """Launch async translation task."""
  717 + def _do_translate():
  718 + try:
  719 + result = self.translate(text, target_lang, source_lang, context, prompt)
  720 + if result:
  721 + logger.debug(f"Async translation completed: {text} -> {target_lang}: {result}")
  722 + except Exception as e:
  723 + logger.warning(f"Async translation failed: {text} -> {target_lang}: {e}")
  724 +
  725 + self.executor.submit(_do_translate)
  726 +
  727 + def _add_ecommerce_context(
  728 + self,
  729 + text: str,
  730 + source_lang: Optional[str],
  731 + context: Optional[str]
  732 + ) -> tuple:
  733 + """
  734 + Add e-commerce context to text for better disambiguation.
  735 +
  736 + For single-word ambiguous Chinese terms, we add context words that help
  737 + DeepL understand this is an e-commerce/product search context.
  738 +
  739 + Args:
  740 + text: Original text to translate
  741 + source_lang: Source language code
  742 + context: Context hint
  743 +
  744 + Returns:
  745 + Tuple of (text_with_context, needs_extraction)
  746 + - text_with_context: Text to send to DeepL
  747 + - needs_extraction: Whether we need to extract the term from the result
  748 + """
  749 + # Only apply for e-commerce context and Chinese source
  750 + if not context or "e-commerce" not in context.lower():
  751 + return text, False
  752 +
  753 + if not source_lang or source_lang.lower() != 'zh':
  754 + return text, False
  755 +
  756 + # For single-word queries, add context to help disambiguation
  757 + text_stripped = text.strip()
  758 + if len(text_stripped.split()) == 1 and len(text_stripped) <= 2:
  759 + # Common ambiguous Chinese e-commerce terms like "车" (car vs rook)
  760 + # We add a context phrase: "购买 [term]" (buy [term]) or "商品 [term]" (product [term])
  761 + # This helps DeepL understand the e-commerce context
  762 + # We'll need to extract just the term from the translation result
  763 + context_phrase = f"购买 {text_stripped}"
  764 + return context_phrase, True
  765 +
  766 + # For multi-word queries, DeepL usually has enough context
  767 + return text, False
  768 +
  769 + def _extract_term_from_translation(
  770 + self,
  771 + translated_text: str,
  772 + original_text: str,
  773 + target_lang_code: str
  774 + ) -> str:
  775 + """
  776 + Extract the actual term from a translation that included context.
  777 +
  778 + For example, if we translated "购买 车" (buy car) and got "buy car",
  779 + we want to extract just "car".
  780 +
  781 + Args:
  782 + translated_text: Full translation result
  783 + original_text: Original single-word query
  784 + target_lang_code: Target language code (EN, ZH, etc.)
  785 +
  786 + Returns:
  787 + Extracted term or original translation if extraction fails
  788 + """
  789 + # For English target, try to extract the last word (the actual term)
  790 + if target_lang_code == "EN":
  791 + words = translated_text.strip().split()
  792 + if len(words) > 1:
  793 + # Usually the last word is the term we want
  794 + # But we need to be smart - if it's "buy car", we want "car"
  795 + # Common context words to skip: buy, purchase, product, item, etc.
  796 + context_words = {"buy", "purchase", "product", "item", "commodity", "goods"}
  797 + # Try to find the term (not a context word)
  798 + for word in reversed(words):
  799 + word_lower = word.lower().rstrip('.,!?;:')
  800 + if word_lower not in context_words:
  801 + return word_lower
  802 + # If all words are context words, return the last one
  803 + return words[-1].lower().rstrip('.,!?;:')
  804 +
  805 + # For other languages or if extraction fails, return as-is
  806 + # The user can configure a glossary for better results
  807 + return translated_text
  808 +
  809 + def _shop_lang_matches(self, shop_lang_lower: str, lang_code: str) -> bool:
  810 + """True if shop language matches index language (use source, no translate)."""
  811 + if not shop_lang_lower or not lang_code:
  812 + return False
  813 + if shop_lang_lower == lang_code:
  814 + return True
  815 + if lang_code == "zh" and "zh" in shop_lang_lower:
  816 + return True
  817 + if lang_code == "en" and "en" in shop_lang_lower:
  818 + return True
  819 + return False
  820 +
  821 + def translate_for_indexing(
  822 + self,
  823 + text: str,
  824 + shop_language: str,
  825 + source_lang: Optional[str] = None,
  826 + context: Optional[str] = None,
  827 + prompt: Optional[str] = None,
  828 + index_languages: Optional[List[str]] = None,
  829 + ) -> Dict[str, Optional[str]]:
  830 + """
  831 + Translate text for indexing based on shop language and tenant index_languages.
  832 +
  833 + For each language in index_languages: use source text if shop language matches,
  834 + otherwise translate to that language.
  835 +
  836 + Args:
  837 + text: Text to translate
  838 + shop_language: Shop primary language (e.g. 'zh', 'en', 'ru')
  839 + source_lang: Source language code (optional)
  840 + context: Additional context for translation (optional)
  841 + prompt: Translation prompt (optional)
  842 + index_languages: Languages to index (from tenant_config). Default ["en", "zh"].
  843 +
  844 + Returns:
  845 + Dict keyed by each index_language with translated or source text (or None).
  846 + """
  847 + langs = index_languages if index_languages else ["en", "zh"]
  848 + results = {lang: None for lang in langs}
  849 + if not text or not text.strip():
  850 + return results
  851 + if re.match(r'^[\d\s_-]+$', text):
  852 + logger.info(f"[Translator] Skip translation for symbol-only query: '{text}'")
  853 + return results
  854 +
  855 + shop_lang_lower = (shop_language or "").strip().lower()
  856 + targets = []
  857 + for lang in langs:
  858 + if self._shop_lang_matches(shop_lang_lower, lang):
  859 + results[lang] = text
  860 + else:
  861 + targets.append(lang)
  862 +
  863 + for target_lang in targets:
  864 + cached = self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt)
  865 + if cached:
  866 + results[target_lang] = cached
  867 + logger.debug(f"[Translator] Cache hit for indexing: '{text}' -> {target_lang}: {cached}")
  868 + continue
  869 + translated = self.translate(
  870 + text,
  871 + target_lang=target_lang,
  872 + source_lang=source_lang or shop_language,
  873 + context=context,
  874 + prompt=prompt,
  875 + )
  876 + results[target_lang] = translated
  877 + return results
  878 +
  879 + def get_translation_needs(
  880 + self,
  881 + detected_lang: str,
  882 + supported_langs: List[str]
  883 + ) -> List[str]:
  884 + """
  885 + Determine which languages need translation.
  886 +
  887 + Args:
  888 + detected_lang: Detected query language
  889 + supported_langs: List of supported languages
  890 +
  891 + Returns:
  892 + List of language codes to translate to
  893 + """
  894 + # If detected language is in supported list, translate to others
  895 + if detected_lang in supported_langs:
  896 + return [lang for lang in supported_langs if detected_lang != lang]
  897 +
  898 + # Otherwise, translate to all supported languages
  899 + return supported_langs
  900 +
  901 + def _is_english_text(self, text: str) -> bool:
  902 + """
  903 + Check if text is primarily English (ASCII letters, numbers, common punctuation).
  904 +
  905 + Args:
  906 + text: Text to check
  907 +
  908 + Returns:
  909 + True if text appears to be English
  910 + """
  911 + if not text or not text.strip():
  912 + return True
  913 +
  914 + # Remove whitespace and common punctuation
  915 + text_clean = re.sub(r'[\s\.,!?;:\-\'\"\(\)\[\]{}]', '', text)
  916 + if not text_clean:
  917 + return True
  918 +
  919 + # Check if all remaining characters are ASCII (letters, numbers)
  920 + # This is a simple heuristic: if most characters are ASCII, it's likely English
  921 + ascii_count = sum(1 for c in text_clean if ord(c) < 128)
  922 + ratio = ascii_count / len(text_clean) if text_clean else 0
  923 +
  924 + # If more than 80% are ASCII characters, consider it English
  925 + return ratio > 0.8
  926 +
  927 + def _contains_chinese(self, text: str) -> bool:
  928 + """
  929 + Check if text contains Chinese characters (Han characters).
  930 +
  931 + Args:
  932 + text: Text to check
  933 +
  934 + Returns:
  935 + True if text contains Chinese characters
  936 + """
  937 + if not text:
  938 + return False
  939 +
  940 + # Check for Chinese characters (Unicode range: \u4e00-\u9fff)
  941 + chinese_pattern = re.compile(r'[\u4e00-\u9fff]')
  942 + return bool(chinese_pattern.search(text))
  943 +
  944 + def _is_pure_number(self, text: str) -> bool:
  945 + """
  946 + Check if text is purely numeric (digits, possibly with spaces, dots, commas).
  947 +
  948 + Args:
  949 + text: Text to check
  950 +
  951 + Returns:
  952 + True if text is purely numeric
  953 + """
  954 + if not text or not text.strip():
  955 + return False
  956 +
  957 + # Remove whitespace, dots, commas (common number separators)
  958 + text_clean = re.sub(r'[\s\.,]', '', text.strip())
  959 + if not text_clean:
  960 + return False
  961 +
  962 + # Check if all remaining characters are digits
  963 + return text_clean.isdigit()
... ...
query/test_translation.py
... ... @@ -19,7 +19,7 @@ from pathlib import Path
19 19 sys.path.insert(0, str(Path(__file__).parent.parent))
20 20  
21 21 from config import ConfigLoader
22   -from query.translator import Translator
  22 +from query.qwen_mt_translate import Translator
23 23 import logging
24 24  
25 25 # Configure logging
... ...
query/translator.py
... ... @@ -13,7 +13,7 @@ Supports multiple translation models:
13 13 使用方法 (Usage):
14 14  
15 15 ```python
16   -from query.translator import Translator
  16 +from query.qwen_mt_translate import Translator
17 17  
18 18 # 使用默认的 qwen 模型(推荐)
19 19 translator = Translator() # 默认使用 qwen 模型
... ...
tests/test_translator_failure_semantics.py
1   -from query.translator import Translator
  1 +from query.qwen_mt_translate import Translator
2 2  
3 3  
4 4 class _RecordingRedis:
... ...