Commit a0a173ae904212171b324f0976f034f6528ff749
1 parent
985752f5
last
Showing
12 changed files
with
1393 additions
and
8 deletions
Show diff stats
api/translator_app.py
| @@ -97,7 +97,7 @@ from pydantic import BaseModel, Field | @@ -97,7 +97,7 @@ from pydantic import BaseModel, Field | ||
| 97 | # Add parent directory to path | 97 | # Add parent directory to path |
| 98 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | 98 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| 99 | 99 | ||
| 100 | -from query.translator import Translator | 100 | +from query.qwen_mt_translate import Translator |
| 101 | from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG | 101 | from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG |
| 102 | 102 | ||
| 103 | # Configure logging | 103 | # Configure logging |
config/config.yaml
| @@ -119,7 +119,7 @@ rerank: | @@ -119,7 +119,7 @@ rerank: | ||
| 119 | # 可扩展服务/provider 注册表(单一配置源) | 119 | # 可扩展服务/provider 注册表(单一配置源) |
| 120 | services: | 120 | services: |
| 121 | translation: | 121 | translation: |
| 122 | - provider: "direct" # direct | http | google(reserved) | 122 | + provider: "llm" # direct | http | google(reserved) |
| 123 | base_url: "http://127.0.0.1:6006" | 123 | base_url: "http://127.0.0.1:6006" |
| 124 | model: "qwen" | 124 | model: "qwen" |
| 125 | timeout_sec: 10.0 | 125 | timeout_sec: 10.0 |
| @@ -130,6 +130,12 @@ services: | @@ -130,6 +130,12 @@ services: | ||
| 130 | base_url: "http://127.0.0.1:6006" | 130 | base_url: "http://127.0.0.1:6006" |
| 131 | model: "qwen" | 131 | model: "qwen" |
| 132 | timeout_sec: 10.0 | 132 | timeout_sec: 10.0 |
| 133 | + llm: | ||
| 134 | + model: "qwen-flash" | ||
| 135 | + # 可选:覆盖 DashScope 兼容模式的 Endpoint 与超时 | ||
| 136 | + # base_url 留空则使用 DASHSCOPE_BASE_URL 或默认地域 | ||
| 137 | + base_url: "" | ||
| 138 | + timeout_sec: 30.0 | ||
| 133 | google: | 139 | google: |
| 134 | enabled: false | 140 | enabled: false |
| 135 | project_id: "" | 141 | project_id: "" |
docs/系统设计文档.md
| @@ -384,6 +384,15 @@ query_config: | @@ -384,6 +384,15 @@ query_config: | ||
| 384 | 384 | ||
| 385 | 实际代码中,通过通用的 translation provider 抽象来选择具体后端和模型,文档不固定绑定某一个具体翻译服务或模型名称,以保持可配置性。 | 385 | 实际代码中,通过通用的 translation provider 抽象来选择具体后端和模型,文档不固定绑定某一个具体翻译服务或模型名称,以保持可配置性。 |
| 386 | 386 | ||
| 387 | +此外,为了支持**高质量、提示词可控的 LLM 翻译**(例如商品富化脚本、离线分析工具),在 `query/llm_translate.py` 中提供了一个独立的 LLM 翻译辅助模块: | ||
| 388 | + | ||
| 389 | +- **配置入口**:`config/config.yaml -> services.translation.providers.llm`,用于指定: | ||
| 390 | + - `model`: 例如 `qwen-flash`(DashScope 兼容模式的对话模型) | ||
| 391 | + - `base_url`: 可选;为空时使用环境变量 `DASHSCOPE_BASE_URL` 或默认 Endpoint | ||
| 392 | + - `timeout_sec`: LLM 调用超时 | ||
| 393 | +- **环境变量**:仍通过 `DASHSCOPE_API_KEY` 注入 DashScope API Key。 | ||
| 394 | +- **使用方式**:主查询路径继续使用 machine translation(`query.translator.Translator`),只在需要更强表达控制的场景(如批量标注、产品分类脚本)中显式调用 `llm_translate()`。 | ||
| 395 | + | ||
| 387 | #### 功能特性 | 396 | #### 功能特性 |
| 388 | 1. **语言检测**:自动检测查询语言 | 397 | 1. **语言检测**:自动检测查询语言 |
| 389 | 2. **智能翻译**: | 398 | 2. **智能翻译**: |
indexer/document_transformer.py
| @@ -20,7 +20,7 @@ logger = logging.getLogger(__name__) | @@ -20,7 +20,7 @@ logger = logging.getLogger(__name__) | ||
| 20 | 20 | ||
| 21 | # Try to import translator (optional dependency) | 21 | # Try to import translator (optional dependency) |
| 22 | try: | 22 | try: |
| 23 | - from query.translator import Translator | 23 | + from query.qwen_mt_translate import Translator |
| 24 | TRANSLATOR_AVAILABLE = True | 24 | TRANSLATOR_AVAILABLE = True |
| 25 | except ImportError: | 25 | except ImportError: |
| 26 | TRANSLATOR_AVAILABLE = False | 26 | TRANSLATOR_AVAILABLE = False |
indexer/test_indexing.py
| @@ -273,7 +273,7 @@ def test_document_transformer(): | @@ -273,7 +273,7 @@ def test_document_transformer(): | ||
| 273 | tenant_config = tenant_config_loader.get_tenant_config('162') | 273 | tenant_config = tenant_config_loader.get_tenant_config('162') |
| 274 | 274 | ||
| 275 | # 初始化翻译器(测试环境总是启用,具体翻译方向由tenant_config控制) | 275 | # 初始化翻译器(测试环境总是启用,具体翻译方向由tenant_config控制) |
| 276 | - from query.translator import Translator | 276 | + from query.qwen_mt_translate import Translator |
| 277 | translator = Translator( | 277 | translator = Translator( |
| 278 | api_key=config.query_config.translation_api_key, | 278 | api_key=config.query_config.translation_api_key, |
| 279 | use_cache=True | 279 | use_cache=True |
providers/translation.py
| @@ -0,0 +1,169 @@ | @@ -0,0 +1,169 @@ | ||
| 1 | +""" | ||
| 2 | +Translation provider - direct (in-process) or HTTP service. | ||
| 3 | +""" | ||
| 4 | +from __future__ import annotations | ||
| 5 | + | ||
| 6 | +import logging | ||
| 7 | +from typing import Any, Dict, List, Optional, Union | ||
| 8 | + | ||
| 9 | +from concurrent.futures import Future, ThreadPoolExecutor | ||
| 10 | +import requests | ||
| 11 | + | ||
| 12 | +from config.services_config import get_translation_config, get_translation_base_url | ||
| 13 | + | ||
| 14 | +logger = logging.getLogger(__name__) | ||
| 15 | + | ||
| 16 | + | ||
| 17 | +class HttpTranslationProvider: | ||
| 18 | + """Translation via HTTP service.""" | ||
| 19 | + | ||
| 20 | + def __init__( | ||
| 21 | + self, | ||
| 22 | + base_url: str, | ||
| 23 | + model: str = "qwen", | ||
| 24 | + timeout_sec: float = 10.0, | ||
| 25 | + translation_context: Optional[str] = None, | ||
| 26 | + ): | ||
| 27 | + self.base_url = (base_url or "").rstrip("/") | ||
| 28 | + self.model = model or "qwen" | ||
| 29 | + self.timeout_sec = float(timeout_sec or 10.0) | ||
| 30 | + self.translation_context = translation_context or "e-commerce product search" | ||
| 31 | + self.executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="http-translator") | ||
| 32 | + | ||
| 33 | + def _translate_once( | ||
| 34 | + self, | ||
| 35 | + text: str, | ||
| 36 | + target_lang: str, | ||
| 37 | + source_lang: Optional[str] = None, | ||
| 38 | + ) -> Optional[str]: | ||
| 39 | + if not text or not str(text).strip(): | ||
| 40 | + return text | ||
| 41 | + try: | ||
| 42 | + url = f"{self.base_url}/translate" | ||
| 43 | + payload = { | ||
| 44 | + "text": text, | ||
| 45 | + "target_lang": target_lang, | ||
| 46 | + "source_lang": source_lang or "auto", | ||
| 47 | + "model": self.model, | ||
| 48 | + } | ||
| 49 | + response = requests.post(url, json=payload, timeout=self.timeout_sec) | ||
| 50 | + if response.status_code != 200: | ||
| 51 | + logger.warning( | ||
| 52 | + "HTTP translator failed: status=%s body=%s", | ||
| 53 | + response.status_code, | ||
| 54 | + (response.text or "")[:200], | ||
| 55 | + ) | ||
| 56 | + return None | ||
| 57 | + data = response.json() | ||
| 58 | + translated = data.get("translated_text") | ||
| 59 | + return translated if translated is not None else None | ||
| 60 | + except Exception as exc: | ||
| 61 | + logger.warning("HTTP translator request failed: %s", exc, exc_info=True) | ||
| 62 | + return None | ||
| 63 | + | ||
| 64 | + def translate( | ||
| 65 | + self, | ||
| 66 | + text: str, | ||
| 67 | + target_lang: str, | ||
| 68 | + source_lang: Optional[str] = None, | ||
| 69 | + context: Optional[str] = None, | ||
| 70 | + prompt: Optional[str] = None, | ||
| 71 | + ) -> Optional[str]: | ||
| 72 | + del context, prompt | ||
| 73 | + result = self._translate_once(text=text, target_lang=target_lang, source_lang=source_lang) | ||
| 74 | + return result if result is not None else text | ||
| 75 | + | ||
| 76 | + def translate_multi( | ||
| 77 | + self, | ||
| 78 | + text: str, | ||
| 79 | + target_langs: List[str], | ||
| 80 | + source_lang: Optional[str] = None, | ||
| 81 | + context: Optional[str] = None, | ||
| 82 | + async_mode: bool = True, | ||
| 83 | + prompt: Optional[str] = None, | ||
| 84 | + ) -> Dict[str, Optional[str]]: | ||
| 85 | + del context, async_mode, prompt | ||
| 86 | + out: Dict[str, Optional[str]] = {} | ||
| 87 | + for lang in target_langs: | ||
| 88 | + out[lang] = self.translate(text, lang, source_lang=source_lang) | ||
| 89 | + return out | ||
| 90 | + | ||
| 91 | + def translate_multi_async( | ||
| 92 | + self, | ||
| 93 | + text: str, | ||
| 94 | + target_langs: List[str], | ||
| 95 | + source_lang: Optional[str] = None, | ||
| 96 | + context: Optional[str] = None, | ||
| 97 | + prompt: Optional[str] = None, | ||
| 98 | + ) -> Dict[str, Union[str, Future]]: | ||
| 99 | + del context, prompt | ||
| 100 | + out: Dict[str, Union[str, Future]] = {} | ||
| 101 | + for lang in target_langs: | ||
| 102 | + out[lang] = self.executor.submit(self.translate, text, lang, source_lang) | ||
| 103 | + return out | ||
| 104 | + | ||
| 105 | + def translate_for_indexing( | ||
| 106 | + self, | ||
| 107 | + text: str, | ||
| 108 | + shop_language: str, | ||
| 109 | + source_lang: Optional[str] = None, | ||
| 110 | + context: Optional[str] = None, | ||
| 111 | + prompt: Optional[str] = None, | ||
| 112 | + index_languages: Optional[List[str]] = None, | ||
| 113 | + ) -> Dict[str, Optional[str]]: | ||
| 114 | + del context, prompt | ||
| 115 | + langs = index_languages if index_languages else ["en", "zh"] | ||
| 116 | + source = source_lang or shop_language or "auto" | ||
| 117 | + out: Dict[str, Optional[str]] = {} | ||
| 118 | + for lang in langs: | ||
| 119 | + if lang == shop_language: | ||
| 120 | + out[lang] = text | ||
| 121 | + else: | ||
| 122 | + out[lang] = self.translate(text, target_lang=lang, source_lang=source) | ||
| 123 | + return out | ||
| 124 | + | ||
| 125 | + | ||
| 126 | +def create_translation_provider(query_config: Any = None) -> Any: | ||
| 127 | + """ | ||
| 128 | + Create translation provider from services config. | ||
| 129 | + | ||
| 130 | + query_config: optional, for api_key/glossary_id/context (used by direct provider). | ||
| 131 | + """ | ||
| 132 | + cfg = get_translation_config() | ||
| 133 | + provider = cfg.provider | ||
| 134 | + pc = cfg.get_provider_cfg() | ||
| 135 | + | ||
| 136 | + if provider in ("direct", "local", "inprocess"): | ||
| 137 | + from query.qwen_mt_translate import Translator | ||
| 138 | + model = pc.get("model") or "qwen" | ||
| 139 | + qc = query_config or _empty_query_config() | ||
| 140 | + return Translator( | ||
| 141 | + model=model, | ||
| 142 | + api_key=getattr(qc, "translation_api_key", None), | ||
| 143 | + use_cache=True, | ||
| 144 | + glossary_id=getattr(qc, "translation_glossary_id", None), | ||
| 145 | + translation_context=getattr(qc, "translation_context", "e-commerce product search"), | ||
| 146 | + ) | ||
| 147 | + | ||
| 148 | + if provider in ("http", "service"): | ||
| 149 | + base_url = get_translation_base_url() | ||
| 150 | + model = pc.get("model") or "qwen" | ||
| 151 | + timeout = pc.get("timeout_sec", 10.0) | ||
| 152 | + qc = query_config or _empty_query_config() | ||
| 153 | + return HttpTranslationProvider( | ||
| 154 | + base_url=base_url, | ||
| 155 | + model=model, | ||
| 156 | + timeout_sec=float(timeout), | ||
| 157 | + translation_context=getattr(qc, "translation_context", "e-commerce product search"), | ||
| 158 | + ) | ||
| 159 | + | ||
| 160 | + raise ValueError(f"Unsupported translation provider: {provider}") | ||
| 161 | + | ||
| 162 | + | ||
| 163 | +def _empty_query_config() -> Any: | ||
| 164 | + """Minimal object with default translation attrs.""" | ||
| 165 | + class _QC: | ||
| 166 | + translation_api_key = None | ||
| 167 | + translation_glossary_id = None | ||
| 168 | + translation_context = "e-commerce product search" | ||
| 169 | + return _QC() |
query/__init__.py
| 1 | """Query package initialization.""" | 1 | """Query package initialization.""" |
| 2 | 2 | ||
| 3 | from .language_detector import LanguageDetector | 3 | from .language_detector import LanguageDetector |
| 4 | -from .translator import Translator | 4 | +from .qwen_mt_translate import Translator |
| 5 | from .query_rewriter import QueryRewriter, QueryNormalizer | 5 | from .query_rewriter import QueryRewriter, QueryNormalizer |
| 6 | from .query_parser import QueryParser, ParsedQuery | 6 | from .query_parser import QueryParser, ParsedQuery |
| 7 | 7 |
query/llm_translate.py
| @@ -0,0 +1,238 @@ | @@ -0,0 +1,238 @@ | ||
| 1 | +""" | ||
| 2 | +LLM-based translation helper using Qwen chat model. | ||
| 3 | + | ||
| 4 | +This module provides a thin wrapper around DashScope's `qwen-flash` model | ||
| 5 | +for high-quality, prompt-controlled translation, independent of the main | ||
| 6 | +`Translator` (machine translation) pipeline. | ||
| 7 | + | ||
| 8 | +Usage example: | ||
| 9 | + | ||
| 10 | + from query.llm_translate import llm_translate | ||
| 11 | + | ||
| 12 | + result = llm_translate( | ||
| 13 | + text="我看到这个视频后没有笑", | ||
| 14 | + target_lang="en", | ||
| 15 | + source_lang="zh", | ||
| 16 | + source_lang_label="中文", | ||
| 17 | + target_lang_label="英文", | ||
| 18 | + ) | ||
| 19 | +""" | ||
| 20 | + | ||
| 21 | +from __future__ import annotations | ||
| 22 | + | ||
| 23 | +import logging | ||
| 24 | +import os | ||
| 25 | +import time | ||
| 26 | +from typing import Dict, Optional | ||
| 27 | + | ||
| 28 | +from openai import OpenAI | ||
| 29 | + | ||
| 30 | +from config.env_config import DASHSCOPE_API_KEY | ||
| 31 | +from config.services_config import get_translation_config | ||
| 32 | + | ||
| 33 | +logger = logging.getLogger(__name__) | ||
| 34 | + | ||
| 35 | + | ||
| 36 | +# 华北2(北京):https://dashscope.aliyuncs.com/compatible-mode/v1 | ||
| 37 | +# 新加坡:https://dashscope-intl.aliyuncs.com/compatible-mode/v1 | ||
| 38 | +# 美国(弗吉尼亚):https://dashscope-us.aliyuncs.com/compatible-mode/v1 | ||
| 39 | +# | ||
| 40 | +# 默认保持与现有翻译/索引脚本相同的美国地域,可通过环境变量覆盖: | ||
| 41 | +# DASHSCOPE_BASE_URL=https://dashscope.aliyuncs.com/compatible-mode/v1 | ||
| 42 | +DEFAULT_QWEN_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1" | ||
| 43 | +QWEN_MODEL_NAME = "qwen-flash" | ||
| 44 | + | ||
| 45 | + | ||
| 46 | +# 由调用方提供的语言标签/代码填充,占位符说明: | ||
| 47 | +# - source_lang: 源语言的人类可读名称(按目标语言本地化,例如 "中文", "English") | ||
| 48 | +# - target_lang: 目标语言的人类可读名称 | ||
| 49 | +# - src_lang_code: 源语言代码,例如 "zh" | ||
| 50 | +# - tgt_lang_code: 目标语言代码,例如 "en" | ||
| 51 | +TRANSLATION_PROMPTS: Dict[str, str] = { | ||
| 52 | + "zh": """你是一名专业的 {source_lang}({src_lang_code})到 {target_lang}({tgt_lang_code})翻译员。你的目标是在遵循 {target_lang} 的语法、词汇和文化习惯的前提下,准确传达原始 {source_lang} 文本的含义和细微差别。请只输出 {target_lang} 的翻译内容,不要包含任何额外的解释或评论。请将以下 {source_lang} 文本翻译成 {target_lang}: | ||
| 53 | + | ||
| 54 | +{text}""", | ||
| 55 | + "en": """You are a professional {source_lang} ({src_lang_code}) to {target_lang} ({tgt_lang_code}) translator. Your goal is to accurately convey the meaning and nuances of the original {source_lang} text while adhering to {target_lang} grammar, vocabulary, and cultural sensitivities. Produce only the {target_lang} translation, without any additional explanations or commentary. Please translate the following {source_lang} text into {target_lang}: | ||
| 56 | + | ||
| 57 | +{text}""", | ||
| 58 | + "ru": """Вы профессиональный переводчик с {source_lang} ({src_lang_code}) на {target_lang} ({tgt_lang_code}). Ваша задача — точно передать смысл и нюансы исходного текста на {source_lang}, соблюдая грамматику, лексику и культурные особенности {target_lang}. Выводите только перевод на {target_lang}, без каких-либо дополнительных объяснений или комментариев. Пожалуйста, переведите следующий текст с {source_lang} на {target_lang}: | ||
| 59 | + | ||
| 60 | +{text}""", | ||
| 61 | + "ar": """أنت مترجم محترف من {source_lang} ({src_lang_code}) إلى {target_lang} ({tgt_lang_code}). هدفك هو نقل المعنى والدلالات الدقيقة للنص الأصلي بلغة {source_lang} بدقة، مع الالتزام بقواعد اللغة والمفردات والحساسيات الثقافية الخاصة بلغة {target_lang}. قم بإنتاج الترجمة إلى {target_lang} فقط دون أي شروحات أو تعليقات إضافية. يرجى ترجمة النص التالي من {source_lang} إلى {target_lang}: | ||
| 62 | + | ||
| 63 | +{text}""", | ||
| 64 | + "ja": """あなたは {source_lang}({src_lang_code})から {target_lang}({tgt_lang_code})へのプロの翻訳者です。{target_lang} の文法、語彙、文化的配慮に従いながら、元の {source_lang} テキストの意味やニュアンスを正確に伝えることが目的です。追加の説明やコメントは一切含めず、{target_lang} の翻訳のみを出力してください。次の {source_lang} テキストを {target_lang} に翻訳してください: | ||
| 65 | + | ||
| 66 | +{text}""", | ||
| 67 | + "es": """Eres un traductor profesional de {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Tu objetivo es transmitir con precisión el significado y los matices del texto original en {source_lang}, respetando la gramática, el vocabulario y las sensibilidades culturales de {target_lang}. Produce únicamente la traducción en {target_lang}, sin explicaciones ni comentarios adicionales. Por favor, traduce el siguiente texto de {source_lang} a {target_lang}: | ||
| 68 | + | ||
| 69 | +{text}""", | ||
| 70 | + "de": """Du bist ein professioneller Übersetzer von {source_lang} ({src_lang_code}) nach {target_lang} ({tgt_lang_code}). Dein Ziel ist es, die Bedeutung und Nuancen des ursprünglichen {source_lang}-Textes genau zu vermitteln und dabei die Grammatik, den Wortschatz und die kulturellen Besonderheiten von {target_lang} zu berücksichtigen. Gib ausschließlich die Übersetzung in {target_lang} aus, ohne zusätzliche Erklärungen oder Kommentare. Bitte übersetze den folgenden {source_lang}-Text in {target_lang}: | ||
| 71 | + | ||
| 72 | +{text}""", | ||
| 73 | + "fr": """Vous êtes un traducteur professionnel de {source_lang} ({src_lang_code}) vers {target_lang} ({tgt_lang_code}). Votre objectif est de transmettre fidèlement le sens et les nuances du texte original en {source_lang}, tout en respectant la grammaire, le vocabulaire et les sensibilités culturelles de {target_lang}. Produisez uniquement la traduction en {target_lang}, sans explications ni commentaires supplémentaires. Veuillez traduire le texte suivant de {source_lang} vers {target_lang} : | ||
| 74 | + | ||
| 75 | +{text}""", | ||
| 76 | + "it": """Sei un traduttore professionista da {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Il tuo obiettivo è trasmettere con precisione il significato e le sfumature del testo originale in {source_lang}, rispettando la grammatica, il vocabolario e le sensibilità culturali di {target_lang}. Produci solo la traduzione in {target_lang}, senza spiegazioni o commenti aggiuntivi. Per favore traduci il seguente testo da {source_lang} a {target_lang}: | ||
| 77 | + | ||
| 78 | +{text}""", | ||
| 79 | + "pt": """Você é um tradutor profissional de {source_lang} ({src_lang_code}) para {target_lang} ({tgt_lang_code}). Seu objetivo é transmitir com precisão o significado e as nuances do texto original em {source_lang}, respeitando a gramática, o vocabulário e as sensibilidades culturais de {target_lang}. Produza apenas a tradução em {target_lang}, sem quaisquer explicações ou comentários adicionais. Por favor, traduza o seguinte texto de {source_lang} para {target_lang}: | ||
| 80 | + | ||
| 81 | +{text}""", | ||
| 82 | +} | ||
| 83 | + | ||
| 84 | + | ||
| 85 | +def _get_qwen_client(base_url: Optional[str] = None) -> Optional[OpenAI]: | ||
| 86 | + """ | ||
| 87 | + Lazily construct an OpenAI-compatible client for DashScope. | ||
| 88 | + | ||
| 89 | + Uses DASHSCOPE_API_KEY and base_url (provider config / env) to configure endpoint. | ||
| 90 | + """ | ||
| 91 | + api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY") | ||
| 92 | + if not api_key: | ||
| 93 | + logger.warning("DASHSCOPE_API_KEY not set; llm-based translation will be disabled") | ||
| 94 | + return None | ||
| 95 | + | ||
| 96 | + # 优先使用显式传入的 base_url,其次环境变量,最后默认地域。 | ||
| 97 | + base_url = ( | ||
| 98 | + (base_url or "").strip() | ||
| 99 | + or os.getenv("DASHSCOPE_BASE_URL") | ||
| 100 | + or DEFAULT_QWEN_BASE_URL | ||
| 101 | + ) | ||
| 102 | + | ||
| 103 | + try: | ||
| 104 | + client = OpenAI(api_key=api_key, base_url=base_url) | ||
| 105 | + return client | ||
| 106 | + except Exception as exc: | ||
| 107 | + logger.error("Failed to initialize DashScope OpenAI client: %s", exc, exc_info=True) | ||
| 108 | + return None | ||
| 109 | + | ||
| 110 | + | ||
| 111 | +def _build_prompt( | ||
| 112 | + text: str, | ||
| 113 | + target_lang: str, | ||
| 114 | + source_lang_label: str, | ||
| 115 | + target_lang_label: str, | ||
| 116 | + src_lang_code: str, | ||
| 117 | + tgt_lang_code: str, | ||
| 118 | +) -> str: | ||
| 119 | + """ | ||
| 120 | + Build translation prompt for given target language, defaulting to English template. | ||
| 121 | + """ | ||
| 122 | + key = (target_lang or "").lower() | ||
| 123 | + template = TRANSLATION_PROMPTS.get(key) or TRANSLATION_PROMPTS["en"] | ||
| 124 | + return template.format( | ||
| 125 | + source_lang=source_lang_label, | ||
| 126 | + target_lang=target_lang_label, | ||
| 127 | + src_lang_code=src_lang_code, | ||
| 128 | + tgt_lang_code=tgt_lang_code, | ||
| 129 | + text=text, | ||
| 130 | + ) | ||
| 131 | + | ||
| 132 | + | ||
| 133 | +def llm_translate( | ||
| 134 | + text: str, | ||
| 135 | + target_lang: str, | ||
| 136 | + *, | ||
| 137 | + source_lang: Optional[str] = None, | ||
| 138 | + source_lang_label: Optional[str] = None, | ||
| 139 | + target_lang_label: Optional[str] = None, | ||
| 140 | + timeout_sec: Optional[float] = None, | ||
| 141 | +) -> Optional[str]: | ||
| 142 | + """ | ||
| 143 | + Translate text with Qwen chat model using rich prompts. | ||
| 144 | + | ||
| 145 | + - 根据目标语言选择提示词,如果没匹配到则退回英文模板。 | ||
| 146 | + - 不对 text 做语言检测或缓存,调用方自行控制。 | ||
| 147 | + | ||
| 148 | + Args: | ||
| 149 | + text: 原始文本 | ||
| 150 | + target_lang: 目标语言代码(如 "zh", "en") | ||
| 151 | + source_lang: 源语言代码(可选,不影响提示词选择,仅用于日志) | ||
| 152 | + source_lang_label: 源语言展示名称,用于 prompt(默认使用 source_lang) | ||
| 153 | + target_lang_label: 目标语言展示名称,用于 prompt(默认使用 target_lang) | ||
| 154 | + timeout_sec: 请求超时时间(秒,可选;若未配置则从 config 读取或采用默认) | ||
| 155 | + | ||
| 156 | + Returns: | ||
| 157 | + 翻译后的文本;如失败则返回 None。 | ||
| 158 | + """ | ||
| 159 | + if not text or not str(text).strip(): | ||
| 160 | + return text | ||
| 161 | + | ||
| 162 | + cfg = get_translation_config() | ||
| 163 | + provider_cfg = cfg.providers.get("llm", {}) if isinstance(cfg.providers, dict) else {} | ||
| 164 | + | ||
| 165 | + model_name = provider_cfg.get("model") or QWEN_MODEL_NAME | ||
| 166 | + req_timeout = float(provider_cfg.get("timeout_sec") or timeout_sec or 30.0) | ||
| 167 | + base_url = (provider_cfg.get("base_url") or "").strip() or None | ||
| 168 | + | ||
| 169 | + client = _get_qwen_client(base_url=base_url) | ||
| 170 | + if not client: | ||
| 171 | + # 无法调用云端,直接回退 | ||
| 172 | + logger.warning( | ||
| 173 | + "[llm_translate] Client init failed; returning original text. " | ||
| 174 | + "text=%r target_lang=%s source_lang=%s", | ||
| 175 | + text[:80], | ||
| 176 | + target_lang, | ||
| 177 | + source_lang or "auto", | ||
| 178 | + ) | ||
| 179 | + return text | ||
| 180 | + | ||
| 181 | + tgt = (target_lang or "").lower() or "en" | ||
| 182 | + src = (source_lang or "auto").lower() | ||
| 183 | + src_label = source_lang_label or src | ||
| 184 | + tgt_label = target_lang_label or tgt | ||
| 185 | + | ||
| 186 | + prompt = _build_prompt( | ||
| 187 | + text=text, | ||
| 188 | + target_lang=tgt, | ||
| 189 | + source_lang_label=src_label, | ||
| 190 | + target_lang_label=tgt_label, | ||
| 191 | + src_lang_code=src, | ||
| 192 | + tgt_lang_code=tgt, | ||
| 193 | + ) | ||
| 194 | + | ||
| 195 | + start = time.time() | ||
| 196 | + try: | ||
| 197 | + completion = client.chat.completions.create( | ||
| 198 | + model=model_name, | ||
| 199 | + messages=[ | ||
| 200 | + { | ||
| 201 | + "role": "user", | ||
| 202 | + "content": prompt, | ||
| 203 | + } | ||
| 204 | + ], | ||
| 205 | + timeout=req_timeout, | ||
| 206 | + ) | ||
| 207 | + content = (completion.choices[0].message.content or "").strip() | ||
| 208 | + duration_ms = (time.time() - start) * 1000 | ||
| 209 | + logger.info( | ||
| 210 | + "[llm_translate] Success | model=%s src=%s tgt=%s latency=%.1fms text=%r -> %r", | ||
| 211 | + model_name, | ||
| 212 | + src, | ||
| 213 | + tgt, | ||
| 214 | + duration_ms, | ||
| 215 | + text[:80], | ||
| 216 | + content[:80], | ||
| 217 | + ) | ||
| 218 | + return content or text | ||
| 219 | + except Exception as exc: | ||
| 220 | + duration_ms = (time.time() - start) * 1000 | ||
| 221 | + logger.warning( | ||
| 222 | + "[llm_translate] Failed | model=%s src=%s tgt=%s latency=%.1fms error=%s", | ||
| 223 | + model_name, | ||
| 224 | + src, | ||
| 225 | + tgt, | ||
| 226 | + duration_ms, | ||
| 227 | + exc, | ||
| 228 | + exc_info=True, | ||
| 229 | + ) | ||
| 230 | + # 安全回退:出错时返回原文,避免中断上游流程 | ||
| 231 | + return text | ||
| 232 | + | ||
| 233 | + | ||
| 234 | +__all__ = [ | ||
| 235 | + "TRANSLATION_PROMPTS", | ||
| 236 | + "llm_translate", | ||
| 237 | +] | ||
| 238 | + |
| @@ -0,0 +1,963 @@ | @@ -0,0 +1,963 @@ | ||
| 1 | +""" | ||
| 2 | +Translation service for multi-language query support. | ||
| 3 | + | ||
| 4 | +Supports multiple translation models: | ||
| 5 | +- Qwen (default): Alibaba Cloud DashScope API using qwen-mt-flash model | ||
| 6 | +- DeepL: DeepL API for high-quality translations | ||
| 7 | + | ||
| 8 | +重要说明(Qwen 机翻限速): | ||
| 9 | +- 当前默认使用的 `qwen-mt-flash` 为云端机翻模型,**官方限速较低,约 RPM=60(每分钟约 60 请求)** | ||
| 10 | +- 在高并发场景必须依赖 Redis 翻译缓存与批量预热,避免在用户实时请求路径上直接打满 DashScope 限流 | ||
| 11 | +- 若业务侧存在大规模离线翻译或更高吞吐需求,建议评估 DeepL 或自建翻译后端 | ||
| 12 | + | ||
| 13 | +使用方法 (Usage): | ||
| 14 | + | ||
| 15 | +```python | ||
| 16 | +from query.translator import Translator | ||
| 17 | + | ||
| 18 | +# 使用默认的 qwen 模型(推荐) | ||
| 19 | +translator = Translator() # 默认使用 qwen 模型 | ||
| 20 | + | ||
| 21 | +# 或显式指定模型 | ||
| 22 | +translator = Translator(model='qwen') # 使用 qwen 模型 | ||
| 23 | +translator = Translator(model='deepl') # 使用 DeepL 模型 | ||
| 24 | + | ||
| 25 | +# 翻译文本 | ||
| 26 | +result = translator.translate( | ||
| 27 | + text="我看到这个视频后没有笑", | ||
| 28 | + target_lang="en", | ||
| 29 | + source_lang="auto" # 自动检测源语言 | ||
| 30 | +) | ||
| 31 | +``` | ||
| 32 | + | ||
| 33 | +配置说明 (Configuration): | ||
| 34 | +- Qwen 模型需要设置 DASHSCOPE_API_KEY 环境变量(在 .env 文件中) | ||
| 35 | +- DeepL 模型需要设置 DEEPL_AUTH_KEY 环境变量(在 .env 文件中) | ||
| 36 | + | ||
| 37 | +Qwen 模型参考文档: | ||
| 38 | +- 官方文档:https://help.aliyun.com/zh/model-studio/get-api-key | ||
| 39 | +- 模型:qwen-mt-flash(快速翻译模型) | ||
| 40 | + | ||
| 41 | +DeepL 官方文档: | ||
| 42 | +https://developers.deepl.com/api-reference/translate/request-translation | ||
| 43 | +""" | ||
| 44 | + | ||
| 45 | +import os | ||
| 46 | +import requests | ||
| 47 | +import re | ||
| 48 | +import redis | ||
| 49 | +from concurrent.futures import ThreadPoolExecutor, Future | ||
| 50 | +from datetime import timedelta | ||
| 51 | +from typing import Dict, List, Optional, Union | ||
| 52 | +import logging | ||
| 53 | +import time | ||
| 54 | + | ||
| 55 | +logger = logging.getLogger(__name__) | ||
| 56 | + | ||
| 57 | +from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG | ||
| 58 | +from openai import OpenAI | ||
| 59 | + | ||
| 60 | + | ||
| 61 | +class Translator: | ||
| 62 | + """ | ||
| 63 | + Multi-language translator supporting Qwen and DeepL APIs. | ||
| 64 | + | ||
| 65 | + Default model is 'qwen' which uses Alibaba Cloud DashScope API. | ||
| 66 | + """ | ||
| 67 | +# 华北2(北京):https://dashscope.aliyuncs.com/compatible-mode/v1 | ||
| 68 | +# 新加坡:https://dashscope-intl.aliyuncs.com/compatible-mode/v1 | ||
| 69 | +# 美国(弗吉尼亚):https://dashscope-us.aliyuncs.com/compatible-mode/v1 | ||
| 70 | + | ||
| 71 | + DEEPL_API_URL = "https://api.deepl.com/v2/translate" # Pro tier | ||
| 72 | + QWEN_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1" # 北京地域 | ||
| 73 | + # QWEN_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" # 新加坡 | ||
| 74 | + # 如果使用新加坡地域的模型,需要将base_url替换为:https://dashscope-intl.aliyuncs.com/compatible-mode/v1 | ||
| 75 | + QWEN_MODEL = "qwen-mt-flash" # 快速翻译模型 | ||
| 76 | + | ||
| 77 | + # Language code mapping | ||
| 78 | + LANG_CODE_MAP = { | ||
| 79 | + 'zh': 'ZH', | ||
| 80 | + 'en': 'EN', | ||
| 81 | + 'ru': 'RU', | ||
| 82 | + 'ar': 'AR', | ||
| 83 | + 'ja': 'JA', | ||
| 84 | + 'es': 'ES', | ||
| 85 | + 'de': 'DE', | ||
| 86 | + 'fr': 'FR', | ||
| 87 | + 'it': 'IT', | ||
| 88 | + 'pt': 'PT', | ||
| 89 | + } | ||
| 90 | + | ||
| 91 | + def __init__( | ||
| 92 | + self, | ||
| 93 | + model: str = "qwen", | ||
| 94 | + api_key: Optional[str] = None, | ||
| 95 | + use_cache: bool = True, | ||
| 96 | + timeout: int = 10, | ||
| 97 | + glossary_id: Optional[str] = None, | ||
| 98 | + translation_context: Optional[str] = None | ||
| 99 | + ): | ||
| 100 | + """ | ||
| 101 | + Initialize translator. | ||
| 102 | + | ||
| 103 | + Args: | ||
| 104 | + model: Translation model to use. Options: 'qwen' (default) or 'deepl' | ||
| 105 | + api_key: API key for the selected model (or None to use from config/env) | ||
| 106 | + use_cache: Whether to cache translations | ||
| 107 | + timeout: Request timeout in seconds | ||
| 108 | + glossary_id: DeepL glossary ID for custom terminology (optional, only for DeepL) | ||
| 109 | + translation_context: Context hint for translation (e.g., "e-commerce", "product search") | ||
| 110 | + """ | ||
| 111 | + self.model = model.lower() | ||
| 112 | + if self.model not in ['qwen', 'deepl']: | ||
| 113 | + raise ValueError(f"Unsupported model: {model}. Supported models: 'qwen', 'deepl'") | ||
| 114 | + | ||
| 115 | + # Get API key from config if not provided | ||
| 116 | + if api_key is None: | ||
| 117 | + if self.model == 'qwen': | ||
| 118 | + api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY") | ||
| 119 | + else: # deepl | ||
| 120 | + api_key = DEEPL_AUTH_KEY or os.getenv("DEEPL_AUTH_KEY") | ||
| 121 | + | ||
| 122 | + self.api_key = api_key | ||
| 123 | + self.timeout = timeout | ||
| 124 | + self.use_cache = use_cache | ||
| 125 | + self.glossary_id = glossary_id | ||
| 126 | + self.translation_context = translation_context or "e-commerce product search" | ||
| 127 | + | ||
| 128 | + # Initialize OpenAI client for Qwen if needed | ||
| 129 | + self.qwen_client = None | ||
| 130 | + if self.model == 'qwen': | ||
| 131 | + if not self.api_key: | ||
| 132 | + logger.warning("DASHSCOPE_API_KEY not set. Qwen translation will not work.") | ||
| 133 | + else: | ||
| 134 | + self.qwen_client = OpenAI( | ||
| 135 | + api_key=self.api_key, | ||
| 136 | + base_url=self.QWEN_BASE_URL, | ||
| 137 | + ) | ||
| 138 | + | ||
| 139 | + # Initialize Redis cache if enabled | ||
| 140 | + if use_cache: | ||
| 141 | + try: | ||
| 142 | + self.redis_client = redis.Redis( | ||
| 143 | + host=REDIS_CONFIG.get('host', 'localhost'), | ||
| 144 | + port=REDIS_CONFIG.get('port', 6479), | ||
| 145 | + password=REDIS_CONFIG.get('password'), | ||
| 146 | + decode_responses=True, # Return str instead of bytes | ||
| 147 | + socket_timeout=REDIS_CONFIG.get('socket_timeout', 1), | ||
| 148 | + socket_connect_timeout=REDIS_CONFIG.get('socket_connect_timeout', 1), | ||
| 149 | + retry_on_timeout=REDIS_CONFIG.get('retry_on_timeout', False), | ||
| 150 | + health_check_interval=10, # 避免复用坏连接 | ||
| 151 | + ) | ||
| 152 | + # Test connection | ||
| 153 | + self.redis_client.ping() | ||
| 154 | + expire_days = REDIS_CONFIG.get('translation_cache_expire_days', 360) | ||
| 155 | + self.expire_time = timedelta(days=expire_days) | ||
| 156 | + self.expire_seconds = int(self.expire_time.total_seconds()) # Redis 需要秒数 | ||
| 157 | + self.cache_prefix = REDIS_CONFIG.get('translation_cache_prefix', 'trans') | ||
| 158 | + logger.info("Redis cache initialized for translations") | ||
| 159 | + except Exception as e: | ||
| 160 | + logger.warning(f"Failed to initialize Redis cache: {e}, falling back to no cache") | ||
| 161 | + self.redis_client = None | ||
| 162 | + self.cache = None | ||
| 163 | + else: | ||
| 164 | + self.redis_client = None | ||
| 165 | + self.cache = None | ||
| 166 | + | ||
| 167 | + # Thread pool for async translation | ||
| 168 | + self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="translator") | ||
| 169 | + | ||
| 170 | + def translate( | ||
| 171 | + self, | ||
| 172 | + text: str, | ||
| 173 | + target_lang: str, | ||
| 174 | + source_lang: Optional[str] = None, | ||
| 175 | + context: Optional[str] = None, | ||
| 176 | + prompt: Optional[str] = None | ||
| 177 | + ) -> Optional[str]: | ||
| 178 | + """ | ||
| 179 | + Translate text to target language (synchronous mode). | ||
| 180 | + | ||
| 181 | + Args: | ||
| 182 | + text: Text to translate | ||
| 183 | + target_lang: Target language code ('zh', 'en', 'ru', etc.) | ||
| 184 | + source_lang: Source language code (option al, auto-detect if None) | ||
| 185 | + context: Additional context for translation (overrides default context) | ||
| 186 | + prompt: Translation prompt/instruction (optional, for better translation quality) | ||
| 187 | + | ||
| 188 | + Returns: | ||
| 189 | + Translated text or None if translation fails | ||
| 190 | + """ | ||
| 191 | + if not text or not text.strip(): | ||
| 192 | + return text | ||
| 193 | + | ||
| 194 | + # Normalize language codes | ||
| 195 | + target_lang = target_lang.lower() | ||
| 196 | + if source_lang: | ||
| 197 | + source_lang = source_lang.lower() | ||
| 198 | + | ||
| 199 | + # Optimization: Skip translation if not needed | ||
| 200 | + if target_lang == 'en' and self._is_english_text(text): | ||
| 201 | + logger.info(f"[Translator] Text is already English, skipping translation: '{text[:50]}...'") | ||
| 202 | + return text | ||
| 203 | + | ||
| 204 | + if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): | ||
| 205 | + logger.info( | ||
| 206 | + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | ||
| 207 | + f"Source language: {source_lang or 'auto'} | Result: Skip translation (contains Chinese or pure number)" | ||
| 208 | + ) | ||
| 209 | + return text | ||
| 210 | + | ||
| 211 | + # Use provided context or default context | ||
| 212 | + translation_context = context or self.translation_context | ||
| 213 | + | ||
| 214 | + # Build cache key (include prompt in cache key if provided) | ||
| 215 | + cache_key_parts = [source_lang or 'auto', target_lang, translation_context] | ||
| 216 | + if prompt: | ||
| 217 | + cache_key_parts.append(prompt) | ||
| 218 | + cache_key_parts.append(text) | ||
| 219 | + cache_key = ':'.join(cache_key_parts) | ||
| 220 | + | ||
| 221 | + # Check cache (include context and prompt in cache key for accuracy) | ||
| 222 | + if self.use_cache and self.redis_client: | ||
| 223 | + cached = self._get_cached_translation_redis(text, target_lang, source_lang, translation_context, prompt) | ||
| 224 | + if cached: | ||
| 225 | + logger.info( | ||
| 226 | + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | ||
| 227 | + f"Source language: {source_lang or 'auto'} | Result: '{cached}' | Source: Cache hit" | ||
| 228 | + ) | ||
| 229 | + return cached | ||
| 230 | + | ||
| 231 | + # If no API key, return mock translation (for testing) | ||
| 232 | + if not self.api_key: | ||
| 233 | + logger.info( | ||
| 234 | + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | ||
| 235 | + f"Source language: {source_lang or 'auto'} | Result: '{text}' | Source: Mock mode (no API key)" | ||
| 236 | + ) | ||
| 237 | + return text | ||
| 238 | + | ||
| 239 | + # Translate using selected model | ||
| 240 | + logger.info( | ||
| 241 | + f"[Translator] Translation request | Model: {self.model} | Original text: '{text}' | Target language: {target_lang} | " | ||
| 242 | + f"Source language: {source_lang or 'auto'} | Context: {translation_context} | " | ||
| 243 | + f"Prompt: {'yes' if prompt else 'no'} | Status: Starting translation" | ||
| 244 | + ) | ||
| 245 | + | ||
| 246 | + if self.model == 'qwen': | ||
| 247 | + result = self._translate_qwen(text, target_lang, source_lang, translation_context, prompt) | ||
| 248 | + else: # deepl | ||
| 249 | + result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt) | ||
| 250 | + | ||
| 251 | + # Surface translation failure to the caller instead of silently | ||
| 252 | + # masquerading the source text as a successful translation. | ||
| 253 | + if result is None: | ||
| 254 | + logger.warning( | ||
| 255 | + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | ||
| 256 | + f"Source language: {source_lang or 'auto'} | Status: Translation failed" | ||
| 257 | + ) | ||
| 258 | + else: | ||
| 259 | + logger.info( | ||
| 260 | + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | ||
| 261 | + f"Source language: {source_lang or 'auto'} | Result: '{result}' | Status: Translation successful" | ||
| 262 | + ) | ||
| 263 | + | ||
| 264 | + # Cache only successful translations. Failed attempts must not poison | ||
| 265 | + # Redis with the original text. | ||
| 266 | + if result is not None and self.use_cache and self.redis_client: | ||
| 267 | + self._set_cached_translation_redis(text, target_lang, result, source_lang, translation_context, prompt) | ||
| 268 | + | ||
| 269 | + return result | ||
| 270 | + | ||
| 271 | + def _translate_qwen( | ||
| 272 | + self, | ||
| 273 | + text: str, | ||
| 274 | + target_lang: str, | ||
| 275 | + source_lang: Optional[str], | ||
| 276 | + context: Optional[str] = None, | ||
| 277 | + prompt: Optional[str] = None | ||
| 278 | + ) -> Optional[str]: | ||
| 279 | + """ | ||
| 280 | + Translate using Qwen MT Flash model via Alibaba Cloud DashScope API. | ||
| 281 | + | ||
| 282 | + Args: | ||
| 283 | + text: Text to translate | ||
| 284 | + target_lang: Target language code ('zh', 'en', 'ru', etc.) | ||
| 285 | + source_lang: Source language code (optional, 'auto' if None) | ||
| 286 | + context: Context hint for translation (optional) | ||
| 287 | + prompt: Translation prompt/instruction (optional) | ||
| 288 | + | ||
| 289 | + Returns: | ||
| 290 | + Translated text or None if translation fails | ||
| 291 | + """ | ||
| 292 | + if not self.qwen_client: | ||
| 293 | + logger.error("[Translator] Qwen client not initialized. Check DASHSCOPE_API_KEY.") | ||
| 294 | + return None | ||
| 295 | + | ||
| 296 | + # Qwen (qwen-mt-plus/flash/turbo) supported languages mapping | ||
| 297 | + # 标准来自:你提供的“语言 / 英文名 / 代码”表 | ||
| 298 | + qwen_lang_map = { | ||
| 299 | + "en": "English", | ||
| 300 | + "zh": "Chinese", | ||
| 301 | + "zh_tw": "Traditional Chinese", | ||
| 302 | + "ru": "Russian", | ||
| 303 | + "ja": "Japanese", | ||
| 304 | + "ko": "Korean", | ||
| 305 | + "es": "Spanish", | ||
| 306 | + "fr": "French", | ||
| 307 | + "pt": "Portuguese", | ||
| 308 | + "de": "German", | ||
| 309 | + "it": "Italian", | ||
| 310 | + "th": "Thai", | ||
| 311 | + "vi": "Vietnamese", | ||
| 312 | + "id": "Indonesian", | ||
| 313 | + "ms": "Malay", | ||
| 314 | + "ar": "Arabic", | ||
| 315 | + "hi": "Hindi", | ||
| 316 | + "he": "Hebrew", | ||
| 317 | + "my": "Burmese", | ||
| 318 | + "ta": "Tamil", | ||
| 319 | + "ur": "Urdu", | ||
| 320 | + "bn": "Bengali", | ||
| 321 | + "pl": "Polish", | ||
| 322 | + "nl": "Dutch", | ||
| 323 | + "ro": "Romanian", | ||
| 324 | + "tr": "Turkish", | ||
| 325 | + "km": "Khmer", | ||
| 326 | + "lo": "Lao", | ||
| 327 | + "yue": "Cantonese", | ||
| 328 | + "cs": "Czech", | ||
| 329 | + "el": "Greek", | ||
| 330 | + "sv": "Swedish", | ||
| 331 | + "hu": "Hungarian", | ||
| 332 | + "da": "Danish", | ||
| 333 | + "fi": "Finnish", | ||
| 334 | + "uk": "Ukrainian", | ||
| 335 | + "bg": "Bulgarian", | ||
| 336 | + } | ||
| 337 | + | ||
| 338 | + # Convert target language | ||
| 339 | + target_lang_normalized = target_lang.lower() | ||
| 340 | + target_lang_qwen = qwen_lang_map.get(target_lang_normalized, target_lang.capitalize()) | ||
| 341 | + | ||
| 342 | + # Convert source language | ||
| 343 | + source_lang_normalized = (source_lang or "").strip().lower() | ||
| 344 | + if not source_lang_normalized or source_lang_normalized == "auto": | ||
| 345 | + source_lang_qwen = "auto" | ||
| 346 | + else: | ||
| 347 | + source_lang_qwen = qwen_lang_map.get(source_lang_normalized, source_lang.capitalize()) | ||
| 348 | + | ||
| 349 | + # Prepare translation options | ||
| 350 | + translation_options = { | ||
| 351 | + "source_lang": source_lang_qwen, | ||
| 352 | + "target_lang": target_lang_qwen, | ||
| 353 | + } | ||
| 354 | + | ||
| 355 | + # Prepare messages | ||
| 356 | + messages = [ | ||
| 357 | + { | ||
| 358 | + "role": "user", | ||
| 359 | + "content": text | ||
| 360 | + } | ||
| 361 | + ] | ||
| 362 | + | ||
| 363 | + start_time = time.time() | ||
| 364 | + try: | ||
| 365 | + completion = self.qwen_client.chat.completions.create( | ||
| 366 | + model=self.QWEN_MODEL, | ||
| 367 | + messages=messages, | ||
| 368 | + extra_body={ | ||
| 369 | + "translation_options": translation_options | ||
| 370 | + } | ||
| 371 | + ) | ||
| 372 | + | ||
| 373 | + translated_text = completion.choices[0].message.content.strip() | ||
| 374 | + duration_ms = (time.time() - start_time) * 1000 | ||
| 375 | + | ||
| 376 | + logger.info( | ||
| 377 | + f"[Translator] Qwen API response success | Original text: '{text}' | Target language: {target_lang_qwen} | " | ||
| 378 | + f"Translation result: '{translated_text}' | Duration: {duration_ms:.2f} ms" | ||
| 379 | + ) | ||
| 380 | + return translated_text | ||
| 381 | + | ||
| 382 | + except Exception as e: | ||
| 383 | + duration_ms = (time.time() - start_time) * 1000 | ||
| 384 | + logger.error( | ||
| 385 | + f"[Translator] Qwen API request exception | Original text: '{text}' | Target language: {target_lang_qwen} | " | ||
| 386 | + f"Duration: {duration_ms:.2f} ms | Error: {e}", exc_info=True | ||
| 387 | + ) | ||
| 388 | + return None | ||
| 389 | + | ||
| 390 | + def _translate_deepl( | ||
| 391 | + self, | ||
| 392 | + text: str, | ||
| 393 | + target_lang: str, | ||
| 394 | + source_lang: Optional[str], | ||
| 395 | + context: Optional[str] = None, | ||
| 396 | + prompt: Optional[str] = None | ||
| 397 | + ) -> Optional[str]: | ||
| 398 | + """ | ||
| 399 | + Translate using DeepL API with context and glossary support. | ||
| 400 | + | ||
| 401 | + Args: | ||
| 402 | + text: Text to translate | ||
| 403 | + target_lang: Target language code | ||
| 404 | + source_lang: Source language code (optional) | ||
| 405 | + context: Context hint for translation (e.g., "e-commerce product search") | ||
| 406 | + """ | ||
| 407 | + # Map to DeepL language codes | ||
| 408 | + target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper()) | ||
| 409 | + | ||
| 410 | + headers = { | ||
| 411 | + "Authorization": f"DeepL-Auth-Key {self.api_key}", | ||
| 412 | + "Content-Type": "application/json", | ||
| 413 | + } | ||
| 414 | + | ||
| 415 | + # Use prompt as context parameter for DeepL API (not as text prefix) | ||
| 416 | + # According to DeepL API: context is "Additional context that can influence a translation but is not translated itself" | ||
| 417 | + # If prompt is provided, use it as context; otherwise use the default context | ||
| 418 | + api_context = prompt if prompt else context | ||
| 419 | + | ||
| 420 | + # For e-commerce, add context words to help DeepL understand the domain | ||
| 421 | + # This is especially important for single-word ambiguous terms like "车" (car vs rook) | ||
| 422 | + text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context) | ||
| 423 | + | ||
| 424 | + payload = { | ||
| 425 | + "text": [text_to_translate], | ||
| 426 | + "target_lang": target_code, | ||
| 427 | + } | ||
| 428 | + | ||
| 429 | + if source_lang: | ||
| 430 | + source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper()) | ||
| 431 | + payload["source_lang"] = source_code | ||
| 432 | + | ||
| 433 | + # Add context parameter (prompt or default context) | ||
| 434 | + # Context influences translation but is not translated itself | ||
| 435 | + if api_context: | ||
| 436 | + payload["context"] = api_context | ||
| 437 | + | ||
| 438 | + # Add glossary if configured | ||
| 439 | + if self.glossary_id: | ||
| 440 | + payload["glossary_id"] = self.glossary_id | ||
| 441 | + | ||
| 442 | + # Note: DeepL API v2 supports "context" parameter for additional context | ||
| 443 | + # that influences translation but is not translated itself. | ||
| 444 | + # We use prompt as context parameter when provided. | ||
| 445 | + | ||
| 446 | + try: | ||
| 447 | + response = requests.post( | ||
| 448 | + self.DEEPL_API_URL, | ||
| 449 | + headers=headers, | ||
| 450 | + json=payload, | ||
| 451 | + timeout=self.timeout | ||
| 452 | + ) | ||
| 453 | + | ||
| 454 | + if response.status_code == 200: | ||
| 455 | + data = response.json() | ||
| 456 | + if "translations" in data and len(data["translations"]) > 0: | ||
| 457 | + translated_text = data["translations"][0]["text"] | ||
| 458 | + # If we added context, extract just the term from the result | ||
| 459 | + if needs_extraction: | ||
| 460 | + translated_text = self._extract_term_from_translation( | ||
| 461 | + translated_text, text, target_code | ||
| 462 | + ) | ||
| 463 | + logger.debug( | ||
| 464 | + f"[Translator] DeepL API response success | Original text: '{text}' | Target language: {target_code} | " | ||
| 465 | + f"Translation result: '{translated_text}'" | ||
| 466 | + ) | ||
| 467 | + return translated_text | ||
| 468 | + else: | ||
| 469 | + logger.error( | ||
| 470 | + f"[Translator] DeepL API error | Original text: '{text}' | Target language: {target_code} | " | ||
| 471 | + f"Status code: {response.status_code} | Error message: {response.text}" | ||
| 472 | + ) | ||
| 473 | + return None | ||
| 474 | + | ||
| 475 | + except requests.Timeout: | ||
| 476 | + logger.warning( | ||
| 477 | + f"[Translator] DeepL API request timeout | Original text: '{text}' | Target language: {target_code} | " | ||
| 478 | + f"Timeout: {self.timeout}s" | ||
| 479 | + ) | ||
| 480 | + return None | ||
| 481 | + except Exception as e: | ||
| 482 | + logger.error( | ||
| 483 | + f"[Translator] DeepL API request exception | Original text: '{text}' | Target language: {target_code} | " | ||
| 484 | + f"Error: {e}", exc_info=True | ||
| 485 | + ) | ||
| 486 | + return None | ||
| 487 | + | ||
| 488 | + # NOTE: _translate_deepl_free is intentionally not implemented. | ||
| 489 | + # We do not support automatic fallback to the free endpoint, to avoid | ||
| 490 | + # mixing Pro keys with https://api-free.deepl.com and related 403 errors. | ||
| 491 | + | ||
| 492 | + def translate_multi( | ||
| 493 | + self, | ||
| 494 | + text: str, | ||
| 495 | + target_langs: List[str], | ||
| 496 | + source_lang: Optional[str] = None, | ||
| 497 | + context: Optional[str] = None, | ||
| 498 | + async_mode: bool = True, | ||
| 499 | + prompt: Optional[str] = None | ||
| 500 | + ) -> Dict[str, Optional[str]]: | ||
| 501 | + """ | ||
| 502 | + Translate text to multiple target languages. | ||
| 503 | + | ||
| 504 | + In async_mode=True (default): | ||
| 505 | + - Returns cached translations immediately if available | ||
| 506 | + - For translations that can be optimized (e.g., pure numbers, already in target language), | ||
| 507 | + returns result immediately via synchronous call | ||
| 508 | + - Launches async tasks for other missing translations (non-blocking) | ||
| 509 | + - Returns None for missing translations that require async processing | ||
| 510 | + | ||
| 511 | + In async_mode=False: | ||
| 512 | + - Waits for all translations to complete (blocking) | ||
| 513 | + | ||
| 514 | + Args: | ||
| 515 | + text: Text to translate | ||
| 516 | + target_langs: List of target language codes | ||
| 517 | + source_lang: Source language code (optional) | ||
| 518 | + context: Context hint for translation (optional) | ||
| 519 | + async_mode: If True, return cached results immediately and translate missing ones async | ||
| 520 | + prompt: Translation prompt/instruction (optional) | ||
| 521 | + | ||
| 522 | + Returns: | ||
| 523 | + Dictionary mapping language code to translated text (only cached results in async mode) | ||
| 524 | + """ | ||
| 525 | + results = {} | ||
| 526 | + missing_langs = [] | ||
| 527 | + async_langs = [] | ||
| 528 | + | ||
| 529 | + # First, get cached translations | ||
| 530 | + for lang in target_langs: | ||
| 531 | + cached = self._get_cached_translation(text, lang, source_lang, context, prompt) | ||
| 532 | + if cached is not None: | ||
| 533 | + results[lang] = cached | ||
| 534 | + else: | ||
| 535 | + missing_langs.append(lang) | ||
| 536 | + | ||
| 537 | + # If async mode and there are missing translations | ||
| 538 | + if async_mode and missing_langs: | ||
| 539 | + # Check if translation can be optimized (immediate return) | ||
| 540 | + for lang in missing_langs: | ||
| 541 | + target_lang = lang.lower() | ||
| 542 | + # Check optimization conditions (same as in translate method) | ||
| 543 | + can_optimize = False | ||
| 544 | + if target_lang == 'en' and self._is_english_text(text): | ||
| 545 | + can_optimize = True | ||
| 546 | + elif target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): | ||
| 547 | + can_optimize = True | ||
| 548 | + | ||
| 549 | + if can_optimize: | ||
| 550 | + # Can be optimized, call translate synchronously for immediate result | ||
| 551 | + results[lang] = self.translate(text, lang, source_lang, context, prompt) | ||
| 552 | + else: | ||
| 553 | + # Requires actual translation, add to async list | ||
| 554 | + async_langs.append(lang) | ||
| 555 | + | ||
| 556 | + # Launch async tasks for translations that require actual API calls | ||
| 557 | + if async_langs: | ||
| 558 | + for lang in async_langs: | ||
| 559 | + self._translate_async(text, lang, source_lang, context, prompt) | ||
| 560 | + # Return None for async translations | ||
| 561 | + for lang in async_langs: | ||
| 562 | + results[lang] = None | ||
| 563 | + else: | ||
| 564 | + # Synchronous mode: wait for all translations | ||
| 565 | + for lang in missing_langs: | ||
| 566 | + results[lang] = self.translate(text, lang, source_lang, context, prompt) | ||
| 567 | + | ||
| 568 | + return results | ||
| 569 | + | ||
| 570 | + def translate_multi_async( | ||
| 571 | + self, | ||
| 572 | + text: str, | ||
| 573 | + target_langs: List[str], | ||
| 574 | + source_lang: Optional[str] = None, | ||
| 575 | + context: Optional[str] = None, | ||
| 576 | + prompt: Optional[str] = None | ||
| 577 | + ) -> Dict[str, Union[str, Future]]: | ||
| 578 | + """ | ||
| 579 | + Translate text to multiple target languages asynchronously, returning Futures that can be awaited. | ||
| 580 | + | ||
| 581 | + This method returns a dictionary where: | ||
| 582 | + - If translation is cached, the value is the translation string (immediate) | ||
| 583 | + - If translation needs to be done, the value is a Future object that can be awaited | ||
| 584 | + | ||
| 585 | + Args: | ||
| 586 | + text: Text to translate | ||
| 587 | + target_langs: List of target language codes | ||
| 588 | + source_lang: Source language code (optional) | ||
| 589 | + context: Context hint for translation (optional) | ||
| 590 | + prompt: Translation prompt/instruction (optional) | ||
| 591 | + | ||
| 592 | + Returns: | ||
| 593 | + Dictionary mapping language code to either translation string (cached) or Future object | ||
| 594 | + """ | ||
| 595 | + results = {} | ||
| 596 | + missing_langs = [] | ||
| 597 | + | ||
| 598 | + # First, get cached translations | ||
| 599 | + for lang in target_langs: | ||
| 600 | + cached = self._get_cached_translation(text, lang, source_lang, context, prompt) | ||
| 601 | + if cached is not None: | ||
| 602 | + results[lang] = cached | ||
| 603 | + else: | ||
| 604 | + missing_langs.append(lang) | ||
| 605 | + | ||
| 606 | + # For missing translations, submit async tasks and return Futures | ||
| 607 | + for lang in missing_langs: | ||
| 608 | + future = self.executor.submit( | ||
| 609 | + self.translate, | ||
| 610 | + text, | ||
| 611 | + lang, | ||
| 612 | + source_lang, | ||
| 613 | + context, | ||
| 614 | + prompt | ||
| 615 | + ) | ||
| 616 | + results[lang] = future | ||
| 617 | + | ||
| 618 | + return results | ||
| 619 | + | ||
| 620 | + def _get_cached_translation( | ||
| 621 | + self, | ||
| 622 | + text: str, | ||
| 623 | + target_lang: str, | ||
| 624 | + source_lang: Optional[str] = None, | ||
| 625 | + context: Optional[str] = None, | ||
| 626 | + prompt: Optional[str] = None | ||
| 627 | + ) -> Optional[str]: | ||
| 628 | + """Get translation from cache if available.""" | ||
| 629 | + if not self.redis_client: | ||
| 630 | + return None | ||
| 631 | + return self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt) | ||
| 632 | + | ||
| 633 | + def _get_cached_translation_redis( | ||
| 634 | + self, | ||
| 635 | + text: str, | ||
| 636 | + target_lang: str, | ||
| 637 | + source_lang: Optional[str] = None, | ||
| 638 | + context: Optional[str] = None, | ||
| 639 | + prompt: Optional[str] = None | ||
| 640 | + ) -> Optional[str]: | ||
| 641 | + """ | ||
| 642 | + Get translation from Redis cache with sliding expiration. | ||
| 643 | + | ||
| 644 | + 滑动过期机制:每次访问缓存时,重置过期时间为配置的过期时间(默认720天)。 | ||
| 645 | + 这样缓存会在最后一次访问后的720天才过期,而不是写入后的720天。 | ||
| 646 | + 这确保了常用的翻译缓存不会被过早删除。 | ||
| 647 | + """ | ||
| 648 | + if not self.redis_client: | ||
| 649 | + return None | ||
| 650 | + | ||
| 651 | + try: | ||
| 652 | + # Build cache key: prefix:target_lang:text | ||
| 653 | + # For simplicity, we use target_lang and text as key | ||
| 654 | + # Context and prompt are not included in key to maximize cache hits | ||
| 655 | + cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}" | ||
| 656 | + value = self.redis_client.get(cache_key) | ||
| 657 | + if value: | ||
| 658 | + # Sliding expiration: reset expiration time on access | ||
| 659 | + # 每次读取缓存时,重置过期时间为配置的过期时间(最后一次访问后的N天才过期) | ||
| 660 | + try: | ||
| 661 | + self.redis_client.expire(cache_key, self.expire_seconds) | ||
| 662 | + except Exception as expire_error: | ||
| 663 | + # 即使 expire 失败,也返回缓存值(不影响功能) | ||
| 664 | + logger.warning( | ||
| 665 | + f"[Translator] Failed to update cache expiration for key {cache_key}: {expire_error}" | ||
| 666 | + ) | ||
| 667 | + | ||
| 668 | + logger.debug( | ||
| 669 | + f"[Translator] Redis cache hit | Original text: '{text}' | Target language: {target_lang} | " | ||
| 670 | + f"Cache key: {cache_key} | Translation result: '{value}' | TTL reset to {self.expire_seconds}s" | ||
| 671 | + ) | ||
| 672 | + return value | ||
| 673 | + logger.debug( | ||
| 674 | + f"[Translator] Redis cache miss | Original text: '{text}' | Target language: {target_lang} | " | ||
| 675 | + f"Cache key: {cache_key}" | ||
| 676 | + ) | ||
| 677 | + return None | ||
| 678 | + except Exception as e: | ||
| 679 | + logger.error(f"[Translator] Redis error during get translation cache | Original text: '{text}' | Target language: {target_lang} | Error: {e}") | ||
| 680 | + return None | ||
| 681 | + | ||
| 682 | + def _set_cached_translation_redis( | ||
| 683 | + self, | ||
| 684 | + text: str, | ||
| 685 | + target_lang: str, | ||
| 686 | + translation: str, | ||
| 687 | + source_lang: Optional[str] = None, | ||
| 688 | + context: Optional[str] = None, | ||
| 689 | + prompt: Optional[str] = None | ||
| 690 | + ) -> None: | ||
| 691 | + """Store translation in Redis cache.""" | ||
| 692 | + if not self.redis_client: | ||
| 693 | + return | ||
| 694 | + | ||
| 695 | + try: | ||
| 696 | + cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}" | ||
| 697 | + self.redis_client.setex(cache_key, self.expire_seconds, translation) | ||
| 698 | + logger.info( | ||
| 699 | + f"[Translator] Redis cache write | Original text: '{text}' | Target language: {target_lang} | " | ||
| 700 | + f"Cache key: {cache_key} | Translation result: '{translation}'" | ||
| 701 | + ) | ||
| 702 | + except Exception as e: | ||
| 703 | + logger.error( | ||
| 704 | + f"[Translator] Redis cache write failed | Original text: '{text}' | Target language: {target_lang} | " | ||
| 705 | + f"Error: {e}" | ||
| 706 | + ) | ||
| 707 | + | ||
| 708 | + def _translate_async( | ||
| 709 | + self, | ||
| 710 | + text: str, | ||
| 711 | + target_lang: str, | ||
| 712 | + source_lang: Optional[str] = None, | ||
| 713 | + context: Optional[str] = None, | ||
| 714 | + prompt: Optional[str] = None | ||
| 715 | + ): | ||
| 716 | + """Launch async translation task.""" | ||
| 717 | + def _do_translate(): | ||
| 718 | + try: | ||
| 719 | + result = self.translate(text, target_lang, source_lang, context, prompt) | ||
| 720 | + if result: | ||
| 721 | + logger.debug(f"Async translation completed: {text} -> {target_lang}: {result}") | ||
| 722 | + except Exception as e: | ||
| 723 | + logger.warning(f"Async translation failed: {text} -> {target_lang}: {e}") | ||
| 724 | + | ||
| 725 | + self.executor.submit(_do_translate) | ||
| 726 | + | ||
| 727 | + def _add_ecommerce_context( | ||
| 728 | + self, | ||
| 729 | + text: str, | ||
| 730 | + source_lang: Optional[str], | ||
| 731 | + context: Optional[str] | ||
| 732 | + ) -> tuple: | ||
| 733 | + """ | ||
| 734 | + Add e-commerce context to text for better disambiguation. | ||
| 735 | + | ||
| 736 | + For single-word ambiguous Chinese terms, we add context words that help | ||
| 737 | + DeepL understand this is an e-commerce/product search context. | ||
| 738 | + | ||
| 739 | + Args: | ||
| 740 | + text: Original text to translate | ||
| 741 | + source_lang: Source language code | ||
| 742 | + context: Context hint | ||
| 743 | + | ||
| 744 | + Returns: | ||
| 745 | + Tuple of (text_with_context, needs_extraction) | ||
| 746 | + - text_with_context: Text to send to DeepL | ||
| 747 | + - needs_extraction: Whether we need to extract the term from the result | ||
| 748 | + """ | ||
| 749 | + # Only apply for e-commerce context and Chinese source | ||
| 750 | + if not context or "e-commerce" not in context.lower(): | ||
| 751 | + return text, False | ||
| 752 | + | ||
| 753 | + if not source_lang or source_lang.lower() != 'zh': | ||
| 754 | + return text, False | ||
| 755 | + | ||
| 756 | + # For single-word queries, add context to help disambiguation | ||
| 757 | + text_stripped = text.strip() | ||
| 758 | + if len(text_stripped.split()) == 1 and len(text_stripped) <= 2: | ||
| 759 | + # Common ambiguous Chinese e-commerce terms like "车" (car vs rook) | ||
| 760 | + # We add a context phrase: "购买 [term]" (buy [term]) or "商品 [term]" (product [term]) | ||
| 761 | + # This helps DeepL understand the e-commerce context | ||
| 762 | + # We'll need to extract just the term from the translation result | ||
| 763 | + context_phrase = f"购买 {text_stripped}" | ||
| 764 | + return context_phrase, True | ||
| 765 | + | ||
| 766 | + # For multi-word queries, DeepL usually has enough context | ||
| 767 | + return text, False | ||
| 768 | + | ||
| 769 | + def _extract_term_from_translation( | ||
| 770 | + self, | ||
| 771 | + translated_text: str, | ||
| 772 | + original_text: str, | ||
| 773 | + target_lang_code: str | ||
| 774 | + ) -> str: | ||
| 775 | + """ | ||
| 776 | + Extract the actual term from a translation that included context. | ||
| 777 | + | ||
| 778 | + For example, if we translated "购买 车" (buy car) and got "buy car", | ||
| 779 | + we want to extract just "car". | ||
| 780 | + | ||
| 781 | + Args: | ||
| 782 | + translated_text: Full translation result | ||
| 783 | + original_text: Original single-word query | ||
| 784 | + target_lang_code: Target language code (EN, ZH, etc.) | ||
| 785 | + | ||
| 786 | + Returns: | ||
| 787 | + Extracted term or original translation if extraction fails | ||
| 788 | + """ | ||
| 789 | + # For English target, try to extract the last word (the actual term) | ||
| 790 | + if target_lang_code == "EN": | ||
| 791 | + words = translated_text.strip().split() | ||
| 792 | + if len(words) > 1: | ||
| 793 | + # Usually the last word is the term we want | ||
| 794 | + # But we need to be smart - if it's "buy car", we want "car" | ||
| 795 | + # Common context words to skip: buy, purchase, product, item, etc. | ||
| 796 | + context_words = {"buy", "purchase", "product", "item", "commodity", "goods"} | ||
| 797 | + # Try to find the term (not a context word) | ||
| 798 | + for word in reversed(words): | ||
| 799 | + word_lower = word.lower().rstrip('.,!?;:') | ||
| 800 | + if word_lower not in context_words: | ||
| 801 | + return word_lower | ||
| 802 | + # If all words are context words, return the last one | ||
| 803 | + return words[-1].lower().rstrip('.,!?;:') | ||
| 804 | + | ||
| 805 | + # For other languages or if extraction fails, return as-is | ||
| 806 | + # The user can configure a glossary for better results | ||
| 807 | + return translated_text | ||
| 808 | + | ||
| 809 | + def _shop_lang_matches(self, shop_lang_lower: str, lang_code: str) -> bool: | ||
| 810 | + """True if shop language matches index language (use source, no translate).""" | ||
| 811 | + if not shop_lang_lower or not lang_code: | ||
| 812 | + return False | ||
| 813 | + if shop_lang_lower == lang_code: | ||
| 814 | + return True | ||
| 815 | + if lang_code == "zh" and "zh" in shop_lang_lower: | ||
| 816 | + return True | ||
| 817 | + if lang_code == "en" and "en" in shop_lang_lower: | ||
| 818 | + return True | ||
| 819 | + return False | ||
| 820 | + | ||
| 821 | + def translate_for_indexing( | ||
| 822 | + self, | ||
| 823 | + text: str, | ||
| 824 | + shop_language: str, | ||
| 825 | + source_lang: Optional[str] = None, | ||
| 826 | + context: Optional[str] = None, | ||
| 827 | + prompt: Optional[str] = None, | ||
| 828 | + index_languages: Optional[List[str]] = None, | ||
| 829 | + ) -> Dict[str, Optional[str]]: | ||
| 830 | + """ | ||
| 831 | + Translate text for indexing based on shop language and tenant index_languages. | ||
| 832 | + | ||
| 833 | + For each language in index_languages: use source text if shop language matches, | ||
| 834 | + otherwise translate to that language. | ||
| 835 | + | ||
| 836 | + Args: | ||
| 837 | + text: Text to translate | ||
| 838 | + shop_language: Shop primary language (e.g. 'zh', 'en', 'ru') | ||
| 839 | + source_lang: Source language code (optional) | ||
| 840 | + context: Additional context for translation (optional) | ||
| 841 | + prompt: Translation prompt (optional) | ||
| 842 | + index_languages: Languages to index (from tenant_config). Default ["en", "zh"]. | ||
| 843 | + | ||
| 844 | + Returns: | ||
| 845 | + Dict keyed by each index_language with translated or source text (or None). | ||
| 846 | + """ | ||
| 847 | + langs = index_languages if index_languages else ["en", "zh"] | ||
| 848 | + results = {lang: None for lang in langs} | ||
| 849 | + if not text or not text.strip(): | ||
| 850 | + return results | ||
| 851 | + if re.match(r'^[\d\s_-]+$', text): | ||
| 852 | + logger.info(f"[Translator] Skip translation for symbol-only query: '{text}'") | ||
| 853 | + return results | ||
| 854 | + | ||
| 855 | + shop_lang_lower = (shop_language or "").strip().lower() | ||
| 856 | + targets = [] | ||
| 857 | + for lang in langs: | ||
| 858 | + if self._shop_lang_matches(shop_lang_lower, lang): | ||
| 859 | + results[lang] = text | ||
| 860 | + else: | ||
| 861 | + targets.append(lang) | ||
| 862 | + | ||
| 863 | + for target_lang in targets: | ||
| 864 | + cached = self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt) | ||
| 865 | + if cached: | ||
| 866 | + results[target_lang] = cached | ||
| 867 | + logger.debug(f"[Translator] Cache hit for indexing: '{text}' -> {target_lang}: {cached}") | ||
| 868 | + continue | ||
| 869 | + translated = self.translate( | ||
| 870 | + text, | ||
| 871 | + target_lang=target_lang, | ||
| 872 | + source_lang=source_lang or shop_language, | ||
| 873 | + context=context, | ||
| 874 | + prompt=prompt, | ||
| 875 | + ) | ||
| 876 | + results[target_lang] = translated | ||
| 877 | + return results | ||
| 878 | + | ||
| 879 | + def get_translation_needs( | ||
| 880 | + self, | ||
| 881 | + detected_lang: str, | ||
| 882 | + supported_langs: List[str] | ||
| 883 | + ) -> List[str]: | ||
| 884 | + """ | ||
| 885 | + Determine which languages need translation. | ||
| 886 | + | ||
| 887 | + Args: | ||
| 888 | + detected_lang: Detected query language | ||
| 889 | + supported_langs: List of supported languages | ||
| 890 | + | ||
| 891 | + Returns: | ||
| 892 | + List of language codes to translate to | ||
| 893 | + """ | ||
| 894 | + # If detected language is in supported list, translate to others | ||
| 895 | + if detected_lang in supported_langs: | ||
| 896 | + return [lang for lang in supported_langs if detected_lang != lang] | ||
| 897 | + | ||
| 898 | + # Otherwise, translate to all supported languages | ||
| 899 | + return supported_langs | ||
| 900 | + | ||
| 901 | + def _is_english_text(self, text: str) -> bool: | ||
| 902 | + """ | ||
| 903 | + Check if text is primarily English (ASCII letters, numbers, common punctuation). | ||
| 904 | + | ||
| 905 | + Args: | ||
| 906 | + text: Text to check | ||
| 907 | + | ||
| 908 | + Returns: | ||
| 909 | + True if text appears to be English | ||
| 910 | + """ | ||
| 911 | + if not text or not text.strip(): | ||
| 912 | + return True | ||
| 913 | + | ||
| 914 | + # Remove whitespace and common punctuation | ||
| 915 | + text_clean = re.sub(r'[\s\.,!?;:\-\'\"\(\)\[\]{}]', '', text) | ||
| 916 | + if not text_clean: | ||
| 917 | + return True | ||
| 918 | + | ||
| 919 | + # Check if all remaining characters are ASCII (letters, numbers) | ||
| 920 | + # This is a simple heuristic: if most characters are ASCII, it's likely English | ||
| 921 | + ascii_count = sum(1 for c in text_clean if ord(c) < 128) | ||
| 922 | + ratio = ascii_count / len(text_clean) if text_clean else 0 | ||
| 923 | + | ||
| 924 | + # If more than 80% are ASCII characters, consider it English | ||
| 925 | + return ratio > 0.8 | ||
| 926 | + | ||
| 927 | + def _contains_chinese(self, text: str) -> bool: | ||
| 928 | + """ | ||
| 929 | + Check if text contains Chinese characters (Han characters). | ||
| 930 | + | ||
| 931 | + Args: | ||
| 932 | + text: Text to check | ||
| 933 | + | ||
| 934 | + Returns: | ||
| 935 | + True if text contains Chinese characters | ||
| 936 | + """ | ||
| 937 | + if not text: | ||
| 938 | + return False | ||
| 939 | + | ||
| 940 | + # Check for Chinese characters (Unicode range: \u4e00-\u9fff) | ||
| 941 | + chinese_pattern = re.compile(r'[\u4e00-\u9fff]') | ||
| 942 | + return bool(chinese_pattern.search(text)) | ||
| 943 | + | ||
| 944 | + def _is_pure_number(self, text: str) -> bool: | ||
| 945 | + """ | ||
| 946 | + Check if text is purely numeric (digits, possibly with spaces, dots, commas). | ||
| 947 | + | ||
| 948 | + Args: | ||
| 949 | + text: Text to check | ||
| 950 | + | ||
| 951 | + Returns: | ||
| 952 | + True if text is purely numeric | ||
| 953 | + """ | ||
| 954 | + if not text or not text.strip(): | ||
| 955 | + return False | ||
| 956 | + | ||
| 957 | + # Remove whitespace, dots, commas (common number separators) | ||
| 958 | + text_clean = re.sub(r'[\s\.,]', '', text.strip()) | ||
| 959 | + if not text_clean: | ||
| 960 | + return False | ||
| 961 | + | ||
| 962 | + # Check if all remaining characters are digits | ||
| 963 | + return text_clean.isdigit() |
query/test_translation.py
| @@ -19,7 +19,7 @@ from pathlib import Path | @@ -19,7 +19,7 @@ from pathlib import Path | ||
| 19 | sys.path.insert(0, str(Path(__file__).parent.parent)) | 19 | sys.path.insert(0, str(Path(__file__).parent.parent)) |
| 20 | 20 | ||
| 21 | from config import ConfigLoader | 21 | from config import ConfigLoader |
| 22 | -from query.translator import Translator | 22 | +from query.qwen_mt_translate import Translator |
| 23 | import logging | 23 | import logging |
| 24 | 24 | ||
| 25 | # Configure logging | 25 | # Configure logging |
query/translator.py
| @@ -13,7 +13,7 @@ Supports multiple translation models: | @@ -13,7 +13,7 @@ Supports multiple translation models: | ||
| 13 | 使用方法 (Usage): | 13 | 使用方法 (Usage): |
| 14 | 14 | ||
| 15 | ```python | 15 | ```python |
| 16 | -from query.translator import Translator | 16 | +from query.qwen_mt_translate import Translator |
| 17 | 17 | ||
| 18 | # 使用默认的 qwen 模型(推荐) | 18 | # 使用默认的 qwen 模型(推荐) |
| 19 | translator = Translator() # 默认使用 qwen 模型 | 19 | translator = Translator() # 默认使用 qwen 模型 |
tests/test_translator_failure_semantics.py