Commit a0a173ae904212171b324f0976f034f6528ff749
1 parent
985752f5
last
Showing
12 changed files
with
1393 additions
and
8 deletions
Show diff stats
api/translator_app.py
| ... | ... | @@ -97,7 +97,7 @@ from pydantic import BaseModel, Field |
| 97 | 97 | # Add parent directory to path |
| 98 | 98 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| 99 | 99 | |
| 100 | -from query.translator import Translator | |
| 100 | +from query.qwen_mt_translate import Translator | |
| 101 | 101 | from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG |
| 102 | 102 | |
| 103 | 103 | # Configure logging | ... | ... |
config/config.yaml
| ... | ... | @@ -119,7 +119,7 @@ rerank: |
| 119 | 119 | # 可扩展服务/provider 注册表(单一配置源) |
| 120 | 120 | services: |
| 121 | 121 | translation: |
| 122 | - provider: "direct" # direct | http | google(reserved) | |
| 122 | + provider: "llm" # direct | http | google(reserved) | |
| 123 | 123 | base_url: "http://127.0.0.1:6006" |
| 124 | 124 | model: "qwen" |
| 125 | 125 | timeout_sec: 10.0 |
| ... | ... | @@ -130,6 +130,12 @@ services: |
| 130 | 130 | base_url: "http://127.0.0.1:6006" |
| 131 | 131 | model: "qwen" |
| 132 | 132 | timeout_sec: 10.0 |
| 133 | + llm: | |
| 134 | + model: "qwen-flash" | |
| 135 | + # 可选:覆盖 DashScope 兼容模式的 Endpoint 与超时 | |
| 136 | + # base_url 留空则使用 DASHSCOPE_BASE_URL 或默认地域 | |
| 137 | + base_url: "" | |
| 138 | + timeout_sec: 30.0 | |
| 133 | 139 | google: |
| 134 | 140 | enabled: false |
| 135 | 141 | project_id: "" | ... | ... |
docs/系统设计文档.md
| ... | ... | @@ -384,6 +384,15 @@ query_config: |
| 384 | 384 | |
| 385 | 385 | 实际代码中,通过通用的 translation provider 抽象来选择具体后端和模型,文档不固定绑定某一个具体翻译服务或模型名称,以保持可配置性。 |
| 386 | 386 | |
| 387 | +此外,为了支持**高质量、提示词可控的 LLM 翻译**(例如商品富化脚本、离线分析工具),在 `query/llm_translate.py` 中提供了一个独立的 LLM 翻译辅助模块: | |
| 388 | + | |
| 389 | +- **配置入口**:`config/config.yaml -> services.translation.providers.llm`,用于指定: | |
| 390 | + - `model`: 例如 `qwen-flash`(DashScope 兼容模式的对话模型) | |
| 391 | + - `base_url`: 可选;为空时使用环境变量 `DASHSCOPE_BASE_URL` 或默认 Endpoint | |
| 392 | + - `timeout_sec`: LLM 调用超时 | |
| 393 | +- **环境变量**:仍通过 `DASHSCOPE_API_KEY` 注入 DashScope API Key。 | |
| 394 | +- **使用方式**:主查询路径继续使用 machine translation(`query.translator.Translator`),只在需要更强表达控制的场景(如批量标注、产品分类脚本)中显式调用 `llm_translate()`。 | |
| 395 | + | |
| 387 | 396 | #### 功能特性 |
| 388 | 397 | 1. **语言检测**:自动检测查询语言 |
| 389 | 398 | 2. **智能翻译**: | ... | ... |
indexer/document_transformer.py
| ... | ... | @@ -20,7 +20,7 @@ logger = logging.getLogger(__name__) |
| 20 | 20 | |
| 21 | 21 | # Try to import translator (optional dependency) |
| 22 | 22 | try: |
| 23 | - from query.translator import Translator | |
| 23 | + from query.qwen_mt_translate import Translator | |
| 24 | 24 | TRANSLATOR_AVAILABLE = True |
| 25 | 25 | except ImportError: |
| 26 | 26 | TRANSLATOR_AVAILABLE = False | ... | ... |
indexer/test_indexing.py
| ... | ... | @@ -273,7 +273,7 @@ def test_document_transformer(): |
| 273 | 273 | tenant_config = tenant_config_loader.get_tenant_config('162') |
| 274 | 274 | |
| 275 | 275 | # 初始化翻译器(测试环境总是启用,具体翻译方向由tenant_config控制) |
| 276 | - from query.translator import Translator | |
| 276 | + from query.qwen_mt_translate import Translator | |
| 277 | 277 | translator = Translator( |
| 278 | 278 | api_key=config.query_config.translation_api_key, |
| 279 | 279 | use_cache=True | ... | ... |
providers/translation.py
| ... | ... | @@ -0,0 +1,169 @@ |
| 1 | +""" | |
| 2 | +Translation provider - direct (in-process) or HTTP service. | |
| 3 | +""" | |
| 4 | +from __future__ import annotations | |
| 5 | + | |
| 6 | +import logging | |
| 7 | +from typing import Any, Dict, List, Optional, Union | |
| 8 | + | |
| 9 | +from concurrent.futures import Future, ThreadPoolExecutor | |
| 10 | +import requests | |
| 11 | + | |
| 12 | +from config.services_config import get_translation_config, get_translation_base_url | |
| 13 | + | |
| 14 | +logger = logging.getLogger(__name__) | |
| 15 | + | |
| 16 | + | |
| 17 | +class HttpTranslationProvider: | |
| 18 | + """Translation via HTTP service.""" | |
| 19 | + | |
| 20 | + def __init__( | |
| 21 | + self, | |
| 22 | + base_url: str, | |
| 23 | + model: str = "qwen", | |
| 24 | + timeout_sec: float = 10.0, | |
| 25 | + translation_context: Optional[str] = None, | |
| 26 | + ): | |
| 27 | + self.base_url = (base_url or "").rstrip("/") | |
| 28 | + self.model = model or "qwen" | |
| 29 | + self.timeout_sec = float(timeout_sec or 10.0) | |
| 30 | + self.translation_context = translation_context or "e-commerce product search" | |
| 31 | + self.executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="http-translator") | |
| 32 | + | |
| 33 | + def _translate_once( | |
| 34 | + self, | |
| 35 | + text: str, | |
| 36 | + target_lang: str, | |
| 37 | + source_lang: Optional[str] = None, | |
| 38 | + ) -> Optional[str]: | |
| 39 | + if not text or not str(text).strip(): | |
| 40 | + return text | |
| 41 | + try: | |
| 42 | + url = f"{self.base_url}/translate" | |
| 43 | + payload = { | |
| 44 | + "text": text, | |
| 45 | + "target_lang": target_lang, | |
| 46 | + "source_lang": source_lang or "auto", | |
| 47 | + "model": self.model, | |
| 48 | + } | |
| 49 | + response = requests.post(url, json=payload, timeout=self.timeout_sec) | |
| 50 | + if response.status_code != 200: | |
| 51 | + logger.warning( | |
| 52 | + "HTTP translator failed: status=%s body=%s", | |
| 53 | + response.status_code, | |
| 54 | + (response.text or "")[:200], | |
| 55 | + ) | |
| 56 | + return None | |
| 57 | + data = response.json() | |
| 58 | + translated = data.get("translated_text") | |
| 59 | + return translated if translated is not None else None | |
| 60 | + except Exception as exc: | |
| 61 | + logger.warning("HTTP translator request failed: %s", exc, exc_info=True) | |
| 62 | + return None | |
| 63 | + | |
| 64 | + def translate( | |
| 65 | + self, | |
| 66 | + text: str, | |
| 67 | + target_lang: str, | |
| 68 | + source_lang: Optional[str] = None, | |
| 69 | + context: Optional[str] = None, | |
| 70 | + prompt: Optional[str] = None, | |
| 71 | + ) -> Optional[str]: | |
| 72 | + del context, prompt | |
| 73 | + result = self._translate_once(text=text, target_lang=target_lang, source_lang=source_lang) | |
| 74 | + return result if result is not None else text | |
| 75 | + | |
| 76 | + def translate_multi( | |
| 77 | + self, | |
| 78 | + text: str, | |
| 79 | + target_langs: List[str], | |
| 80 | + source_lang: Optional[str] = None, | |
| 81 | + context: Optional[str] = None, | |
| 82 | + async_mode: bool = True, | |
| 83 | + prompt: Optional[str] = None, | |
| 84 | + ) -> Dict[str, Optional[str]]: | |
| 85 | + del context, async_mode, prompt | |
| 86 | + out: Dict[str, Optional[str]] = {} | |
| 87 | + for lang in target_langs: | |
| 88 | + out[lang] = self.translate(text, lang, source_lang=source_lang) | |
| 89 | + return out | |
| 90 | + | |
| 91 | + def translate_multi_async( | |
| 92 | + self, | |
| 93 | + text: str, | |
| 94 | + target_langs: List[str], | |
| 95 | + source_lang: Optional[str] = None, | |
| 96 | + context: Optional[str] = None, | |
| 97 | + prompt: Optional[str] = None, | |
| 98 | + ) -> Dict[str, Union[str, Future]]: | |
| 99 | + del context, prompt | |
| 100 | + out: Dict[str, Union[str, Future]] = {} | |
| 101 | + for lang in target_langs: | |
| 102 | + out[lang] = self.executor.submit(self.translate, text, lang, source_lang) | |
| 103 | + return out | |
| 104 | + | |
| 105 | + def translate_for_indexing( | |
| 106 | + self, | |
| 107 | + text: str, | |
| 108 | + shop_language: str, | |
| 109 | + source_lang: Optional[str] = None, | |
| 110 | + context: Optional[str] = None, | |
| 111 | + prompt: Optional[str] = None, | |
| 112 | + index_languages: Optional[List[str]] = None, | |
| 113 | + ) -> Dict[str, Optional[str]]: | |
| 114 | + del context, prompt | |
| 115 | + langs = index_languages if index_languages else ["en", "zh"] | |
| 116 | + source = source_lang or shop_language or "auto" | |
| 117 | + out: Dict[str, Optional[str]] = {} | |
| 118 | + for lang in langs: | |
| 119 | + if lang == shop_language: | |
| 120 | + out[lang] = text | |
| 121 | + else: | |
| 122 | + out[lang] = self.translate(text, target_lang=lang, source_lang=source) | |
| 123 | + return out | |
| 124 | + | |
| 125 | + | |
| 126 | +def create_translation_provider(query_config: Any = None) -> Any: | |
| 127 | + """ | |
| 128 | + Create translation provider from services config. | |
| 129 | + | |
| 130 | + query_config: optional, for api_key/glossary_id/context (used by direct provider). | |
| 131 | + """ | |
| 132 | + cfg = get_translation_config() | |
| 133 | + provider = cfg.provider | |
| 134 | + pc = cfg.get_provider_cfg() | |
| 135 | + | |
| 136 | + if provider in ("direct", "local", "inprocess"): | |
| 137 | + from query.qwen_mt_translate import Translator | |
| 138 | + model = pc.get("model") or "qwen" | |
| 139 | + qc = query_config or _empty_query_config() | |
| 140 | + return Translator( | |
| 141 | + model=model, | |
| 142 | + api_key=getattr(qc, "translation_api_key", None), | |
| 143 | + use_cache=True, | |
| 144 | + glossary_id=getattr(qc, "translation_glossary_id", None), | |
| 145 | + translation_context=getattr(qc, "translation_context", "e-commerce product search"), | |
| 146 | + ) | |
| 147 | + | |
| 148 | + if provider in ("http", "service"): | |
| 149 | + base_url = get_translation_base_url() | |
| 150 | + model = pc.get("model") or "qwen" | |
| 151 | + timeout = pc.get("timeout_sec", 10.0) | |
| 152 | + qc = query_config or _empty_query_config() | |
| 153 | + return HttpTranslationProvider( | |
| 154 | + base_url=base_url, | |
| 155 | + model=model, | |
| 156 | + timeout_sec=float(timeout), | |
| 157 | + translation_context=getattr(qc, "translation_context", "e-commerce product search"), | |
| 158 | + ) | |
| 159 | + | |
| 160 | + raise ValueError(f"Unsupported translation provider: {provider}") | |
| 161 | + | |
| 162 | + | |
| 163 | +def _empty_query_config() -> Any: | |
| 164 | + """Minimal object with default translation attrs.""" | |
| 165 | + class _QC: | |
| 166 | + translation_api_key = None | |
| 167 | + translation_glossary_id = None | |
| 168 | + translation_context = "e-commerce product search" | |
| 169 | + return _QC() | ... | ... |
query/__init__.py
| 1 | 1 | """Query package initialization.""" |
| 2 | 2 | |
| 3 | 3 | from .language_detector import LanguageDetector |
| 4 | -from .translator import Translator | |
| 4 | +from .qwen_mt_translate import Translator | |
| 5 | 5 | from .query_rewriter import QueryRewriter, QueryNormalizer |
| 6 | 6 | from .query_parser import QueryParser, ParsedQuery |
| 7 | 7 | ... | ... |
query/llm_translate.py
| ... | ... | @@ -0,0 +1,238 @@ |
| 1 | +""" | |
| 2 | +LLM-based translation helper using Qwen chat model. | |
| 3 | + | |
| 4 | +This module provides a thin wrapper around DashScope's `qwen-flash` model | |
| 5 | +for high-quality, prompt-controlled translation, independent of the main | |
| 6 | +`Translator` (machine translation) pipeline. | |
| 7 | + | |
| 8 | +Usage example: | |
| 9 | + | |
| 10 | + from query.llm_translate import llm_translate | |
| 11 | + | |
| 12 | + result = llm_translate( | |
| 13 | + text="我看到这个视频后没有笑", | |
| 14 | + target_lang="en", | |
| 15 | + source_lang="zh", | |
| 16 | + source_lang_label="中文", | |
| 17 | + target_lang_label="英文", | |
| 18 | + ) | |
| 19 | +""" | |
| 20 | + | |
| 21 | +from __future__ import annotations | |
| 22 | + | |
| 23 | +import logging | |
| 24 | +import os | |
| 25 | +import time | |
| 26 | +from typing import Dict, Optional | |
| 27 | + | |
| 28 | +from openai import OpenAI | |
| 29 | + | |
| 30 | +from config.env_config import DASHSCOPE_API_KEY | |
| 31 | +from config.services_config import get_translation_config | |
| 32 | + | |
| 33 | +logger = logging.getLogger(__name__) | |
| 34 | + | |
| 35 | + | |
| 36 | +# 华北2(北京):https://dashscope.aliyuncs.com/compatible-mode/v1 | |
| 37 | +# 新加坡:https://dashscope-intl.aliyuncs.com/compatible-mode/v1 | |
| 38 | +# 美国(弗吉尼亚):https://dashscope-us.aliyuncs.com/compatible-mode/v1 | |
| 39 | +# | |
| 40 | +# 默认保持与现有翻译/索引脚本相同的美国地域,可通过环境变量覆盖: | |
| 41 | +# DASHSCOPE_BASE_URL=https://dashscope.aliyuncs.com/compatible-mode/v1 | |
| 42 | +DEFAULT_QWEN_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1" | |
| 43 | +QWEN_MODEL_NAME = "qwen-flash" | |
| 44 | + | |
| 45 | + | |
| 46 | +# 由调用方提供的语言标签/代码填充,占位符说明: | |
| 47 | +# - source_lang: 源语言的人类可读名称(按目标语言本地化,例如 "中文", "English") | |
| 48 | +# - target_lang: 目标语言的人类可读名称 | |
| 49 | +# - src_lang_code: 源语言代码,例如 "zh" | |
| 50 | +# - tgt_lang_code: 目标语言代码,例如 "en" | |
| 51 | +TRANSLATION_PROMPTS: Dict[str, str] = { | |
| 52 | + "zh": """你是一名专业的 {source_lang}({src_lang_code})到 {target_lang}({tgt_lang_code})翻译员。你的目标是在遵循 {target_lang} 的语法、词汇和文化习惯的前提下,准确传达原始 {source_lang} 文本的含义和细微差别。请只输出 {target_lang} 的翻译内容,不要包含任何额外的解释或评论。请将以下 {source_lang} 文本翻译成 {target_lang}: | |
| 53 | + | |
| 54 | +{text}""", | |
| 55 | + "en": """You are a professional {source_lang} ({src_lang_code}) to {target_lang} ({tgt_lang_code}) translator. Your goal is to accurately convey the meaning and nuances of the original {source_lang} text while adhering to {target_lang} grammar, vocabulary, and cultural sensitivities. Produce only the {target_lang} translation, without any additional explanations or commentary. Please translate the following {source_lang} text into {target_lang}: | |
| 56 | + | |
| 57 | +{text}""", | |
| 58 | + "ru": """Вы профессиональный переводчик с {source_lang} ({src_lang_code}) на {target_lang} ({tgt_lang_code}). Ваша задача — точно передать смысл и нюансы исходного текста на {source_lang}, соблюдая грамматику, лексику и культурные особенности {target_lang}. Выводите только перевод на {target_lang}, без каких-либо дополнительных объяснений или комментариев. Пожалуйста, переведите следующий текст с {source_lang} на {target_lang}: | |
| 59 | + | |
| 60 | +{text}""", | |
| 61 | + "ar": """أنت مترجم محترف من {source_lang} ({src_lang_code}) إلى {target_lang} ({tgt_lang_code}). هدفك هو نقل المعنى والدلالات الدقيقة للنص الأصلي بلغة {source_lang} بدقة، مع الالتزام بقواعد اللغة والمفردات والحساسيات الثقافية الخاصة بلغة {target_lang}. قم بإنتاج الترجمة إلى {target_lang} فقط دون أي شروحات أو تعليقات إضافية. يرجى ترجمة النص التالي من {source_lang} إلى {target_lang}: | |
| 62 | + | |
| 63 | +{text}""", | |
| 64 | + "ja": """あなたは {source_lang}({src_lang_code})から {target_lang}({tgt_lang_code})へのプロの翻訳者です。{target_lang} の文法、語彙、文化的配慮に従いながら、元の {source_lang} テキストの意味やニュアンスを正確に伝えることが目的です。追加の説明やコメントは一切含めず、{target_lang} の翻訳のみを出力してください。次の {source_lang} テキストを {target_lang} に翻訳してください: | |
| 65 | + | |
| 66 | +{text}""", | |
| 67 | + "es": """Eres un traductor profesional de {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Tu objetivo es transmitir con precisión el significado y los matices del texto original en {source_lang}, respetando la gramática, el vocabulario y las sensibilidades culturales de {target_lang}. Produce únicamente la traducción en {target_lang}, sin explicaciones ni comentarios adicionales. Por favor, traduce el siguiente texto de {source_lang} a {target_lang}: | |
| 68 | + | |
| 69 | +{text}""", | |
| 70 | + "de": """Du bist ein professioneller Übersetzer von {source_lang} ({src_lang_code}) nach {target_lang} ({tgt_lang_code}). Dein Ziel ist es, die Bedeutung und Nuancen des ursprünglichen {source_lang}-Textes genau zu vermitteln und dabei die Grammatik, den Wortschatz und die kulturellen Besonderheiten von {target_lang} zu berücksichtigen. Gib ausschließlich die Übersetzung in {target_lang} aus, ohne zusätzliche Erklärungen oder Kommentare. Bitte übersetze den folgenden {source_lang}-Text in {target_lang}: | |
| 71 | + | |
| 72 | +{text}""", | |
| 73 | + "fr": """Vous êtes un traducteur professionnel de {source_lang} ({src_lang_code}) vers {target_lang} ({tgt_lang_code}). Votre objectif est de transmettre fidèlement le sens et les nuances du texte original en {source_lang}, tout en respectant la grammaire, le vocabulaire et les sensibilités culturelles de {target_lang}. Produisez uniquement la traduction en {target_lang}, sans explications ni commentaires supplémentaires. Veuillez traduire le texte suivant de {source_lang} vers {target_lang} : | |
| 74 | + | |
| 75 | +{text}""", | |
| 76 | + "it": """Sei un traduttore professionista da {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Il tuo obiettivo è trasmettere con precisione il significato e le sfumature del testo originale in {source_lang}, rispettando la grammatica, il vocabolario e le sensibilità culturali di {target_lang}. Produci solo la traduzione in {target_lang}, senza spiegazioni o commenti aggiuntivi. Per favore traduci il seguente testo da {source_lang} a {target_lang}: | |
| 77 | + | |
| 78 | +{text}""", | |
| 79 | + "pt": """Você é um tradutor profissional de {source_lang} ({src_lang_code}) para {target_lang} ({tgt_lang_code}). Seu objetivo é transmitir com precisão o significado e as nuances do texto original em {source_lang}, respeitando a gramática, o vocabulário e as sensibilidades culturais de {target_lang}. Produza apenas a tradução em {target_lang}, sem quaisquer explicações ou comentários adicionais. Por favor, traduza o seguinte texto de {source_lang} para {target_lang}: | |
| 80 | + | |
| 81 | +{text}""", | |
| 82 | +} | |
| 83 | + | |
| 84 | + | |
| 85 | +def _get_qwen_client(base_url: Optional[str] = None) -> Optional[OpenAI]: | |
| 86 | + """ | |
| 87 | + Lazily construct an OpenAI-compatible client for DashScope. | |
| 88 | + | |
| 89 | + Uses DASHSCOPE_API_KEY and base_url (provider config / env) to configure endpoint. | |
| 90 | + """ | |
| 91 | + api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY") | |
| 92 | + if not api_key: | |
| 93 | + logger.warning("DASHSCOPE_API_KEY not set; llm-based translation will be disabled") | |
| 94 | + return None | |
| 95 | + | |
| 96 | + # 优先使用显式传入的 base_url,其次环境变量,最后默认地域。 | |
| 97 | + base_url = ( | |
| 98 | + (base_url or "").strip() | |
| 99 | + or os.getenv("DASHSCOPE_BASE_URL") | |
| 100 | + or DEFAULT_QWEN_BASE_URL | |
| 101 | + ) | |
| 102 | + | |
| 103 | + try: | |
| 104 | + client = OpenAI(api_key=api_key, base_url=base_url) | |
| 105 | + return client | |
| 106 | + except Exception as exc: | |
| 107 | + logger.error("Failed to initialize DashScope OpenAI client: %s", exc, exc_info=True) | |
| 108 | + return None | |
| 109 | + | |
| 110 | + | |
| 111 | +def _build_prompt( | |
| 112 | + text: str, | |
| 113 | + target_lang: str, | |
| 114 | + source_lang_label: str, | |
| 115 | + target_lang_label: str, | |
| 116 | + src_lang_code: str, | |
| 117 | + tgt_lang_code: str, | |
| 118 | +) -> str: | |
| 119 | + """ | |
| 120 | + Build translation prompt for given target language, defaulting to English template. | |
| 121 | + """ | |
| 122 | + key = (target_lang or "").lower() | |
| 123 | + template = TRANSLATION_PROMPTS.get(key) or TRANSLATION_PROMPTS["en"] | |
| 124 | + return template.format( | |
| 125 | + source_lang=source_lang_label, | |
| 126 | + target_lang=target_lang_label, | |
| 127 | + src_lang_code=src_lang_code, | |
| 128 | + tgt_lang_code=tgt_lang_code, | |
| 129 | + text=text, | |
| 130 | + ) | |
| 131 | + | |
| 132 | + | |
| 133 | +def llm_translate( | |
| 134 | + text: str, | |
| 135 | + target_lang: str, | |
| 136 | + *, | |
| 137 | + source_lang: Optional[str] = None, | |
| 138 | + source_lang_label: Optional[str] = None, | |
| 139 | + target_lang_label: Optional[str] = None, | |
| 140 | + timeout_sec: Optional[float] = None, | |
| 141 | +) -> Optional[str]: | |
| 142 | + """ | |
| 143 | + Translate text with Qwen chat model using rich prompts. | |
| 144 | + | |
| 145 | + - 根据目标语言选择提示词,如果没匹配到则退回英文模板。 | |
| 146 | + - 不对 text 做语言检测或缓存,调用方自行控制。 | |
| 147 | + | |
| 148 | + Args: | |
| 149 | + text: 原始文本 | |
| 150 | + target_lang: 目标语言代码(如 "zh", "en") | |
| 151 | + source_lang: 源语言代码(可选,不影响提示词选择,仅用于日志) | |
| 152 | + source_lang_label: 源语言展示名称,用于 prompt(默认使用 source_lang) | |
| 153 | + target_lang_label: 目标语言展示名称,用于 prompt(默认使用 target_lang) | |
| 154 | + timeout_sec: 请求超时时间(秒,可选;若未配置则从 config 读取或采用默认) | |
| 155 | + | |
| 156 | + Returns: | |
| 157 | + 翻译后的文本;如失败则返回 None。 | |
| 158 | + """ | |
| 159 | + if not text or not str(text).strip(): | |
| 160 | + return text | |
| 161 | + | |
| 162 | + cfg = get_translation_config() | |
| 163 | + provider_cfg = cfg.providers.get("llm", {}) if isinstance(cfg.providers, dict) else {} | |
| 164 | + | |
| 165 | + model_name = provider_cfg.get("model") or QWEN_MODEL_NAME | |
| 166 | + req_timeout = float(provider_cfg.get("timeout_sec") or timeout_sec or 30.0) | |
| 167 | + base_url = (provider_cfg.get("base_url") or "").strip() or None | |
| 168 | + | |
| 169 | + client = _get_qwen_client(base_url=base_url) | |
| 170 | + if not client: | |
| 171 | + # 无法调用云端,直接回退 | |
| 172 | + logger.warning( | |
| 173 | + "[llm_translate] Client init failed; returning original text. " | |
| 174 | + "text=%r target_lang=%s source_lang=%s", | |
| 175 | + text[:80], | |
| 176 | + target_lang, | |
| 177 | + source_lang or "auto", | |
| 178 | + ) | |
| 179 | + return text | |
| 180 | + | |
| 181 | + tgt = (target_lang or "").lower() or "en" | |
| 182 | + src = (source_lang or "auto").lower() | |
| 183 | + src_label = source_lang_label or src | |
| 184 | + tgt_label = target_lang_label or tgt | |
| 185 | + | |
| 186 | + prompt = _build_prompt( | |
| 187 | + text=text, | |
| 188 | + target_lang=tgt, | |
| 189 | + source_lang_label=src_label, | |
| 190 | + target_lang_label=tgt_label, | |
| 191 | + src_lang_code=src, | |
| 192 | + tgt_lang_code=tgt, | |
| 193 | + ) | |
| 194 | + | |
| 195 | + start = time.time() | |
| 196 | + try: | |
| 197 | + completion = client.chat.completions.create( | |
| 198 | + model=model_name, | |
| 199 | + messages=[ | |
| 200 | + { | |
| 201 | + "role": "user", | |
| 202 | + "content": prompt, | |
| 203 | + } | |
| 204 | + ], | |
| 205 | + timeout=req_timeout, | |
| 206 | + ) | |
| 207 | + content = (completion.choices[0].message.content or "").strip() | |
| 208 | + duration_ms = (time.time() - start) * 1000 | |
| 209 | + logger.info( | |
| 210 | + "[llm_translate] Success | model=%s src=%s tgt=%s latency=%.1fms text=%r -> %r", | |
| 211 | + model_name, | |
| 212 | + src, | |
| 213 | + tgt, | |
| 214 | + duration_ms, | |
| 215 | + text[:80], | |
| 216 | + content[:80], | |
| 217 | + ) | |
| 218 | + return content or text | |
| 219 | + except Exception as exc: | |
| 220 | + duration_ms = (time.time() - start) * 1000 | |
| 221 | + logger.warning( | |
| 222 | + "[llm_translate] Failed | model=%s src=%s tgt=%s latency=%.1fms error=%s", | |
| 223 | + model_name, | |
| 224 | + src, | |
| 225 | + tgt, | |
| 226 | + duration_ms, | |
| 227 | + exc, | |
| 228 | + exc_info=True, | |
| 229 | + ) | |
| 230 | + # 安全回退:出错时返回原文,避免中断上游流程 | |
| 231 | + return text | |
| 232 | + | |
| 233 | + | |
| 234 | +__all__ = [ | |
| 235 | + "TRANSLATION_PROMPTS", | |
| 236 | + "llm_translate", | |
| 237 | +] | |
| 238 | + | ... | ... |
| ... | ... | @@ -0,0 +1,963 @@ |
| 1 | +""" | |
| 2 | +Translation service for multi-language query support. | |
| 3 | + | |
| 4 | +Supports multiple translation models: | |
| 5 | +- Qwen (default): Alibaba Cloud DashScope API using qwen-mt-flash model | |
| 6 | +- DeepL: DeepL API for high-quality translations | |
| 7 | + | |
| 8 | +重要说明(Qwen 机翻限速): | |
| 9 | +- 当前默认使用的 `qwen-mt-flash` 为云端机翻模型,**官方限速较低,约 RPM=60(每分钟约 60 请求)** | |
| 10 | +- 在高并发场景必须依赖 Redis 翻译缓存与批量预热,避免在用户实时请求路径上直接打满 DashScope 限流 | |
| 11 | +- 若业务侧存在大规模离线翻译或更高吞吐需求,建议评估 DeepL 或自建翻译后端 | |
| 12 | + | |
| 13 | +使用方法 (Usage): | |
| 14 | + | |
| 15 | +```python | |
| 16 | +from query.translator import Translator | |
| 17 | + | |
| 18 | +# 使用默认的 qwen 模型(推荐) | |
| 19 | +translator = Translator() # 默认使用 qwen 模型 | |
| 20 | + | |
| 21 | +# 或显式指定模型 | |
| 22 | +translator = Translator(model='qwen') # 使用 qwen 模型 | |
| 23 | +translator = Translator(model='deepl') # 使用 DeepL 模型 | |
| 24 | + | |
| 25 | +# 翻译文本 | |
| 26 | +result = translator.translate( | |
| 27 | + text="我看到这个视频后没有笑", | |
| 28 | + target_lang="en", | |
| 29 | + source_lang="auto" # 自动检测源语言 | |
| 30 | +) | |
| 31 | +``` | |
| 32 | + | |
| 33 | +配置说明 (Configuration): | |
| 34 | +- Qwen 模型需要设置 DASHSCOPE_API_KEY 环境变量(在 .env 文件中) | |
| 35 | +- DeepL 模型需要设置 DEEPL_AUTH_KEY 环境变量(在 .env 文件中) | |
| 36 | + | |
| 37 | +Qwen 模型参考文档: | |
| 38 | +- 官方文档:https://help.aliyun.com/zh/model-studio/get-api-key | |
| 39 | +- 模型:qwen-mt-flash(快速翻译模型) | |
| 40 | + | |
| 41 | +DeepL 官方文档: | |
| 42 | +https://developers.deepl.com/api-reference/translate/request-translation | |
| 43 | +""" | |
| 44 | + | |
| 45 | +import os | |
| 46 | +import requests | |
| 47 | +import re | |
| 48 | +import redis | |
| 49 | +from concurrent.futures import ThreadPoolExecutor, Future | |
| 50 | +from datetime import timedelta | |
| 51 | +from typing import Dict, List, Optional, Union | |
| 52 | +import logging | |
| 53 | +import time | |
| 54 | + | |
| 55 | +logger = logging.getLogger(__name__) | |
| 56 | + | |
| 57 | +from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG | |
| 58 | +from openai import OpenAI | |
| 59 | + | |
| 60 | + | |
| 61 | +class Translator: | |
| 62 | + """ | |
| 63 | + Multi-language translator supporting Qwen and DeepL APIs. | |
| 64 | + | |
| 65 | + Default model is 'qwen' which uses Alibaba Cloud DashScope API. | |
| 66 | + """ | |
| 67 | +# 华北2(北京):https://dashscope.aliyuncs.com/compatible-mode/v1 | |
| 68 | +# 新加坡:https://dashscope-intl.aliyuncs.com/compatible-mode/v1 | |
| 69 | +# 美国(弗吉尼亚):https://dashscope-us.aliyuncs.com/compatible-mode/v1 | |
| 70 | + | |
| 71 | + DEEPL_API_URL = "https://api.deepl.com/v2/translate" # Pro tier | |
| 72 | + QWEN_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1" # 北京地域 | |
| 73 | + # QWEN_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" # 新加坡 | |
| 74 | + # 如果使用新加坡地域的模型,需要将base_url替换为:https://dashscope-intl.aliyuncs.com/compatible-mode/v1 | |
| 75 | + QWEN_MODEL = "qwen-mt-flash" # 快速翻译模型 | |
| 76 | + | |
| 77 | + # Language code mapping | |
| 78 | + LANG_CODE_MAP = { | |
| 79 | + 'zh': 'ZH', | |
| 80 | + 'en': 'EN', | |
| 81 | + 'ru': 'RU', | |
| 82 | + 'ar': 'AR', | |
| 83 | + 'ja': 'JA', | |
| 84 | + 'es': 'ES', | |
| 85 | + 'de': 'DE', | |
| 86 | + 'fr': 'FR', | |
| 87 | + 'it': 'IT', | |
| 88 | + 'pt': 'PT', | |
| 89 | + } | |
| 90 | + | |
| 91 | + def __init__( | |
| 92 | + self, | |
| 93 | + model: str = "qwen", | |
| 94 | + api_key: Optional[str] = None, | |
| 95 | + use_cache: bool = True, | |
| 96 | + timeout: int = 10, | |
| 97 | + glossary_id: Optional[str] = None, | |
| 98 | + translation_context: Optional[str] = None | |
| 99 | + ): | |
| 100 | + """ | |
| 101 | + Initialize translator. | |
| 102 | + | |
| 103 | + Args: | |
| 104 | + model: Translation model to use. Options: 'qwen' (default) or 'deepl' | |
| 105 | + api_key: API key for the selected model (or None to use from config/env) | |
| 106 | + use_cache: Whether to cache translations | |
| 107 | + timeout: Request timeout in seconds | |
| 108 | + glossary_id: DeepL glossary ID for custom terminology (optional, only for DeepL) | |
| 109 | + translation_context: Context hint for translation (e.g., "e-commerce", "product search") | |
| 110 | + """ | |
| 111 | + self.model = model.lower() | |
| 112 | + if self.model not in ['qwen', 'deepl']: | |
| 113 | + raise ValueError(f"Unsupported model: {model}. Supported models: 'qwen', 'deepl'") | |
| 114 | + | |
| 115 | + # Get API key from config if not provided | |
| 116 | + if api_key is None: | |
| 117 | + if self.model == 'qwen': | |
| 118 | + api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY") | |
| 119 | + else: # deepl | |
| 120 | + api_key = DEEPL_AUTH_KEY or os.getenv("DEEPL_AUTH_KEY") | |
| 121 | + | |
| 122 | + self.api_key = api_key | |
| 123 | + self.timeout = timeout | |
| 124 | + self.use_cache = use_cache | |
| 125 | + self.glossary_id = glossary_id | |
| 126 | + self.translation_context = translation_context or "e-commerce product search" | |
| 127 | + | |
| 128 | + # Initialize OpenAI client for Qwen if needed | |
| 129 | + self.qwen_client = None | |
| 130 | + if self.model == 'qwen': | |
| 131 | + if not self.api_key: | |
| 132 | + logger.warning("DASHSCOPE_API_KEY not set. Qwen translation will not work.") | |
| 133 | + else: | |
| 134 | + self.qwen_client = OpenAI( | |
| 135 | + api_key=self.api_key, | |
| 136 | + base_url=self.QWEN_BASE_URL, | |
| 137 | + ) | |
| 138 | + | |
| 139 | + # Initialize Redis cache if enabled | |
| 140 | + if use_cache: | |
| 141 | + try: | |
| 142 | + self.redis_client = redis.Redis( | |
| 143 | + host=REDIS_CONFIG.get('host', 'localhost'), | |
| 144 | + port=REDIS_CONFIG.get('port', 6479), | |
| 145 | + password=REDIS_CONFIG.get('password'), | |
| 146 | + decode_responses=True, # Return str instead of bytes | |
| 147 | + socket_timeout=REDIS_CONFIG.get('socket_timeout', 1), | |
| 148 | + socket_connect_timeout=REDIS_CONFIG.get('socket_connect_timeout', 1), | |
| 149 | + retry_on_timeout=REDIS_CONFIG.get('retry_on_timeout', False), | |
| 150 | + health_check_interval=10, # 避免复用坏连接 | |
| 151 | + ) | |
| 152 | + # Test connection | |
| 153 | + self.redis_client.ping() | |
| 154 | + expire_days = REDIS_CONFIG.get('translation_cache_expire_days', 360) | |
| 155 | + self.expire_time = timedelta(days=expire_days) | |
| 156 | + self.expire_seconds = int(self.expire_time.total_seconds()) # Redis 需要秒数 | |
| 157 | + self.cache_prefix = REDIS_CONFIG.get('translation_cache_prefix', 'trans') | |
| 158 | + logger.info("Redis cache initialized for translations") | |
| 159 | + except Exception as e: | |
| 160 | + logger.warning(f"Failed to initialize Redis cache: {e}, falling back to no cache") | |
| 161 | + self.redis_client = None | |
| 162 | + self.cache = None | |
| 163 | + else: | |
| 164 | + self.redis_client = None | |
| 165 | + self.cache = None | |
| 166 | + | |
| 167 | + # Thread pool for async translation | |
| 168 | + self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="translator") | |
| 169 | + | |
| 170 | + def translate( | |
| 171 | + self, | |
| 172 | + text: str, | |
| 173 | + target_lang: str, | |
| 174 | + source_lang: Optional[str] = None, | |
| 175 | + context: Optional[str] = None, | |
| 176 | + prompt: Optional[str] = None | |
| 177 | + ) -> Optional[str]: | |
| 178 | + """ | |
| 179 | + Translate text to target language (synchronous mode). | |
| 180 | + | |
| 181 | + Args: | |
| 182 | + text: Text to translate | |
| 183 | + target_lang: Target language code ('zh', 'en', 'ru', etc.) | |
| 184 | + source_lang: Source language code (option al, auto-detect if None) | |
| 185 | + context: Additional context for translation (overrides default context) | |
| 186 | + prompt: Translation prompt/instruction (optional, for better translation quality) | |
| 187 | + | |
| 188 | + Returns: | |
| 189 | + Translated text or None if translation fails | |
| 190 | + """ | |
| 191 | + if not text or not text.strip(): | |
| 192 | + return text | |
| 193 | + | |
| 194 | + # Normalize language codes | |
| 195 | + target_lang = target_lang.lower() | |
| 196 | + if source_lang: | |
| 197 | + source_lang = source_lang.lower() | |
| 198 | + | |
| 199 | + # Optimization: Skip translation if not needed | |
| 200 | + if target_lang == 'en' and self._is_english_text(text): | |
| 201 | + logger.info(f"[Translator] Text is already English, skipping translation: '{text[:50]}...'") | |
| 202 | + return text | |
| 203 | + | |
| 204 | + if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): | |
| 205 | + logger.info( | |
| 206 | + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | |
| 207 | + f"Source language: {source_lang or 'auto'} | Result: Skip translation (contains Chinese or pure number)" | |
| 208 | + ) | |
| 209 | + return text | |
| 210 | + | |
| 211 | + # Use provided context or default context | |
| 212 | + translation_context = context or self.translation_context | |
| 213 | + | |
| 214 | + # Build cache key (include prompt in cache key if provided) | |
| 215 | + cache_key_parts = [source_lang or 'auto', target_lang, translation_context] | |
| 216 | + if prompt: | |
| 217 | + cache_key_parts.append(prompt) | |
| 218 | + cache_key_parts.append(text) | |
| 219 | + cache_key = ':'.join(cache_key_parts) | |
| 220 | + | |
| 221 | + # Check cache (include context and prompt in cache key for accuracy) | |
| 222 | + if self.use_cache and self.redis_client: | |
| 223 | + cached = self._get_cached_translation_redis(text, target_lang, source_lang, translation_context, prompt) | |
| 224 | + if cached: | |
| 225 | + logger.info( | |
| 226 | + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | |
| 227 | + f"Source language: {source_lang or 'auto'} | Result: '{cached}' | Source: Cache hit" | |
| 228 | + ) | |
| 229 | + return cached | |
| 230 | + | |
| 231 | + # If no API key, return mock translation (for testing) | |
| 232 | + if not self.api_key: | |
| 233 | + logger.info( | |
| 234 | + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | |
| 235 | + f"Source language: {source_lang or 'auto'} | Result: '{text}' | Source: Mock mode (no API key)" | |
| 236 | + ) | |
| 237 | + return text | |
| 238 | + | |
| 239 | + # Translate using selected model | |
| 240 | + logger.info( | |
| 241 | + f"[Translator] Translation request | Model: {self.model} | Original text: '{text}' | Target language: {target_lang} | " | |
| 242 | + f"Source language: {source_lang or 'auto'} | Context: {translation_context} | " | |
| 243 | + f"Prompt: {'yes' if prompt else 'no'} | Status: Starting translation" | |
| 244 | + ) | |
| 245 | + | |
| 246 | + if self.model == 'qwen': | |
| 247 | + result = self._translate_qwen(text, target_lang, source_lang, translation_context, prompt) | |
| 248 | + else: # deepl | |
| 249 | + result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt) | |
| 250 | + | |
| 251 | + # Surface translation failure to the caller instead of silently | |
| 252 | + # masquerading the source text as a successful translation. | |
| 253 | + if result is None: | |
| 254 | + logger.warning( | |
| 255 | + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | |
| 256 | + f"Source language: {source_lang or 'auto'} | Status: Translation failed" | |
| 257 | + ) | |
| 258 | + else: | |
| 259 | + logger.info( | |
| 260 | + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | |
| 261 | + f"Source language: {source_lang or 'auto'} | Result: '{result}' | Status: Translation successful" | |
| 262 | + ) | |
| 263 | + | |
| 264 | + # Cache only successful translations. Failed attempts must not poison | |
| 265 | + # Redis with the original text. | |
| 266 | + if result is not None and self.use_cache and self.redis_client: | |
| 267 | + self._set_cached_translation_redis(text, target_lang, result, source_lang, translation_context, prompt) | |
| 268 | + | |
| 269 | + return result | |
| 270 | + | |
| 271 | + def _translate_qwen( | |
| 272 | + self, | |
| 273 | + text: str, | |
| 274 | + target_lang: str, | |
| 275 | + source_lang: Optional[str], | |
| 276 | + context: Optional[str] = None, | |
| 277 | + prompt: Optional[str] = None | |
| 278 | + ) -> Optional[str]: | |
| 279 | + """ | |
| 280 | + Translate using Qwen MT Flash model via Alibaba Cloud DashScope API. | |
| 281 | + | |
| 282 | + Args: | |
| 283 | + text: Text to translate | |
| 284 | + target_lang: Target language code ('zh', 'en', 'ru', etc.) | |
| 285 | + source_lang: Source language code (optional, 'auto' if None) | |
| 286 | + context: Context hint for translation (optional) | |
| 287 | + prompt: Translation prompt/instruction (optional) | |
| 288 | + | |
| 289 | + Returns: | |
| 290 | + Translated text or None if translation fails | |
| 291 | + """ | |
| 292 | + if not self.qwen_client: | |
| 293 | + logger.error("[Translator] Qwen client not initialized. Check DASHSCOPE_API_KEY.") | |
| 294 | + return None | |
| 295 | + | |
| 296 | + # Qwen (qwen-mt-plus/flash/turbo) supported languages mapping | |
| 297 | + # 标准来自:你提供的“语言 / 英文名 / 代码”表 | |
| 298 | + qwen_lang_map = { | |
| 299 | + "en": "English", | |
| 300 | + "zh": "Chinese", | |
| 301 | + "zh_tw": "Traditional Chinese", | |
| 302 | + "ru": "Russian", | |
| 303 | + "ja": "Japanese", | |
| 304 | + "ko": "Korean", | |
| 305 | + "es": "Spanish", | |
| 306 | + "fr": "French", | |
| 307 | + "pt": "Portuguese", | |
| 308 | + "de": "German", | |
| 309 | + "it": "Italian", | |
| 310 | + "th": "Thai", | |
| 311 | + "vi": "Vietnamese", | |
| 312 | + "id": "Indonesian", | |
| 313 | + "ms": "Malay", | |
| 314 | + "ar": "Arabic", | |
| 315 | + "hi": "Hindi", | |
| 316 | + "he": "Hebrew", | |
| 317 | + "my": "Burmese", | |
| 318 | + "ta": "Tamil", | |
| 319 | + "ur": "Urdu", | |
| 320 | + "bn": "Bengali", | |
| 321 | + "pl": "Polish", | |
| 322 | + "nl": "Dutch", | |
| 323 | + "ro": "Romanian", | |
| 324 | + "tr": "Turkish", | |
| 325 | + "km": "Khmer", | |
| 326 | + "lo": "Lao", | |
| 327 | + "yue": "Cantonese", | |
| 328 | + "cs": "Czech", | |
| 329 | + "el": "Greek", | |
| 330 | + "sv": "Swedish", | |
| 331 | + "hu": "Hungarian", | |
| 332 | + "da": "Danish", | |
| 333 | + "fi": "Finnish", | |
| 334 | + "uk": "Ukrainian", | |
| 335 | + "bg": "Bulgarian", | |
| 336 | + } | |
| 337 | + | |
| 338 | + # Convert target language | |
| 339 | + target_lang_normalized = target_lang.lower() | |
| 340 | + target_lang_qwen = qwen_lang_map.get(target_lang_normalized, target_lang.capitalize()) | |
| 341 | + | |
| 342 | + # Convert source language | |
| 343 | + source_lang_normalized = (source_lang or "").strip().lower() | |
| 344 | + if not source_lang_normalized or source_lang_normalized == "auto": | |
| 345 | + source_lang_qwen = "auto" | |
| 346 | + else: | |
| 347 | + source_lang_qwen = qwen_lang_map.get(source_lang_normalized, source_lang.capitalize()) | |
| 348 | + | |
| 349 | + # Prepare translation options | |
| 350 | + translation_options = { | |
| 351 | + "source_lang": source_lang_qwen, | |
| 352 | + "target_lang": target_lang_qwen, | |
| 353 | + } | |
| 354 | + | |
| 355 | + # Prepare messages | |
| 356 | + messages = [ | |
| 357 | + { | |
| 358 | + "role": "user", | |
| 359 | + "content": text | |
| 360 | + } | |
| 361 | + ] | |
| 362 | + | |
| 363 | + start_time = time.time() | |
| 364 | + try: | |
| 365 | + completion = self.qwen_client.chat.completions.create( | |
| 366 | + model=self.QWEN_MODEL, | |
| 367 | + messages=messages, | |
| 368 | + extra_body={ | |
| 369 | + "translation_options": translation_options | |
| 370 | + } | |
| 371 | + ) | |
| 372 | + | |
| 373 | + translated_text = completion.choices[0].message.content.strip() | |
| 374 | + duration_ms = (time.time() - start_time) * 1000 | |
| 375 | + | |
| 376 | + logger.info( | |
| 377 | + f"[Translator] Qwen API response success | Original text: '{text}' | Target language: {target_lang_qwen} | " | |
| 378 | + f"Translation result: '{translated_text}' | Duration: {duration_ms:.2f} ms" | |
| 379 | + ) | |
| 380 | + return translated_text | |
| 381 | + | |
| 382 | + except Exception as e: | |
| 383 | + duration_ms = (time.time() - start_time) * 1000 | |
| 384 | + logger.error( | |
| 385 | + f"[Translator] Qwen API request exception | Original text: '{text}' | Target language: {target_lang_qwen} | " | |
| 386 | + f"Duration: {duration_ms:.2f} ms | Error: {e}", exc_info=True | |
| 387 | + ) | |
| 388 | + return None | |
| 389 | + | |
| 390 | + def _translate_deepl( | |
| 391 | + self, | |
| 392 | + text: str, | |
| 393 | + target_lang: str, | |
| 394 | + source_lang: Optional[str], | |
| 395 | + context: Optional[str] = None, | |
| 396 | + prompt: Optional[str] = None | |
| 397 | + ) -> Optional[str]: | |
| 398 | + """ | |
| 399 | + Translate using DeepL API with context and glossary support. | |
| 400 | + | |
| 401 | + Args: | |
| 402 | + text: Text to translate | |
| 403 | + target_lang: Target language code | |
| 404 | + source_lang: Source language code (optional) | |
| 405 | + context: Context hint for translation (e.g., "e-commerce product search") | |
| 406 | + """ | |
| 407 | + # Map to DeepL language codes | |
| 408 | + target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper()) | |
| 409 | + | |
| 410 | + headers = { | |
| 411 | + "Authorization": f"DeepL-Auth-Key {self.api_key}", | |
| 412 | + "Content-Type": "application/json", | |
| 413 | + } | |
| 414 | + | |
| 415 | + # Use prompt as context parameter for DeepL API (not as text prefix) | |
| 416 | + # According to DeepL API: context is "Additional context that can influence a translation but is not translated itself" | |
| 417 | + # If prompt is provided, use it as context; otherwise use the default context | |
| 418 | + api_context = prompt if prompt else context | |
| 419 | + | |
| 420 | + # For e-commerce, add context words to help DeepL understand the domain | |
| 421 | + # This is especially important for single-word ambiguous terms like "车" (car vs rook) | |
| 422 | + text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context) | |
| 423 | + | |
| 424 | + payload = { | |
| 425 | + "text": [text_to_translate], | |
| 426 | + "target_lang": target_code, | |
| 427 | + } | |
| 428 | + | |
| 429 | + if source_lang: | |
| 430 | + source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper()) | |
| 431 | + payload["source_lang"] = source_code | |
| 432 | + | |
| 433 | + # Add context parameter (prompt or default context) | |
| 434 | + # Context influences translation but is not translated itself | |
| 435 | + if api_context: | |
| 436 | + payload["context"] = api_context | |
| 437 | + | |
| 438 | + # Add glossary if configured | |
| 439 | + if self.glossary_id: | |
| 440 | + payload["glossary_id"] = self.glossary_id | |
| 441 | + | |
| 442 | + # Note: DeepL API v2 supports "context" parameter for additional context | |
| 443 | + # that influences translation but is not translated itself. | |
| 444 | + # We use prompt as context parameter when provided. | |
| 445 | + | |
| 446 | + try: | |
| 447 | + response = requests.post( | |
| 448 | + self.DEEPL_API_URL, | |
| 449 | + headers=headers, | |
| 450 | + json=payload, | |
| 451 | + timeout=self.timeout | |
| 452 | + ) | |
| 453 | + | |
| 454 | + if response.status_code == 200: | |
| 455 | + data = response.json() | |
| 456 | + if "translations" in data and len(data["translations"]) > 0: | |
| 457 | + translated_text = data["translations"][0]["text"] | |
| 458 | + # If we added context, extract just the term from the result | |
| 459 | + if needs_extraction: | |
| 460 | + translated_text = self._extract_term_from_translation( | |
| 461 | + translated_text, text, target_code | |
| 462 | + ) | |
| 463 | + logger.debug( | |
| 464 | + f"[Translator] DeepL API response success | Original text: '{text}' | Target language: {target_code} | " | |
| 465 | + f"Translation result: '{translated_text}'" | |
| 466 | + ) | |
| 467 | + return translated_text | |
| 468 | + else: | |
| 469 | + logger.error( | |
| 470 | + f"[Translator] DeepL API error | Original text: '{text}' | Target language: {target_code} | " | |
| 471 | + f"Status code: {response.status_code} | Error message: {response.text}" | |
| 472 | + ) | |
| 473 | + return None | |
| 474 | + | |
| 475 | + except requests.Timeout: | |
| 476 | + logger.warning( | |
| 477 | + f"[Translator] DeepL API request timeout | Original text: '{text}' | Target language: {target_code} | " | |
| 478 | + f"Timeout: {self.timeout}s" | |
| 479 | + ) | |
| 480 | + return None | |
| 481 | + except Exception as e: | |
| 482 | + logger.error( | |
| 483 | + f"[Translator] DeepL API request exception | Original text: '{text}' | Target language: {target_code} | " | |
| 484 | + f"Error: {e}", exc_info=True | |
| 485 | + ) | |
| 486 | + return None | |
| 487 | + | |
| 488 | + # NOTE: _translate_deepl_free is intentionally not implemented. | |
| 489 | + # We do not support automatic fallback to the free endpoint, to avoid | |
| 490 | + # mixing Pro keys with https://api-free.deepl.com and related 403 errors. | |
| 491 | + | |
| 492 | + def translate_multi( | |
| 493 | + self, | |
| 494 | + text: str, | |
| 495 | + target_langs: List[str], | |
| 496 | + source_lang: Optional[str] = None, | |
| 497 | + context: Optional[str] = None, | |
| 498 | + async_mode: bool = True, | |
| 499 | + prompt: Optional[str] = None | |
| 500 | + ) -> Dict[str, Optional[str]]: | |
| 501 | + """ | |
| 502 | + Translate text to multiple target languages. | |
| 503 | + | |
| 504 | + In async_mode=True (default): | |
| 505 | + - Returns cached translations immediately if available | |
| 506 | + - For translations that can be optimized (e.g., pure numbers, already in target language), | |
| 507 | + returns result immediately via synchronous call | |
| 508 | + - Launches async tasks for other missing translations (non-blocking) | |
| 509 | + - Returns None for missing translations that require async processing | |
| 510 | + | |
| 511 | + In async_mode=False: | |
| 512 | + - Waits for all translations to complete (blocking) | |
| 513 | + | |
| 514 | + Args: | |
| 515 | + text: Text to translate | |
| 516 | + target_langs: List of target language codes | |
| 517 | + source_lang: Source language code (optional) | |
| 518 | + context: Context hint for translation (optional) | |
| 519 | + async_mode: If True, return cached results immediately and translate missing ones async | |
| 520 | + prompt: Translation prompt/instruction (optional) | |
| 521 | + | |
| 522 | + Returns: | |
| 523 | + Dictionary mapping language code to translated text (only cached results in async mode) | |
| 524 | + """ | |
| 525 | + results = {} | |
| 526 | + missing_langs = [] | |
| 527 | + async_langs = [] | |
| 528 | + | |
| 529 | + # First, get cached translations | |
| 530 | + for lang in target_langs: | |
| 531 | + cached = self._get_cached_translation(text, lang, source_lang, context, prompt) | |
| 532 | + if cached is not None: | |
| 533 | + results[lang] = cached | |
| 534 | + else: | |
| 535 | + missing_langs.append(lang) | |
| 536 | + | |
| 537 | + # If async mode and there are missing translations | |
| 538 | + if async_mode and missing_langs: | |
| 539 | + # Check if translation can be optimized (immediate return) | |
| 540 | + for lang in missing_langs: | |
| 541 | + target_lang = lang.lower() | |
| 542 | + # Check optimization conditions (same as in translate method) | |
| 543 | + can_optimize = False | |
| 544 | + if target_lang == 'en' and self._is_english_text(text): | |
| 545 | + can_optimize = True | |
| 546 | + elif target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): | |
| 547 | + can_optimize = True | |
| 548 | + | |
| 549 | + if can_optimize: | |
| 550 | + # Can be optimized, call translate synchronously for immediate result | |
| 551 | + results[lang] = self.translate(text, lang, source_lang, context, prompt) | |
| 552 | + else: | |
| 553 | + # Requires actual translation, add to async list | |
| 554 | + async_langs.append(lang) | |
| 555 | + | |
| 556 | + # Launch async tasks for translations that require actual API calls | |
| 557 | + if async_langs: | |
| 558 | + for lang in async_langs: | |
| 559 | + self._translate_async(text, lang, source_lang, context, prompt) | |
| 560 | + # Return None for async translations | |
| 561 | + for lang in async_langs: | |
| 562 | + results[lang] = None | |
| 563 | + else: | |
| 564 | + # Synchronous mode: wait for all translations | |
| 565 | + for lang in missing_langs: | |
| 566 | + results[lang] = self.translate(text, lang, source_lang, context, prompt) | |
| 567 | + | |
| 568 | + return results | |
| 569 | + | |
| 570 | + def translate_multi_async( | |
| 571 | + self, | |
| 572 | + text: str, | |
| 573 | + target_langs: List[str], | |
| 574 | + source_lang: Optional[str] = None, | |
| 575 | + context: Optional[str] = None, | |
| 576 | + prompt: Optional[str] = None | |
| 577 | + ) -> Dict[str, Union[str, Future]]: | |
| 578 | + """ | |
| 579 | + Translate text to multiple target languages asynchronously, returning Futures that can be awaited. | |
| 580 | + | |
| 581 | + This method returns a dictionary where: | |
| 582 | + - If translation is cached, the value is the translation string (immediate) | |
| 583 | + - If translation needs to be done, the value is a Future object that can be awaited | |
| 584 | + | |
| 585 | + Args: | |
| 586 | + text: Text to translate | |
| 587 | + target_langs: List of target language codes | |
| 588 | + source_lang: Source language code (optional) | |
| 589 | + context: Context hint for translation (optional) | |
| 590 | + prompt: Translation prompt/instruction (optional) | |
| 591 | + | |
| 592 | + Returns: | |
| 593 | + Dictionary mapping language code to either translation string (cached) or Future object | |
| 594 | + """ | |
| 595 | + results = {} | |
| 596 | + missing_langs = [] | |
| 597 | + | |
| 598 | + # First, get cached translations | |
| 599 | + for lang in target_langs: | |
| 600 | + cached = self._get_cached_translation(text, lang, source_lang, context, prompt) | |
| 601 | + if cached is not None: | |
| 602 | + results[lang] = cached | |
| 603 | + else: | |
| 604 | + missing_langs.append(lang) | |
| 605 | + | |
| 606 | + # For missing translations, submit async tasks and return Futures | |
| 607 | + for lang in missing_langs: | |
| 608 | + future = self.executor.submit( | |
| 609 | + self.translate, | |
| 610 | + text, | |
| 611 | + lang, | |
| 612 | + source_lang, | |
| 613 | + context, | |
| 614 | + prompt | |
| 615 | + ) | |
| 616 | + results[lang] = future | |
| 617 | + | |
| 618 | + return results | |
| 619 | + | |
| 620 | + def _get_cached_translation( | |
| 621 | + self, | |
| 622 | + text: str, | |
| 623 | + target_lang: str, | |
| 624 | + source_lang: Optional[str] = None, | |
| 625 | + context: Optional[str] = None, | |
| 626 | + prompt: Optional[str] = None | |
| 627 | + ) -> Optional[str]: | |
| 628 | + """Get translation from cache if available.""" | |
| 629 | + if not self.redis_client: | |
| 630 | + return None | |
| 631 | + return self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt) | |
| 632 | + | |
| 633 | + def _get_cached_translation_redis( | |
| 634 | + self, | |
| 635 | + text: str, | |
| 636 | + target_lang: str, | |
| 637 | + source_lang: Optional[str] = None, | |
| 638 | + context: Optional[str] = None, | |
| 639 | + prompt: Optional[str] = None | |
| 640 | + ) -> Optional[str]: | |
| 641 | + """ | |
| 642 | + Get translation from Redis cache with sliding expiration. | |
| 643 | + | |
| 644 | + 滑动过期机制:每次访问缓存时,重置过期时间为配置的过期时间(默认720天)。 | |
| 645 | + 这样缓存会在最后一次访问后的720天才过期,而不是写入后的720天。 | |
| 646 | + 这确保了常用的翻译缓存不会被过早删除。 | |
| 647 | + """ | |
| 648 | + if not self.redis_client: | |
| 649 | + return None | |
| 650 | + | |
| 651 | + try: | |
| 652 | + # Build cache key: prefix:target_lang:text | |
| 653 | + # For simplicity, we use target_lang and text as key | |
| 654 | + # Context and prompt are not included in key to maximize cache hits | |
| 655 | + cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}" | |
| 656 | + value = self.redis_client.get(cache_key) | |
| 657 | + if value: | |
| 658 | + # Sliding expiration: reset expiration time on access | |
| 659 | + # 每次读取缓存时,重置过期时间为配置的过期时间(最后一次访问后的N天才过期) | |
| 660 | + try: | |
| 661 | + self.redis_client.expire(cache_key, self.expire_seconds) | |
| 662 | + except Exception as expire_error: | |
| 663 | + # 即使 expire 失败,也返回缓存值(不影响功能) | |
| 664 | + logger.warning( | |
| 665 | + f"[Translator] Failed to update cache expiration for key {cache_key}: {expire_error}" | |
| 666 | + ) | |
| 667 | + | |
| 668 | + logger.debug( | |
| 669 | + f"[Translator] Redis cache hit | Original text: '{text}' | Target language: {target_lang} | " | |
| 670 | + f"Cache key: {cache_key} | Translation result: '{value}' | TTL reset to {self.expire_seconds}s" | |
| 671 | + ) | |
| 672 | + return value | |
| 673 | + logger.debug( | |
| 674 | + f"[Translator] Redis cache miss | Original text: '{text}' | Target language: {target_lang} | " | |
| 675 | + f"Cache key: {cache_key}" | |
| 676 | + ) | |
| 677 | + return None | |
| 678 | + except Exception as e: | |
| 679 | + logger.error(f"[Translator] Redis error during get translation cache | Original text: '{text}' | Target language: {target_lang} | Error: {e}") | |
| 680 | + return None | |
| 681 | + | |
| 682 | + def _set_cached_translation_redis( | |
| 683 | + self, | |
| 684 | + text: str, | |
| 685 | + target_lang: str, | |
| 686 | + translation: str, | |
| 687 | + source_lang: Optional[str] = None, | |
| 688 | + context: Optional[str] = None, | |
| 689 | + prompt: Optional[str] = None | |
| 690 | + ) -> None: | |
| 691 | + """Store translation in Redis cache.""" | |
| 692 | + if not self.redis_client: | |
| 693 | + return | |
| 694 | + | |
| 695 | + try: | |
| 696 | + cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}" | |
| 697 | + self.redis_client.setex(cache_key, self.expire_seconds, translation) | |
| 698 | + logger.info( | |
| 699 | + f"[Translator] Redis cache write | Original text: '{text}' | Target language: {target_lang} | " | |
| 700 | + f"Cache key: {cache_key} | Translation result: '{translation}'" | |
| 701 | + ) | |
| 702 | + except Exception as e: | |
| 703 | + logger.error( | |
| 704 | + f"[Translator] Redis cache write failed | Original text: '{text}' | Target language: {target_lang} | " | |
| 705 | + f"Error: {e}" | |
| 706 | + ) | |
| 707 | + | |
| 708 | + def _translate_async( | |
| 709 | + self, | |
| 710 | + text: str, | |
| 711 | + target_lang: str, | |
| 712 | + source_lang: Optional[str] = None, | |
| 713 | + context: Optional[str] = None, | |
| 714 | + prompt: Optional[str] = None | |
| 715 | + ): | |
| 716 | + """Launch async translation task.""" | |
| 717 | + def _do_translate(): | |
| 718 | + try: | |
| 719 | + result = self.translate(text, target_lang, source_lang, context, prompt) | |
| 720 | + if result: | |
| 721 | + logger.debug(f"Async translation completed: {text} -> {target_lang}: {result}") | |
| 722 | + except Exception as e: | |
| 723 | + logger.warning(f"Async translation failed: {text} -> {target_lang}: {e}") | |
| 724 | + | |
| 725 | + self.executor.submit(_do_translate) | |
| 726 | + | |
| 727 | + def _add_ecommerce_context( | |
| 728 | + self, | |
| 729 | + text: str, | |
| 730 | + source_lang: Optional[str], | |
| 731 | + context: Optional[str] | |
| 732 | + ) -> tuple: | |
| 733 | + """ | |
| 734 | + Add e-commerce context to text for better disambiguation. | |
| 735 | + | |
| 736 | + For single-word ambiguous Chinese terms, we add context words that help | |
| 737 | + DeepL understand this is an e-commerce/product search context. | |
| 738 | + | |
| 739 | + Args: | |
| 740 | + text: Original text to translate | |
| 741 | + source_lang: Source language code | |
| 742 | + context: Context hint | |
| 743 | + | |
| 744 | + Returns: | |
| 745 | + Tuple of (text_with_context, needs_extraction) | |
| 746 | + - text_with_context: Text to send to DeepL | |
| 747 | + - needs_extraction: Whether we need to extract the term from the result | |
| 748 | + """ | |
| 749 | + # Only apply for e-commerce context and Chinese source | |
| 750 | + if not context or "e-commerce" not in context.lower(): | |
| 751 | + return text, False | |
| 752 | + | |
| 753 | + if not source_lang or source_lang.lower() != 'zh': | |
| 754 | + return text, False | |
| 755 | + | |
| 756 | + # For single-word queries, add context to help disambiguation | |
| 757 | + text_stripped = text.strip() | |
| 758 | + if len(text_stripped.split()) == 1 and len(text_stripped) <= 2: | |
| 759 | + # Common ambiguous Chinese e-commerce terms like "车" (car vs rook) | |
| 760 | + # We add a context phrase: "购买 [term]" (buy [term]) or "商品 [term]" (product [term]) | |
| 761 | + # This helps DeepL understand the e-commerce context | |
| 762 | + # We'll need to extract just the term from the translation result | |
| 763 | + context_phrase = f"购买 {text_stripped}" | |
| 764 | + return context_phrase, True | |
| 765 | + | |
| 766 | + # For multi-word queries, DeepL usually has enough context | |
| 767 | + return text, False | |
| 768 | + | |
| 769 | + def _extract_term_from_translation( | |
| 770 | + self, | |
| 771 | + translated_text: str, | |
| 772 | + original_text: str, | |
| 773 | + target_lang_code: str | |
| 774 | + ) -> str: | |
| 775 | + """ | |
| 776 | + Extract the actual term from a translation that included context. | |
| 777 | + | |
| 778 | + For example, if we translated "购买 车" (buy car) and got "buy car", | |
| 779 | + we want to extract just "car". | |
| 780 | + | |
| 781 | + Args: | |
| 782 | + translated_text: Full translation result | |
| 783 | + original_text: Original single-word query | |
| 784 | + target_lang_code: Target language code (EN, ZH, etc.) | |
| 785 | + | |
| 786 | + Returns: | |
| 787 | + Extracted term or original translation if extraction fails | |
| 788 | + """ | |
| 789 | + # For English target, try to extract the last word (the actual term) | |
| 790 | + if target_lang_code == "EN": | |
| 791 | + words = translated_text.strip().split() | |
| 792 | + if len(words) > 1: | |
| 793 | + # Usually the last word is the term we want | |
| 794 | + # But we need to be smart - if it's "buy car", we want "car" | |
| 795 | + # Common context words to skip: buy, purchase, product, item, etc. | |
| 796 | + context_words = {"buy", "purchase", "product", "item", "commodity", "goods"} | |
| 797 | + # Try to find the term (not a context word) | |
| 798 | + for word in reversed(words): | |
| 799 | + word_lower = word.lower().rstrip('.,!?;:') | |
| 800 | + if word_lower not in context_words: | |
| 801 | + return word_lower | |
| 802 | + # If all words are context words, return the last one | |
| 803 | + return words[-1].lower().rstrip('.,!?;:') | |
| 804 | + | |
| 805 | + # For other languages or if extraction fails, return as-is | |
| 806 | + # The user can configure a glossary for better results | |
| 807 | + return translated_text | |
| 808 | + | |
| 809 | + def _shop_lang_matches(self, shop_lang_lower: str, lang_code: str) -> bool: | |
| 810 | + """True if shop language matches index language (use source, no translate).""" | |
| 811 | + if not shop_lang_lower or not lang_code: | |
| 812 | + return False | |
| 813 | + if shop_lang_lower == lang_code: | |
| 814 | + return True | |
| 815 | + if lang_code == "zh" and "zh" in shop_lang_lower: | |
| 816 | + return True | |
| 817 | + if lang_code == "en" and "en" in shop_lang_lower: | |
| 818 | + return True | |
| 819 | + return False | |
| 820 | + | |
| 821 | + def translate_for_indexing( | |
| 822 | + self, | |
| 823 | + text: str, | |
| 824 | + shop_language: str, | |
| 825 | + source_lang: Optional[str] = None, | |
| 826 | + context: Optional[str] = None, | |
| 827 | + prompt: Optional[str] = None, | |
| 828 | + index_languages: Optional[List[str]] = None, | |
| 829 | + ) -> Dict[str, Optional[str]]: | |
| 830 | + """ | |
| 831 | + Translate text for indexing based on shop language and tenant index_languages. | |
| 832 | + | |
| 833 | + For each language in index_languages: use source text if shop language matches, | |
| 834 | + otherwise translate to that language. | |
| 835 | + | |
| 836 | + Args: | |
| 837 | + text: Text to translate | |
| 838 | + shop_language: Shop primary language (e.g. 'zh', 'en', 'ru') | |
| 839 | + source_lang: Source language code (optional) | |
| 840 | + context: Additional context for translation (optional) | |
| 841 | + prompt: Translation prompt (optional) | |
| 842 | + index_languages: Languages to index (from tenant_config). Default ["en", "zh"]. | |
| 843 | + | |
| 844 | + Returns: | |
| 845 | + Dict keyed by each index_language with translated or source text (or None). | |
| 846 | + """ | |
| 847 | + langs = index_languages if index_languages else ["en", "zh"] | |
| 848 | + results = {lang: None for lang in langs} | |
| 849 | + if not text or not text.strip(): | |
| 850 | + return results | |
| 851 | + if re.match(r'^[\d\s_-]+$', text): | |
| 852 | + logger.info(f"[Translator] Skip translation for symbol-only query: '{text}'") | |
| 853 | + return results | |
| 854 | + | |
| 855 | + shop_lang_lower = (shop_language or "").strip().lower() | |
| 856 | + targets = [] | |
| 857 | + for lang in langs: | |
| 858 | + if self._shop_lang_matches(shop_lang_lower, lang): | |
| 859 | + results[lang] = text | |
| 860 | + else: | |
| 861 | + targets.append(lang) | |
| 862 | + | |
| 863 | + for target_lang in targets: | |
| 864 | + cached = self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt) | |
| 865 | + if cached: | |
| 866 | + results[target_lang] = cached | |
| 867 | + logger.debug(f"[Translator] Cache hit for indexing: '{text}' -> {target_lang}: {cached}") | |
| 868 | + continue | |
| 869 | + translated = self.translate( | |
| 870 | + text, | |
| 871 | + target_lang=target_lang, | |
| 872 | + source_lang=source_lang or shop_language, | |
| 873 | + context=context, | |
| 874 | + prompt=prompt, | |
| 875 | + ) | |
| 876 | + results[target_lang] = translated | |
| 877 | + return results | |
| 878 | + | |
| 879 | + def get_translation_needs( | |
| 880 | + self, | |
| 881 | + detected_lang: str, | |
| 882 | + supported_langs: List[str] | |
| 883 | + ) -> List[str]: | |
| 884 | + """ | |
| 885 | + Determine which languages need translation. | |
| 886 | + | |
| 887 | + Args: | |
| 888 | + detected_lang: Detected query language | |
| 889 | + supported_langs: List of supported languages | |
| 890 | + | |
| 891 | + Returns: | |
| 892 | + List of language codes to translate to | |
| 893 | + """ | |
| 894 | + # If detected language is in supported list, translate to others | |
| 895 | + if detected_lang in supported_langs: | |
| 896 | + return [lang for lang in supported_langs if detected_lang != lang] | |
| 897 | + | |
| 898 | + # Otherwise, translate to all supported languages | |
| 899 | + return supported_langs | |
| 900 | + | |
| 901 | + def _is_english_text(self, text: str) -> bool: | |
| 902 | + """ | |
| 903 | + Check if text is primarily English (ASCII letters, numbers, common punctuation). | |
| 904 | + | |
| 905 | + Args: | |
| 906 | + text: Text to check | |
| 907 | + | |
| 908 | + Returns: | |
| 909 | + True if text appears to be English | |
| 910 | + """ | |
| 911 | + if not text or not text.strip(): | |
| 912 | + return True | |
| 913 | + | |
| 914 | + # Remove whitespace and common punctuation | |
| 915 | + text_clean = re.sub(r'[\s\.,!?;:\-\'\"\(\)\[\]{}]', '', text) | |
| 916 | + if not text_clean: | |
| 917 | + return True | |
| 918 | + | |
| 919 | + # Check if all remaining characters are ASCII (letters, numbers) | |
| 920 | + # This is a simple heuristic: if most characters are ASCII, it's likely English | |
| 921 | + ascii_count = sum(1 for c in text_clean if ord(c) < 128) | |
| 922 | + ratio = ascii_count / len(text_clean) if text_clean else 0 | |
| 923 | + | |
| 924 | + # If more than 80% are ASCII characters, consider it English | |
| 925 | + return ratio > 0.8 | |
| 926 | + | |
| 927 | + def _contains_chinese(self, text: str) -> bool: | |
| 928 | + """ | |
| 929 | + Check if text contains Chinese characters (Han characters). | |
| 930 | + | |
| 931 | + Args: | |
| 932 | + text: Text to check | |
| 933 | + | |
| 934 | + Returns: | |
| 935 | + True if text contains Chinese characters | |
| 936 | + """ | |
| 937 | + if not text: | |
| 938 | + return False | |
| 939 | + | |
| 940 | + # Check for Chinese characters (Unicode range: \u4e00-\u9fff) | |
| 941 | + chinese_pattern = re.compile(r'[\u4e00-\u9fff]') | |
| 942 | + return bool(chinese_pattern.search(text)) | |
| 943 | + | |
| 944 | + def _is_pure_number(self, text: str) -> bool: | |
| 945 | + """ | |
| 946 | + Check if text is purely numeric (digits, possibly with spaces, dots, commas). | |
| 947 | + | |
| 948 | + Args: | |
| 949 | + text: Text to check | |
| 950 | + | |
| 951 | + Returns: | |
| 952 | + True if text is purely numeric | |
| 953 | + """ | |
| 954 | + if not text or not text.strip(): | |
| 955 | + return False | |
| 956 | + | |
| 957 | + # Remove whitespace, dots, commas (common number separators) | |
| 958 | + text_clean = re.sub(r'[\s\.,]', '', text.strip()) | |
| 959 | + if not text_clean: | |
| 960 | + return False | |
| 961 | + | |
| 962 | + # Check if all remaining characters are digits | |
| 963 | + return text_clean.isdigit() | ... | ... |
query/test_translation.py
| ... | ... | @@ -19,7 +19,7 @@ from pathlib import Path |
| 19 | 19 | sys.path.insert(0, str(Path(__file__).parent.parent)) |
| 20 | 20 | |
| 21 | 21 | from config import ConfigLoader |
| 22 | -from query.translator import Translator | |
| 22 | +from query.qwen_mt_translate import Translator | |
| 23 | 23 | import logging |
| 24 | 24 | |
| 25 | 25 | # Configure logging | ... | ... |
query/translator.py
tests/test_translator_failure_semantics.py