From a0a173ae904212171b324f0976f034f6528ff749 Mon Sep 17 00:00:00 2001 From: tangwang Date: Fri, 13 Mar 2026 16:56:44 +0800 Subject: [PATCH] last --- api/translator_app.py | 2 +- config/config.yaml | 8 +++++++- docs/系统设计文档.md | 9 +++++++++ indexer/document_transformer.py | 2 +- indexer/test_indexing.py | 2 +- providers/translation.py | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ query/__init__.py | 2 +- query/llm_translate.py | 238 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ query/qwen_mt_translate.py | 963 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ query/test_translation.py | 2 +- query/translator.py | 2 +- tests/test_translator_failure_semantics.py | 2 +- 12 files changed, 1393 insertions(+), 8 deletions(-) create mode 100644 query/qwen_mt_translate.py diff --git a/api/translator_app.py b/api/translator_app.py index cbc1d36..df6a0a9 100644 --- a/api/translator_app.py +++ b/api/translator_app.py @@ -97,7 +97,7 @@ from pydantic import BaseModel, Field # Add parent directory to path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from query.translator import Translator +from query.qwen_mt_translate import Translator from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG # Configure logging diff --git a/config/config.yaml b/config/config.yaml index e48bf88..d824b3c 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -119,7 +119,7 @@ rerank: # 可扩展服务/provider 注册表(单一配置源) services: translation: - provider: "direct" # direct | http | google(reserved) + provider: "llm" # direct | http | google(reserved) base_url: "http://127.0.0.1:6006" model: "qwen" timeout_sec: 10.0 @@ -130,6 +130,12 @@ services: base_url: "http://127.0.0.1:6006" model: "qwen" timeout_sec: 10.0 + llm: + model: "qwen-flash" + # 可选:覆盖 DashScope 兼容模式的 Endpoint 与超时 + # base_url 留空则使用 DASHSCOPE_BASE_URL 或默认地域 + base_url: "" + timeout_sec: 30.0 google: enabled: false project_id: "" diff --git a/docs/系统设计文档.md b/docs/系统设计文档.md index 639dd15..e7b6001 100644 --- a/docs/系统设计文档.md +++ b/docs/系统设计文档.md @@ -384,6 +384,15 @@ query_config: 实际代码中,通过通用的 translation provider 抽象来选择具体后端和模型,文档不固定绑定某一个具体翻译服务或模型名称,以保持可配置性。 +此外,为了支持**高质量、提示词可控的 LLM 翻译**(例如商品富化脚本、离线分析工具),在 `query/llm_translate.py` 中提供了一个独立的 LLM 翻译辅助模块: + +- **配置入口**:`config/config.yaml -> services.translation.providers.llm`,用于指定: + - `model`: 例如 `qwen-flash`(DashScope 兼容模式的对话模型) + - `base_url`: 可选;为空时使用环境变量 `DASHSCOPE_BASE_URL` 或默认 Endpoint + - `timeout_sec`: LLM 调用超时 +- **环境变量**:仍通过 `DASHSCOPE_API_KEY` 注入 DashScope API Key。 +- **使用方式**:主查询路径继续使用 machine translation(`query.translator.Translator`),只在需要更强表达控制的场景(如批量标注、产品分类脚本)中显式调用 `llm_translate()`。 + #### 功能特性 1. **语言检测**:自动检测查询语言 2. **智能翻译**: diff --git a/indexer/document_transformer.py b/indexer/document_transformer.py index 83cacd3..d761346 100644 --- a/indexer/document_transformer.py +++ b/indexer/document_transformer.py @@ -20,7 +20,7 @@ logger = logging.getLogger(__name__) # Try to import translator (optional dependency) try: - from query.translator import Translator + from query.qwen_mt_translate import Translator TRANSLATOR_AVAILABLE = True except ImportError: TRANSLATOR_AVAILABLE = False diff --git a/indexer/test_indexing.py b/indexer/test_indexing.py index 5fe7944..a1f0093 100755 --- a/indexer/test_indexing.py +++ b/indexer/test_indexing.py @@ -273,7 +273,7 @@ def test_document_transformer(): tenant_config = tenant_config_loader.get_tenant_config('162') # 初始化翻译器(测试环境总是启用,具体翻译方向由tenant_config控制) - from query.translator import Translator + from query.qwen_mt_translate import Translator translator = Translator( api_key=config.query_config.translation_api_key, use_cache=True diff --git a/providers/translation.py b/providers/translation.py index e69de29..0b8e522 100644 --- a/providers/translation.py +++ b/providers/translation.py @@ -0,0 +1,169 @@ +""" +Translation provider - direct (in-process) or HTTP service. +""" +from __future__ import annotations + +import logging +from typing import Any, Dict, List, Optional, Union + +from concurrent.futures import Future, ThreadPoolExecutor +import requests + +from config.services_config import get_translation_config, get_translation_base_url + +logger = logging.getLogger(__name__) + + +class HttpTranslationProvider: + """Translation via HTTP service.""" + + def __init__( + self, + base_url: str, + model: str = "qwen", + timeout_sec: float = 10.0, + translation_context: Optional[str] = None, + ): + self.base_url = (base_url or "").rstrip("/") + self.model = model or "qwen" + self.timeout_sec = float(timeout_sec or 10.0) + self.translation_context = translation_context or "e-commerce product search" + self.executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="http-translator") + + def _translate_once( + self, + text: str, + target_lang: str, + source_lang: Optional[str] = None, + ) -> Optional[str]: + if not text or not str(text).strip(): + return text + try: + url = f"{self.base_url}/translate" + payload = { + "text": text, + "target_lang": target_lang, + "source_lang": source_lang or "auto", + "model": self.model, + } + response = requests.post(url, json=payload, timeout=self.timeout_sec) + if response.status_code != 200: + logger.warning( + "HTTP translator failed: status=%s body=%s", + response.status_code, + (response.text or "")[:200], + ) + return None + data = response.json() + translated = data.get("translated_text") + return translated if translated is not None else None + except Exception as exc: + logger.warning("HTTP translator request failed: %s", exc, exc_info=True) + return None + + def translate( + self, + text: str, + target_lang: str, + source_lang: Optional[str] = None, + context: Optional[str] = None, + prompt: Optional[str] = None, + ) -> Optional[str]: + del context, prompt + result = self._translate_once(text=text, target_lang=target_lang, source_lang=source_lang) + return result if result is not None else text + + def translate_multi( + self, + text: str, + target_langs: List[str], + source_lang: Optional[str] = None, + context: Optional[str] = None, + async_mode: bool = True, + prompt: Optional[str] = None, + ) -> Dict[str, Optional[str]]: + del context, async_mode, prompt + out: Dict[str, Optional[str]] = {} + for lang in target_langs: + out[lang] = self.translate(text, lang, source_lang=source_lang) + return out + + def translate_multi_async( + self, + text: str, + target_langs: List[str], + source_lang: Optional[str] = None, + context: Optional[str] = None, + prompt: Optional[str] = None, + ) -> Dict[str, Union[str, Future]]: + del context, prompt + out: Dict[str, Union[str, Future]] = {} + for lang in target_langs: + out[lang] = self.executor.submit(self.translate, text, lang, source_lang) + return out + + def translate_for_indexing( + self, + text: str, + shop_language: str, + source_lang: Optional[str] = None, + context: Optional[str] = None, + prompt: Optional[str] = None, + index_languages: Optional[List[str]] = None, + ) -> Dict[str, Optional[str]]: + del context, prompt + langs = index_languages if index_languages else ["en", "zh"] + source = source_lang or shop_language or "auto" + out: Dict[str, Optional[str]] = {} + for lang in langs: + if lang == shop_language: + out[lang] = text + else: + out[lang] = self.translate(text, target_lang=lang, source_lang=source) + return out + + +def create_translation_provider(query_config: Any = None) -> Any: + """ + Create translation provider from services config. + + query_config: optional, for api_key/glossary_id/context (used by direct provider). + """ + cfg = get_translation_config() + provider = cfg.provider + pc = cfg.get_provider_cfg() + + if provider in ("direct", "local", "inprocess"): + from query.qwen_mt_translate import Translator + model = pc.get("model") or "qwen" + qc = query_config or _empty_query_config() + return Translator( + model=model, + api_key=getattr(qc, "translation_api_key", None), + use_cache=True, + glossary_id=getattr(qc, "translation_glossary_id", None), + translation_context=getattr(qc, "translation_context", "e-commerce product search"), + ) + + if provider in ("http", "service"): + base_url = get_translation_base_url() + model = pc.get("model") or "qwen" + timeout = pc.get("timeout_sec", 10.0) + qc = query_config or _empty_query_config() + return HttpTranslationProvider( + base_url=base_url, + model=model, + timeout_sec=float(timeout), + translation_context=getattr(qc, "translation_context", "e-commerce product search"), + ) + + raise ValueError(f"Unsupported translation provider: {provider}") + + +def _empty_query_config() -> Any: + """Minimal object with default translation attrs.""" + class _QC: + translation_api_key = None + translation_glossary_id = None + translation_context = "e-commerce product search" + return _QC() diff --git a/query/__init__.py b/query/__init__.py index b732adf..26c9c4a 100644 --- a/query/__init__.py +++ b/query/__init__.py @@ -1,7 +1,7 @@ """Query package initialization.""" from .language_detector import LanguageDetector -from .translator import Translator +from .qwen_mt_translate import Translator from .query_rewriter import QueryRewriter, QueryNormalizer from .query_parser import QueryParser, ParsedQuery diff --git a/query/llm_translate.py b/query/llm_translate.py index e69de29..24e22c4 100644 --- a/query/llm_translate.py +++ b/query/llm_translate.py @@ -0,0 +1,238 @@ +""" +LLM-based translation helper using Qwen chat model. + +This module provides a thin wrapper around DashScope's `qwen-flash` model +for high-quality, prompt-controlled translation, independent of the main +`Translator` (machine translation) pipeline. + +Usage example: + + from query.llm_translate import llm_translate + + result = llm_translate( + text="我看到这个视频后没有笑", + target_lang="en", + source_lang="zh", + source_lang_label="中文", + target_lang_label="英文", + ) +""" + +from __future__ import annotations + +import logging +import os +import time +from typing import Dict, Optional + +from openai import OpenAI + +from config.env_config import DASHSCOPE_API_KEY +from config.services_config import get_translation_config + +logger = logging.getLogger(__name__) + + +# 华北2(北京):https://dashscope.aliyuncs.com/compatible-mode/v1 +# 新加坡:https://dashscope-intl.aliyuncs.com/compatible-mode/v1 +# 美国(弗吉尼亚):https://dashscope-us.aliyuncs.com/compatible-mode/v1 +# +# 默认保持与现有翻译/索引脚本相同的美国地域,可通过环境变量覆盖: +# DASHSCOPE_BASE_URL=https://dashscope.aliyuncs.com/compatible-mode/v1 +DEFAULT_QWEN_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1" +QWEN_MODEL_NAME = "qwen-flash" + + +# 由调用方提供的语言标签/代码填充,占位符说明: +# - source_lang: 源语言的人类可读名称(按目标语言本地化,例如 "中文", "English") +# - target_lang: 目标语言的人类可读名称 +# - src_lang_code: 源语言代码,例如 "zh" +# - tgt_lang_code: 目标语言代码,例如 "en" +TRANSLATION_PROMPTS: Dict[str, str] = { + "zh": """你是一名专业的 {source_lang}({src_lang_code})到 {target_lang}({tgt_lang_code})翻译员。你的目标是在遵循 {target_lang} 的语法、词汇和文化习惯的前提下,准确传达原始 {source_lang} 文本的含义和细微差别。请只输出 {target_lang} 的翻译内容,不要包含任何额外的解释或评论。请将以下 {source_lang} 文本翻译成 {target_lang}: + +{text}""", + "en": """You are a professional {source_lang} ({src_lang_code}) to {target_lang} ({tgt_lang_code}) translator. Your goal is to accurately convey the meaning and nuances of the original {source_lang} text while adhering to {target_lang} grammar, vocabulary, and cultural sensitivities. Produce only the {target_lang} translation, without any additional explanations or commentary. Please translate the following {source_lang} text into {target_lang}: + +{text}""", + "ru": """Вы профессиональный переводчик с {source_lang} ({src_lang_code}) на {target_lang} ({tgt_lang_code}). Ваша задача — точно передать смысл и нюансы исходного текста на {source_lang}, соблюдая грамматику, лексику и культурные особенности {target_lang}. Выводите только перевод на {target_lang}, без каких-либо дополнительных объяснений или комментариев. Пожалуйста, переведите следующий текст с {source_lang} на {target_lang}: + +{text}""", + "ar": """أنت مترجم محترف من {source_lang} ({src_lang_code}) إلى {target_lang} ({tgt_lang_code}). هدفك هو نقل المعنى والدلالات الدقيقة للنص الأصلي بلغة {source_lang} بدقة، مع الالتزام بقواعد اللغة والمفردات والحساسيات الثقافية الخاصة بلغة {target_lang}. قم بإنتاج الترجمة إلى {target_lang} فقط دون أي شروحات أو تعليقات إضافية. يرجى ترجمة النص التالي من {source_lang} إلى {target_lang}: + +{text}""", + "ja": """あなたは {source_lang}({src_lang_code})から {target_lang}({tgt_lang_code})へのプロの翻訳者です。{target_lang} の文法、語彙、文化的配慮に従いながら、元の {source_lang} テキストの意味やニュアンスを正確に伝えることが目的です。追加の説明やコメントは一切含めず、{target_lang} の翻訳のみを出力してください。次の {source_lang} テキストを {target_lang} に翻訳してください: + +{text}""", + "es": """Eres un traductor profesional de {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Tu objetivo es transmitir con precisión el significado y los matices del texto original en {source_lang}, respetando la gramática, el vocabulario y las sensibilidades culturales de {target_lang}. Produce únicamente la traducción en {target_lang}, sin explicaciones ni comentarios adicionales. Por favor, traduce el siguiente texto de {source_lang} a {target_lang}: + +{text}""", + "de": """Du bist ein professioneller Übersetzer von {source_lang} ({src_lang_code}) nach {target_lang} ({tgt_lang_code}). Dein Ziel ist es, die Bedeutung und Nuancen des ursprünglichen {source_lang}-Textes genau zu vermitteln und dabei die Grammatik, den Wortschatz und die kulturellen Besonderheiten von {target_lang} zu berücksichtigen. Gib ausschließlich die Übersetzung in {target_lang} aus, ohne zusätzliche Erklärungen oder Kommentare. Bitte übersetze den folgenden {source_lang}-Text in {target_lang}: + +{text}""", + "fr": """Vous êtes un traducteur professionnel de {source_lang} ({src_lang_code}) vers {target_lang} ({tgt_lang_code}). Votre objectif est de transmettre fidèlement le sens et les nuances du texte original en {source_lang}, tout en respectant la grammaire, le vocabulaire et les sensibilités culturelles de {target_lang}. Produisez uniquement la traduction en {target_lang}, sans explications ni commentaires supplémentaires. Veuillez traduire le texte suivant de {source_lang} vers {target_lang} : + +{text}""", + "it": """Sei un traduttore professionista da {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Il tuo obiettivo è trasmettere con precisione il significato e le sfumature del testo originale in {source_lang}, rispettando la grammatica, il vocabolario e le sensibilità culturali di {target_lang}. Produci solo la traduzione in {target_lang}, senza spiegazioni o commenti aggiuntivi. Per favore traduci il seguente testo da {source_lang} a {target_lang}: + +{text}""", + "pt": """Você é um tradutor profissional de {source_lang} ({src_lang_code}) para {target_lang} ({tgt_lang_code}). Seu objetivo é transmitir com precisão o significado e as nuances do texto original em {source_lang}, respeitando a gramática, o vocabulário e as sensibilidades culturais de {target_lang}. Produza apenas a tradução em {target_lang}, sem quaisquer explicações ou comentários adicionais. Por favor, traduza o seguinte texto de {source_lang} para {target_lang}: + +{text}""", +} + + +def _get_qwen_client(base_url: Optional[str] = None) -> Optional[OpenAI]: + """ + Lazily construct an OpenAI-compatible client for DashScope. + + Uses DASHSCOPE_API_KEY and base_url (provider config / env) to configure endpoint. + """ + api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY") + if not api_key: + logger.warning("DASHSCOPE_API_KEY not set; llm-based translation will be disabled") + return None + + # 优先使用显式传入的 base_url,其次环境变量,最后默认地域。 + base_url = ( + (base_url or "").strip() + or os.getenv("DASHSCOPE_BASE_URL") + or DEFAULT_QWEN_BASE_URL + ) + + try: + client = OpenAI(api_key=api_key, base_url=base_url) + return client + except Exception as exc: + logger.error("Failed to initialize DashScope OpenAI client: %s", exc, exc_info=True) + return None + + +def _build_prompt( + text: str, + target_lang: str, + source_lang_label: str, + target_lang_label: str, + src_lang_code: str, + tgt_lang_code: str, +) -> str: + """ + Build translation prompt for given target language, defaulting to English template. + """ + key = (target_lang or "").lower() + template = TRANSLATION_PROMPTS.get(key) or TRANSLATION_PROMPTS["en"] + return template.format( + source_lang=source_lang_label, + target_lang=target_lang_label, + src_lang_code=src_lang_code, + tgt_lang_code=tgt_lang_code, + text=text, + ) + + +def llm_translate( + text: str, + target_lang: str, + *, + source_lang: Optional[str] = None, + source_lang_label: Optional[str] = None, + target_lang_label: Optional[str] = None, + timeout_sec: Optional[float] = None, +) -> Optional[str]: + """ + Translate text with Qwen chat model using rich prompts. + + - 根据目标语言选择提示词,如果没匹配到则退回英文模板。 + - 不对 text 做语言检测或缓存,调用方自行控制。 + + Args: + text: 原始文本 + target_lang: 目标语言代码(如 "zh", "en") + source_lang: 源语言代码(可选,不影响提示词选择,仅用于日志) + source_lang_label: 源语言展示名称,用于 prompt(默认使用 source_lang) + target_lang_label: 目标语言展示名称,用于 prompt(默认使用 target_lang) + timeout_sec: 请求超时时间(秒,可选;若未配置则从 config 读取或采用默认) + + Returns: + 翻译后的文本;如失败则返回 None。 + """ + if not text or not str(text).strip(): + return text + + cfg = get_translation_config() + provider_cfg = cfg.providers.get("llm", {}) if isinstance(cfg.providers, dict) else {} + + model_name = provider_cfg.get("model") or QWEN_MODEL_NAME + req_timeout = float(provider_cfg.get("timeout_sec") or timeout_sec or 30.0) + base_url = (provider_cfg.get("base_url") or "").strip() or None + + client = _get_qwen_client(base_url=base_url) + if not client: + # 无法调用云端,直接回退 + logger.warning( + "[llm_translate] Client init failed; returning original text. " + "text=%r target_lang=%s source_lang=%s", + text[:80], + target_lang, + source_lang or "auto", + ) + return text + + tgt = (target_lang or "").lower() or "en" + src = (source_lang or "auto").lower() + src_label = source_lang_label or src + tgt_label = target_lang_label or tgt + + prompt = _build_prompt( + text=text, + target_lang=tgt, + source_lang_label=src_label, + target_lang_label=tgt_label, + src_lang_code=src, + tgt_lang_code=tgt, + ) + + start = time.time() + try: + completion = client.chat.completions.create( + model=model_name, + messages=[ + { + "role": "user", + "content": prompt, + } + ], + timeout=req_timeout, + ) + content = (completion.choices[0].message.content or "").strip() + duration_ms = (time.time() - start) * 1000 + logger.info( + "[llm_translate] Success | model=%s src=%s tgt=%s latency=%.1fms text=%r -> %r", + model_name, + src, + tgt, + duration_ms, + text[:80], + content[:80], + ) + return content or text + except Exception as exc: + duration_ms = (time.time() - start) * 1000 + logger.warning( + "[llm_translate] Failed | model=%s src=%s tgt=%s latency=%.1fms error=%s", + model_name, + src, + tgt, + duration_ms, + exc, + exc_info=True, + ) + # 安全回退:出错时返回原文,避免中断上游流程 + return text + + +__all__ = [ + "TRANSLATION_PROMPTS", + "llm_translate", +] + diff --git a/query/qwen_mt_translate.py b/query/qwen_mt_translate.py new file mode 100644 index 0000000..ee39071 --- /dev/null +++ b/query/qwen_mt_translate.py @@ -0,0 +1,963 @@ +""" +Translation service for multi-language query support. + +Supports multiple translation models: +- Qwen (default): Alibaba Cloud DashScope API using qwen-mt-flash model +- DeepL: DeepL API for high-quality translations + +重要说明(Qwen 机翻限速): +- 当前默认使用的 `qwen-mt-flash` 为云端机翻模型,**官方限速较低,约 RPM=60(每分钟约 60 请求)** +- 在高并发场景必须依赖 Redis 翻译缓存与批量预热,避免在用户实时请求路径上直接打满 DashScope 限流 +- 若业务侧存在大规模离线翻译或更高吞吐需求,建议评估 DeepL 或自建翻译后端 + +使用方法 (Usage): + +```python +from query.translator import Translator + +# 使用默认的 qwen 模型(推荐) +translator = Translator() # 默认使用 qwen 模型 + +# 或显式指定模型 +translator = Translator(model='qwen') # 使用 qwen 模型 +translator = Translator(model='deepl') # 使用 DeepL 模型 + +# 翻译文本 +result = translator.translate( + text="我看到这个视频后没有笑", + target_lang="en", + source_lang="auto" # 自动检测源语言 +) +``` + +配置说明 (Configuration): +- Qwen 模型需要设置 DASHSCOPE_API_KEY 环境变量(在 .env 文件中) +- DeepL 模型需要设置 DEEPL_AUTH_KEY 环境变量(在 .env 文件中) + +Qwen 模型参考文档: +- 官方文档:https://help.aliyun.com/zh/model-studio/get-api-key +- 模型:qwen-mt-flash(快速翻译模型) + +DeepL 官方文档: +https://developers.deepl.com/api-reference/translate/request-translation +""" + +import os +import requests +import re +import redis +from concurrent.futures import ThreadPoolExecutor, Future +from datetime import timedelta +from typing import Dict, List, Optional, Union +import logging +import time + +logger = logging.getLogger(__name__) + +from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG +from openai import OpenAI + + +class Translator: + """ + Multi-language translator supporting Qwen and DeepL APIs. + + Default model is 'qwen' which uses Alibaba Cloud DashScope API. + """ +# 华北2(北京):https://dashscope.aliyuncs.com/compatible-mode/v1 +# 新加坡:https://dashscope-intl.aliyuncs.com/compatible-mode/v1 +# 美国(弗吉尼亚):https://dashscope-us.aliyuncs.com/compatible-mode/v1 + + DEEPL_API_URL = "https://api.deepl.com/v2/translate" # Pro tier + QWEN_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1" # 北京地域 + # QWEN_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" # 新加坡 + # 如果使用新加坡地域的模型,需要将base_url替换为:https://dashscope-intl.aliyuncs.com/compatible-mode/v1 + QWEN_MODEL = "qwen-mt-flash" # 快速翻译模型 + + # Language code mapping + LANG_CODE_MAP = { + 'zh': 'ZH', + 'en': 'EN', + 'ru': 'RU', + 'ar': 'AR', + 'ja': 'JA', + 'es': 'ES', + 'de': 'DE', + 'fr': 'FR', + 'it': 'IT', + 'pt': 'PT', + } + + def __init__( + self, + model: str = "qwen", + api_key: Optional[str] = None, + use_cache: bool = True, + timeout: int = 10, + glossary_id: Optional[str] = None, + translation_context: Optional[str] = None + ): + """ + Initialize translator. + + Args: + model: Translation model to use. Options: 'qwen' (default) or 'deepl' + api_key: API key for the selected model (or None to use from config/env) + use_cache: Whether to cache translations + timeout: Request timeout in seconds + glossary_id: DeepL glossary ID for custom terminology (optional, only for DeepL) + translation_context: Context hint for translation (e.g., "e-commerce", "product search") + """ + self.model = model.lower() + if self.model not in ['qwen', 'deepl']: + raise ValueError(f"Unsupported model: {model}. Supported models: 'qwen', 'deepl'") + + # Get API key from config if not provided + if api_key is None: + if self.model == 'qwen': + api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY") + else: # deepl + api_key = DEEPL_AUTH_KEY or os.getenv("DEEPL_AUTH_KEY") + + self.api_key = api_key + self.timeout = timeout + self.use_cache = use_cache + self.glossary_id = glossary_id + self.translation_context = translation_context or "e-commerce product search" + + # Initialize OpenAI client for Qwen if needed + self.qwen_client = None + if self.model == 'qwen': + if not self.api_key: + logger.warning("DASHSCOPE_API_KEY not set. Qwen translation will not work.") + else: + self.qwen_client = OpenAI( + api_key=self.api_key, + base_url=self.QWEN_BASE_URL, + ) + + # Initialize Redis cache if enabled + if use_cache: + try: + self.redis_client = redis.Redis( + host=REDIS_CONFIG.get('host', 'localhost'), + port=REDIS_CONFIG.get('port', 6479), + password=REDIS_CONFIG.get('password'), + decode_responses=True, # Return str instead of bytes + socket_timeout=REDIS_CONFIG.get('socket_timeout', 1), + socket_connect_timeout=REDIS_CONFIG.get('socket_connect_timeout', 1), + retry_on_timeout=REDIS_CONFIG.get('retry_on_timeout', False), + health_check_interval=10, # 避免复用坏连接 + ) + # Test connection + self.redis_client.ping() + expire_days = REDIS_CONFIG.get('translation_cache_expire_days', 360) + self.expire_time = timedelta(days=expire_days) + self.expire_seconds = int(self.expire_time.total_seconds()) # Redis 需要秒数 + self.cache_prefix = REDIS_CONFIG.get('translation_cache_prefix', 'trans') + logger.info("Redis cache initialized for translations") + except Exception as e: + logger.warning(f"Failed to initialize Redis cache: {e}, falling back to no cache") + self.redis_client = None + self.cache = None + else: + self.redis_client = None + self.cache = None + + # Thread pool for async translation + self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="translator") + + def translate( + self, + text: str, + target_lang: str, + source_lang: Optional[str] = None, + context: Optional[str] = None, + prompt: Optional[str] = None + ) -> Optional[str]: + """ + Translate text to target language (synchronous mode). + + Args: + text: Text to translate + target_lang: Target language code ('zh', 'en', 'ru', etc.) + source_lang: Source language code (option al, auto-detect if None) + context: Additional context for translation (overrides default context) + prompt: Translation prompt/instruction (optional, for better translation quality) + + Returns: + Translated text or None if translation fails + """ + if not text or not text.strip(): + return text + + # Normalize language codes + target_lang = target_lang.lower() + if source_lang: + source_lang = source_lang.lower() + + # Optimization: Skip translation if not needed + if target_lang == 'en' and self._is_english_text(text): + logger.info(f"[Translator] Text is already English, skipping translation: '{text[:50]}...'") + return text + + if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): + logger.info( + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " + f"Source language: {source_lang or 'auto'} | Result: Skip translation (contains Chinese or pure number)" + ) + return text + + # Use provided context or default context + translation_context = context or self.translation_context + + # Build cache key (include prompt in cache key if provided) + cache_key_parts = [source_lang or 'auto', target_lang, translation_context] + if prompt: + cache_key_parts.append(prompt) + cache_key_parts.append(text) + cache_key = ':'.join(cache_key_parts) + + # Check cache (include context and prompt in cache key for accuracy) + if self.use_cache and self.redis_client: + cached = self._get_cached_translation_redis(text, target_lang, source_lang, translation_context, prompt) + if cached: + logger.info( + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " + f"Source language: {source_lang or 'auto'} | Result: '{cached}' | Source: Cache hit" + ) + return cached + + # If no API key, return mock translation (for testing) + if not self.api_key: + logger.info( + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " + f"Source language: {source_lang or 'auto'} | Result: '{text}' | Source: Mock mode (no API key)" + ) + return text + + # Translate using selected model + logger.info( + f"[Translator] Translation request | Model: {self.model} | Original text: '{text}' | Target language: {target_lang} | " + f"Source language: {source_lang or 'auto'} | Context: {translation_context} | " + f"Prompt: {'yes' if prompt else 'no'} | Status: Starting translation" + ) + + if self.model == 'qwen': + result = self._translate_qwen(text, target_lang, source_lang, translation_context, prompt) + else: # deepl + result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt) + + # Surface translation failure to the caller instead of silently + # masquerading the source text as a successful translation. + if result is None: + logger.warning( + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " + f"Source language: {source_lang or 'auto'} | Status: Translation failed" + ) + else: + logger.info( + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " + f"Source language: {source_lang or 'auto'} | Result: '{result}' | Status: Translation successful" + ) + + # Cache only successful translations. Failed attempts must not poison + # Redis with the original text. + if result is not None and self.use_cache and self.redis_client: + self._set_cached_translation_redis(text, target_lang, result, source_lang, translation_context, prompt) + + return result + + def _translate_qwen( + self, + text: str, + target_lang: str, + source_lang: Optional[str], + context: Optional[str] = None, + prompt: Optional[str] = None + ) -> Optional[str]: + """ + Translate using Qwen MT Flash model via Alibaba Cloud DashScope API. + + Args: + text: Text to translate + target_lang: Target language code ('zh', 'en', 'ru', etc.) + source_lang: Source language code (optional, 'auto' if None) + context: Context hint for translation (optional) + prompt: Translation prompt/instruction (optional) + + Returns: + Translated text or None if translation fails + """ + if not self.qwen_client: + logger.error("[Translator] Qwen client not initialized. Check DASHSCOPE_API_KEY.") + return None + + # Qwen (qwen-mt-plus/flash/turbo) supported languages mapping + # 标准来自:你提供的“语言 / 英文名 / 代码”表 + qwen_lang_map = { + "en": "English", + "zh": "Chinese", + "zh_tw": "Traditional Chinese", + "ru": "Russian", + "ja": "Japanese", + "ko": "Korean", + "es": "Spanish", + "fr": "French", + "pt": "Portuguese", + "de": "German", + "it": "Italian", + "th": "Thai", + "vi": "Vietnamese", + "id": "Indonesian", + "ms": "Malay", + "ar": "Arabic", + "hi": "Hindi", + "he": "Hebrew", + "my": "Burmese", + "ta": "Tamil", + "ur": "Urdu", + "bn": "Bengali", + "pl": "Polish", + "nl": "Dutch", + "ro": "Romanian", + "tr": "Turkish", + "km": "Khmer", + "lo": "Lao", + "yue": "Cantonese", + "cs": "Czech", + "el": "Greek", + "sv": "Swedish", + "hu": "Hungarian", + "da": "Danish", + "fi": "Finnish", + "uk": "Ukrainian", + "bg": "Bulgarian", + } + + # Convert target language + target_lang_normalized = target_lang.lower() + target_lang_qwen = qwen_lang_map.get(target_lang_normalized, target_lang.capitalize()) + + # Convert source language + source_lang_normalized = (source_lang or "").strip().lower() + if not source_lang_normalized or source_lang_normalized == "auto": + source_lang_qwen = "auto" + else: + source_lang_qwen = qwen_lang_map.get(source_lang_normalized, source_lang.capitalize()) + + # Prepare translation options + translation_options = { + "source_lang": source_lang_qwen, + "target_lang": target_lang_qwen, + } + + # Prepare messages + messages = [ + { + "role": "user", + "content": text + } + ] + + start_time = time.time() + try: + completion = self.qwen_client.chat.completions.create( + model=self.QWEN_MODEL, + messages=messages, + extra_body={ + "translation_options": translation_options + } + ) + + translated_text = completion.choices[0].message.content.strip() + duration_ms = (time.time() - start_time) * 1000 + + logger.info( + f"[Translator] Qwen API response success | Original text: '{text}' | Target language: {target_lang_qwen} | " + f"Translation result: '{translated_text}' | Duration: {duration_ms:.2f} ms" + ) + return translated_text + + except Exception as e: + duration_ms = (time.time() - start_time) * 1000 + logger.error( + f"[Translator] Qwen API request exception | Original text: '{text}' | Target language: {target_lang_qwen} | " + f"Duration: {duration_ms:.2f} ms | Error: {e}", exc_info=True + ) + return None + + def _translate_deepl( + self, + text: str, + target_lang: str, + source_lang: Optional[str], + context: Optional[str] = None, + prompt: Optional[str] = None + ) -> Optional[str]: + """ + Translate using DeepL API with context and glossary support. + + Args: + text: Text to translate + target_lang: Target language code + source_lang: Source language code (optional) + context: Context hint for translation (e.g., "e-commerce product search") + """ + # Map to DeepL language codes + target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper()) + + headers = { + "Authorization": f"DeepL-Auth-Key {self.api_key}", + "Content-Type": "application/json", + } + + # Use prompt as context parameter for DeepL API (not as text prefix) + # According to DeepL API: context is "Additional context that can influence a translation but is not translated itself" + # If prompt is provided, use it as context; otherwise use the default context + api_context = prompt if prompt else context + + # For e-commerce, add context words to help DeepL understand the domain + # This is especially important for single-word ambiguous terms like "车" (car vs rook) + text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context) + + payload = { + "text": [text_to_translate], + "target_lang": target_code, + } + + if source_lang: + source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper()) + payload["source_lang"] = source_code + + # Add context parameter (prompt or default context) + # Context influences translation but is not translated itself + if api_context: + payload["context"] = api_context + + # Add glossary if configured + if self.glossary_id: + payload["glossary_id"] = self.glossary_id + + # Note: DeepL API v2 supports "context" parameter for additional context + # that influences translation but is not translated itself. + # We use prompt as context parameter when provided. + + try: + response = requests.post( + self.DEEPL_API_URL, + headers=headers, + json=payload, + timeout=self.timeout + ) + + if response.status_code == 200: + data = response.json() + if "translations" in data and len(data["translations"]) > 0: + translated_text = data["translations"][0]["text"] + # If we added context, extract just the term from the result + if needs_extraction: + translated_text = self._extract_term_from_translation( + translated_text, text, target_code + ) + logger.debug( + f"[Translator] DeepL API response success | Original text: '{text}' | Target language: {target_code} | " + f"Translation result: '{translated_text}'" + ) + return translated_text + else: + logger.error( + f"[Translator] DeepL API error | Original text: '{text}' | Target language: {target_code} | " + f"Status code: {response.status_code} | Error message: {response.text}" + ) + return None + + except requests.Timeout: + logger.warning( + f"[Translator] DeepL API request timeout | Original text: '{text}' | Target language: {target_code} | " + f"Timeout: {self.timeout}s" + ) + return None + except Exception as e: + logger.error( + f"[Translator] DeepL API request exception | Original text: '{text}' | Target language: {target_code} | " + f"Error: {e}", exc_info=True + ) + return None + + # NOTE: _translate_deepl_free is intentionally not implemented. + # We do not support automatic fallback to the free endpoint, to avoid + # mixing Pro keys with https://api-free.deepl.com and related 403 errors. + + def translate_multi( + self, + text: str, + target_langs: List[str], + source_lang: Optional[str] = None, + context: Optional[str] = None, + async_mode: bool = True, + prompt: Optional[str] = None + ) -> Dict[str, Optional[str]]: + """ + Translate text to multiple target languages. + + In async_mode=True (default): + - Returns cached translations immediately if available + - For translations that can be optimized (e.g., pure numbers, already in target language), + returns result immediately via synchronous call + - Launches async tasks for other missing translations (non-blocking) + - Returns None for missing translations that require async processing + + In async_mode=False: + - Waits for all translations to complete (blocking) + + Args: + text: Text to translate + target_langs: List of target language codes + source_lang: Source language code (optional) + context: Context hint for translation (optional) + async_mode: If True, return cached results immediately and translate missing ones async + prompt: Translation prompt/instruction (optional) + + Returns: + Dictionary mapping language code to translated text (only cached results in async mode) + """ + results = {} + missing_langs = [] + async_langs = [] + + # First, get cached translations + for lang in target_langs: + cached = self._get_cached_translation(text, lang, source_lang, context, prompt) + if cached is not None: + results[lang] = cached + else: + missing_langs.append(lang) + + # If async mode and there are missing translations + if async_mode and missing_langs: + # Check if translation can be optimized (immediate return) + for lang in missing_langs: + target_lang = lang.lower() + # Check optimization conditions (same as in translate method) + can_optimize = False + if target_lang == 'en' and self._is_english_text(text): + can_optimize = True + elif target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): + can_optimize = True + + if can_optimize: + # Can be optimized, call translate synchronously for immediate result + results[lang] = self.translate(text, lang, source_lang, context, prompt) + else: + # Requires actual translation, add to async list + async_langs.append(lang) + + # Launch async tasks for translations that require actual API calls + if async_langs: + for lang in async_langs: + self._translate_async(text, lang, source_lang, context, prompt) + # Return None for async translations + for lang in async_langs: + results[lang] = None + else: + # Synchronous mode: wait for all translations + for lang in missing_langs: + results[lang] = self.translate(text, lang, source_lang, context, prompt) + + return results + + def translate_multi_async( + self, + text: str, + target_langs: List[str], + source_lang: Optional[str] = None, + context: Optional[str] = None, + prompt: Optional[str] = None + ) -> Dict[str, Union[str, Future]]: + """ + Translate text to multiple target languages asynchronously, returning Futures that can be awaited. + + This method returns a dictionary where: + - If translation is cached, the value is the translation string (immediate) + - If translation needs to be done, the value is a Future object that can be awaited + + Args: + text: Text to translate + target_langs: List of target language codes + source_lang: Source language code (optional) + context: Context hint for translation (optional) + prompt: Translation prompt/instruction (optional) + + Returns: + Dictionary mapping language code to either translation string (cached) or Future object + """ + results = {} + missing_langs = [] + + # First, get cached translations + for lang in target_langs: + cached = self._get_cached_translation(text, lang, source_lang, context, prompt) + if cached is not None: + results[lang] = cached + else: + missing_langs.append(lang) + + # For missing translations, submit async tasks and return Futures + for lang in missing_langs: + future = self.executor.submit( + self.translate, + text, + lang, + source_lang, + context, + prompt + ) + results[lang] = future + + return results + + def _get_cached_translation( + self, + text: str, + target_lang: str, + source_lang: Optional[str] = None, + context: Optional[str] = None, + prompt: Optional[str] = None + ) -> Optional[str]: + """Get translation from cache if available.""" + if not self.redis_client: + return None + return self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt) + + def _get_cached_translation_redis( + self, + text: str, + target_lang: str, + source_lang: Optional[str] = None, + context: Optional[str] = None, + prompt: Optional[str] = None + ) -> Optional[str]: + """ + Get translation from Redis cache with sliding expiration. + + 滑动过期机制:每次访问缓存时,重置过期时间为配置的过期时间(默认720天)。 + 这样缓存会在最后一次访问后的720天才过期,而不是写入后的720天。 + 这确保了常用的翻译缓存不会被过早删除。 + """ + if not self.redis_client: + return None + + try: + # Build cache key: prefix:target_lang:text + # For simplicity, we use target_lang and text as key + # Context and prompt are not included in key to maximize cache hits + cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}" + value = self.redis_client.get(cache_key) + if value: + # Sliding expiration: reset expiration time on access + # 每次读取缓存时,重置过期时间为配置的过期时间(最后一次访问后的N天才过期) + try: + self.redis_client.expire(cache_key, self.expire_seconds) + except Exception as expire_error: + # 即使 expire 失败,也返回缓存值(不影响功能) + logger.warning( + f"[Translator] Failed to update cache expiration for key {cache_key}: {expire_error}" + ) + + logger.debug( + f"[Translator] Redis cache hit | Original text: '{text}' | Target language: {target_lang} | " + f"Cache key: {cache_key} | Translation result: '{value}' | TTL reset to {self.expire_seconds}s" + ) + return value + logger.debug( + f"[Translator] Redis cache miss | Original text: '{text}' | Target language: {target_lang} | " + f"Cache key: {cache_key}" + ) + return None + except Exception as e: + logger.error(f"[Translator] Redis error during get translation cache | Original text: '{text}' | Target language: {target_lang} | Error: {e}") + return None + + def _set_cached_translation_redis( + self, + text: str, + target_lang: str, + translation: str, + source_lang: Optional[str] = None, + context: Optional[str] = None, + prompt: Optional[str] = None + ) -> None: + """Store translation in Redis cache.""" + if not self.redis_client: + return + + try: + cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}" + self.redis_client.setex(cache_key, self.expire_seconds, translation) + logger.info( + f"[Translator] Redis cache write | Original text: '{text}' | Target language: {target_lang} | " + f"Cache key: {cache_key} | Translation result: '{translation}'" + ) + except Exception as e: + logger.error( + f"[Translator] Redis cache write failed | Original text: '{text}' | Target language: {target_lang} | " + f"Error: {e}" + ) + + def _translate_async( + self, + text: str, + target_lang: str, + source_lang: Optional[str] = None, + context: Optional[str] = None, + prompt: Optional[str] = None + ): + """Launch async translation task.""" + def _do_translate(): + try: + result = self.translate(text, target_lang, source_lang, context, prompt) + if result: + logger.debug(f"Async translation completed: {text} -> {target_lang}: {result}") + except Exception as e: + logger.warning(f"Async translation failed: {text} -> {target_lang}: {e}") + + self.executor.submit(_do_translate) + + def _add_ecommerce_context( + self, + text: str, + source_lang: Optional[str], + context: Optional[str] + ) -> tuple: + """ + Add e-commerce context to text for better disambiguation. + + For single-word ambiguous Chinese terms, we add context words that help + DeepL understand this is an e-commerce/product search context. + + Args: + text: Original text to translate + source_lang: Source language code + context: Context hint + + Returns: + Tuple of (text_with_context, needs_extraction) + - text_with_context: Text to send to DeepL + - needs_extraction: Whether we need to extract the term from the result + """ + # Only apply for e-commerce context and Chinese source + if not context or "e-commerce" not in context.lower(): + return text, False + + if not source_lang or source_lang.lower() != 'zh': + return text, False + + # For single-word queries, add context to help disambiguation + text_stripped = text.strip() + if len(text_stripped.split()) == 1 and len(text_stripped) <= 2: + # Common ambiguous Chinese e-commerce terms like "车" (car vs rook) + # We add a context phrase: "购买 [term]" (buy [term]) or "商品 [term]" (product [term]) + # This helps DeepL understand the e-commerce context + # We'll need to extract just the term from the translation result + context_phrase = f"购买 {text_stripped}" + return context_phrase, True + + # For multi-word queries, DeepL usually has enough context + return text, False + + def _extract_term_from_translation( + self, + translated_text: str, + original_text: str, + target_lang_code: str + ) -> str: + """ + Extract the actual term from a translation that included context. + + For example, if we translated "购买 车" (buy car) and got "buy car", + we want to extract just "car". + + Args: + translated_text: Full translation result + original_text: Original single-word query + target_lang_code: Target language code (EN, ZH, etc.) + + Returns: + Extracted term or original translation if extraction fails + """ + # For English target, try to extract the last word (the actual term) + if target_lang_code == "EN": + words = translated_text.strip().split() + if len(words) > 1: + # Usually the last word is the term we want + # But we need to be smart - if it's "buy car", we want "car" + # Common context words to skip: buy, purchase, product, item, etc. + context_words = {"buy", "purchase", "product", "item", "commodity", "goods"} + # Try to find the term (not a context word) + for word in reversed(words): + word_lower = word.lower().rstrip('.,!?;:') + if word_lower not in context_words: + return word_lower + # If all words are context words, return the last one + return words[-1].lower().rstrip('.,!?;:') + + # For other languages or if extraction fails, return as-is + # The user can configure a glossary for better results + return translated_text + + def _shop_lang_matches(self, shop_lang_lower: str, lang_code: str) -> bool: + """True if shop language matches index language (use source, no translate).""" + if not shop_lang_lower or not lang_code: + return False + if shop_lang_lower == lang_code: + return True + if lang_code == "zh" and "zh" in shop_lang_lower: + return True + if lang_code == "en" and "en" in shop_lang_lower: + return True + return False + + def translate_for_indexing( + self, + text: str, + shop_language: str, + source_lang: Optional[str] = None, + context: Optional[str] = None, + prompt: Optional[str] = None, + index_languages: Optional[List[str]] = None, + ) -> Dict[str, Optional[str]]: + """ + Translate text for indexing based on shop language and tenant index_languages. + + For each language in index_languages: use source text if shop language matches, + otherwise translate to that language. + + Args: + text: Text to translate + shop_language: Shop primary language (e.g. 'zh', 'en', 'ru') + source_lang: Source language code (optional) + context: Additional context for translation (optional) + prompt: Translation prompt (optional) + index_languages: Languages to index (from tenant_config). Default ["en", "zh"]. + + Returns: + Dict keyed by each index_language with translated or source text (or None). + """ + langs = index_languages if index_languages else ["en", "zh"] + results = {lang: None for lang in langs} + if not text or not text.strip(): + return results + if re.match(r'^[\d\s_-]+$', text): + logger.info(f"[Translator] Skip translation for symbol-only query: '{text}'") + return results + + shop_lang_lower = (shop_language or "").strip().lower() + targets = [] + for lang in langs: + if self._shop_lang_matches(shop_lang_lower, lang): + results[lang] = text + else: + targets.append(lang) + + for target_lang in targets: + cached = self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt) + if cached: + results[target_lang] = cached + logger.debug(f"[Translator] Cache hit for indexing: '{text}' -> {target_lang}: {cached}") + continue + translated = self.translate( + text, + target_lang=target_lang, + source_lang=source_lang or shop_language, + context=context, + prompt=prompt, + ) + results[target_lang] = translated + return results + + def get_translation_needs( + self, + detected_lang: str, + supported_langs: List[str] + ) -> List[str]: + """ + Determine which languages need translation. + + Args: + detected_lang: Detected query language + supported_langs: List of supported languages + + Returns: + List of language codes to translate to + """ + # If detected language is in supported list, translate to others + if detected_lang in supported_langs: + return [lang for lang in supported_langs if detected_lang != lang] + + # Otherwise, translate to all supported languages + return supported_langs + + def _is_english_text(self, text: str) -> bool: + """ + Check if text is primarily English (ASCII letters, numbers, common punctuation). + + Args: + text: Text to check + + Returns: + True if text appears to be English + """ + if not text or not text.strip(): + return True + + # Remove whitespace and common punctuation + text_clean = re.sub(r'[\s\.,!?;:\-\'\"\(\)\[\]{}]', '', text) + if not text_clean: + return True + + # Check if all remaining characters are ASCII (letters, numbers) + # This is a simple heuristic: if most characters are ASCII, it's likely English + ascii_count = sum(1 for c in text_clean if ord(c) < 128) + ratio = ascii_count / len(text_clean) if text_clean else 0 + + # If more than 80% are ASCII characters, consider it English + return ratio > 0.8 + + def _contains_chinese(self, text: str) -> bool: + """ + Check if text contains Chinese characters (Han characters). + + Args: + text: Text to check + + Returns: + True if text contains Chinese characters + """ + if not text: + return False + + # Check for Chinese characters (Unicode range: \u4e00-\u9fff) + chinese_pattern = re.compile(r'[\u4e00-\u9fff]') + return bool(chinese_pattern.search(text)) + + def _is_pure_number(self, text: str) -> bool: + """ + Check if text is purely numeric (digits, possibly with spaces, dots, commas). + + Args: + text: Text to check + + Returns: + True if text is purely numeric + """ + if not text or not text.strip(): + return False + + # Remove whitespace, dots, commas (common number separators) + text_clean = re.sub(r'[\s\.,]', '', text.strip()) + if not text_clean: + return False + + # Check if all remaining characters are digits + return text_clean.isdigit() diff --git a/query/test_translation.py b/query/test_translation.py index ff38e61..1ce00f5 100755 --- a/query/test_translation.py +++ b/query/test_translation.py @@ -19,7 +19,7 @@ from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) from config import ConfigLoader -from query.translator import Translator +from query.qwen_mt_translate import Translator import logging # Configure logging diff --git a/query/translator.py b/query/translator.py index ee39071..77b829f 100644 --- a/query/translator.py +++ b/query/translator.py @@ -13,7 +13,7 @@ Supports multiple translation models: 使用方法 (Usage): ```python -from query.translator import Translator +from query.qwen_mt_translate import Translator # 使用默认的 qwen 模型(推荐) translator = Translator() # 默认使用 qwen 模型 diff --git a/tests/test_translator_failure_semantics.py b/tests/test_translator_failure_semantics.py index 286468c..5f8fde8 100644 --- a/tests/test_translator_failure_semantics.py +++ b/tests/test_translator_failure_semantics.py @@ -1,4 +1,4 @@ -from query.translator import Translator +from query.qwen_mt_translate import Translator class _RecordingRedis: -- libgit2 0.21.2