Commit d4cadc13bd22491045c3580a54d0aaa1d4f625e6
1 parent
a0a173ae
翻译重构
Showing
21 changed files
with
832 additions
and
2263 deletions
Show diff stats
api/routes/search.py
api/translator_app.py
| ... | ... | @@ -98,7 +98,9 @@ from pydantic import BaseModel, Field |
| 98 | 98 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| 99 | 99 | |
| 100 | 100 | from query.qwen_mt_translate import Translator |
| 101 | -from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG | |
| 101 | +from query.llm_translate import LLMTranslatorProvider | |
| 102 | +from query.deepl_provider import DeepLProvider | |
| 103 | +from config.services_config import get_translation_config | |
| 102 | 104 | |
| 103 | 105 | # Configure logging |
| 104 | 106 | logging.basicConfig( |
| ... | ... | @@ -107,23 +109,52 @@ logging.basicConfig( |
| 107 | 109 | ) |
| 108 | 110 | logger = logging.getLogger(__name__) |
| 109 | 111 | |
| 110 | -# Fixed translation prompt | |
| 111 | -TRANSLATION_PROMPT = "Translate the original text into an English product SKU name. Requirements: Ensure accurate and complete transmission of the original information, with concise, clear, authentic, and professional language." | |
| 112 | - | |
| 113 | 112 | # Global translator instances cache (keyed by model) |
| 114 | -_translators: Dict[str, Translator] = {} | |
| 113 | +_translators: Dict[str, object] = {} | |
| 114 | + | |
| 115 | 115 | |
| 116 | +def _resolve_default_model() -> str: | |
| 117 | + """ | |
| 118 | + Resolve translator model from services.translation config first. | |
| 116 | 119 | |
| 117 | -def get_translator(model: str = "qwen") -> Translator: | |
| 120 | + Priority: | |
| 121 | + 1) TRANSLATION_MODEL env (explicit runtime override) | |
| 122 | + 2) services.translation.provider + providers.<provider>.model | |
| 123 | + 3) qwen-mt | |
| 124 | + """ | |
| 125 | + env_model = (os.getenv("TRANSLATION_MODEL") or "").strip() | |
| 126 | + if env_model: | |
| 127 | + return env_model | |
| 128 | + try: | |
| 129 | + cfg = get_translation_config() | |
| 130 | + provider = (cfg.provider or "").strip().lower() | |
| 131 | + provider_cfg = cfg.get_provider_cfg() if hasattr(cfg, "get_provider_cfg") else {} | |
| 132 | + model = (provider_cfg.get("model") or "").strip().lower() if isinstance(provider_cfg, dict) else "" | |
| 133 | + if provider == "llm": | |
| 134 | + return "llm" | |
| 135 | + if provider in {"qwen-mt", "direct", "http"}: | |
| 136 | + return model or "qwen-mt" | |
| 137 | + if provider == "deepl": | |
| 138 | + return "deepl" | |
| 139 | + except Exception: | |
| 140 | + pass | |
| 141 | + return "qwen-mt" | |
| 142 | + | |
| 143 | + | |
| 144 | +def get_translator(model: str = "qwen") -> object: | |
| 118 | 145 | """Get or create translator instance for the specified model.""" |
| 119 | 146 | global _translators |
| 120 | 147 | if model not in _translators: |
| 121 | 148 | logger.info(f"Initializing translator with model: {model}...") |
| 122 | - _translators[model] = Translator( | |
| 123 | - model=model, | |
| 124 | - use_cache=True, | |
| 125 | - timeout=10 | |
| 126 | - ) | |
| 149 | + normalized = (model or "qwen").strip().lower() | |
| 150 | + if normalized in {"qwen", "qwen-mt", "qwen-mt-flash", "qwen-mt-flush"}: | |
| 151 | + _translators[model] = Translator(model=normalized, use_cache=True, timeout=10) | |
| 152 | + elif normalized == "deepl": | |
| 153 | + _translators[model] = DeepLProvider(api_key=None, timeout=10.0) | |
| 154 | + elif normalized == "llm": | |
| 155 | + _translators[model] = LLMTranslatorProvider() | |
| 156 | + else: | |
| 157 | + raise ValueError(f"Unsupported model: {model}") | |
| 127 | 158 | logger.info(f"Translator initialized with model: {model}") |
| 128 | 159 | return _translators[model] |
| 129 | 160 | |
| ... | ... | @@ -134,7 +165,9 @@ class TranslationRequest(BaseModel): |
| 134 | 165 | text: str = Field(..., description="Text to translate") |
| 135 | 166 | target_lang: str = Field(..., description="Target language code (zh, en, ru, etc.)") |
| 136 | 167 | source_lang: Optional[str] = Field(None, description="Source language code (optional, auto-detect if not provided)") |
| 137 | - model: Optional[str] = Field("qwen", description="Translation model: 'qwen' (default) or 'deepl'") | |
| 168 | + model: Optional[str] = Field(None, description="Translation model: qwen-mt | deepl | llm") | |
| 169 | + context: Optional[str] = Field(None, description="Optional translation scene or context") | |
| 170 | + prompt: Optional[str] = Field(None, description="Optional prompt override") | |
| 138 | 171 | |
| 139 | 172 | class Config: |
| 140 | 173 | json_schema_extra = { |
| ... | ... | @@ -142,7 +175,8 @@ class TranslationRequest(BaseModel): |
| 142 | 175 | "text": "商品名称", |
| 143 | 176 | "target_lang": "en", |
| 144 | 177 | "source_lang": "zh", |
| 145 | - "model": "qwen" | |
| 178 | + "model": "llm", | |
| 179 | + "context": "sku_name" | |
| 146 | 180 | } |
| 147 | 181 | } |
| 148 | 182 | |
| ... | ... | @@ -180,8 +214,7 @@ app.add_middleware( |
| 180 | 214 | async def startup_event(): |
| 181 | 215 | """Initialize translator on startup.""" |
| 182 | 216 | logger.info("Starting Translation Service API on port 6006") |
| 183 | - # Get default model from environment variable or use 'qwen' | |
| 184 | - default_model = os.getenv("TRANSLATION_MODEL", "qwen") | |
| 217 | + default_model = _resolve_default_model() | |
| 185 | 218 | try: |
| 186 | 219 | get_translator(model=default_model) |
| 187 | 220 | logger.info(f"Translation service ready with default model: {default_model}") |
| ... | ... | @@ -194,15 +227,17 @@ async def startup_event(): |
| 194 | 227 | async def health_check(): |
| 195 | 228 | """Health check endpoint.""" |
| 196 | 229 | try: |
| 197 | - default_model = os.getenv("TRANSLATION_MODEL", "qwen") | |
| 198 | - translator = get_translator(model=default_model) | |
| 230 | + # 仅做轻量级本地检查,避免在健康检查中触发潜在的阻塞初始化或外部依赖 | |
| 231 | + default_model = _resolve_default_model() | |
| 232 | + # 如果启动事件成功,默认模型通常会已经初始化到缓存中 | |
| 233 | + translator = _translators.get(default_model) or next(iter(_translators.values()), None) | |
| 199 | 234 | return { |
| 200 | 235 | "status": "healthy", |
| 201 | 236 | "service": "translation", |
| 202 | 237 | "default_model": default_model, |
| 203 | 238 | "available_models": list(_translators.keys()), |
| 204 | 239 | "translator_initialized": translator is not None, |
| 205 | - "cache_enabled": translator.use_cache if translator else False | |
| 240 | + "cache_enabled": bool(getattr(translator, "use_cache", False)) | |
| 206 | 241 | } |
| 207 | 242 | except Exception as e: |
| 208 | 243 | logger.error(f"Health check failed: {e}") |
| ... | ... | @@ -238,11 +273,11 @@ async def translate(request: TranslationRequest): |
| 238 | 273 | ) |
| 239 | 274 | |
| 240 | 275 | # Validate model parameter |
| 241 | - model = request.model.lower() if request.model else "qwen" | |
| 242 | - if model not in ['qwen', 'deepl']: | |
| 276 | + model = request.model.lower() if request.model else _resolve_default_model().lower() | |
| 277 | + if model not in ["qwen", "qwen-mt", "deepl", "llm"]: | |
| 243 | 278 | raise HTTPException( |
| 244 | 279 | status_code=400, |
| 245 | - detail=f"Invalid model: {model}. Supported models: 'qwen', 'deepl'" | |
| 280 | + detail="Invalid model. Supported models: 'qwen-mt', 'deepl', 'llm'" | |
| 246 | 281 | ) |
| 247 | 282 | |
| 248 | 283 | try: |
| ... | ... | @@ -254,7 +289,8 @@ async def translate(request: TranslationRequest): |
| 254 | 289 | text=request.text, |
| 255 | 290 | target_lang=request.target_lang, |
| 256 | 291 | source_lang=request.source_lang, |
| 257 | - prompt=TRANSLATION_PROMPT | |
| 292 | + context=request.context, | |
| 293 | + prompt=request.prompt, | |
| 258 | 294 | ) |
| 259 | 295 | |
| 260 | 296 | if translated_text is None: |
| ... | ... | @@ -269,7 +305,7 @@ async def translate(request: TranslationRequest): |
| 269 | 305 | source_lang=request.source_lang, |
| 270 | 306 | translated_text=translated_text, |
| 271 | 307 | status="success", |
| 272 | - model=translator.model | |
| 308 | + model=str(getattr(translator, "model", model)) | |
| 273 | 309 | ) |
| 274 | 310 | |
| 275 | 311 | except HTTPException: | ... | ... |
config/__init__.py
| ... | ... | @@ -28,6 +28,7 @@ from .services_config import ( |
| 28 | 28 | get_translation_base_url, |
| 29 | 29 | get_embedding_base_url, |
| 30 | 30 | get_rerank_service_url, |
| 31 | + get_translation_cache_config, | |
| 31 | 32 | ServiceConfig, |
| 32 | 33 | ) |
| 33 | 34 | |
| ... | ... | @@ -53,5 +54,6 @@ __all__ = [ |
| 53 | 54 | 'get_translation_base_url', |
| 54 | 55 | 'get_embedding_base_url', |
| 55 | 56 | 'get_rerank_service_url', |
| 57 | + 'get_translation_cache_config', | |
| 56 | 58 | 'ServiceConfig', |
| 57 | 59 | ] | ... | ... |
config/config.yaml
| ... | ... | @@ -81,18 +81,6 @@ query_config: |
| 81 | 81 | translation_service: "deepl" |
| 82 | 82 | translation_api_key: null # 通过环境变量设置 |
| 83 | 83 | |
| 84 | - # 翻译提示词配置(用于提高翻译质量,作为DeepL API的context参数) | |
| 85 | - translation_prompts: | |
| 86 | - # 商品标题翻译提示词 | |
| 87 | - product_title_zh: "请将原文翻译成中文商品SKU名称,要求:确保精确、完整地传达原文信息的基础上,语言简洁清晰、地道、专业。" | |
| 88 | - product_title_en: "Translate the original text into an English product SKU name. Requirements: Ensure accurate and complete transmission of the original information, with concise, clear, authentic, and professional language." | |
| 89 | - # query翻译提示词 | |
| 90 | - query_zh: "电商领域" | |
| 91 | - query_en: "e-commerce domain" | |
| 92 | - # 默认翻译用词 | |
| 93 | - default_zh: "电商领域" | |
| 94 | - default_en: "e-commerce domain" | |
| 95 | - | |
| 96 | 84 | # 返回字段配置(_source includes) |
| 97 | 85 | # null表示返回所有字段,[]表示不返回任何字段,列表表示只返回指定字段 |
| 98 | 86 | source_fields: null |
| ... | ... | @@ -119,16 +107,24 @@ rerank: |
| 119 | 107 | # 可扩展服务/provider 注册表(单一配置源) |
| 120 | 108 | services: |
| 121 | 109 | translation: |
| 122 | - provider: "llm" # direct | http | google(reserved) | |
| 110 | + provider: "llm" # qwen-mt | deepl | http | llm | |
| 123 | 111 | base_url: "http://127.0.0.1:6006" |
| 124 | - model: "qwen" | |
| 112 | + model: "qwen-flash" | |
| 125 | 113 | timeout_sec: 10.0 |
| 114 | + cache: | |
| 115 | + enabled: true | |
| 116 | + key_prefix: "trans:v2" | |
| 117 | + ttl_seconds: 62208000 | |
| 118 | + sliding_expiration: true | |
| 119 | + key_include_context: true | |
| 120 | + key_include_prompt: true | |
| 121 | + key_include_source_lang: true | |
| 126 | 122 | providers: |
| 127 | - direct: | |
| 128 | - model: "qwen" | |
| 123 | + qwen-mt: | |
| 124 | + model: "qwen-mt-flush" | |
| 129 | 125 | http: |
| 130 | 126 | base_url: "http://127.0.0.1:6006" |
| 131 | - model: "qwen" | |
| 127 | + model: "qwen-mt-flush" | |
| 132 | 128 | timeout_sec: 10.0 |
| 133 | 129 | llm: |
| 134 | 130 | model: "qwen-flash" |
| ... | ... | @@ -136,6 +132,11 @@ services: |
| 136 | 132 | # base_url 留空则使用 DASHSCOPE_BASE_URL 或默认地域 |
| 137 | 133 | base_url: "" |
| 138 | 134 | timeout_sec: 30.0 |
| 135 | + deepl: | |
| 136 | + model: "deepl" | |
| 137 | + timeout_sec: 10.0 | |
| 138 | + # 可选:用于术语表翻译(由 query_config.translation_glossary_id 衔接) | |
| 139 | + glossary_id: "" | |
| 139 | 140 | google: |
| 140 | 141 | enabled: false |
| 141 | 142 | project_id: "" | ... | ... |
config/config_loader.py
| ... | ... | @@ -42,7 +42,6 @@ class QueryConfig: |
| 42 | 42 | translation_api_key: Optional[str] = None |
| 43 | 43 | translation_glossary_id: Optional[str] = None |
| 44 | 44 | translation_context: str = "e-commerce product search" |
| 45 | - translation_prompts: Dict[str, str] = field(default_factory=dict) | |
| 46 | 45 | |
| 47 | 46 | # Embedding field names |
| 48 | 47 | text_embedding_field: Optional[str] = "title_embedding" |
| ... | ... | @@ -250,7 +249,6 @@ class ConfigLoader: |
| 250 | 249 | translation_service=query_config_data.get("translation_service") or "deepl", |
| 251 | 250 | translation_glossary_id=query_config_data.get("translation_glossary_id"), |
| 252 | 251 | translation_context=query_config_data.get("translation_context") or "e-commerce product search", |
| 253 | - translation_prompts=query_config_data.get("translation_prompts", {}), | |
| 254 | 252 | text_embedding_field=query_config_data.get("text_embedding_field"), |
| 255 | 253 | image_embedding_field=query_config_data.get("image_embedding_field"), |
| 256 | 254 | source_fields=query_config_data.get("source_fields"), | ... | ... |
config/services_config.py
| ... | ... | @@ -72,12 +72,12 @@ def _resolve_translation() -> ServiceConfig: |
| 72 | 72 | config_provider=cfg.get("provider"), |
| 73 | 73 | capability="translation", |
| 74 | 74 | ) |
| 75 | - if provider not in ("direct", "local", "inprocess", "http", "service"): | |
| 75 | + if provider not in ("qwen-mt", "deepl", "direct", "local", "inprocess", "http", "service", "llm"): | |
| 76 | 76 | raise ValueError(f"Unsupported translation provider: {provider}") |
| 77 | 77 | |
| 78 | 78 | # Env override for http base_url |
| 79 | 79 | env_url = os.getenv("TRANSLATION_SERVICE_URL") |
| 80 | - if env_url and provider == "http": | |
| 80 | + if env_url and provider in ("http", "service"): | |
| 81 | 81 | providers = dict(providers) |
| 82 | 82 | providers["http"] = dict(providers.get("http", {})) |
| 83 | 83 | providers["http"]["base_url"] = env_url.rstrip("/") |
| ... | ... | @@ -206,6 +206,27 @@ def get_translation_base_url() -> str: |
| 206 | 206 | return str(base).rstrip("/") |
| 207 | 207 | |
| 208 | 208 | |
| 209 | +def get_translation_cache_config() -> Dict[str, Any]: | |
| 210 | + """ | |
| 211 | + Resolve translation cache policy from services.translation.cache. | |
| 212 | + | |
| 213 | + All translation cache key/TTL behavior should be configured in config.yaml, | |
| 214 | + not hardcoded in code. | |
| 215 | + """ | |
| 216 | + raw = _load_services_raw() | |
| 217 | + cfg = raw.get("translation", {}) if isinstance(raw.get("translation"), dict) else {} | |
| 218 | + cache_cfg = cfg.get("cache", {}) if isinstance(cfg.get("cache"), dict) else {} | |
| 219 | + return { | |
| 220 | + "enabled": bool(cache_cfg.get("enabled", True)), | |
| 221 | + "key_prefix": str(cache_cfg.get("key_prefix", "trans:v2")), | |
| 222 | + "ttl_seconds": int(cache_cfg.get("ttl_seconds", 360 * 24 * 3600)), | |
| 223 | + "sliding_expiration": bool(cache_cfg.get("sliding_expiration", True)), | |
| 224 | + "key_include_context": bool(cache_cfg.get("key_include_context", True)), | |
| 225 | + "key_include_prompt": bool(cache_cfg.get("key_include_prompt", True)), | |
| 226 | + "key_include_source_lang": bool(cache_cfg.get("key_include_source_lang", True)), | |
| 227 | + } | |
| 228 | + | |
| 229 | + | |
| 209 | 230 | def get_embedding_base_url() -> str: |
| 210 | 231 | """Resolve embedding HTTP base URL.""" |
| 211 | 232 | base = ( | ... | ... |
| ... | ... | @@ -0,0 +1,82 @@ |
| 1 | +SOURCE_LANG_CODE_MAP = { | |
| 2 | + "en": "English", | |
| 3 | + "zh": "Chinese", | |
| 4 | + "zh_tw": "Traditional Chinese", | |
| 5 | + "ru": "Russian", | |
| 6 | + "ja": "Japanese", | |
| 7 | + "ko": "Korean", | |
| 8 | + "es": "Spanish", | |
| 9 | + "fr": "French", | |
| 10 | + "pt": "Portuguese", | |
| 11 | + "de": "German", | |
| 12 | + "it": "Italian", | |
| 13 | + "th": "Thai", | |
| 14 | + "vi": "Vietnamese", | |
| 15 | + "id": "Indonesian", | |
| 16 | + "ms": "Malay", | |
| 17 | + "ar": "Arabic", | |
| 18 | + "hi": "Hindi", | |
| 19 | + "he": "Hebrew", | |
| 20 | + "my": "Burmese", | |
| 21 | + "ta": "Tamil", | |
| 22 | + "ur": "Urdu", | |
| 23 | + "bn": "Bengali", | |
| 24 | + "pl": "Polish", | |
| 25 | + "nl": "Dutch", | |
| 26 | + "ro": "Romanian", | |
| 27 | + "tr": "Turkish", | |
| 28 | + "km": "Khmer", | |
| 29 | + "lo": "Lao", | |
| 30 | + "yue": "Cantonese", | |
| 31 | + "cs": "Czech", | |
| 32 | + "el": "Greek", | |
| 33 | + "sv": "Swedish", | |
| 34 | + "hu": "Hungarian", | |
| 35 | + "da": "Danish", | |
| 36 | + "fi": "Finnish", | |
| 37 | + "uk": "Ukrainian", | |
| 38 | + "bg": "Bulgarian", | |
| 39 | +} | |
| 40 | + | |
| 41 | +TARGET_LANG_CODE_MAP = {v: k for k, v in SOURCE_LANG_CODE_MAP.items()} | |
| 42 | + | |
| 43 | +TRANSLATION_PROMPTS = { | |
| 44 | + "general": { | |
| 45 | + "zh": "你是一名专业的 {source_lang}({src_lang_code})到 {target_lang}({tgt_lang_code})翻译专家,请准确传达原文含义并符合{target_lang}语言习惯,只输出翻译结果:{text}", | |
| 46 | + "en": "You are a professional {source_lang} ({src_lang_code}) to {target_lang} ({tgt_lang_code}) translator. Accurately convey the meaning following {target_lang} grammar and usage, output only the translation: {text}", | |
| 47 | + "ru": "Вы профессиональный переводчик с {source_lang} ({src_lang_code}) на {target_lang} ({tgt_lang_code}). Точно передайте смысл текста, соблюдая нормы {target_lang}, выводите только перевод: {text}", | |
| 48 | + "ar": "أنت مترجم محترف من {source_lang} ({src_lang_code}) إلى {target_lang} ({tgt_lang_code}). انقل المعنى بدقة وفق قواعد {target_lang} وأخرج الترجمة فقط: {text}", | |
| 49 | + "ja": "あなたは {source_lang}({src_lang_code})から {target_lang}({tgt_lang_code})へのプロ翻訳者です。意味を正確に伝え、{target_lang}の表現に従い、翻訳のみ出力してください:{text}", | |
| 50 | + "es": "Eres un traductor profesional de {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Transmite con precisión el significado y devuelve solo la traducción: {text}", | |
| 51 | + "de": "Du bist ein professioneller Übersetzer von {source_lang} ({src_lang_code}) nach {target_lang} ({tgt_lang_code}). Gib die Bedeutung korrekt wieder und gib nur die Übersetzung aus: {text}", | |
| 52 | + "fr": "Vous êtes un traducteur professionnel de {source_lang} ({src_lang_code}) vers {target_lang} ({tgt_lang_code}). Transmettez fidèlement le sens et produisez uniquement la traduction : {text}", | |
| 53 | + "it": "Sei un traduttore professionista da {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Trasmetti accuratamente il significato e restituisci solo la traduzione: {text}", | |
| 54 | + "pt": "Você é um tradutor profissional de {source_lang} ({src_lang_code}) para {target_lang} ({tgt_lang_code}). Transmita o significado com precisão e produza apenas a tradução: {text}" | |
| 55 | + }, | |
| 56 | + | |
| 57 | + "sku_name": { | |
| 58 | + "zh": "你是一名专业的 {source_lang}({src_lang_code})到 {target_lang}({tgt_lang_code})电商翻译专家,请将原文翻译为{target_lang}商品SKU名称,要求准确完整、简洁专业,只输出结果:{text}", | |
| 59 | + "en": "You are a professional {source_lang} ({src_lang_code}) to {target_lang} ({tgt_lang_code}) ecommerce translator. Translate into a concise and accurate {target_lang} product SKU name, output only the result: {text}", | |
| 60 | + "ru": "Вы переводчик e-commerce с {source_lang} ({src_lang_code}) на {target_lang} ({tgt_lang_code}). Переведите в краткое и точное название SKU товара на {target_lang}, выводите только результат: {text}", | |
| 61 | + "ar": "أنت مترجم تجارة إلكترونية من {source_lang} ({src_lang_code}) إلى {target_lang} ({tgt_lang_code}). ترجم إلى اسم SKU للمنتج بلغة {target_lang} بدقة واختصار، وأخرج النتيجة فقط: {text}", | |
| 62 | + "ja": "{source_lang}({src_lang_code})から {target_lang}({tgt_lang_code})へのEC翻訳者として、簡潔で正確な{target_lang}の商品SKU名に翻訳し、結果のみ出力してください:{text}", | |
| 63 | + "es": "Eres un traductor ecommerce de {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Traduce a un nombre SKU de producto en {target_lang}, preciso y conciso, devuelve solo el resultado: {text}", | |
| 64 | + "de": "Du bist ein E-Commerce-Übersetzer von {source_lang} ({src_lang_code}) nach {target_lang} ({tgt_lang_code}). Übersetze in einen präzisen und kurzen {target_lang} Produkt-SKU-Namen, nur Ergebnis ausgeben: {text}", | |
| 65 | + "fr": "Vous êtes un traducteur e-commerce de {source_lang} ({src_lang_code}) vers {target_lang} ({tgt_lang_code}). Traduisez en un nom SKU produit {target_lang} précis et concis, sortie uniquement : {text}", | |
| 66 | + "it": "Sei un traduttore ecommerce da {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Traduce in un nome SKU prodotto {target_lang} conciso e accurato, restituisci solo il risultato: {text}", | |
| 67 | + "pt": "Você é um tradutor de e-commerce de {source_lang} ({src_lang_code}) para {target_lang} ({tgt_lang_code}). Traduza para um nome SKU de produto {target_lang} conciso e preciso, produza apenas o resultado: {text}" | |
| 68 | + }, | |
| 69 | + | |
| 70 | + "ecommerce_search_query": { | |
| 71 | + "zh": "你是一名专业的 {source_lang}({src_lang_code})到 {target_lang}({tgt_lang_code})翻译助手,请将电商搜索词准确翻译为{target_lang}并符合搜索习惯,只输出结果:{text}", | |
| 72 | + "en": "You are a professional {source_lang} ({src_lang_code}) to {target_lang} ({tgt_lang_code}) translator. Translate the ecommerce search query accurately following {target_lang} search habits, output only the result: {text}", | |
| 73 | + "ru": "Вы переводчик с {source_lang} ({src_lang_code}) на {target_lang} ({tgt_lang_code}). Переведите поисковый запрос e-commerce с учётом привычек поиска, выводите только результат: {text}", | |
| 74 | + "ar": "أنت مترجم من {source_lang} ({src_lang_code}) إلى {target_lang} ({tgt_lang_code}). ترجم عبارة البحث للتجارة الإلكترونية بما يناسب عادات البحث وأخرج النتيجة فقط: {text}", | |
| 75 | + "ja": "{source_lang}({src_lang_code})から {target_lang}({tgt_lang_code})への翻訳者として、EC検索キーワードを{target_lang}の検索習慣に合わせて翻訳し、結果のみ出力してください:{text}", | |
| 76 | + "es": "Eres un traductor de {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Traduce la consulta de búsqueda ecommerce según los hábitos de búsqueda y devuelve solo el resultado: {text}", | |
| 77 | + "de": "Du bist ein Übersetzer von {source_lang} ({src_lang_code}) nach {target_lang} ({tgt_lang_code}). Übersetze die E-Commerce-Suchanfrage entsprechend den Suchgewohnheiten, nur Ergebnis ausgeben: {text}", | |
| 78 | + "fr": "Vous êtes un traducteur de {source_lang} ({src_lang_code}) vers {target_lang} ({tgt_lang_code}). Traduisez la requête de recherche e-commerce selon les habitudes de recherche, sortie uniquement : {text}", | |
| 79 | + "it": "Sei un traduttore da {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Traduce la query di ricerca ecommerce secondo le abitudini di ricerca e restituisci solo il risultato: {text}", | |
| 80 | + "pt": "Você é um tradutor de {source_lang} ({src_lang_code}) para {target_lang} ({tgt_lang_code}). Traduza a consulta de busca de ecommerce conforme os hábitos de busca e produza apenas o resultado: {text}" | |
| 81 | + } | |
| 82 | +} | ... | ... |
docs/搜索API对接指南.md
| ... | ... | @@ -1814,7 +1814,8 @@ curl "http://localhost:6007/health" |
| 1814 | 1814 | "text": "商品名称", |
| 1815 | 1815 | "target_lang": "en", |
| 1816 | 1816 | "source_lang": "zh", |
| 1817 | - "model": "qwen" | |
| 1817 | + "model": "qwen", | |
| 1818 | + "context": "sku_name" | |
| 1818 | 1819 | } |
| 1819 | 1820 | ``` |
| 1820 | 1821 | |
| ... | ... | @@ -1823,7 +1824,8 @@ curl "http://localhost:6007/health" |
| 1823 | 1824 | | `text` | string | Y | 待翻译文本 | |
| 1824 | 1825 | | `target_lang` | string | Y | 目标语言:`zh`、`en`、`ru` 等 | |
| 1825 | 1826 | | `source_lang` | string | N | 源语言,不传则自动检测 | |
| 1826 | -| `model` | string | N | `qwen`(默认)或 `deepl` | | |
| 1827 | +| `model` | string | N | `qwen`(默认)、`deepl` 或 `llm` | | |
| 1828 | +| `context` | string | N | 翻译场景参数:商品标题翻译使用 `sku_name`,搜索请求中的 query 翻译使用 `ecommerce_search_query`,其它通用场景可不传或使用 `general` | | |
| 1827 | 1829 | |
| 1828 | 1830 | **响应**: |
| 1829 | 1831 | ```json | ... | ... |
indexer/document_transformer.py
| ... | ... | @@ -36,7 +36,6 @@ class SPUDocumentTransformer: |
| 36 | 36 | searchable_option_dimensions: List[str], |
| 37 | 37 | tenant_config: Optional[Dict[str, Any]] = None, |
| 38 | 38 | translator: Optional[Any] = None, |
| 39 | - translation_prompts: Optional[Dict[str, str]] = None, | |
| 40 | 39 | encoder: Optional[Any] = None, |
| 41 | 40 | enable_title_embedding: bool = True, |
| 42 | 41 | image_encoder: Optional[Any] = None, |
| ... | ... | @@ -50,7 +49,6 @@ class SPUDocumentTransformer: |
| 50 | 49 | searchable_option_dimensions: 可搜索的option维度列表 |
| 51 | 50 | tenant_config: 租户配置(包含主语言和翻译配置) |
| 52 | 51 | translator: 翻译器实例(可选,如果提供则启用翻译功能) |
| 53 | - translation_prompts: 翻译提示词配置(可选) | |
| 54 | 52 | encoder: 文本编码器实例(可选,用于生成title_embedding) |
| 55 | 53 | enable_title_embedding: 是否启用标题向量化(默认True) |
| 56 | 54 | image_encoder: 图片编码器实例(可选,需实现 encode_image_urls(urls) -> List[Optional[np.ndarray]]) |
| ... | ... | @@ -60,12 +58,33 @@ class SPUDocumentTransformer: |
| 60 | 58 | self.searchable_option_dimensions = searchable_option_dimensions |
| 61 | 59 | self.tenant_config = tenant_config or {} |
| 62 | 60 | self.translator = translator |
| 63 | - self.translation_prompts = translation_prompts or {} | |
| 64 | 61 | self.encoder = encoder |
| 65 | 62 | self.enable_title_embedding = enable_title_embedding |
| 66 | 63 | self.image_encoder = image_encoder |
| 67 | 64 | self.enable_image_embedding = bool(enable_image_embedding and image_encoder is not None) |
| 68 | 65 | |
| 66 | + def _translate_index_languages( | |
| 67 | + self, | |
| 68 | + text: str, | |
| 69 | + source_lang: str, | |
| 70 | + index_languages: List[str], | |
| 71 | + scene: str, | |
| 72 | + ) -> Dict[str, Optional[str]]: | |
| 73 | + translations: Dict[str, Optional[str]] = {} | |
| 74 | + if not self.translator or not text or not str(text).strip(): | |
| 75 | + return translations | |
| 76 | + for lang in index_languages: | |
| 77 | + if lang == source_lang: | |
| 78 | + translations[lang] = text | |
| 79 | + continue | |
| 80 | + translations[lang] = self.translator.translate( | |
| 81 | + text=text, | |
| 82 | + target_lang=lang, | |
| 83 | + source_lang=source_lang, | |
| 84 | + context=scene, | |
| 85 | + ) | |
| 86 | + return translations | |
| 87 | + | |
| 69 | 88 | def transform_spu_to_doc( |
| 70 | 89 | self, |
| 71 | 90 | tenant_id: str, |
| ... | ... | @@ -322,15 +341,12 @@ class SPUDocumentTransformer: |
| 322 | 341 | title_text = str(spu_row['title']) |
| 323 | 342 | translations: Dict[str, Optional[str]] = {} |
| 324 | 343 | if self.translator: |
| 325 | - prompt_zh = self.translation_prompts.get('product_title_zh') or self.translation_prompts.get('default_zh') | |
| 326 | - prompt_en = self.translation_prompts.get('product_title_en') or self.translation_prompts.get('default_en') | |
| 327 | - translations = self.translator.translate_for_indexing( | |
| 328 | - title_text, | |
| 329 | - shop_language=primary_lang, | |
| 344 | + translations = self._translate_index_languages( | |
| 345 | + text=title_text, | |
| 330 | 346 | source_lang=primary_lang, |
| 331 | - prompt=prompt_zh if primary_lang == 'zh' else prompt_en, | |
| 332 | 347 | index_languages=index_langs, |
| 333 | - ) or {} | |
| 348 | + scene="product_title", | |
| 349 | + ) | |
| 334 | 350 | _set_lang_obj("title", title_text, translations) |
| 335 | 351 | |
| 336 | 352 | # Brief |
| ... | ... | @@ -338,14 +354,12 @@ class SPUDocumentTransformer: |
| 338 | 354 | brief_text = str(spu_row['brief']) |
| 339 | 355 | translations = {} |
| 340 | 356 | if self.translator: |
| 341 | - prompt = self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en') | |
| 342 | - translations = self.translator.translate_for_indexing( | |
| 343 | - brief_text, | |
| 344 | - shop_language=primary_lang, | |
| 357 | + translations = self._translate_index_languages( | |
| 358 | + text=brief_text, | |
| 345 | 359 | source_lang=primary_lang, |
| 346 | - prompt=prompt, | |
| 347 | 360 | index_languages=index_langs, |
| 348 | - ) or {} | |
| 361 | + scene="default", | |
| 362 | + ) | |
| 349 | 363 | _set_lang_obj("brief", brief_text, translations) |
| 350 | 364 | |
| 351 | 365 | # Description |
| ... | ... | @@ -353,14 +367,12 @@ class SPUDocumentTransformer: |
| 353 | 367 | desc_text = str(spu_row['description']) |
| 354 | 368 | translations = {} |
| 355 | 369 | if self.translator: |
| 356 | - prompt = self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en') | |
| 357 | - translations = self.translator.translate_for_indexing( | |
| 358 | - desc_text, | |
| 359 | - shop_language=primary_lang, | |
| 370 | + translations = self._translate_index_languages( | |
| 371 | + text=desc_text, | |
| 360 | 372 | source_lang=primary_lang, |
| 361 | - prompt=prompt, | |
| 362 | 373 | index_languages=index_langs, |
| 363 | - ) or {} | |
| 374 | + scene="default", | |
| 375 | + ) | |
| 364 | 376 | _set_lang_obj("description", desc_text, translations) |
| 365 | 377 | |
| 366 | 378 | # Vendor |
| ... | ... | @@ -368,14 +380,12 @@ class SPUDocumentTransformer: |
| 368 | 380 | vendor_text = str(spu_row['vendor']) |
| 369 | 381 | translations = {} |
| 370 | 382 | if self.translator: |
| 371 | - prompt = self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en') | |
| 372 | - translations = self.translator.translate_for_indexing( | |
| 373 | - vendor_text, | |
| 374 | - shop_language=primary_lang, | |
| 383 | + translations = self._translate_index_languages( | |
| 384 | + text=vendor_text, | |
| 375 | 385 | source_lang=primary_lang, |
| 376 | - prompt=prompt, | |
| 377 | 386 | index_languages=index_langs, |
| 378 | - ) or {} | |
| 387 | + scene="default", | |
| 388 | + ) | |
| 379 | 389 | _set_lang_obj("vendor", vendor_text, translations) |
| 380 | 390 | |
| 381 | 391 | def _fill_category_fields(self, doc: Dict[str, Any], spu_row: pd.Series): | ... | ... |
indexer/incremental_service.py
| ... | ... | @@ -39,7 +39,6 @@ class IncrementalIndexerService: |
| 39 | 39 | self._config: Optional[Any] = None |
| 40 | 40 | self._config_lock = threading.Lock() |
| 41 | 41 | self._translator: Optional[Any] = None |
| 42 | - self._translation_prompts: Optional[Dict[str, Any]] = None | |
| 43 | 42 | self._searchable_option_dimensions: Optional[List[str]] = None |
| 44 | 43 | self._shared_text_encoder: Optional[Any] = None |
| 45 | 44 | self._shared_image_encoder: Optional[Any] = None |
| ... | ... | @@ -52,7 +51,6 @@ class IncrementalIndexerService: |
| 52 | 51 | def _eager_init(self) -> None: |
| 53 | 52 | """Strict eager initialization. Any dependency failure should fail fast.""" |
| 54 | 53 | self._config = ConfigLoader("config/config.yaml").load_config() |
| 55 | - self._translation_prompts = getattr(self._config.query_config, "translation_prompts", {}) or {} | |
| 56 | 54 | self._searchable_option_dimensions = ( |
| 57 | 55 | getattr(self._config.spu_config, "searchable_option_dimensions", None) |
| 58 | 56 | or ["option1", "option2", "option3"] |
| ... | ... | @@ -110,7 +108,6 @@ class IncrementalIndexerService: |
| 110 | 108 | tenant_id=tenant_id, |
| 111 | 109 | searchable_option_dimensions=self._searchable_option_dimensions, |
| 112 | 110 | translator=self._translator, |
| 113 | - translation_prompts=self._translation_prompts, | |
| 114 | 111 | encoder=encoder, |
| 115 | 112 | enable_title_embedding=False, # batch fill later |
| 116 | 113 | image_encoder=image_encoder, | ... | ... |
indexer/indexing_utils.py
| ... | ... | @@ -57,7 +57,6 @@ def create_document_transformer( |
| 57 | 57 | tenant_id: str, |
| 58 | 58 | searchable_option_dimensions: Optional[list] = None, |
| 59 | 59 | translator: Optional[Any] = None, |
| 60 | - translation_prompts: Optional[Dict[str, str]] = None, | |
| 61 | 60 | encoder: Optional[Any] = None, |
| 62 | 61 | enable_title_embedding: bool = True, |
| 63 | 62 | image_encoder: Optional[Any] = None, |
| ... | ... | @@ -72,7 +71,6 @@ def create_document_transformer( |
| 72 | 71 | tenant_id: 租户ID |
| 73 | 72 | searchable_option_dimensions: 可搜索的option维度列表(如果为None则从配置加载) |
| 74 | 73 | translator: 翻译器实例(如果为None则根据配置初始化) |
| 75 | - translation_prompts: 翻译提示词配置(如果为None则从配置加载) | |
| 76 | 74 | encoder: 文本编码器实例(如果为None且enable_title_embedding为True则根据配置初始化) |
| 77 | 75 | enable_title_embedding: 是否启用标题向量化(默认True) |
| 78 | 76 | image_encoder: 图片编码器(可选,需实现 encode_image_urls(urls)) |
| ... | ... | @@ -89,7 +87,6 @@ def create_document_transformer( |
| 89 | 87 | if ( |
| 90 | 88 | searchable_option_dimensions is None |
| 91 | 89 | or translator is None |
| 92 | - or translation_prompts is None | |
| 93 | 90 | or (encoder is None and enable_title_embedding) |
| 94 | 91 | or config is None |
| 95 | 92 | ): |
| ... | ... | @@ -107,9 +104,6 @@ def create_document_transformer( |
| 107 | 104 | |
| 108 | 105 | translator = create_translation_provider(config.query_config) |
| 109 | 106 | |
| 110 | - if translation_prompts is None: | |
| 111 | - translation_prompts = config.query_config.translation_prompts | |
| 112 | - | |
| 113 | 107 | # 初始化encoder(如果启用标题向量化且未提供encoder) |
| 114 | 108 | if encoder is None and enable_title_embedding and config.query_config.enable_text_embedding: |
| 115 | 109 | from embeddings.text_encoder import TextEmbeddingEncoder |
| ... | ... | @@ -122,7 +116,6 @@ def create_document_transformer( |
| 122 | 116 | searchable_option_dimensions=searchable_option_dimensions, |
| 123 | 117 | tenant_config=tenant_config, |
| 124 | 118 | translator=translator, |
| 125 | - translation_prompts=translation_prompts, | |
| 126 | 119 | encoder=encoder, |
| 127 | 120 | enable_title_embedding=enable_title_embedding, |
| 128 | 121 | image_encoder=image_encoder, | ... | ... |
indexer/test_indexing.py
| ... | ... | @@ -285,7 +285,6 @@ def test_document_transformer(): |
| 285 | 285 | searchable_option_dimensions=['option1', 'option2', 'option3'], |
| 286 | 286 | tenant_config=tenant_config, |
| 287 | 287 | translator=translator, |
| 288 | - translation_prompts=config.query_config.translation_prompts | |
| 289 | 288 | ) |
| 290 | 289 | |
| 291 | 290 | # 转换文档 | ... | ... |
providers/translation.py
| 1 | -""" | |
| 2 | -Translation provider - direct (in-process) or HTTP service. | |
| 3 | -""" | |
| 1 | +"""Translation provider factory and HTTP provider implementation.""" | |
| 4 | 2 | from __future__ import annotations |
| 5 | 3 | |
| 6 | 4 | import logging |
| 7 | -from typing import Any, Dict, List, Optional, Union | |
| 8 | - | |
| 9 | -from concurrent.futures import Future, ThreadPoolExecutor | |
| 5 | +from typing import Any, Dict, Optional | |
| 10 | 6 | import requests |
| 11 | 7 | |
| 12 | 8 | from config.services_config import get_translation_config, get_translation_base_url |
| ... | ... | @@ -22,19 +18,18 @@ class HttpTranslationProvider: |
| 22 | 18 | base_url: str, |
| 23 | 19 | model: str = "qwen", |
| 24 | 20 | timeout_sec: float = 10.0, |
| 25 | - translation_context: Optional[str] = None, | |
| 26 | 21 | ): |
| 27 | 22 | self.base_url = (base_url or "").rstrip("/") |
| 28 | 23 | self.model = model or "qwen" |
| 29 | 24 | self.timeout_sec = float(timeout_sec or 10.0) |
| 30 | - self.translation_context = translation_context or "e-commerce product search" | |
| 31 | - self.executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="http-translator") | |
| 32 | 25 | |
| 33 | 26 | def _translate_once( |
| 34 | 27 | self, |
| 35 | 28 | text: str, |
| 36 | 29 | target_lang: str, |
| 37 | 30 | source_lang: Optional[str] = None, |
| 31 | + context: Optional[str] = None, | |
| 32 | + prompt: Optional[str] = None, | |
| 38 | 33 | ) -> Optional[str]: |
| 39 | 34 | if not text or not str(text).strip(): |
| 40 | 35 | return text |
| ... | ... | @@ -46,6 +41,10 @@ class HttpTranslationProvider: |
| 46 | 41 | "source_lang": source_lang or "auto", |
| 47 | 42 | "model": self.model, |
| 48 | 43 | } |
| 44 | + if context: | |
| 45 | + payload["context"] = context | |
| 46 | + if prompt: | |
| 47 | + payload["prompt"] = prompt | |
| 49 | 48 | response = requests.post(url, json=payload, timeout=self.timeout_sec) |
| 50 | 49 | if response.status_code != 200: |
| 51 | 50 | logger.warning( |
| ... | ... | @@ -69,58 +68,13 @@ class HttpTranslationProvider: |
| 69 | 68 | context: Optional[str] = None, |
| 70 | 69 | prompt: Optional[str] = None, |
| 71 | 70 | ) -> Optional[str]: |
| 72 | - del context, prompt | |
| 73 | - result = self._translate_once(text=text, target_lang=target_lang, source_lang=source_lang) | |
| 74 | - return result if result is not None else text | |
| 75 | - | |
| 76 | - def translate_multi( | |
| 77 | - self, | |
| 78 | - text: str, | |
| 79 | - target_langs: List[str], | |
| 80 | - source_lang: Optional[str] = None, | |
| 81 | - context: Optional[str] = None, | |
| 82 | - async_mode: bool = True, | |
| 83 | - prompt: Optional[str] = None, | |
| 84 | - ) -> Dict[str, Optional[str]]: | |
| 85 | - del context, async_mode, prompt | |
| 86 | - out: Dict[str, Optional[str]] = {} | |
| 87 | - for lang in target_langs: | |
| 88 | - out[lang] = self.translate(text, lang, source_lang=source_lang) | |
| 89 | - return out | |
| 90 | - | |
| 91 | - def translate_multi_async( | |
| 92 | - self, | |
| 93 | - text: str, | |
| 94 | - target_langs: List[str], | |
| 95 | - source_lang: Optional[str] = None, | |
| 96 | - context: Optional[str] = None, | |
| 97 | - prompt: Optional[str] = None, | |
| 98 | - ) -> Dict[str, Union[str, Future]]: | |
| 99 | - del context, prompt | |
| 100 | - out: Dict[str, Union[str, Future]] = {} | |
| 101 | - for lang in target_langs: | |
| 102 | - out[lang] = self.executor.submit(self.translate, text, lang, source_lang) | |
| 103 | - return out | |
| 104 | - | |
| 105 | - def translate_for_indexing( | |
| 106 | - self, | |
| 107 | - text: str, | |
| 108 | - shop_language: str, | |
| 109 | - source_lang: Optional[str] = None, | |
| 110 | - context: Optional[str] = None, | |
| 111 | - prompt: Optional[str] = None, | |
| 112 | - index_languages: Optional[List[str]] = None, | |
| 113 | - ) -> Dict[str, Optional[str]]: | |
| 114 | - del context, prompt | |
| 115 | - langs = index_languages if index_languages else ["en", "zh"] | |
| 116 | - source = source_lang or shop_language or "auto" | |
| 117 | - out: Dict[str, Optional[str]] = {} | |
| 118 | - for lang in langs: | |
| 119 | - if lang == shop_language: | |
| 120 | - out[lang] = text | |
| 121 | - else: | |
| 122 | - out[lang] = self.translate(text, target_lang=lang, source_lang=source) | |
| 123 | - return out | |
| 71 | + return self._translate_once( | |
| 72 | + text=text, | |
| 73 | + target_lang=target_lang, | |
| 74 | + source_lang=source_lang, | |
| 75 | + context=context, | |
| 76 | + prompt=prompt, | |
| 77 | + ) | |
| 124 | 78 | |
| 125 | 79 | |
| 126 | 80 | def create_translation_provider(query_config: Any = None) -> Any: |
| ... | ... | @@ -133,9 +87,9 @@ def create_translation_provider(query_config: Any = None) -> Any: |
| 133 | 87 | provider = cfg.provider |
| 134 | 88 | pc = cfg.get_provider_cfg() |
| 135 | 89 | |
| 136 | - if provider in ("direct", "local", "inprocess"): | |
| 90 | + if provider in ("qwen-mt", "direct", "local", "inprocess"): | |
| 137 | 91 | from query.qwen_mt_translate import Translator |
| 138 | - model = pc.get("model") or "qwen" | |
| 92 | + model = pc.get("model") or "qwen-mt-flash" | |
| 139 | 93 | qc = query_config or _empty_query_config() |
| 140 | 94 | return Translator( |
| 141 | 95 | model=model, |
| ... | ... | @@ -145,7 +99,7 @@ def create_translation_provider(query_config: Any = None) -> Any: |
| 145 | 99 | translation_context=getattr(qc, "translation_context", "e-commerce product search"), |
| 146 | 100 | ) |
| 147 | 101 | |
| 148 | - if provider in ("http", "service"): | |
| 102 | + elif provider in ("http", "service"): | |
| 149 | 103 | base_url = get_translation_base_url() |
| 150 | 104 | model = pc.get("model") or "qwen" |
| 151 | 105 | timeout = pc.get("timeout_sec", 10.0) |
| ... | ... | @@ -154,7 +108,26 @@ def create_translation_provider(query_config: Any = None) -> Any: |
| 154 | 108 | base_url=base_url, |
| 155 | 109 | model=model, |
| 156 | 110 | timeout_sec=float(timeout), |
| 157 | - translation_context=getattr(qc, "translation_context", "e-commerce product search"), | |
| 111 | + ) | |
| 112 | + | |
| 113 | + elif provider == "llm": | |
| 114 | + from query.llm_translate import LLMTranslatorProvider | |
| 115 | + model = pc.get("model") | |
| 116 | + timeout = float(pc.get("timeout_sec", 30.0)) | |
| 117 | + base_url = (pc.get("base_url") or "").strip() or None | |
| 118 | + return LLMTranslatorProvider( | |
| 119 | + model=model, | |
| 120 | + timeout_sec=timeout, | |
| 121 | + base_url=base_url, | |
| 122 | + ) | |
| 123 | + | |
| 124 | + elif provider == "deepl": | |
| 125 | + from query.deepl_provider import DeepLProvider | |
| 126 | + qc = query_config or _empty_query_config() | |
| 127 | + return DeepLProvider( | |
| 128 | + api_key=getattr(qc, "translation_api_key", None), | |
| 129 | + timeout=float(pc.get("timeout_sec", 10.0)), | |
| 130 | + glossary_id=pc.get("glossary_id") or getattr(qc, "translation_glossary_id", None), | |
| 158 | 131 | ) |
| 159 | 132 | |
| 160 | 133 | raise ValueError(f"Unsupported translation provider: {provider}") | ... | ... |
| ... | ... | @@ -0,0 +1,203 @@ |
| 1 | +""" | |
| 2 | +DeepL backend provider. | |
| 3 | + | |
| 4 | +This module only handles network calls to DeepL. | |
| 5 | +It does not handle cache, async fanout, or fallback semantics. | |
| 6 | +""" | |
| 7 | + | |
| 8 | +from __future__ import annotations | |
| 9 | + | |
| 10 | +import logging | |
| 11 | +import os | |
| 12 | +import re | |
| 13 | +from typing import Dict, Optional, Tuple | |
| 14 | + | |
| 15 | +import requests | |
| 16 | +from config.services_config import get_translation_config | |
| 17 | + | |
| 18 | + | |
| 19 | +logger = logging.getLogger(__name__) | |
| 20 | + | |
| 21 | +DEFAULT_CONTEXTS: Dict[str, Dict[str, str]] = { | |
| 22 | + "sku_name": { | |
| 23 | + "zh": "商品SKU名称", | |
| 24 | + "en": "product SKU name", | |
| 25 | + }, | |
| 26 | + "ecommerce_search_query": { | |
| 27 | + "zh": "电商", | |
| 28 | + "en": "e-commerce", | |
| 29 | + }, | |
| 30 | + "general": { | |
| 31 | + "zh": "", | |
| 32 | + "en": "", | |
| 33 | + }, | |
| 34 | +} | |
| 35 | +SCENE_NAMES = frozenset(DEFAULT_CONTEXTS.keys()) | |
| 36 | + | |
| 37 | + | |
| 38 | +def _merge_contexts(raw: object) -> Dict[str, Dict[str, str]]: | |
| 39 | + merged: Dict[str, Dict[str, str]] = { | |
| 40 | + scene: dict(lang_map) for scene, lang_map in DEFAULT_CONTEXTS.items() | |
| 41 | + } | |
| 42 | + if not isinstance(raw, dict): | |
| 43 | + return merged | |
| 44 | + for scene, lang_map in raw.items(): | |
| 45 | + if not isinstance(lang_map, dict): | |
| 46 | + continue | |
| 47 | + scene_name = str(scene or "").strip() | |
| 48 | + if not scene_name: | |
| 49 | + continue | |
| 50 | + merged.setdefault(scene_name, {}) | |
| 51 | + for lang, value in lang_map.items(): | |
| 52 | + lang_key = str(lang or "").strip().lower() | |
| 53 | + context_value = str(value or "").strip() | |
| 54 | + if lang_key and context_value: | |
| 55 | + merged[scene_name][lang_key] = context_value | |
| 56 | + return merged | |
| 57 | + | |
| 58 | + | |
| 59 | +class DeepLProvider: | |
| 60 | + API_URL = "https://api.deepl.com/v2/translate" # Pro tier | |
| 61 | + LANG_CODE_MAP = { | |
| 62 | + "zh": "ZH", | |
| 63 | + "en": "EN", | |
| 64 | + "ru": "RU", | |
| 65 | + "ar": "AR", | |
| 66 | + "ja": "JA", | |
| 67 | + "es": "ES", | |
| 68 | + "de": "DE", | |
| 69 | + "fr": "FR", | |
| 70 | + "it": "IT", | |
| 71 | + "pt": "PT", | |
| 72 | + } | |
| 73 | + | |
| 74 | + def __init__( | |
| 75 | + self, | |
| 76 | + api_key: Optional[str], | |
| 77 | + *, | |
| 78 | + timeout: float = 10.0, | |
| 79 | + glossary_id: Optional[str] = None, | |
| 80 | + ) -> None: | |
| 81 | + cfg = get_translation_config() | |
| 82 | + provider_cfg = cfg.providers.get("deepl", {}) if isinstance(cfg.providers, dict) else {} | |
| 83 | + self.api_key = api_key or os.getenv("DEEPL_AUTH_KEY") | |
| 84 | + self.timeout = float(provider_cfg.get("timeout_sec") or timeout or 10.0) | |
| 85 | + self.glossary_id = glossary_id or provider_cfg.get("glossary_id") | |
| 86 | + self.model = "deepl" | |
| 87 | + self.context_presets = _merge_contexts(provider_cfg.get("contexts")) | |
| 88 | + if not self.api_key: | |
| 89 | + logger.warning("DEEPL_AUTH_KEY not set; DeepL translation is unavailable") | |
| 90 | + | |
| 91 | + def _resolve_request_context( | |
| 92 | + self, | |
| 93 | + target_lang: str, | |
| 94 | + context: Optional[str], | |
| 95 | + prompt: Optional[str], | |
| 96 | + ) -> Optional[str]: | |
| 97 | + if prompt: | |
| 98 | + return prompt | |
| 99 | + if context in SCENE_NAMES: | |
| 100 | + scene_map = self.context_presets.get(context) or self.context_presets.get("default") or {} | |
| 101 | + tgt = (target_lang or "").strip().lower() | |
| 102 | + return scene_map.get(tgt) or scene_map.get("en") | |
| 103 | + if context: | |
| 104 | + return context | |
| 105 | + scene_map = self.context_presets.get("default") or {} | |
| 106 | + tgt = (target_lang or "").strip().lower() | |
| 107 | + return scene_map.get(tgt) or scene_map.get("en") | |
| 108 | + | |
| 109 | + def translate( | |
| 110 | + self, | |
| 111 | + text: str, | |
| 112 | + target_lang: str, | |
| 113 | + source_lang: Optional[str] = None, | |
| 114 | + context: Optional[str] = None, | |
| 115 | + prompt: Optional[str] = None, | |
| 116 | + ) -> Optional[str]: | |
| 117 | + if not self.api_key: | |
| 118 | + return None | |
| 119 | + | |
| 120 | + target_code = self.LANG_CODE_MAP.get((target_lang or "").lower(), (target_lang or "").upper()) | |
| 121 | + headers = { | |
| 122 | + "Authorization": f"DeepL-Auth-Key {self.api_key}", | |
| 123 | + "Content-Type": "application/json", | |
| 124 | + } | |
| 125 | + | |
| 126 | + api_context = self._resolve_request_context(target_lang, context, prompt) | |
| 127 | + text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context) | |
| 128 | + | |
| 129 | + payload = { | |
| 130 | + "text": [text_to_translate], | |
| 131 | + "target_lang": target_code, | |
| 132 | + } | |
| 133 | + if source_lang: | |
| 134 | + payload["source_lang"] = self.LANG_CODE_MAP.get(source_lang.lower(), source_lang.upper()) | |
| 135 | + if api_context: | |
| 136 | + payload["context"] = api_context | |
| 137 | + if self.glossary_id: | |
| 138 | + payload["glossary_id"] = self.glossary_id | |
| 139 | + | |
| 140 | + try: | |
| 141 | + response = requests.post(self.API_URL, headers=headers, json=payload, timeout=self.timeout) | |
| 142 | + if response.status_code != 200: | |
| 143 | + logger.warning( | |
| 144 | + "[deepl] Failed | status=%s tgt=%s body=%s", | |
| 145 | + response.status_code, | |
| 146 | + target_code, | |
| 147 | + (response.text or "")[:200], | |
| 148 | + ) | |
| 149 | + return None | |
| 150 | + | |
| 151 | + data = response.json() | |
| 152 | + translations = data.get("translations") or [] | |
| 153 | + if not translations: | |
| 154 | + return None | |
| 155 | + translated = translations[0].get("text") | |
| 156 | + if not translated: | |
| 157 | + return None | |
| 158 | + if needs_extraction: | |
| 159 | + translated = self._extract_term_from_translation(translated, text, target_code) | |
| 160 | + return translated | |
| 161 | + except requests.Timeout: | |
| 162 | + logger.warning("[deepl] Timeout | tgt=%s timeout=%.1fs", target_code, self.timeout) | |
| 163 | + return None | |
| 164 | + except Exception as exc: | |
| 165 | + logger.warning("[deepl] Exception | tgt=%s error=%s", target_code, exc, exc_info=True) | |
| 166 | + return None | |
| 167 | + | |
| 168 | + def _add_ecommerce_context( | |
| 169 | + self, | |
| 170 | + text: str, | |
| 171 | + source_lang: Optional[str], | |
| 172 | + context: Optional[str], | |
| 173 | + ) -> Tuple[str, bool]: | |
| 174 | + if not context or "e-commerce" not in context.lower(): | |
| 175 | + return text, False | |
| 176 | + if (source_lang or "").lower() != "zh": | |
| 177 | + return text, False | |
| 178 | + | |
| 179 | + term = (text or "").strip() | |
| 180 | + if len(term.split()) == 1 and len(term) <= 2: | |
| 181 | + return f"购买 {term}", True | |
| 182 | + return text, False | |
| 183 | + | |
| 184 | + def _extract_term_from_translation( | |
| 185 | + self, | |
| 186 | + translated_text: str, | |
| 187 | + original_text: str, | |
| 188 | + target_lang_code: str, | |
| 189 | + ) -> str: | |
| 190 | + del original_text | |
| 191 | + if target_lang_code != "EN": | |
| 192 | + return translated_text | |
| 193 | + | |
| 194 | + words = translated_text.strip().split() | |
| 195 | + if len(words) <= 1: | |
| 196 | + return translated_text | |
| 197 | + context_words = {"buy", "purchase", "product", "item", "commodity", "goods"} | |
| 198 | + for word in reversed(words): | |
| 199 | + normalized = re.sub(r"[.,!?;:]+$", "", word.lower()) | |
| 200 | + if normalized not in context_words: | |
| 201 | + return normalized | |
| 202 | + return re.sub(r"[.,!?;:]+$", "", words[-1].lower()) | |
| 203 | + | ... | ... |
query/llm_translate.py
| 1 | 1 | """ |
| 2 | -LLM-based translation helper using Qwen chat model. | |
| 2 | +LLM-based translation backend (DashScope-compatible OpenAI API). | |
| 3 | 3 | |
| 4 | -This module provides a thin wrapper around DashScope's `qwen-flash` model | |
| 5 | -for high-quality, prompt-controlled translation, independent of the main | |
| 6 | -`Translator` (machine translation) pipeline. | |
| 7 | - | |
| 8 | -Usage example: | |
| 9 | - | |
| 10 | - from query.llm_translate import llm_translate | |
| 11 | - | |
| 12 | - result = llm_translate( | |
| 13 | - text="我看到这个视频后没有笑", | |
| 14 | - target_lang="en", | |
| 15 | - source_lang="zh", | |
| 16 | - source_lang_label="中文", | |
| 17 | - target_lang_label="英文", | |
| 18 | - ) | |
| 4 | +Failure semantics are strict: | |
| 5 | +- success: translated string | |
| 6 | +- failure: None | |
| 19 | 7 | """ |
| 20 | 8 | |
| 21 | 9 | from __future__ import annotations |
| ... | ... | @@ -23,113 +11,159 @@ from __future__ import annotations |
| 23 | 11 | import logging |
| 24 | 12 | import os |
| 25 | 13 | import time |
| 26 | -from typing import Dict, Optional | |
| 14 | +from typing import Optional | |
| 27 | 15 | |
| 28 | 16 | from openai import OpenAI |
| 29 | 17 | |
| 30 | 18 | from config.env_config import DASHSCOPE_API_KEY |
| 31 | 19 | from config.services_config import get_translation_config |
| 20 | +from config.translate_prompts import TRANSLATION_PROMPTS, SOURCE_LANG_CODE_MAP | |
| 21 | + | |
| 32 | 22 | |
| 33 | 23 | logger = logging.getLogger(__name__) |
| 34 | 24 | |
| 35 | 25 | |
| 36 | -# 华北2(北京):https://dashscope.aliyuncs.com/compatible-mode/v1 | |
| 37 | -# 新加坡:https://dashscope-intl.aliyuncs.com/compatible-mode/v1 | |
| 38 | -# 美国(弗吉尼亚):https://dashscope-us.aliyuncs.com/compatible-mode/v1 | |
| 39 | -# | |
| 40 | -# 默认保持与现有翻译/索引脚本相同的美国地域,可通过环境变量覆盖: | |
| 41 | -# DASHSCOPE_BASE_URL=https://dashscope.aliyuncs.com/compatible-mode/v1 | |
| 42 | 26 | DEFAULT_QWEN_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1" |
| 43 | -QWEN_MODEL_NAME = "qwen-flash" | |
| 44 | - | |
| 45 | - | |
| 46 | -# 由调用方提供的语言标签/代码填充,占位符说明: | |
| 47 | -# - source_lang: 源语言的人类可读名称(按目标语言本地化,例如 "中文", "English") | |
| 48 | -# - target_lang: 目标语言的人类可读名称 | |
| 49 | -# - src_lang_code: 源语言代码,例如 "zh" | |
| 50 | -# - tgt_lang_code: 目标语言代码,例如 "en" | |
| 51 | -TRANSLATION_PROMPTS: Dict[str, str] = { | |
| 52 | - "zh": """你是一名专业的 {source_lang}({src_lang_code})到 {target_lang}({tgt_lang_code})翻译员。你的目标是在遵循 {target_lang} 的语法、词汇和文化习惯的前提下,准确传达原始 {source_lang} 文本的含义和细微差别。请只输出 {target_lang} 的翻译内容,不要包含任何额外的解释或评论。请将以下 {source_lang} 文本翻译成 {target_lang}: | |
| 53 | - | |
| 54 | -{text}""", | |
| 55 | - "en": """You are a professional {source_lang} ({src_lang_code}) to {target_lang} ({tgt_lang_code}) translator. Your goal is to accurately convey the meaning and nuances of the original {source_lang} text while adhering to {target_lang} grammar, vocabulary, and cultural sensitivities. Produce only the {target_lang} translation, without any additional explanations or commentary. Please translate the following {source_lang} text into {target_lang}: | |
| 56 | - | |
| 57 | -{text}""", | |
| 58 | - "ru": """Вы профессиональный переводчик с {source_lang} ({src_lang_code}) на {target_lang} ({tgt_lang_code}). Ваша задача — точно передать смысл и нюансы исходного текста на {source_lang}, соблюдая грамматику, лексику и культурные особенности {target_lang}. Выводите только перевод на {target_lang}, без каких-либо дополнительных объяснений или комментариев. Пожалуйста, переведите следующий текст с {source_lang} на {target_lang}: | |
| 59 | - | |
| 60 | -{text}""", | |
| 61 | - "ar": """أنت مترجم محترف من {source_lang} ({src_lang_code}) إلى {target_lang} ({tgt_lang_code}). هدفك هو نقل المعنى والدلالات الدقيقة للنص الأصلي بلغة {source_lang} بدقة، مع الالتزام بقواعد اللغة والمفردات والحساسيات الثقافية الخاصة بلغة {target_lang}. قم بإنتاج الترجمة إلى {target_lang} فقط دون أي شروحات أو تعليقات إضافية. يرجى ترجمة النص التالي من {source_lang} إلى {target_lang}: | |
| 62 | - | |
| 63 | -{text}""", | |
| 64 | - "ja": """あなたは {source_lang}({src_lang_code})から {target_lang}({tgt_lang_code})へのプロの翻訳者です。{target_lang} の文法、語彙、文化的配慮に従いながら、元の {source_lang} テキストの意味やニュアンスを正確に伝えることが目的です。追加の説明やコメントは一切含めず、{target_lang} の翻訳のみを出力してください。次の {source_lang} テキストを {target_lang} に翻訳してください: | |
| 65 | - | |
| 66 | -{text}""", | |
| 67 | - "es": """Eres un traductor profesional de {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Tu objetivo es transmitir con precisión el significado y los matices del texto original en {source_lang}, respetando la gramática, el vocabulario y las sensibilidades culturales de {target_lang}. Produce únicamente la traducción en {target_lang}, sin explicaciones ni comentarios adicionales. Por favor, traduce el siguiente texto de {source_lang} a {target_lang}: | |
| 68 | - | |
| 69 | -{text}""", | |
| 70 | - "de": """Du bist ein professioneller Übersetzer von {source_lang} ({src_lang_code}) nach {target_lang} ({tgt_lang_code}). Dein Ziel ist es, die Bedeutung und Nuancen des ursprünglichen {source_lang}-Textes genau zu vermitteln und dabei die Grammatik, den Wortschatz und die kulturellen Besonderheiten von {target_lang} zu berücksichtigen. Gib ausschließlich die Übersetzung in {target_lang} aus, ohne zusätzliche Erklärungen oder Kommentare. Bitte übersetze den folgenden {source_lang}-Text in {target_lang}: | |
| 71 | - | |
| 72 | -{text}""", | |
| 73 | - "fr": """Vous êtes un traducteur professionnel de {source_lang} ({src_lang_code}) vers {target_lang} ({tgt_lang_code}). Votre objectif est de transmettre fidèlement le sens et les nuances du texte original en {source_lang}, tout en respectant la grammaire, le vocabulaire et les sensibilités culturelles de {target_lang}. Produisez uniquement la traduction en {target_lang}, sans explications ni commentaires supplémentaires. Veuillez traduire le texte suivant de {source_lang} vers {target_lang} : | |
| 74 | - | |
| 75 | -{text}""", | |
| 76 | - "it": """Sei un traduttore professionista da {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Il tuo obiettivo è trasmettere con precisione il significato e le sfumature del testo originale in {source_lang}, rispettando la grammatica, il vocabolario e le sensibilità culturali di {target_lang}. Produci solo la traduzione in {target_lang}, senza spiegazioni o commenti aggiuntivi. Per favore traduci il seguente testo da {source_lang} a {target_lang}: | |
| 77 | - | |
| 78 | -{text}""", | |
| 79 | - "pt": """Você é um tradutor profissional de {source_lang} ({src_lang_code}) para {target_lang} ({tgt_lang_code}). Seu objetivo é transmitir com precisão o significado e as nuances do texto original em {source_lang}, respeitando a gramática, o vocabulário e as sensibilidades culturais de {target_lang}. Produza apenas a tradução em {target_lang}, sem quaisquer explicações ou comentários adicionais. Por favor, traduza o seguinte texto de {source_lang} para {target_lang}: | |
| 80 | - | |
| 81 | -{text}""", | |
| 82 | -} | |
| 83 | - | |
| 84 | - | |
| 85 | -def _get_qwen_client(base_url: Optional[str] = None) -> Optional[OpenAI]: | |
| 86 | - """ | |
| 87 | - Lazily construct an OpenAI-compatible client for DashScope. | |
| 88 | - | |
| 89 | - Uses DASHSCOPE_API_KEY and base_url (provider config / env) to configure endpoint. | |
| 90 | - """ | |
| 91 | - api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY") | |
| 92 | - if not api_key: | |
| 93 | - logger.warning("DASHSCOPE_API_KEY not set; llm-based translation will be disabled") | |
| 94 | - return None | |
| 95 | - | |
| 96 | - # 优先使用显式传入的 base_url,其次环境变量,最后默认地域。 | |
| 97 | - base_url = ( | |
| 98 | - (base_url or "").strip() | |
| 99 | - or os.getenv("DASHSCOPE_BASE_URL") | |
| 100 | - or DEFAULT_QWEN_BASE_URL | |
| 101 | - ) | |
| 102 | - | |
| 103 | - try: | |
| 104 | - client = OpenAI(api_key=api_key, base_url=base_url) | |
| 105 | - return client | |
| 106 | - except Exception as exc: | |
| 107 | - logger.error("Failed to initialize DashScope OpenAI client: %s", exc, exc_info=True) | |
| 108 | - return None | |
| 27 | +DEFAULT_LLM_MODEL = "qwen-flash" | |
| 109 | 28 | |
| 110 | 29 | |
| 111 | 30 | def _build_prompt( |
| 112 | 31 | text: str, |
| 32 | + *, | |
| 33 | + source_lang: Optional[str], | |
| 113 | 34 | target_lang: str, |
| 114 | - source_lang_label: str, | |
| 115 | - target_lang_label: str, | |
| 116 | - src_lang_code: str, | |
| 117 | - tgt_lang_code: str, | |
| 35 | + scene: Optional[str], | |
| 118 | 36 | ) -> str: |
| 119 | 37 | """ |
| 120 | - Build translation prompt for given target language, defaulting to English template. | |
| 38 | + 从 config.translate_prompts.TRANSLATION_PROMPTS 中构建提示词。 | |
| 39 | + | |
| 40 | + 要求:模板必须包含 {source_lang}({src_lang_code}){target_lang}({tgt_lang_code})。 | |
| 41 | + 这里统一使用 code 作为占位的 lang 与 label,外部接口仍然只传语言 code。 | |
| 121 | 42 | """ |
| 122 | - key = (target_lang or "").lower() | |
| 123 | - template = TRANSLATION_PROMPTS.get(key) or TRANSLATION_PROMPTS["en"] | |
| 43 | + tgt = (target_lang or "").lower() or "en" | |
| 44 | + src = (source_lang or "auto").lower() | |
| 45 | + | |
| 46 | + # 将业务上下文 scene 映射为模板分组名 | |
| 47 | + normalized_scene = (scene or "").strip() or "general" | |
| 48 | + # 如果出现历史词,则报错,用于发现错误 | |
| 49 | + if normalized_scene in {"query", "ecommerce_search", "ecommerce_search_query"}: | |
| 50 | + group_key = "ecommerce_search_query" | |
| 51 | + elif normalized_scene in {"product_title", "sku_name"}: | |
| 52 | + group_key = "sku_name" | |
| 53 | + else: | |
| 54 | + group_key = normalized_scene | |
| 55 | + group = TRANSLATION_PROMPTS.get(group_key) or TRANSLATION_PROMPTS["general"] | |
| 56 | + | |
| 57 | + # 先按目标语言 code 取模板,取不到回退到英文 | |
| 58 | + template = group.get(tgt) or group.get("en") | |
| 59 | + if not template: | |
| 60 | + # 理论上不会发生,兜底一个简单模板 | |
| 61 | + template = ( | |
| 62 | + "You are a professional {source_lang} ({src_lang_code}) to " | |
| 63 | + "{target_lang} ({tgt_lang_code}) translator, output only the translation: {text}" | |
| 64 | + ) | |
| 65 | + | |
| 66 | + # 目前不额外维护语言名称映射,直接使用 code 作为 label | |
| 67 | + source_lang_label = SOURCE_LANG_CODE_MAP.get(src, src) | |
| 68 | + target_lang_label = SOURCE_LANG_CODE_MAP.get(tgt, tgt) | |
| 69 | + | |
| 124 | 70 | return template.format( |
| 125 | 71 | source_lang=source_lang_label, |
| 72 | + src_lang_code=src, | |
| 126 | 73 | target_lang=target_lang_label, |
| 127 | - src_lang_code=src_lang_code, | |
| 128 | - tgt_lang_code=tgt_lang_code, | |
| 74 | + tgt_lang_code=tgt, | |
| 129 | 75 | text=text, |
| 130 | 76 | ) |
| 131 | 77 | |
| 132 | 78 | |
| 79 | +class LLMTranslatorProvider: | |
| 80 | + def __init__( | |
| 81 | + self, | |
| 82 | + *, | |
| 83 | + model: Optional[str] = None, | |
| 84 | + timeout_sec: float = 30.0, | |
| 85 | + base_url: Optional[str] = None, | |
| 86 | + ) -> None: | |
| 87 | + cfg = get_translation_config() | |
| 88 | + llm_cfg = cfg.providers.get("llm", {}) if isinstance(cfg.providers, dict) else {} | |
| 89 | + self.model = model or llm_cfg.get("model") or DEFAULT_LLM_MODEL | |
| 90 | + self.timeout_sec = float(llm_cfg.get("timeout_sec") or timeout_sec or 30.0) | |
| 91 | + self.base_url = ( | |
| 92 | + (base_url or "").strip() | |
| 93 | + or (llm_cfg.get("base_url") or "").strip() | |
| 94 | + or os.getenv("DASHSCOPE_BASE_URL") | |
| 95 | + or DEFAULT_QWEN_BASE_URL | |
| 96 | + ) | |
| 97 | + self.client = self._create_client() | |
| 98 | + | |
| 99 | + def _create_client(self) -> Optional[OpenAI]: | |
| 100 | + api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY") | |
| 101 | + if not api_key: | |
| 102 | + logger.warning("DASHSCOPE_API_KEY not set; llm translation unavailable") | |
| 103 | + return None | |
| 104 | + try: | |
| 105 | + return OpenAI(api_key=api_key, base_url=self.base_url) | |
| 106 | + except Exception as exc: | |
| 107 | + logger.error("Failed to initialize llm translation client: %s", exc, exc_info=True) | |
| 108 | + return None | |
| 109 | + | |
| 110 | + def translate( | |
| 111 | + self, | |
| 112 | + text: str, | |
| 113 | + target_lang: str, | |
| 114 | + source_lang: Optional[str] = None, | |
| 115 | + context: Optional[str] = None, | |
| 116 | + prompt: Optional[str] = None, | |
| 117 | + ) -> Optional[str]: | |
| 118 | + if not text or not str(text).strip(): | |
| 119 | + return text | |
| 120 | + if not self.client: | |
| 121 | + return None | |
| 122 | + | |
| 123 | + tgt = (target_lang or "").lower() or "en" | |
| 124 | + src = (source_lang or "auto").lower() | |
| 125 | + scene = context or "default" | |
| 126 | + user_prompt = prompt or _build_prompt( | |
| 127 | + text=text, | |
| 128 | + source_lang=src, | |
| 129 | + target_lang=tgt, | |
| 130 | + scene=scene, | |
| 131 | + ) | |
| 132 | + start = time.time() | |
| 133 | + try: | |
| 134 | + logger.info( | |
| 135 | + "[llm] Request | src=%s tgt=%s model=%s prompt=%s", | |
| 136 | + src, | |
| 137 | + tgt, | |
| 138 | + self.model, | |
| 139 | + user_prompt, | |
| 140 | + ) | |
| 141 | + completion = self.client.chat.completions.create( | |
| 142 | + model=self.model, | |
| 143 | + messages=[{"role": "user", "content": user_prompt}], | |
| 144 | + timeout=self.timeout_sec, | |
| 145 | + ) | |
| 146 | + content = (completion.choices[0].message.content or "").strip() | |
| 147 | + latency_ms = (time.time() - start) * 1000 | |
| 148 | + if not content: | |
| 149 | + logger.warning("[llm] Empty result | src=%s tgt=%s latency=%.1fms", src, tgt, latency_ms) | |
| 150 | + return None | |
| 151 | + logger.info("[llm] Response | src=%s tgt=%s response=%s", src, tgt, content) | |
| 152 | + logger.info("[llm] Success | src=%s tgt=%s latency=%.1fms", src, tgt, latency_ms) | |
| 153 | + return content | |
| 154 | + except Exception as exc: | |
| 155 | + latency_ms = (time.time() - start) * 1000 | |
| 156 | + logger.warning( | |
| 157 | + "[llm] Failed | src=%s tgt=%s latency=%.1fms error=%s", | |
| 158 | + src, | |
| 159 | + tgt, | |
| 160 | + latency_ms, | |
| 161 | + exc, | |
| 162 | + exc_info=True, | |
| 163 | + ) | |
| 164 | + return None | |
| 165 | + | |
| 166 | + | |
| 133 | 167 | def llm_translate( |
| 134 | 168 | text: str, |
| 135 | 169 | target_lang: str, |
| ... | ... | @@ -139,100 +173,13 @@ def llm_translate( |
| 139 | 173 | target_lang_label: Optional[str] = None, |
| 140 | 174 | timeout_sec: Optional[float] = None, |
| 141 | 175 | ) -> Optional[str]: |
| 142 | - """ | |
| 143 | - Translate text with Qwen chat model using rich prompts. | |
| 144 | - | |
| 145 | - - 根据目标语言选择提示词,如果没匹配到则退回英文模板。 | |
| 146 | - - 不对 text 做语言检测或缓存,调用方自行控制。 | |
| 147 | - | |
| 148 | - Args: | |
| 149 | - text: 原始文本 | |
| 150 | - target_lang: 目标语言代码(如 "zh", "en") | |
| 151 | - source_lang: 源语言代码(可选,不影响提示词选择,仅用于日志) | |
| 152 | - source_lang_label: 源语言展示名称,用于 prompt(默认使用 source_lang) | |
| 153 | - target_lang_label: 目标语言展示名称,用于 prompt(默认使用 target_lang) | |
| 154 | - timeout_sec: 请求超时时间(秒,可选;若未配置则从 config 读取或采用默认) | |
| 155 | - | |
| 156 | - Returns: | |
| 157 | - 翻译后的文本;如失败则返回 None。 | |
| 158 | - """ | |
| 159 | - if not text or not str(text).strip(): | |
| 160 | - return text | |
| 161 | - | |
| 162 | - cfg = get_translation_config() | |
| 163 | - provider_cfg = cfg.providers.get("llm", {}) if isinstance(cfg.providers, dict) else {} | |
| 164 | - | |
| 165 | - model_name = provider_cfg.get("model") or QWEN_MODEL_NAME | |
| 166 | - req_timeout = float(provider_cfg.get("timeout_sec") or timeout_sec or 30.0) | |
| 167 | - base_url = (provider_cfg.get("base_url") or "").strip() or None | |
| 168 | - | |
| 169 | - client = _get_qwen_client(base_url=base_url) | |
| 170 | - if not client: | |
| 171 | - # 无法调用云端,直接回退 | |
| 172 | - logger.warning( | |
| 173 | - "[llm_translate] Client init failed; returning original text. " | |
| 174 | - "text=%r target_lang=%s source_lang=%s", | |
| 175 | - text[:80], | |
| 176 | - target_lang, | |
| 177 | - source_lang or "auto", | |
| 178 | - ) | |
| 179 | - return text | |
| 180 | - | |
| 181 | - tgt = (target_lang or "").lower() or "en" | |
| 182 | - src = (source_lang or "auto").lower() | |
| 183 | - src_label = source_lang_label or src | |
| 184 | - tgt_label = target_lang_label or tgt | |
| 185 | - | |
| 186 | - prompt = _build_prompt( | |
| 176 | + provider = LLMTranslatorProvider(timeout_sec=timeout_sec or 30.0) | |
| 177 | + return provider.translate( | |
| 187 | 178 | text=text, |
| 188 | - target_lang=tgt, | |
| 189 | - source_lang_label=src_label, | |
| 190 | - target_lang_label=tgt_label, | |
| 191 | - src_lang_code=src, | |
| 192 | - tgt_lang_code=tgt, | |
| 179 | + target_lang=target_lang, | |
| 180 | + source_lang=source_lang, | |
| 181 | + context=None, | |
| 193 | 182 | ) |
| 194 | 183 | |
| 195 | - start = time.time() | |
| 196 | - try: | |
| 197 | - completion = client.chat.completions.create( | |
| 198 | - model=model_name, | |
| 199 | - messages=[ | |
| 200 | - { | |
| 201 | - "role": "user", | |
| 202 | - "content": prompt, | |
| 203 | - } | |
| 204 | - ], | |
| 205 | - timeout=req_timeout, | |
| 206 | - ) | |
| 207 | - content = (completion.choices[0].message.content or "").strip() | |
| 208 | - duration_ms = (time.time() - start) * 1000 | |
| 209 | - logger.info( | |
| 210 | - "[llm_translate] Success | model=%s src=%s tgt=%s latency=%.1fms text=%r -> %r", | |
| 211 | - model_name, | |
| 212 | - src, | |
| 213 | - tgt, | |
| 214 | - duration_ms, | |
| 215 | - text[:80], | |
| 216 | - content[:80], | |
| 217 | - ) | |
| 218 | - return content or text | |
| 219 | - except Exception as exc: | |
| 220 | - duration_ms = (time.time() - start) * 1000 | |
| 221 | - logger.warning( | |
| 222 | - "[llm_translate] Failed | model=%s src=%s tgt=%s latency=%.1fms error=%s", | |
| 223 | - model_name, | |
| 224 | - src, | |
| 225 | - tgt, | |
| 226 | - duration_ms, | |
| 227 | - exc, | |
| 228 | - exc_info=True, | |
| 229 | - ) | |
| 230 | - # 安全回退:出错时返回原文,避免中断上游流程 | |
| 231 | - return text | |
| 232 | - | |
| 233 | - | |
| 234 | -__all__ = [ | |
| 235 | - "TRANSLATION_PROMPTS", | |
| 236 | - "llm_translate", | |
| 237 | -] | |
| 238 | 184 | |
| 185 | +__all__ = ["LLMTranslatorProvider", "llm_translate"] | ... | ... |
query/query_parser.py
| ... | ... | @@ -8,7 +8,7 @@ from typing import Dict, List, Optional, Any, Union |
| 8 | 8 | import numpy as np |
| 9 | 9 | import logging |
| 10 | 10 | import re |
| 11 | -from concurrent.futures import Future, ThreadPoolExecutor, as_completed | |
| 11 | +from concurrent.futures import ThreadPoolExecutor, as_completed, wait | |
| 12 | 12 | |
| 13 | 13 | from embeddings.text_encoder import TextEmbeddingEncoder |
| 14 | 14 | from config import SearchConfig |
| ... | ... | @@ -135,6 +135,7 @@ class QueryParser: |
| 135 | 135 | cfg = get_translation_config() |
| 136 | 136 | logger.info("Initializing translator at QueryParser construction (provider=%s)...", cfg.provider) |
| 137 | 137 | self._translator = create_translation_provider(self.config.query_config) |
| 138 | + self._translation_executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="query-translation") | |
| 138 | 139 | |
| 139 | 140 | @property |
| 140 | 141 | def text_encoder(self) -> TextEmbeddingEncoder: |
| ... | ... | @@ -265,6 +266,7 @@ class QueryParser: |
| 265 | 266 | # Stage 4: Translation (with async support and conditional waiting) |
| 266 | 267 | translations = {} |
| 267 | 268 | translation_futures = {} |
| 269 | + translation_executor = None | |
| 268 | 270 | index_langs = ["en", "zh"] |
| 269 | 271 | try: |
| 270 | 272 | # 根据租户配置的 index_languages 决定翻译目标语言 |
| ... | ... | @@ -287,48 +289,33 @@ class QueryParser: |
| 287 | 289 | target_langs = target_langs_for_translation |
| 288 | 290 | |
| 289 | 291 | if target_langs: |
| 290 | - # Use e-commerce context for better disambiguation | |
| 291 | - translation_context = self.config.query_config.translation_context | |
| 292 | - # For query translation, we use a general prompt (not language-specific) | |
| 293 | - query_prompt = ( | |
| 294 | - self.config.query_config.translation_prompts.get(f"query_{detected_lang}") | |
| 295 | - or self.config.query_config.translation_prompts.get("query_en") | |
| 296 | - or self.config.query_config.translation_prompts.get("default_en") | |
| 297 | - or self.config.query_config.translation_prompts.get("default_zh") | |
| 298 | - ) | |
| 299 | - | |
| 300 | 292 | # Determine if we need to wait for translation results |
| 301 | 293 | # If detected_lang is not in index_languages, we must wait for translation |
| 302 | 294 | need_wait_translation = detected_lang not in index_langs |
| 303 | - | |
| 295 | + | |
| 304 | 296 | if need_wait_translation: |
| 305 | - # Use async method that returns Futures, so we can wait for results | |
| 306 | - translation_results = self.translator.translate_multi_async( | |
| 307 | - query_text, | |
| 308 | - target_langs, | |
| 309 | - source_lang=detected_lang, | |
| 310 | - context=translation_context, | |
| 311 | - prompt=query_prompt | |
| 297 | + translation_executor = ThreadPoolExecutor( | |
| 298 | + max_workers=max(1, min(len(target_langs), 4)), | |
| 299 | + thread_name_prefix="query-translation-wait", | |
| 312 | 300 | ) |
| 313 | - # Separate cached results and futures | |
| 314 | - for lang, result in translation_results.items(): | |
| 315 | - if isinstance(result, Future): | |
| 316 | - translation_futures[lang] = result | |
| 317 | - else: | |
| 318 | - translations[lang] = result | |
| 301 | + for lang in target_langs: | |
| 302 | + translation_futures[lang] = translation_executor.submit( | |
| 303 | + self.translator.translate, | |
| 304 | + query_text, | |
| 305 | + lang, | |
| 306 | + detected_lang, | |
| 307 | + "ecommerce_search_query", | |
| 308 | + ) | |
| 319 | 309 | else: |
| 320 | - # Use async mode: returns cached translations immediately, missing ones translated in background | |
| 321 | - translations = self.translator.translate_multi( | |
| 322 | - query_text, | |
| 323 | - target_langs, | |
| 324 | - source_lang=detected_lang, | |
| 325 | - context=translation_context, | |
| 326 | - async_mode=True, | |
| 327 | - prompt=query_prompt | |
| 328 | - ) | |
| 329 | - # Filter out None values (missing translations that are being processed async) | |
| 330 | - translations = {k: v for k, v in translations.items() if v is not None} | |
| 331 | - | |
| 310 | + for lang in target_langs: | |
| 311 | + self._translation_executor.submit( | |
| 312 | + self.translator.translate, | |
| 313 | + query_text, | |
| 314 | + lang, | |
| 315 | + detected_lang, | |
| 316 | + "ecommerce_search_query", | |
| 317 | + ) | |
| 318 | + | |
| 332 | 319 | if translations: |
| 333 | 320 | log_info(f"Translation completed (cache hit) | Query text: '{query_text}' | Results: {translations}") |
| 334 | 321 | if translation_futures: |
| ... | ... | @@ -407,15 +394,18 @@ class QueryParser: |
| 407 | 394 | all_futures.append(embedding_future) |
| 408 | 395 | future_to_lang[embedding_future] = ('embedding', None) |
| 409 | 396 | |
| 410 | - # Wait for all futures to complete | |
| 411 | - for future in as_completed(all_futures): | |
| 397 | + # Enforce a hard timeout for translation-related work (300ms budget) | |
| 398 | + done, not_done = wait(all_futures, timeout=0.3) | |
| 399 | + for future in done: | |
| 412 | 400 | task_type, lang = future_to_lang[future] |
| 413 | 401 | try: |
| 414 | 402 | result = future.result() |
| 415 | 403 | if task_type == 'translation': |
| 416 | 404 | if result: |
| 417 | 405 | translations[lang] = result |
| 418 | - log_info(f"Translation completed | Query text: '{query_text}' | Target language: {lang} | Translation result: '{result}'") | |
| 406 | + log_info( | |
| 407 | + f"Translation completed | Query text: '{query_text}' | Target language: {lang} | Translation result: '{result}'" | |
| 408 | + ) | |
| 419 | 409 | if context: |
| 420 | 410 | context.store_intermediate_result(f'translation_{lang}', result) |
| 421 | 411 | elif task_type == 'embedding': |
| ... | ... | @@ -434,10 +424,27 @@ class QueryParser: |
| 434 | 424 | log_info(error_msg) |
| 435 | 425 | if context: |
| 436 | 426 | context.add_warning(error_msg) |
| 437 | - | |
| 427 | + | |
| 428 | + # Log timeouts for any futures that did not finish within 300ms | |
| 429 | + if not_done: | |
| 430 | + for future in not_done: | |
| 431 | + task_type, lang = future_to_lang[future] | |
| 432 | + if task_type == 'translation': | |
| 433 | + timeout_msg = ( | |
| 434 | + f"Translation timeout (>300ms) | Language: {lang} | " | |
| 435 | + f"Query text: '{query_text}'" | |
| 436 | + ) | |
| 437 | + else: | |
| 438 | + timeout_msg = "Query vector generation timeout (>300ms), proceeding without embedding result" | |
| 439 | + log_info(timeout_msg) | |
| 440 | + if context: | |
| 441 | + context.add_warning(timeout_msg) | |
| 442 | + | |
| 438 | 443 | # Clean up encoding executor |
| 439 | 444 | if encoding_executor: |
| 440 | 445 | encoding_executor.shutdown(wait=False) |
| 446 | + if translation_executor: | |
| 447 | + translation_executor.shutdown(wait=False) | |
| 441 | 448 | |
| 442 | 449 | # Update translations in context after all are complete |
| 443 | 450 | if translations and context: | ... | ... |
query/qwen_mt_translate.py
| 1 | -""" | |
| 2 | -Translation service for multi-language query support. | |
| 1 | +"""Qwen-MT translation orchestrator with cache and async helpers.""" | |
| 3 | 2 | |
| 4 | -Supports multiple translation models: | |
| 5 | -- Qwen (default): Alibaba Cloud DashScope API using qwen-mt-flash model | |
| 6 | -- DeepL: DeepL API for high-quality translations | |
| 7 | - | |
| 8 | -重要说明(Qwen 机翻限速): | |
| 9 | -- 当前默认使用的 `qwen-mt-flash` 为云端机翻模型,**官方限速较低,约 RPM=60(每分钟约 60 请求)** | |
| 10 | -- 在高并发场景必须依赖 Redis 翻译缓存与批量预热,避免在用户实时请求路径上直接打满 DashScope 限流 | |
| 11 | -- 若业务侧存在大规模离线翻译或更高吞吐需求,建议评估 DeepL 或自建翻译后端 | |
| 12 | - | |
| 13 | -使用方法 (Usage): | |
| 14 | - | |
| 15 | -```python | |
| 16 | -from query.translator import Translator | |
| 17 | - | |
| 18 | -# 使用默认的 qwen 模型(推荐) | |
| 19 | -translator = Translator() # 默认使用 qwen 模型 | |
| 20 | - | |
| 21 | -# 或显式指定模型 | |
| 22 | -translator = Translator(model='qwen') # 使用 qwen 模型 | |
| 23 | -translator = Translator(model='deepl') # 使用 DeepL 模型 | |
| 24 | - | |
| 25 | -# 翻译文本 | |
| 26 | -result = translator.translate( | |
| 27 | - text="我看到这个视频后没有笑", | |
| 28 | - target_lang="en", | |
| 29 | - source_lang="auto" # 自动检测源语言 | |
| 30 | -) | |
| 31 | -``` | |
| 32 | - | |
| 33 | -配置说明 (Configuration): | |
| 34 | -- Qwen 模型需要设置 DASHSCOPE_API_KEY 环境变量(在 .env 文件中) | |
| 35 | -- DeepL 模型需要设置 DEEPL_AUTH_KEY 环境变量(在 .env 文件中) | |
| 36 | - | |
| 37 | -Qwen 模型参考文档: | |
| 38 | -- 官方文档:https://help.aliyun.com/zh/model-studio/get-api-key | |
| 39 | -- 模型:qwen-mt-flash(快速翻译模型) | |
| 40 | - | |
| 41 | -DeepL 官方文档: | |
| 42 | -https://developers.deepl.com/api-reference/translate/request-translation | |
| 43 | -""" | |
| 3 | +from __future__ import annotations | |
| 44 | 4 | |
| 5 | +import hashlib | |
| 6 | +import logging | |
| 45 | 7 | import os |
| 46 | -import requests | |
| 47 | 8 | import re |
| 48 | -import redis | |
| 49 | -from concurrent.futures import ThreadPoolExecutor, Future | |
| 50 | -from datetime import timedelta | |
| 51 | -from typing import Dict, List, Optional, Union | |
| 52 | -import logging | |
| 53 | 9 | import time |
| 10 | +from typing import Dict, List, Optional | |
| 54 | 11 | |
| 55 | -logger = logging.getLogger(__name__) | |
| 56 | - | |
| 57 | -from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG | |
| 12 | +import redis | |
| 58 | 13 | from openai import OpenAI |
| 59 | 14 | |
| 15 | +from config.env_config import DASHSCOPE_API_KEY, REDIS_CONFIG | |
| 16 | +from config.services_config import get_translation_cache_config | |
| 17 | +from config.translate_prompts import SOURCE_LANG_CODE_MAP | |
| 60 | 18 | |
| 61 | -class Translator: | |
| 62 | - """ | |
| 63 | - Multi-language translator supporting Qwen and DeepL APIs. | |
| 64 | - | |
| 65 | - Default model is 'qwen' which uses Alibaba Cloud DashScope API. | |
| 66 | - """ | |
| 67 | -# 华北2(北京):https://dashscope.aliyuncs.com/compatible-mode/v1 | |
| 68 | -# 新加坡:https://dashscope-intl.aliyuncs.com/compatible-mode/v1 | |
| 69 | -# 美国(弗吉尼亚):https://dashscope-us.aliyuncs.com/compatible-mode/v1 | |
| 19 | +logger = logging.getLogger(__name__) | |
| 70 | 20 | |
| 71 | - DEEPL_API_URL = "https://api.deepl.com/v2/translate" # Pro tier | |
| 72 | - QWEN_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1" # 北京地域 | |
| 73 | - # QWEN_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" # 新加坡 | |
| 74 | - # 如果使用新加坡地域的模型,需要将base_url替换为:https://dashscope-intl.aliyuncs.com/compatible-mode/v1 | |
| 75 | - QWEN_MODEL = "qwen-mt-flash" # 快速翻译模型 | |
| 76 | 21 | |
| 77 | - # Language code mapping | |
| 78 | - LANG_CODE_MAP = { | |
| 79 | - 'zh': 'ZH', | |
| 80 | - 'en': 'EN', | |
| 81 | - 'ru': 'RU', | |
| 82 | - 'ar': 'AR', | |
| 83 | - 'ja': 'JA', | |
| 84 | - 'es': 'ES', | |
| 85 | - 'de': 'DE', | |
| 86 | - 'fr': 'FR', | |
| 87 | - 'it': 'IT', | |
| 88 | - 'pt': 'PT', | |
| 89 | - } | |
| 22 | +class Translator: | |
| 23 | + QWEN_DEFAULT_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1" | |
| 24 | + QWEN_MODEL = "qwen-mt-flash" | |
| 90 | 25 | |
| 91 | 26 | def __init__( |
| 92 | 27 | self, |
| ... | ... | @@ -95,77 +30,90 @@ class Translator: |
| 95 | 30 | use_cache: bool = True, |
| 96 | 31 | timeout: int = 10, |
| 97 | 32 | glossary_id: Optional[str] = None, |
| 98 | - translation_context: Optional[str] = None | |
| 33 | + translation_context: Optional[str] = None, | |
| 99 | 34 | ): |
| 100 | - """ | |
| 101 | - Initialize translator. | |
| 102 | - | |
| 103 | - Args: | |
| 104 | - model: Translation model to use. Options: 'qwen' (default) or 'deepl' | |
| 105 | - api_key: API key for the selected model (or None to use from config/env) | |
| 106 | - use_cache: Whether to cache translations | |
| 107 | - timeout: Request timeout in seconds | |
| 108 | - glossary_id: DeepL glossary ID for custom terminology (optional, only for DeepL) | |
| 109 | - translation_context: Context hint for translation (e.g., "e-commerce", "product search") | |
| 110 | - """ | |
| 111 | - self.model = model.lower() | |
| 112 | - if self.model not in ['qwen', 'deepl']: | |
| 113 | - raise ValueError(f"Unsupported model: {model}. Supported models: 'qwen', 'deepl'") | |
| 114 | - | |
| 115 | - # Get API key from config if not provided | |
| 116 | - if api_key is None: | |
| 117 | - if self.model == 'qwen': | |
| 118 | - api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY") | |
| 119 | - else: # deepl | |
| 120 | - api_key = DEEPL_AUTH_KEY or os.getenv("DEEPL_AUTH_KEY") | |
| 121 | - | |
| 122 | - self.api_key = api_key | |
| 123 | - self.timeout = timeout | |
| 124 | - self.use_cache = use_cache | |
| 35 | + self.model = self._normalize_model(model) | |
| 36 | + self.timeout = int(timeout) | |
| 37 | + self.use_cache = bool(use_cache) | |
| 125 | 38 | self.glossary_id = glossary_id |
| 126 | 39 | self.translation_context = translation_context or "e-commerce product search" |
| 127 | - | |
| 128 | - # Initialize OpenAI client for Qwen if needed | |
| 129 | - self.qwen_client = None | |
| 130 | - if self.model == 'qwen': | |
| 131 | - if not self.api_key: | |
| 132 | - logger.warning("DASHSCOPE_API_KEY not set. Qwen translation will not work.") | |
| 133 | - else: | |
| 134 | - self.qwen_client = OpenAI( | |
| 135 | - api_key=self.api_key, | |
| 136 | - base_url=self.QWEN_BASE_URL, | |
| 137 | - ) | |
| 138 | 40 | |
| 139 | - # Initialize Redis cache if enabled | |
| 140 | - if use_cache: | |
| 41 | + cache_cfg = get_translation_cache_config() | |
| 42 | + self.cache_prefix = str(cache_cfg.get("key_prefix", "trans:v2")) | |
| 43 | + self.expire_seconds = int(cache_cfg.get("ttl_seconds", 360 * 24 * 3600)) | |
| 44 | + self.cache_sliding_expiration = bool(cache_cfg.get("sliding_expiration", True)) | |
| 45 | + self.cache_include_context = bool(cache_cfg.get("key_include_context", True)) | |
| 46 | + self.cache_include_prompt = bool(cache_cfg.get("key_include_prompt", True)) | |
| 47 | + self.cache_include_source_lang = bool(cache_cfg.get("key_include_source_lang", True)) | |
| 48 | + | |
| 49 | + self.qwen_model_name = self._resolve_qwen_model_name(model) | |
| 50 | + self._api_key = api_key or self._default_api_key(self.model) | |
| 51 | + self._qwen_client: Optional[OpenAI] = None | |
| 52 | + base_url = os.getenv("DASHSCOPE_BASE_URL") or self.QWEN_DEFAULT_BASE_URL | |
| 53 | + if self._api_key: | |
| 141 | 54 | try: |
| 142 | - self.redis_client = redis.Redis( | |
| 143 | - host=REDIS_CONFIG.get('host', 'localhost'), | |
| 144 | - port=REDIS_CONFIG.get('port', 6479), | |
| 145 | - password=REDIS_CONFIG.get('password'), | |
| 146 | - decode_responses=True, # Return str instead of bytes | |
| 147 | - socket_timeout=REDIS_CONFIG.get('socket_timeout', 1), | |
| 148 | - socket_connect_timeout=REDIS_CONFIG.get('socket_connect_timeout', 1), | |
| 149 | - retry_on_timeout=REDIS_CONFIG.get('retry_on_timeout', False), | |
| 150 | - health_check_interval=10, # 避免复用坏连接 | |
| 151 | - ) | |
| 152 | - # Test connection | |
| 153 | - self.redis_client.ping() | |
| 154 | - expire_days = REDIS_CONFIG.get('translation_cache_expire_days', 360) | |
| 155 | - self.expire_time = timedelta(days=expire_days) | |
| 156 | - self.expire_seconds = int(self.expire_time.total_seconds()) # Redis 需要秒数 | |
| 157 | - self.cache_prefix = REDIS_CONFIG.get('translation_cache_prefix', 'trans') | |
| 158 | - logger.info("Redis cache initialized for translations") | |
| 159 | - except Exception as e: | |
| 160 | - logger.warning(f"Failed to initialize Redis cache: {e}, falling back to no cache") | |
| 161 | - self.redis_client = None | |
| 162 | - self.cache = None | |
| 55 | + self._qwen_client = OpenAI(api_key=self._api_key, base_url=base_url) | |
| 56 | + except Exception as exc: | |
| 57 | + logger.warning("Failed to initialize qwen-mt client: %s", exc, exc_info=True) | |
| 163 | 58 | else: |
| 164 | - self.redis_client = None | |
| 165 | - self.cache = None | |
| 166 | - | |
| 167 | - # Thread pool for async translation | |
| 168 | - self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="translator") | |
| 59 | + logger.warning("DASHSCOPE_API_KEY not set; qwen-mt translation unavailable") | |
| 60 | + | |
| 61 | + self.redis_client = None | |
| 62 | + if self.use_cache and bool(cache_cfg.get("enabled", True)): | |
| 63 | + self.redis_client = self._init_redis_client() | |
| 64 | + | |
| 65 | + @staticmethod | |
| 66 | + def _normalize_model(model: str) -> str: | |
| 67 | + m = (model or "qwen").strip().lower() | |
| 68 | + if m.startswith("qwen"): | |
| 69 | + return "qwen-mt" | |
| 70 | + raise ValueError(f"Unsupported model: {model}. Supported models: 'qwen', 'qwen-mt', 'qwen-mt-flash'") | |
| 71 | + | |
| 72 | + @staticmethod | |
| 73 | + def _resolve_qwen_model_name(model: str) -> str: | |
| 74 | + m = (model or "qwen").strip().lower() | |
| 75 | + if m in {"qwen", "qwen-mt"}: | |
| 76 | + return "qwen-mt-flash" | |
| 77 | + return m | |
| 78 | + | |
| 79 | + @staticmethod | |
| 80 | + def _default_api_key(model: str) -> Optional[str]: | |
| 81 | + del model | |
| 82 | + return DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY") | |
| 83 | + | |
| 84 | + def _init_redis_client(self): | |
| 85 | + try: | |
| 86 | + client = redis.Redis( | |
| 87 | + host=REDIS_CONFIG.get("host", "localhost"), | |
| 88 | + port=REDIS_CONFIG.get("port", 6479), | |
| 89 | + password=REDIS_CONFIG.get("password"), | |
| 90 | + decode_responses=True, | |
| 91 | + socket_timeout=REDIS_CONFIG.get("socket_timeout", 1), | |
| 92 | + socket_connect_timeout=REDIS_CONFIG.get("socket_connect_timeout", 1), | |
| 93 | + retry_on_timeout=REDIS_CONFIG.get("retry_on_timeout", False), | |
| 94 | + health_check_interval=10, | |
| 95 | + ) | |
| 96 | + client.ping() | |
| 97 | + return client | |
| 98 | + except Exception as exc: | |
| 99 | + logger.warning("Failed to initialize translation redis cache: %s", exc) | |
| 100 | + return None | |
| 101 | + | |
| 102 | + def _build_cache_key( | |
| 103 | + self, | |
| 104 | + text: str, | |
| 105 | + target_lang: str, | |
| 106 | + source_lang: Optional[str], | |
| 107 | + context: Optional[str], | |
| 108 | + prompt: Optional[str], | |
| 109 | + ) -> str: | |
| 110 | + src = (source_lang or "auto").strip().lower() if self.cache_include_source_lang else "-" | |
| 111 | + tgt = (target_lang or "").strip().lower() | |
| 112 | + ctx = (context or "").strip() if self.cache_include_context else "" | |
| 113 | + prm = (prompt or "").strip() if self.cache_include_prompt else "" | |
| 114 | + payload = f"model={self.model}\nsrc={src}\ntgt={tgt}\nctx={ctx}\nprm={prm}\ntext={text}" | |
| 115 | + digest = hashlib.sha256(payload.encode("utf-8")).hexdigest() | |
| 116 | + return f"{self.cache_prefix}:{self.model}:{src}:{tgt}:{digest}" | |
| 169 | 117 | |
| 170 | 118 | def translate( |
| 171 | 119 | self, |
| ... | ... | @@ -173,99 +121,27 @@ class Translator: |
| 173 | 121 | target_lang: str, |
| 174 | 122 | source_lang: Optional[str] = None, |
| 175 | 123 | context: Optional[str] = None, |
| 176 | - prompt: Optional[str] = None | |
| 124 | + prompt: Optional[str] = None, | |
| 177 | 125 | ) -> Optional[str]: |
| 178 | - """ | |
| 179 | - Translate text to target language (synchronous mode). | |
| 180 | - | |
| 181 | - Args: | |
| 182 | - text: Text to translate | |
| 183 | - target_lang: Target language code ('zh', 'en', 'ru', etc.) | |
| 184 | - source_lang: Source language code (option al, auto-detect if None) | |
| 185 | - context: Additional context for translation (overrides default context) | |
| 186 | - prompt: Translation prompt/instruction (optional, for better translation quality) | |
| 187 | - | |
| 188 | - Returns: | |
| 189 | - Translated text or None if translation fails | |
| 190 | - """ | |
| 191 | 126 | if not text or not text.strip(): |
| 192 | 127 | return text |
| 193 | 128 | |
| 194 | - # Normalize language codes | |
| 195 | - target_lang = target_lang.lower() | |
| 196 | - if source_lang: | |
| 197 | - source_lang = source_lang.lower() | |
| 198 | - | |
| 199 | - # Optimization: Skip translation if not needed | |
| 200 | - if target_lang == 'en' and self._is_english_text(text): | |
| 201 | - logger.info(f"[Translator] Text is already English, skipping translation: '{text[:50]}...'") | |
| 129 | + tgt = (target_lang or "").strip().lower() | |
| 130 | + src = (source_lang or "").strip().lower() or None | |
| 131 | + if tgt == "en" and self._is_english_text(text): | |
| 202 | 132 | return text |
| 203 | - | |
| 204 | - if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): | |
| 205 | - logger.info( | |
| 206 | - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | |
| 207 | - f"Source language: {source_lang or 'auto'} | Result: Skip translation (contains Chinese or pure number)" | |
| 208 | - ) | |
| 133 | + if tgt == "zh" and (self._contains_chinese(text) or self._is_pure_number(text)): | |
| 209 | 134 | return text |
| 210 | 135 | |
| 211 | - # Use provided context or default context | |
| 212 | 136 | translation_context = context or self.translation_context |
| 213 | - | |
| 214 | - # Build cache key (include prompt in cache key if provided) | |
| 215 | - cache_key_parts = [source_lang or 'auto', target_lang, translation_context] | |
| 216 | - if prompt: | |
| 217 | - cache_key_parts.append(prompt) | |
| 218 | - cache_key_parts.append(text) | |
| 219 | - cache_key = ':'.join(cache_key_parts) | |
| 137 | + cached = self._get_cached_translation_redis(text, tgt, src, translation_context, prompt) | |
| 138 | + if cached is not None: | |
| 139 | + return cached | |
| 220 | 140 | |
| 221 | - # Check cache (include context and prompt in cache key for accuracy) | |
| 222 | - if self.use_cache and self.redis_client: | |
| 223 | - cached = self._get_cached_translation_redis(text, target_lang, source_lang, translation_context, prompt) | |
| 224 | - if cached: | |
| 225 | - logger.info( | |
| 226 | - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | |
| 227 | - f"Source language: {source_lang or 'auto'} | Result: '{cached}' | Source: Cache hit" | |
| 228 | - ) | |
| 229 | - return cached | |
| 230 | - | |
| 231 | - # If no API key, return mock translation (for testing) | |
| 232 | - if not self.api_key: | |
| 233 | - logger.info( | |
| 234 | - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | |
| 235 | - f"Source language: {source_lang or 'auto'} | Result: '{text}' | Source: Mock mode (no API key)" | |
| 236 | - ) | |
| 237 | - return text | |
| 238 | - | |
| 239 | - # Translate using selected model | |
| 240 | - logger.info( | |
| 241 | - f"[Translator] Translation request | Model: {self.model} | Original text: '{text}' | Target language: {target_lang} | " | |
| 242 | - f"Source language: {source_lang or 'auto'} | Context: {translation_context} | " | |
| 243 | - f"Prompt: {'yes' if prompt else 'no'} | Status: Starting translation" | |
| 244 | - ) | |
| 245 | - | |
| 246 | - if self.model == 'qwen': | |
| 247 | - result = self._translate_qwen(text, target_lang, source_lang, translation_context, prompt) | |
| 248 | - else: # deepl | |
| 249 | - result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt) | |
| 250 | - | |
| 251 | - # Surface translation failure to the caller instead of silently | |
| 252 | - # masquerading the source text as a successful translation. | |
| 253 | - if result is None: | |
| 254 | - logger.warning( | |
| 255 | - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | |
| 256 | - f"Source language: {source_lang or 'auto'} | Status: Translation failed" | |
| 257 | - ) | |
| 258 | - else: | |
| 259 | - logger.info( | |
| 260 | - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | |
| 261 | - f"Source language: {source_lang or 'auto'} | Result: '{result}' | Status: Translation successful" | |
| 262 | - ) | |
| 263 | - | |
| 264 | - # Cache only successful translations. Failed attempts must not poison | |
| 265 | - # Redis with the original text. | |
| 266 | - if result is not None and self.use_cache and self.redis_client: | |
| 267 | - self._set_cached_translation_redis(text, target_lang, result, source_lang, translation_context, prompt) | |
| 141 | + result = self._translate_qwen(text, tgt, src) | |
| 268 | 142 | |
| 143 | + if result is not None: | |
| 144 | + self._set_cached_translation_redis(text, tgt, result, src, translation_context, prompt) | |
| 269 | 145 | return result |
| 270 | 146 | |
| 271 | 147 | def _translate_qwen( |
| ... | ... | @@ -273,412 +149,63 @@ class Translator: |
| 273 | 149 | text: str, |
| 274 | 150 | target_lang: str, |
| 275 | 151 | source_lang: Optional[str], |
| 276 | - context: Optional[str] = None, | |
| 277 | - prompt: Optional[str] = None | |
| 278 | 152 | ) -> Optional[str]: |
| 279 | - """ | |
| 280 | - Translate using Qwen MT Flash model via Alibaba Cloud DashScope API. | |
| 281 | - | |
| 282 | - Args: | |
| 283 | - text: Text to translate | |
| 284 | - target_lang: Target language code ('zh', 'en', 'ru', etc.) | |
| 285 | - source_lang: Source language code (optional, 'auto' if None) | |
| 286 | - context: Context hint for translation (optional) | |
| 287 | - prompt: Translation prompt/instruction (optional) | |
| 288 | - | |
| 289 | - Returns: | |
| 290 | - Translated text or None if translation fails | |
| 291 | - """ | |
| 292 | - if not self.qwen_client: | |
| 293 | - logger.error("[Translator] Qwen client not initialized. Check DASHSCOPE_API_KEY.") | |
| 153 | + if not self._qwen_client: | |
| 294 | 154 | return None |
| 295 | - | |
| 296 | - # Qwen (qwen-mt-plus/flash/turbo) supported languages mapping | |
| 297 | - # 标准来自:你提供的“语言 / 英文名 / 代码”表 | |
| 298 | - qwen_lang_map = { | |
| 299 | - "en": "English", | |
| 300 | - "zh": "Chinese", | |
| 301 | - "zh_tw": "Traditional Chinese", | |
| 302 | - "ru": "Russian", | |
| 303 | - "ja": "Japanese", | |
| 304 | - "ko": "Korean", | |
| 305 | - "es": "Spanish", | |
| 306 | - "fr": "French", | |
| 307 | - "pt": "Portuguese", | |
| 308 | - "de": "German", | |
| 309 | - "it": "Italian", | |
| 310 | - "th": "Thai", | |
| 311 | - "vi": "Vietnamese", | |
| 312 | - "id": "Indonesian", | |
| 313 | - "ms": "Malay", | |
| 314 | - "ar": "Arabic", | |
| 315 | - "hi": "Hindi", | |
| 316 | - "he": "Hebrew", | |
| 317 | - "my": "Burmese", | |
| 318 | - "ta": "Tamil", | |
| 319 | - "ur": "Urdu", | |
| 320 | - "bn": "Bengali", | |
| 321 | - "pl": "Polish", | |
| 322 | - "nl": "Dutch", | |
| 323 | - "ro": "Romanian", | |
| 324 | - "tr": "Turkish", | |
| 325 | - "km": "Khmer", | |
| 326 | - "lo": "Lao", | |
| 327 | - "yue": "Cantonese", | |
| 328 | - "cs": "Czech", | |
| 329 | - "el": "Greek", | |
| 330 | - "sv": "Swedish", | |
| 331 | - "hu": "Hungarian", | |
| 332 | - "da": "Danish", | |
| 333 | - "fi": "Finnish", | |
| 334 | - "uk": "Ukrainian", | |
| 335 | - "bg": "Bulgarian", | |
| 336 | - } | |
| 337 | - | |
| 338 | - # Convert target language | |
| 339 | - target_lang_normalized = target_lang.lower() | |
| 340 | - target_lang_qwen = qwen_lang_map.get(target_lang_normalized, target_lang.capitalize()) | |
| 341 | - | |
| 342 | - # Convert source language | |
| 343 | - source_lang_normalized = (source_lang or "").strip().lower() | |
| 344 | - if not source_lang_normalized or source_lang_normalized == "auto": | |
| 345 | - source_lang_qwen = "auto" | |
| 346 | - else: | |
| 347 | - source_lang_qwen = qwen_lang_map.get(source_lang_normalized, source_lang.capitalize()) | |
| 348 | - | |
| 349 | - # Prepare translation options | |
| 350 | - translation_options = { | |
| 351 | - "source_lang": source_lang_qwen, | |
| 352 | - "target_lang": target_lang_qwen, | |
| 353 | - } | |
| 354 | - | |
| 355 | - # Prepare messages | |
| 356 | - messages = [ | |
| 357 | - { | |
| 358 | - "role": "user", | |
| 359 | - "content": text | |
| 360 | - } | |
| 361 | - ] | |
| 362 | - | |
| 363 | - start_time = time.time() | |
| 155 | + tgt_norm = (target_lang or "").strip().lower() | |
| 156 | + src_norm = (source_lang or "").strip().lower() | |
| 157 | + tgt_qwen = self.SOURCE_LANG_CODE_MAP.get(tgt_norm, tgt_norm.capitalize()) | |
| 158 | + src_qwen = "auto" if not src_norm or src_norm == "auto" else self.SOURCE_LANG_CODE_MAP.get(src_norm, src_norm.capitalize()) | |
| 159 | + start = time.time() | |
| 364 | 160 | try: |
| 365 | - completion = self.qwen_client.chat.completions.create( | |
| 366 | - model=self.QWEN_MODEL, | |
| 367 | - messages=messages, | |
| 161 | + completion = self._qwen_client.chat.completions.create( | |
| 162 | + model=self.qwen_model_name, | |
| 163 | + messages=[{"role": "user", "content": text}], | |
| 368 | 164 | extra_body={ |
| 369 | - "translation_options": translation_options | |
| 370 | - } | |
| 371 | - ) | |
| 372 | - | |
| 373 | - translated_text = completion.choices[0].message.content.strip() | |
| 374 | - duration_ms = (time.time() - start_time) * 1000 | |
| 375 | - | |
| 376 | - logger.info( | |
| 377 | - f"[Translator] Qwen API response success | Original text: '{text}' | Target language: {target_lang_qwen} | " | |
| 378 | - f"Translation result: '{translated_text}' | Duration: {duration_ms:.2f} ms" | |
| 379 | - ) | |
| 380 | - return translated_text | |
| 381 | - | |
| 382 | - except Exception as e: | |
| 383 | - duration_ms = (time.time() - start_time) * 1000 | |
| 384 | - logger.error( | |
| 385 | - f"[Translator] Qwen API request exception | Original text: '{text}' | Target language: {target_lang_qwen} | " | |
| 386 | - f"Duration: {duration_ms:.2f} ms | Error: {e}", exc_info=True | |
| 387 | - ) | |
| 388 | - return None | |
| 389 | - | |
| 390 | - def _translate_deepl( | |
| 391 | - self, | |
| 392 | - text: str, | |
| 393 | - target_lang: str, | |
| 394 | - source_lang: Optional[str], | |
| 395 | - context: Optional[str] = None, | |
| 396 | - prompt: Optional[str] = None | |
| 397 | - ) -> Optional[str]: | |
| 398 | - """ | |
| 399 | - Translate using DeepL API with context and glossary support. | |
| 400 | - | |
| 401 | - Args: | |
| 402 | - text: Text to translate | |
| 403 | - target_lang: Target language code | |
| 404 | - source_lang: Source language code (optional) | |
| 405 | - context: Context hint for translation (e.g., "e-commerce product search") | |
| 406 | - """ | |
| 407 | - # Map to DeepL language codes | |
| 408 | - target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper()) | |
| 409 | - | |
| 410 | - headers = { | |
| 411 | - "Authorization": f"DeepL-Auth-Key {self.api_key}", | |
| 412 | - "Content-Type": "application/json", | |
| 413 | - } | |
| 414 | - | |
| 415 | - # Use prompt as context parameter for DeepL API (not as text prefix) | |
| 416 | - # According to DeepL API: context is "Additional context that can influence a translation but is not translated itself" | |
| 417 | - # If prompt is provided, use it as context; otherwise use the default context | |
| 418 | - api_context = prompt if prompt else context | |
| 419 | - | |
| 420 | - # For e-commerce, add context words to help DeepL understand the domain | |
| 421 | - # This is especially important for single-word ambiguous terms like "车" (car vs rook) | |
| 422 | - text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context) | |
| 423 | - | |
| 424 | - payload = { | |
| 425 | - "text": [text_to_translate], | |
| 426 | - "target_lang": target_code, | |
| 427 | - } | |
| 428 | - | |
| 429 | - if source_lang: | |
| 430 | - source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper()) | |
| 431 | - payload["source_lang"] = source_code | |
| 432 | - | |
| 433 | - # Add context parameter (prompt or default context) | |
| 434 | - # Context influences translation but is not translated itself | |
| 435 | - if api_context: | |
| 436 | - payload["context"] = api_context | |
| 437 | - | |
| 438 | - # Add glossary if configured | |
| 439 | - if self.glossary_id: | |
| 440 | - payload["glossary_id"] = self.glossary_id | |
| 441 | - | |
| 442 | - # Note: DeepL API v2 supports "context" parameter for additional context | |
| 443 | - # that influences translation but is not translated itself. | |
| 444 | - # We use prompt as context parameter when provided. | |
| 445 | - | |
| 446 | - try: | |
| 447 | - response = requests.post( | |
| 448 | - self.DEEPL_API_URL, | |
| 449 | - headers=headers, | |
| 450 | - json=payload, | |
| 451 | - timeout=self.timeout | |
| 165 | + "translation_options": { | |
| 166 | + "source_lang": src_qwen, | |
| 167 | + "target_lang": tgt_qwen, | |
| 168 | + } | |
| 169 | + }, | |
| 170 | + timeout=self.timeout, | |
| 452 | 171 | ) |
| 453 | - | |
| 454 | - if response.status_code == 200: | |
| 455 | - data = response.json() | |
| 456 | - if "translations" in data and len(data["translations"]) > 0: | |
| 457 | - translated_text = data["translations"][0]["text"] | |
| 458 | - # If we added context, extract just the term from the result | |
| 459 | - if needs_extraction: | |
| 460 | - translated_text = self._extract_term_from_translation( | |
| 461 | - translated_text, text, target_code | |
| 462 | - ) | |
| 463 | - logger.debug( | |
| 464 | - f"[Translator] DeepL API response success | Original text: '{text}' | Target language: {target_code} | " | |
| 465 | - f"Translation result: '{translated_text}'" | |
| 466 | - ) | |
| 467 | - return translated_text | |
| 468 | - else: | |
| 469 | - logger.error( | |
| 470 | - f"[Translator] DeepL API error | Original text: '{text}' | Target language: {target_code} | " | |
| 471 | - f"Status code: {response.status_code} | Error message: {response.text}" | |
| 472 | - ) | |
| 172 | + content = (completion.choices[0].message.content or "").strip() | |
| 173 | + if not content: | |
| 473 | 174 | return None |
| 474 | - | |
| 475 | - except requests.Timeout: | |
| 175 | + logger.info("[qwen-mt] Success | src=%s tgt=%s latency=%.1fms", src_qwen, tgt_qwen, (time.time() - start) * 1000) | |
| 176 | + return content | |
| 177 | + except Exception as exc: | |
| 476 | 178 | logger.warning( |
| 477 | - f"[Translator] DeepL API request timeout | Original text: '{text}' | Target language: {target_code} | " | |
| 478 | - f"Timeout: {self.timeout}s" | |
| 479 | - ) | |
| 480 | - return None | |
| 481 | - except Exception as e: | |
| 482 | - logger.error( | |
| 483 | - f"[Translator] DeepL API request exception | Original text: '{text}' | Target language: {target_code} | " | |
| 484 | - f"Error: {e}", exc_info=True | |
| 179 | + "[qwen-mt] Failed | src=%s tgt=%s latency=%.1fms error=%s", | |
| 180 | + src_qwen, | |
| 181 | + tgt_qwen, | |
| 182 | + (time.time() - start) * 1000, | |
| 183 | + exc, | |
| 184 | + exc_info=True, | |
| 485 | 185 | ) |
| 486 | 186 | return None |
| 487 | 187 | |
| 488 | - # NOTE: _translate_deepl_free is intentionally not implemented. | |
| 489 | - # We do not support automatic fallback to the free endpoint, to avoid | |
| 490 | - # mixing Pro keys with https://api-free.deepl.com and related 403 errors. | |
| 491 | - | |
| 492 | - def translate_multi( | |
| 493 | - self, | |
| 494 | - text: str, | |
| 495 | - target_langs: List[str], | |
| 496 | - source_lang: Optional[str] = None, | |
| 497 | - context: Optional[str] = None, | |
| 498 | - async_mode: bool = True, | |
| 499 | - prompt: Optional[str] = None | |
| 500 | - ) -> Dict[str, Optional[str]]: | |
| 501 | - """ | |
| 502 | - Translate text to multiple target languages. | |
| 503 | - | |
| 504 | - In async_mode=True (default): | |
| 505 | - - Returns cached translations immediately if available | |
| 506 | - - For translations that can be optimized (e.g., pure numbers, already in target language), | |
| 507 | - returns result immediately via synchronous call | |
| 508 | - - Launches async tasks for other missing translations (non-blocking) | |
| 509 | - - Returns None for missing translations that require async processing | |
| 510 | - | |
| 511 | - In async_mode=False: | |
| 512 | - - Waits for all translations to complete (blocking) | |
| 513 | - | |
| 514 | - Args: | |
| 515 | - text: Text to translate | |
| 516 | - target_langs: List of target language codes | |
| 517 | - source_lang: Source language code (optional) | |
| 518 | - context: Context hint for translation (optional) | |
| 519 | - async_mode: If True, return cached results immediately and translate missing ones async | |
| 520 | - prompt: Translation prompt/instruction (optional) | |
| 521 | 188 | |
| 522 | - Returns: | |
| 523 | - Dictionary mapping language code to translated text (only cached results in async mode) | |
| 524 | - """ | |
| 525 | - results = {} | |
| 526 | - missing_langs = [] | |
| 527 | - async_langs = [] | |
| 528 | - | |
| 529 | - # First, get cached translations | |
| 530 | - for lang in target_langs: | |
| 531 | - cached = self._get_cached_translation(text, lang, source_lang, context, prompt) | |
| 532 | - if cached is not None: | |
| 533 | - results[lang] = cached | |
| 534 | - else: | |
| 535 | - missing_langs.append(lang) | |
| 536 | - | |
| 537 | - # If async mode and there are missing translations | |
| 538 | - if async_mode and missing_langs: | |
| 539 | - # Check if translation can be optimized (immediate return) | |
| 540 | - for lang in missing_langs: | |
| 541 | - target_lang = lang.lower() | |
| 542 | - # Check optimization conditions (same as in translate method) | |
| 543 | - can_optimize = False | |
| 544 | - if target_lang == 'en' and self._is_english_text(text): | |
| 545 | - can_optimize = True | |
| 546 | - elif target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): | |
| 547 | - can_optimize = True | |
| 548 | - | |
| 549 | - if can_optimize: | |
| 550 | - # Can be optimized, call translate synchronously for immediate result | |
| 551 | - results[lang] = self.translate(text, lang, source_lang, context, prompt) | |
| 552 | - else: | |
| 553 | - # Requires actual translation, add to async list | |
| 554 | - async_langs.append(lang) | |
| 555 | - | |
| 556 | - # Launch async tasks for translations that require actual API calls | |
| 557 | - if async_langs: | |
| 558 | - for lang in async_langs: | |
| 559 | - self._translate_async(text, lang, source_lang, context, prompt) | |
| 560 | - # Return None for async translations | |
| 561 | - for lang in async_langs: | |
| 562 | - results[lang] = None | |
| 563 | - else: | |
| 564 | - # Synchronous mode: wait for all translations | |
| 565 | - for lang in missing_langs: | |
| 566 | - results[lang] = self.translate(text, lang, source_lang, context, prompt) | |
| 567 | - | |
| 568 | - return results | |
| 569 | - | |
| 570 | - def translate_multi_async( | |
| 571 | - self, | |
| 572 | - text: str, | |
| 573 | - target_langs: List[str], | |
| 574 | - source_lang: Optional[str] = None, | |
| 575 | - context: Optional[str] = None, | |
| 576 | - prompt: Optional[str] = None | |
| 577 | - ) -> Dict[str, Union[str, Future]]: | |
| 578 | - """ | |
| 579 | - Translate text to multiple target languages asynchronously, returning Futures that can be awaited. | |
| 580 | - | |
| 581 | - This method returns a dictionary where: | |
| 582 | - - If translation is cached, the value is the translation string (immediate) | |
| 583 | - - If translation needs to be done, the value is a Future object that can be awaited | |
| 584 | - | |
| 585 | - Args: | |
| 586 | - text: Text to translate | |
| 587 | - target_langs: List of target language codes | |
| 588 | - source_lang: Source language code (optional) | |
| 589 | - context: Context hint for translation (optional) | |
| 590 | - prompt: Translation prompt/instruction (optional) | |
| 591 | - | |
| 592 | - Returns: | |
| 593 | - Dictionary mapping language code to either translation string (cached) or Future object | |
| 594 | - """ | |
| 595 | - results = {} | |
| 596 | - missing_langs = [] | |
| 597 | - | |
| 598 | - # First, get cached translations | |
| 599 | - for lang in target_langs: | |
| 600 | - cached = self._get_cached_translation(text, lang, source_lang, context, prompt) | |
| 601 | - if cached is not None: | |
| 602 | - results[lang] = cached | |
| 603 | - else: | |
| 604 | - missing_langs.append(lang) | |
| 605 | - | |
| 606 | - # For missing translations, submit async tasks and return Futures | |
| 607 | - for lang in missing_langs: | |
| 608 | - future = self.executor.submit( | |
| 609 | - self.translate, | |
| 610 | - text, | |
| 611 | - lang, | |
| 612 | - source_lang, | |
| 613 | - context, | |
| 614 | - prompt | |
| 615 | - ) | |
| 616 | - results[lang] = future | |
| 617 | - | |
| 618 | - return results | |
| 619 | - | |
| 620 | - def _get_cached_translation( | |
| 621 | - self, | |
| 622 | - text: str, | |
| 623 | - target_lang: str, | |
| 624 | - source_lang: Optional[str] = None, | |
| 625 | - context: Optional[str] = None, | |
| 626 | - prompt: Optional[str] = None | |
| 627 | - ) -> Optional[str]: | |
| 628 | - """Get translation from cache if available.""" | |
| 629 | - if not self.redis_client: | |
| 630 | - return None | |
| 631 | - return self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt) | |
| 632 | - | |
| 633 | 189 | def _get_cached_translation_redis( |
| 634 | 190 | self, |
| 635 | 191 | text: str, |
| 636 | 192 | target_lang: str, |
| 637 | 193 | source_lang: Optional[str] = None, |
| 638 | 194 | context: Optional[str] = None, |
| 639 | - prompt: Optional[str] = None | |
| 195 | + prompt: Optional[str] = None, | |
| 640 | 196 | ) -> Optional[str]: |
| 641 | - """ | |
| 642 | - Get translation from Redis cache with sliding expiration. | |
| 643 | - | |
| 644 | - 滑动过期机制:每次访问缓存时,重置过期时间为配置的过期时间(默认720天)。 | |
| 645 | - 这样缓存会在最后一次访问后的720天才过期,而不是写入后的720天。 | |
| 646 | - 这确保了常用的翻译缓存不会被过早删除。 | |
| 647 | - """ | |
| 648 | 197 | if not self.redis_client: |
| 649 | 198 | return None |
| 650 | - | |
| 199 | + key = self._build_cache_key(text, target_lang, source_lang, context, prompt) | |
| 651 | 200 | try: |
| 652 | - # Build cache key: prefix:target_lang:text | |
| 653 | - # For simplicity, we use target_lang and text as key | |
| 654 | - # Context and prompt are not included in key to maximize cache hits | |
| 655 | - cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}" | |
| 656 | - value = self.redis_client.get(cache_key) | |
| 657 | - if value: | |
| 658 | - # Sliding expiration: reset expiration time on access | |
| 659 | - # 每次读取缓存时,重置过期时间为配置的过期时间(最后一次访问后的N天才过期) | |
| 660 | - try: | |
| 661 | - self.redis_client.expire(cache_key, self.expire_seconds) | |
| 662 | - except Exception as expire_error: | |
| 663 | - # 即使 expire 失败,也返回缓存值(不影响功能) | |
| 664 | - logger.warning( | |
| 665 | - f"[Translator] Failed to update cache expiration for key {cache_key}: {expire_error}" | |
| 666 | - ) | |
| 667 | - | |
| 668 | - logger.debug( | |
| 669 | - f"[Translator] Redis cache hit | Original text: '{text}' | Target language: {target_lang} | " | |
| 670 | - f"Cache key: {cache_key} | Translation result: '{value}' | TTL reset to {self.expire_seconds}s" | |
| 671 | - ) | |
| 672 | - return value | |
| 673 | - logger.debug( | |
| 674 | - f"[Translator] Redis cache miss | Original text: '{text}' | Target language: {target_lang} | " | |
| 675 | - f"Cache key: {cache_key}" | |
| 676 | - ) | |
| 201 | + value = self.redis_client.get(key) | |
| 202 | + if value and self.cache_sliding_expiration: | |
| 203 | + self.redis_client.expire(key, self.expire_seconds) | |
| 204 | + return value | |
| 205 | + except Exception as exc: | |
| 206 | + logger.warning("Redis get translation cache failed: %s", exc) | |
| 677 | 207 | return None |
| 678 | - except Exception as e: | |
| 679 | - logger.error(f"[Translator] Redis error during get translation cache | Original text: '{text}' | Target language: {target_lang} | Error: {e}") | |
| 680 | - return None | |
| 681 | - | |
| 208 | + | |
| 682 | 209 | def _set_cached_translation_redis( |
| 683 | 210 | self, |
| 684 | 211 | text: str, |
| ... | ... | @@ -686,128 +213,17 @@ class Translator: |
| 686 | 213 | translation: str, |
| 687 | 214 | source_lang: Optional[str] = None, |
| 688 | 215 | context: Optional[str] = None, |
| 689 | - prompt: Optional[str] = None | |
| 216 | + prompt: Optional[str] = None, | |
| 690 | 217 | ) -> None: |
| 691 | - """Store translation in Redis cache.""" | |
| 692 | 218 | if not self.redis_client: |
| 693 | 219 | return |
| 694 | - | |
| 220 | + key = self._build_cache_key(text, target_lang, source_lang, context, prompt) | |
| 695 | 221 | try: |
| 696 | - cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}" | |
| 697 | - self.redis_client.setex(cache_key, self.expire_seconds, translation) | |
| 698 | - logger.info( | |
| 699 | - f"[Translator] Redis cache write | Original text: '{text}' | Target language: {target_lang} | " | |
| 700 | - f"Cache key: {cache_key} | Translation result: '{translation}'" | |
| 701 | - ) | |
| 702 | - except Exception as e: | |
| 703 | - logger.error( | |
| 704 | - f"[Translator] Redis cache write failed | Original text: '{text}' | Target language: {target_lang} | " | |
| 705 | - f"Error: {e}" | |
| 706 | - ) | |
| 707 | - | |
| 708 | - def _translate_async( | |
| 709 | - self, | |
| 710 | - text: str, | |
| 711 | - target_lang: str, | |
| 712 | - source_lang: Optional[str] = None, | |
| 713 | - context: Optional[str] = None, | |
| 714 | - prompt: Optional[str] = None | |
| 715 | - ): | |
| 716 | - """Launch async translation task.""" | |
| 717 | - def _do_translate(): | |
| 718 | - try: | |
| 719 | - result = self.translate(text, target_lang, source_lang, context, prompt) | |
| 720 | - if result: | |
| 721 | - logger.debug(f"Async translation completed: {text} -> {target_lang}: {result}") | |
| 722 | - except Exception as e: | |
| 723 | - logger.warning(f"Async translation failed: {text} -> {target_lang}: {e}") | |
| 724 | - | |
| 725 | - self.executor.submit(_do_translate) | |
| 726 | - | |
| 727 | - def _add_ecommerce_context( | |
| 728 | - self, | |
| 729 | - text: str, | |
| 730 | - source_lang: Optional[str], | |
| 731 | - context: Optional[str] | |
| 732 | - ) -> tuple: | |
| 733 | - """ | |
| 734 | - Add e-commerce context to text for better disambiguation. | |
| 735 | - | |
| 736 | - For single-word ambiguous Chinese terms, we add context words that help | |
| 737 | - DeepL understand this is an e-commerce/product search context. | |
| 738 | - | |
| 739 | - Args: | |
| 740 | - text: Original text to translate | |
| 741 | - source_lang: Source language code | |
| 742 | - context: Context hint | |
| 743 | - | |
| 744 | - Returns: | |
| 745 | - Tuple of (text_with_context, needs_extraction) | |
| 746 | - - text_with_context: Text to send to DeepL | |
| 747 | - - needs_extraction: Whether we need to extract the term from the result | |
| 748 | - """ | |
| 749 | - # Only apply for e-commerce context and Chinese source | |
| 750 | - if not context or "e-commerce" not in context.lower(): | |
| 751 | - return text, False | |
| 752 | - | |
| 753 | - if not source_lang or source_lang.lower() != 'zh': | |
| 754 | - return text, False | |
| 755 | - | |
| 756 | - # For single-word queries, add context to help disambiguation | |
| 757 | - text_stripped = text.strip() | |
| 758 | - if len(text_stripped.split()) == 1 and len(text_stripped) <= 2: | |
| 759 | - # Common ambiguous Chinese e-commerce terms like "车" (car vs rook) | |
| 760 | - # We add a context phrase: "购买 [term]" (buy [term]) or "商品 [term]" (product [term]) | |
| 761 | - # This helps DeepL understand the e-commerce context | |
| 762 | - # We'll need to extract just the term from the translation result | |
| 763 | - context_phrase = f"购买 {text_stripped}" | |
| 764 | - return context_phrase, True | |
| 765 | - | |
| 766 | - # For multi-word queries, DeepL usually has enough context | |
| 767 | - return text, False | |
| 768 | - | |
| 769 | - def _extract_term_from_translation( | |
| 770 | - self, | |
| 771 | - translated_text: str, | |
| 772 | - original_text: str, | |
| 773 | - target_lang_code: str | |
| 774 | - ) -> str: | |
| 775 | - """ | |
| 776 | - Extract the actual term from a translation that included context. | |
| 777 | - | |
| 778 | - For example, if we translated "购买 车" (buy car) and got "buy car", | |
| 779 | - we want to extract just "car". | |
| 780 | - | |
| 781 | - Args: | |
| 782 | - translated_text: Full translation result | |
| 783 | - original_text: Original single-word query | |
| 784 | - target_lang_code: Target language code (EN, ZH, etc.) | |
| 785 | - | |
| 786 | - Returns: | |
| 787 | - Extracted term or original translation if extraction fails | |
| 788 | - """ | |
| 789 | - # For English target, try to extract the last word (the actual term) | |
| 790 | - if target_lang_code == "EN": | |
| 791 | - words = translated_text.strip().split() | |
| 792 | - if len(words) > 1: | |
| 793 | - # Usually the last word is the term we want | |
| 794 | - # But we need to be smart - if it's "buy car", we want "car" | |
| 795 | - # Common context words to skip: buy, purchase, product, item, etc. | |
| 796 | - context_words = {"buy", "purchase", "product", "item", "commodity", "goods"} | |
| 797 | - # Try to find the term (not a context word) | |
| 798 | - for word in reversed(words): | |
| 799 | - word_lower = word.lower().rstrip('.,!?;:') | |
| 800 | - if word_lower not in context_words: | |
| 801 | - return word_lower | |
| 802 | - # If all words are context words, return the last one | |
| 803 | - return words[-1].lower().rstrip('.,!?;:') | |
| 804 | - | |
| 805 | - # For other languages or if extraction fails, return as-is | |
| 806 | - # The user can configure a glossary for better results | |
| 807 | - return translated_text | |
| 222 | + self.redis_client.setex(key, self.expire_seconds, translation) | |
| 223 | + except Exception as exc: | |
| 224 | + logger.warning("Redis set translation cache failed: %s", exc) | |
| 808 | 225 | |
| 809 | 226 | def _shop_lang_matches(self, shop_lang_lower: str, lang_code: str) -> bool: |
| 810 | - """True if shop language matches index language (use source, no translate).""" | |
| 811 | 227 | if not shop_lang_lower or not lang_code: |
| 812 | 228 | return False |
| 813 | 229 | if shop_lang_lower == lang_code: |
| ... | ... | @@ -818,146 +234,27 @@ class Translator: |
| 818 | 234 | return True |
| 819 | 235 | return False |
| 820 | 236 | |
| 821 | - def translate_for_indexing( | |
| 822 | - self, | |
| 823 | - text: str, | |
| 824 | - shop_language: str, | |
| 825 | - source_lang: Optional[str] = None, | |
| 826 | - context: Optional[str] = None, | |
| 827 | - prompt: Optional[str] = None, | |
| 828 | - index_languages: Optional[List[str]] = None, | |
| 829 | - ) -> Dict[str, Optional[str]]: | |
| 830 | - """ | |
| 831 | - Translate text for indexing based on shop language and tenant index_languages. | |
| 832 | - | |
| 833 | - For each language in index_languages: use source text if shop language matches, | |
| 834 | - otherwise translate to that language. | |
| 835 | - | |
| 836 | - Args: | |
| 837 | - text: Text to translate | |
| 838 | - shop_language: Shop primary language (e.g. 'zh', 'en', 'ru') | |
| 839 | - source_lang: Source language code (optional) | |
| 840 | - context: Additional context for translation (optional) | |
| 841 | - prompt: Translation prompt (optional) | |
| 842 | - index_languages: Languages to index (from tenant_config). Default ["en", "zh"]. | |
| 843 | - | |
| 844 | - Returns: | |
| 845 | - Dict keyed by each index_language with translated or source text (or None). | |
| 846 | - """ | |
| 847 | - langs = index_languages if index_languages else ["en", "zh"] | |
| 848 | - results = {lang: None for lang in langs} | |
| 849 | - if not text or not text.strip(): | |
| 850 | - return results | |
| 851 | - if re.match(r'^[\d\s_-]+$', text): | |
| 852 | - logger.info(f"[Translator] Skip translation for symbol-only query: '{text}'") | |
| 853 | - return results | |
| 854 | - | |
| 855 | - shop_lang_lower = (shop_language or "").strip().lower() | |
| 856 | - targets = [] | |
| 857 | - for lang in langs: | |
| 858 | - if self._shop_lang_matches(shop_lang_lower, lang): | |
| 859 | - results[lang] = text | |
| 860 | - else: | |
| 861 | - targets.append(lang) | |
| 862 | - | |
| 863 | - for target_lang in targets: | |
| 864 | - cached = self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt) | |
| 865 | - if cached: | |
| 866 | - results[target_lang] = cached | |
| 867 | - logger.debug(f"[Translator] Cache hit for indexing: '{text}' -> {target_lang}: {cached}") | |
| 868 | - continue | |
| 869 | - translated = self.translate( | |
| 870 | - text, | |
| 871 | - target_lang=target_lang, | |
| 872 | - source_lang=source_lang or shop_language, | |
| 873 | - context=context, | |
| 874 | - prompt=prompt, | |
| 875 | - ) | |
| 876 | - results[target_lang] = translated | |
| 877 | - return results | |
| 878 | - | |
| 879 | - def get_translation_needs( | |
| 880 | - self, | |
| 881 | - detected_lang: str, | |
| 882 | - supported_langs: List[str] | |
| 883 | - ) -> List[str]: | |
| 884 | - """ | |
| 885 | - Determine which languages need translation. | |
| 886 | - | |
| 887 | - Args: | |
| 888 | - detected_lang: Detected query language | |
| 889 | - supported_langs: List of supported languages | |
| 890 | - | |
| 891 | - Returns: | |
| 892 | - List of language codes to translate to | |
| 893 | - """ | |
| 894 | - # If detected language is in supported list, translate to others | |
| 237 | + def get_translation_needs(self, detected_lang: str, supported_langs: List[str]) -> List[str]: | |
| 895 | 238 | if detected_lang in supported_langs: |
| 896 | - return [lang for lang in supported_langs if detected_lang != lang] | |
| 897 | - | |
| 898 | - # Otherwise, translate to all supported languages | |
| 239 | + return [lang for lang in supported_langs if lang != detected_lang] | |
| 899 | 240 | return supported_langs |
| 900 | - | |
| 241 | + | |
| 901 | 242 | def _is_english_text(self, text: str) -> bool: |
| 902 | - """ | |
| 903 | - Check if text is primarily English (ASCII letters, numbers, common punctuation). | |
| 904 | - | |
| 905 | - Args: | |
| 906 | - text: Text to check | |
| 907 | - | |
| 908 | - Returns: | |
| 909 | - True if text appears to be English | |
| 910 | - """ | |
| 911 | 243 | if not text or not text.strip(): |
| 912 | 244 | return True |
| 913 | - | |
| 914 | - # Remove whitespace and common punctuation | |
| 915 | - text_clean = re.sub(r'[\s\.,!?;:\-\'\"\(\)\[\]{}]', '', text) | |
| 245 | + text_clean = re.sub(r"[\s\.,!?;:\-\'\"\(\)\[\]{}]", "", text) | |
| 916 | 246 | if not text_clean: |
| 917 | 247 | return True |
| 918 | - | |
| 919 | - # Check if all remaining characters are ASCII (letters, numbers) | |
| 920 | - # This is a simple heuristic: if most characters are ASCII, it's likely English | |
| 921 | 248 | ascii_count = sum(1 for c in text_clean if ord(c) < 128) |
| 922 | - ratio = ascii_count / len(text_clean) if text_clean else 0 | |
| 923 | - | |
| 924 | - # If more than 80% are ASCII characters, consider it English | |
| 925 | - return ratio > 0.8 | |
| 926 | - | |
| 249 | + return (ascii_count / len(text_clean)) > 0.8 | |
| 250 | + | |
| 927 | 251 | def _contains_chinese(self, text: str) -> bool: |
| 928 | - """ | |
| 929 | - Check if text contains Chinese characters (Han characters). | |
| 930 | - | |
| 931 | - Args: | |
| 932 | - text: Text to check | |
| 933 | - | |
| 934 | - Returns: | |
| 935 | - True if text contains Chinese characters | |
| 936 | - """ | |
| 937 | 252 | if not text: |
| 938 | 253 | return False |
| 939 | - | |
| 940 | - # Check for Chinese characters (Unicode range: \u4e00-\u9fff) | |
| 941 | - chinese_pattern = re.compile(r'[\u4e00-\u9fff]') | |
| 942 | - return bool(chinese_pattern.search(text)) | |
| 943 | - | |
| 254 | + return bool(re.search(r"[\u4e00-\u9fff]", text)) | |
| 255 | + | |
| 944 | 256 | def _is_pure_number(self, text: str) -> bool: |
| 945 | - """ | |
| 946 | - Check if text is purely numeric (digits, possibly with spaces, dots, commas). | |
| 947 | - | |
| 948 | - Args: | |
| 949 | - text: Text to check | |
| 950 | - | |
| 951 | - Returns: | |
| 952 | - True if text is purely numeric | |
| 953 | - """ | |
| 954 | 257 | if not text or not text.strip(): |
| 955 | 258 | return False |
| 956 | - | |
| 957 | - # Remove whitespace, dots, commas (common number separators) | |
| 958 | - text_clean = re.sub(r'[\s\.,]', '', text.strip()) | |
| 959 | - if not text_clean: | |
| 960 | - return False | |
| 961 | - | |
| 962 | - # Check if all remaining characters are digits | |
| 963 | - return text_clean.isdigit() | |
| 259 | + text_clean = re.sub(r"[\s\.,]", "", text.strip()) | |
| 260 | + return bool(text_clean) and text_clean.isdigit() | ... | ... |
query/test_translation.py
| ... | ... | @@ -14,6 +14,7 @@ Test content: |
| 14 | 14 | import sys |
| 15 | 15 | import os |
| 16 | 16 | from pathlib import Path |
| 17 | +from concurrent.futures import ThreadPoolExecutor | |
| 17 | 18 | |
| 18 | 19 | # Add parent directory to path |
| 19 | 20 | sys.path.insert(0, str(Path(__file__).parent.parent)) |
| ... | ... | @@ -42,9 +43,6 @@ def test_config_loading(): |
| 42 | 43 | |
| 43 | 44 | print(f"✓ Configuration loaded successfully") |
| 44 | 45 | print(f" Translation service: {config.query_config.translation_service}") |
| 45 | - print(f" Translation prompt configuration:") | |
| 46 | - for key, value in config.query_config.translation_prompts.items(): | |
| 47 | - print(f" {key}: {value[:60]}..." if len(value) > 60 else f" {key}: {value}") | |
| 48 | 46 | |
| 49 | 47 | return config |
| 50 | 48 | except Exception as e: |
| ... | ... | @@ -72,34 +70,23 @@ def test_translator_sync(config): |
| 72 | 70 | translation_context=config.query_config.translation_context |
| 73 | 71 | ) |
| 74 | 72 | |
| 75 | - # 测试商品标题翻译(使用product_title提示词) | |
| 73 | + # 测试商品标题翻译(使用sku_name提示词) | |
| 76 | 74 | test_texts = [ |
| 77 | - ("蓝牙耳机", "zh", "en", "product_title"), | |
| 78 | - ("Wireless Headphones", "en", "zh", "product_title"), | |
| 75 | + ("蓝牙耳机", "zh", "en", "sku_name"), | |
| 76 | + ("Wireless Headphones", "en", "zh", "sku_name"), | |
| 79 | 77 | ] |
| 80 | 78 | |
| 81 | - for text, source_lang, target_lang, prompt_type in test_texts: | |
| 82 | - if prompt_type == "product_title": | |
| 83 | - if target_lang == "zh": | |
| 84 | - prompt = config.query_config.translation_prompts.get('product_title_zh') | |
| 85 | - else: | |
| 86 | - prompt = config.query_config.translation_prompts.get('product_title_en') | |
| 87 | - else: | |
| 88 | - if target_lang == "zh": | |
| 89 | - prompt = config.query_config.translation_prompts.get('default_zh') | |
| 90 | - else: | |
| 91 | - prompt = config.query_config.translation_prompts.get('default_en') | |
| 92 | - | |
| 79 | + for text, source_lang, target_lang, scene in test_texts: | |
| 93 | 80 | print(f"\nTranslation test:") |
| 94 | 81 | print(f" Original text ({source_lang}): {text}") |
| 95 | 82 | print(f" Target language: {target_lang}") |
| 96 | - print(f" Prompt: {prompt[:50] if prompt else 'None'}...") | |
| 83 | + print(f" Scene: {scene}") | |
| 97 | 84 | |
| 98 | 85 | result = translator.translate( |
| 99 | 86 | text, |
| 100 | 87 | target_lang=target_lang, |
| 101 | 88 | source_lang=source_lang, |
| 102 | - prompt=prompt | |
| 89 | + context=scene, | |
| 103 | 90 | ) |
| 104 | 91 | |
| 105 | 92 | if result: |
| ... | ... | @@ -131,43 +118,25 @@ def test_translator_async(config, translator): |
| 131 | 118 | query_text = "手机" |
| 132 | 119 | target_langs = ['en'] |
| 133 | 120 | source_lang = 'zh' |
| 134 | - | |
| 135 | - query_prompt = config.query_config.translation_prompts.get('query_zh') | |
| 136 | - | |
| 121 | + | |
| 137 | 122 | print(f"Query text: {query_text}") |
| 138 | 123 | print(f"Target languages: {target_langs}") |
| 139 | - print(f"Prompt: {query_prompt}") | |
| 140 | - | |
| 141 | - # 异步模式(立即返回,后台翻译) | |
| 142 | - results = translator.translate_multi( | |
| 143 | - query_text, | |
| 144 | - target_langs, | |
| 145 | - source_lang=source_lang, | |
| 146 | - context=config.query_config.translation_context, | |
| 147 | - async_mode=True, | |
| 148 | - prompt=query_prompt | |
| 149 | - ) | |
| 150 | - | |
| 151 | - print(f"\nAsynchronous translation results:") | |
| 152 | - for lang, translation in results.items(): | |
| 153 | - if translation: | |
| 154 | - print(f" {lang}: {translation} (cache hit)") | |
| 155 | - else: | |
| 156 | - print(f" {lang}: None (translating in background...)") | |
| 157 | - | |
| 158 | - # 同步模式(等待完成) | |
| 159 | - print(f"\nSynchronous translation (waiting for completion):") | |
| 160 | - results_sync = translator.translate_multi( | |
| 161 | - query_text, | |
| 162 | - target_langs, | |
| 163 | - source_lang=source_lang, | |
| 164 | - context=config.query_config.translation_context, | |
| 165 | - async_mode=False, | |
| 166 | - prompt=query_prompt | |
| 167 | - ) | |
| 124 | + print("Scene: ecommerce_search_query") | |
| 168 | 125 | |
| 169 | - for lang, translation in results_sync.items(): | |
| 170 | - print(f" {lang}: {translation}") | |
| 126 | + print(f"\nConcurrent translation via generic translate():") | |
| 127 | + with ThreadPoolExecutor(max_workers=len(target_langs)) as executor: | |
| 128 | + futures = { | |
| 129 | + lang: executor.submit( | |
| 130 | + translator.translate, | |
| 131 | + query_text, | |
| 132 | + lang, | |
| 133 | + source_lang, | |
| 134 | + "ecommerce_search_query", | |
| 135 | + ) | |
| 136 | + for lang in target_langs | |
| 137 | + } | |
| 138 | + for lang, future in futures.items(): | |
| 139 | + print(f" {lang}: {future.result()}") | |
| 171 | 140 | |
| 172 | 141 | except Exception as e: |
| 173 | 142 | print(f"✗ Asynchronous translation test failed: {e}") |
| ... | ... | @@ -193,14 +162,13 @@ def test_cache(): |
| 193 | 162 | test_text = "测试文本" |
| 194 | 163 | target_lang = "en" |
| 195 | 164 | source_lang = "zh" |
| 196 | - prompt = config.query_config.translation_prompts.get('default_zh') | |
| 197 | 165 | |
| 198 | 166 | print(f"First translation (should call API or return mock):") |
| 199 | - result1 = translator.translate(test_text, target_lang, source_lang, prompt=prompt) | |
| 167 | + result1 = translator.translate(test_text, target_lang, source_lang, context="default") | |
| 200 | 168 | print(f" Result: {result1}") |
| 201 | 169 | |
| 202 | 170 | print(f"\nSecond translation (should use cache):") |
| 203 | - result2 = translator.translate(test_text, target_lang, source_lang, prompt=prompt) | |
| 171 | + result2 = translator.translate(test_text, target_lang, source_lang, context="default") | |
| 204 | 172 | print(f" Result: {result2}") |
| 205 | 173 | |
| 206 | 174 | if result1 == result2: |
| ... | ... | @@ -231,17 +199,16 @@ def test_context_parameter(): |
| 231 | 199 | |
| 232 | 200 | # 测试带context和不带context的翻译 |
| 233 | 201 | text = "手机" |
| 234 | - prompt = config.query_config.translation_prompts.get('query_zh') | |
| 235 | 202 | |
| 236 | 203 | print(f"Test text: {text}") |
| 237 | - print(f"Prompt (as context): {prompt}") | |
| 204 | + print("Scene: ecommerce_search_query") | |
| 238 | 205 | |
| 239 | 206 | # 带context的翻译 |
| 240 | 207 | result_with_context = translator.translate( |
| 241 | 208 | text, |
| 242 | 209 | target_lang='en', |
| 243 | 210 | source_lang='zh', |
| 244 | - prompt=prompt | |
| 211 | + context="ecommerce_search_query", | |
| 245 | 212 | ) |
| 246 | 213 | print(f"\nTranslation result with context: {result_with_context}") |
| 247 | 214 | ... | ... |
query/translator.py deleted
| ... | ... | @@ -1,963 +0,0 @@ |
| 1 | -""" | |
| 2 | -Translation service for multi-language query support. | |
| 3 | - | |
| 4 | -Supports multiple translation models: | |
| 5 | -- Qwen (default): Alibaba Cloud DashScope API using qwen-mt-flash model | |
| 6 | -- DeepL: DeepL API for high-quality translations | |
| 7 | - | |
| 8 | -重要说明(Qwen 机翻限速): | |
| 9 | -- 当前默认使用的 `qwen-mt-flash` 为云端机翻模型,**官方限速较低,约 RPM=60(每分钟约 60 请求)** | |
| 10 | -- 在高并发场景必须依赖 Redis 翻译缓存与批量预热,避免在用户实时请求路径上直接打满 DashScope 限流 | |
| 11 | -- 若业务侧存在大规模离线翻译或更高吞吐需求,建议评估 DeepL 或自建翻译后端 | |
| 12 | - | |
| 13 | -使用方法 (Usage): | |
| 14 | - | |
| 15 | -```python | |
| 16 | -from query.qwen_mt_translate import Translator | |
| 17 | - | |
| 18 | -# 使用默认的 qwen 模型(推荐) | |
| 19 | -translator = Translator() # 默认使用 qwen 模型 | |
| 20 | - | |
| 21 | -# 或显式指定模型 | |
| 22 | -translator = Translator(model='qwen') # 使用 qwen 模型 | |
| 23 | -translator = Translator(model='deepl') # 使用 DeepL 模型 | |
| 24 | - | |
| 25 | -# 翻译文本 | |
| 26 | -result = translator.translate( | |
| 27 | - text="我看到这个视频后没有笑", | |
| 28 | - target_lang="en", | |
| 29 | - source_lang="auto" # 自动检测源语言 | |
| 30 | -) | |
| 31 | -``` | |
| 32 | - | |
| 33 | -配置说明 (Configuration): | |
| 34 | -- Qwen 模型需要设置 DASHSCOPE_API_KEY 环境变量(在 .env 文件中) | |
| 35 | -- DeepL 模型需要设置 DEEPL_AUTH_KEY 环境变量(在 .env 文件中) | |
| 36 | - | |
| 37 | -Qwen 模型参考文档: | |
| 38 | -- 官方文档:https://help.aliyun.com/zh/model-studio/get-api-key | |
| 39 | -- 模型:qwen-mt-flash(快速翻译模型) | |
| 40 | - | |
| 41 | -DeepL 官方文档: | |
| 42 | -https://developers.deepl.com/api-reference/translate/request-translation | |
| 43 | -""" | |
| 44 | - | |
| 45 | -import os | |
| 46 | -import requests | |
| 47 | -import re | |
| 48 | -import redis | |
| 49 | -from concurrent.futures import ThreadPoolExecutor, Future | |
| 50 | -from datetime import timedelta | |
| 51 | -from typing import Dict, List, Optional, Union | |
| 52 | -import logging | |
| 53 | -import time | |
| 54 | - | |
| 55 | -logger = logging.getLogger(__name__) | |
| 56 | - | |
| 57 | -from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG | |
| 58 | -from openai import OpenAI | |
| 59 | - | |
| 60 | - | |
| 61 | -class Translator: | |
| 62 | - """ | |
| 63 | - Multi-language translator supporting Qwen and DeepL APIs. | |
| 64 | - | |
| 65 | - Default model is 'qwen' which uses Alibaba Cloud DashScope API. | |
| 66 | - """ | |
| 67 | -# 华北2(北京):https://dashscope.aliyuncs.com/compatible-mode/v1 | |
| 68 | -# 新加坡:https://dashscope-intl.aliyuncs.com/compatible-mode/v1 | |
| 69 | -# 美国(弗吉尼亚):https://dashscope-us.aliyuncs.com/compatible-mode/v1 | |
| 70 | - | |
| 71 | - DEEPL_API_URL = "https://api.deepl.com/v2/translate" # Pro tier | |
| 72 | - QWEN_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1" # 北京地域 | |
| 73 | - # QWEN_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" # 新加坡 | |
| 74 | - # 如果使用新加坡地域的模型,需要将base_url替换为:https://dashscope-intl.aliyuncs.com/compatible-mode/v1 | |
| 75 | - QWEN_MODEL = "qwen-mt-flash" # 快速翻译模型 | |
| 76 | - | |
| 77 | - # Language code mapping | |
| 78 | - LANG_CODE_MAP = { | |
| 79 | - 'zh': 'ZH', | |
| 80 | - 'en': 'EN', | |
| 81 | - 'ru': 'RU', | |
| 82 | - 'ar': 'AR', | |
| 83 | - 'ja': 'JA', | |
| 84 | - 'es': 'ES', | |
| 85 | - 'de': 'DE', | |
| 86 | - 'fr': 'FR', | |
| 87 | - 'it': 'IT', | |
| 88 | - 'pt': 'PT', | |
| 89 | - } | |
| 90 | - | |
| 91 | - def __init__( | |
| 92 | - self, | |
| 93 | - model: str = "qwen", | |
| 94 | - api_key: Optional[str] = None, | |
| 95 | - use_cache: bool = True, | |
| 96 | - timeout: int = 10, | |
| 97 | - glossary_id: Optional[str] = None, | |
| 98 | - translation_context: Optional[str] = None | |
| 99 | - ): | |
| 100 | - """ | |
| 101 | - Initialize translator. | |
| 102 | - | |
| 103 | - Args: | |
| 104 | - model: Translation model to use. Options: 'qwen' (default) or 'deepl' | |
| 105 | - api_key: API key for the selected model (or None to use from config/env) | |
| 106 | - use_cache: Whether to cache translations | |
| 107 | - timeout: Request timeout in seconds | |
| 108 | - glossary_id: DeepL glossary ID for custom terminology (optional, only for DeepL) | |
| 109 | - translation_context: Context hint for translation (e.g., "e-commerce", "product search") | |
| 110 | - """ | |
| 111 | - self.model = model.lower() | |
| 112 | - if self.model not in ['qwen', 'deepl']: | |
| 113 | - raise ValueError(f"Unsupported model: {model}. Supported models: 'qwen', 'deepl'") | |
| 114 | - | |
| 115 | - # Get API key from config if not provided | |
| 116 | - if api_key is None: | |
| 117 | - if self.model == 'qwen': | |
| 118 | - api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY") | |
| 119 | - else: # deepl | |
| 120 | - api_key = DEEPL_AUTH_KEY or os.getenv("DEEPL_AUTH_KEY") | |
| 121 | - | |
| 122 | - self.api_key = api_key | |
| 123 | - self.timeout = timeout | |
| 124 | - self.use_cache = use_cache | |
| 125 | - self.glossary_id = glossary_id | |
| 126 | - self.translation_context = translation_context or "e-commerce product search" | |
| 127 | - | |
| 128 | - # Initialize OpenAI client for Qwen if needed | |
| 129 | - self.qwen_client = None | |
| 130 | - if self.model == 'qwen': | |
| 131 | - if not self.api_key: | |
| 132 | - logger.warning("DASHSCOPE_API_KEY not set. Qwen translation will not work.") | |
| 133 | - else: | |
| 134 | - self.qwen_client = OpenAI( | |
| 135 | - api_key=self.api_key, | |
| 136 | - base_url=self.QWEN_BASE_URL, | |
| 137 | - ) | |
| 138 | - | |
| 139 | - # Initialize Redis cache if enabled | |
| 140 | - if use_cache: | |
| 141 | - try: | |
| 142 | - self.redis_client = redis.Redis( | |
| 143 | - host=REDIS_CONFIG.get('host', 'localhost'), | |
| 144 | - port=REDIS_CONFIG.get('port', 6479), | |
| 145 | - password=REDIS_CONFIG.get('password'), | |
| 146 | - decode_responses=True, # Return str instead of bytes | |
| 147 | - socket_timeout=REDIS_CONFIG.get('socket_timeout', 1), | |
| 148 | - socket_connect_timeout=REDIS_CONFIG.get('socket_connect_timeout', 1), | |
| 149 | - retry_on_timeout=REDIS_CONFIG.get('retry_on_timeout', False), | |
| 150 | - health_check_interval=10, # 避免复用坏连接 | |
| 151 | - ) | |
| 152 | - # Test connection | |
| 153 | - self.redis_client.ping() | |
| 154 | - expire_days = REDIS_CONFIG.get('translation_cache_expire_days', 360) | |
| 155 | - self.expire_time = timedelta(days=expire_days) | |
| 156 | - self.expire_seconds = int(self.expire_time.total_seconds()) # Redis 需要秒数 | |
| 157 | - self.cache_prefix = REDIS_CONFIG.get('translation_cache_prefix', 'trans') | |
| 158 | - logger.info("Redis cache initialized for translations") | |
| 159 | - except Exception as e: | |
| 160 | - logger.warning(f"Failed to initialize Redis cache: {e}, falling back to no cache") | |
| 161 | - self.redis_client = None | |
| 162 | - self.cache = None | |
| 163 | - else: | |
| 164 | - self.redis_client = None | |
| 165 | - self.cache = None | |
| 166 | - | |
| 167 | - # Thread pool for async translation | |
| 168 | - self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="translator") | |
| 169 | - | |
| 170 | - def translate( | |
| 171 | - self, | |
| 172 | - text: str, | |
| 173 | - target_lang: str, | |
| 174 | - source_lang: Optional[str] = None, | |
| 175 | - context: Optional[str] = None, | |
| 176 | - prompt: Optional[str] = None | |
| 177 | - ) -> Optional[str]: | |
| 178 | - """ | |
| 179 | - Translate text to target language (synchronous mode). | |
| 180 | - | |
| 181 | - Args: | |
| 182 | - text: Text to translate | |
| 183 | - target_lang: Target language code ('zh', 'en', 'ru', etc.) | |
| 184 | - source_lang: Source language code (option al, auto-detect if None) | |
| 185 | - context: Additional context for translation (overrides default context) | |
| 186 | - prompt: Translation prompt/instruction (optional, for better translation quality) | |
| 187 | - | |
| 188 | - Returns: | |
| 189 | - Translated text or None if translation fails | |
| 190 | - """ | |
| 191 | - if not text or not text.strip(): | |
| 192 | - return text | |
| 193 | - | |
| 194 | - # Normalize language codes | |
| 195 | - target_lang = target_lang.lower() | |
| 196 | - if source_lang: | |
| 197 | - source_lang = source_lang.lower() | |
| 198 | - | |
| 199 | - # Optimization: Skip translation if not needed | |
| 200 | - if target_lang == 'en' and self._is_english_text(text): | |
| 201 | - logger.info(f"[Translator] Text is already English, skipping translation: '{text[:50]}...'") | |
| 202 | - return text | |
| 203 | - | |
| 204 | - if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): | |
| 205 | - logger.info( | |
| 206 | - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | |
| 207 | - f"Source language: {source_lang or 'auto'} | Result: Skip translation (contains Chinese or pure number)" | |
| 208 | - ) | |
| 209 | - return text | |
| 210 | - | |
| 211 | - # Use provided context or default context | |
| 212 | - translation_context = context or self.translation_context | |
| 213 | - | |
| 214 | - # Build cache key (include prompt in cache key if provided) | |
| 215 | - cache_key_parts = [source_lang or 'auto', target_lang, translation_context] | |
| 216 | - if prompt: | |
| 217 | - cache_key_parts.append(prompt) | |
| 218 | - cache_key_parts.append(text) | |
| 219 | - cache_key = ':'.join(cache_key_parts) | |
| 220 | - | |
| 221 | - # Check cache (include context and prompt in cache key for accuracy) | |
| 222 | - if self.use_cache and self.redis_client: | |
| 223 | - cached = self._get_cached_translation_redis(text, target_lang, source_lang, translation_context, prompt) | |
| 224 | - if cached: | |
| 225 | - logger.info( | |
| 226 | - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | |
| 227 | - f"Source language: {source_lang or 'auto'} | Result: '{cached}' | Source: Cache hit" | |
| 228 | - ) | |
| 229 | - return cached | |
| 230 | - | |
| 231 | - # If no API key, return mock translation (for testing) | |
| 232 | - if not self.api_key: | |
| 233 | - logger.info( | |
| 234 | - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | |
| 235 | - f"Source language: {source_lang or 'auto'} | Result: '{text}' | Source: Mock mode (no API key)" | |
| 236 | - ) | |
| 237 | - return text | |
| 238 | - | |
| 239 | - # Translate using selected model | |
| 240 | - logger.info( | |
| 241 | - f"[Translator] Translation request | Model: {self.model} | Original text: '{text}' | Target language: {target_lang} | " | |
| 242 | - f"Source language: {source_lang or 'auto'} | Context: {translation_context} | " | |
| 243 | - f"Prompt: {'yes' if prompt else 'no'} | Status: Starting translation" | |
| 244 | - ) | |
| 245 | - | |
| 246 | - if self.model == 'qwen': | |
| 247 | - result = self._translate_qwen(text, target_lang, source_lang, translation_context, prompt) | |
| 248 | - else: # deepl | |
| 249 | - result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt) | |
| 250 | - | |
| 251 | - # Surface translation failure to the caller instead of silently | |
| 252 | - # masquerading the source text as a successful translation. | |
| 253 | - if result is None: | |
| 254 | - logger.warning( | |
| 255 | - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | |
| 256 | - f"Source language: {source_lang or 'auto'} | Status: Translation failed" | |
| 257 | - ) | |
| 258 | - else: | |
| 259 | - logger.info( | |
| 260 | - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | |
| 261 | - f"Source language: {source_lang or 'auto'} | Result: '{result}' | Status: Translation successful" | |
| 262 | - ) | |
| 263 | - | |
| 264 | - # Cache only successful translations. Failed attempts must not poison | |
| 265 | - # Redis with the original text. | |
| 266 | - if result is not None and self.use_cache and self.redis_client: | |
| 267 | - self._set_cached_translation_redis(text, target_lang, result, source_lang, translation_context, prompt) | |
| 268 | - | |
| 269 | - return result | |
| 270 | - | |
| 271 | - def _translate_qwen( | |
| 272 | - self, | |
| 273 | - text: str, | |
| 274 | - target_lang: str, | |
| 275 | - source_lang: Optional[str], | |
| 276 | - context: Optional[str] = None, | |
| 277 | - prompt: Optional[str] = None | |
| 278 | - ) -> Optional[str]: | |
| 279 | - """ | |
| 280 | - Translate using Qwen MT Flash model via Alibaba Cloud DashScope API. | |
| 281 | - | |
| 282 | - Args: | |
| 283 | - text: Text to translate | |
| 284 | - target_lang: Target language code ('zh', 'en', 'ru', etc.) | |
| 285 | - source_lang: Source language code (optional, 'auto' if None) | |
| 286 | - context: Context hint for translation (optional) | |
| 287 | - prompt: Translation prompt/instruction (optional) | |
| 288 | - | |
| 289 | - Returns: | |
| 290 | - Translated text or None if translation fails | |
| 291 | - """ | |
| 292 | - if not self.qwen_client: | |
| 293 | - logger.error("[Translator] Qwen client not initialized. Check DASHSCOPE_API_KEY.") | |
| 294 | - return None | |
| 295 | - | |
| 296 | - # Qwen (qwen-mt-plus/flash/turbo) supported languages mapping | |
| 297 | - # 标准来自:你提供的“语言 / 英文名 / 代码”表 | |
| 298 | - qwen_lang_map = { | |
| 299 | - "en": "English", | |
| 300 | - "zh": "Chinese", | |
| 301 | - "zh_tw": "Traditional Chinese", | |
| 302 | - "ru": "Russian", | |
| 303 | - "ja": "Japanese", | |
| 304 | - "ko": "Korean", | |
| 305 | - "es": "Spanish", | |
| 306 | - "fr": "French", | |
| 307 | - "pt": "Portuguese", | |
| 308 | - "de": "German", | |
| 309 | - "it": "Italian", | |
| 310 | - "th": "Thai", | |
| 311 | - "vi": "Vietnamese", | |
| 312 | - "id": "Indonesian", | |
| 313 | - "ms": "Malay", | |
| 314 | - "ar": "Arabic", | |
| 315 | - "hi": "Hindi", | |
| 316 | - "he": "Hebrew", | |
| 317 | - "my": "Burmese", | |
| 318 | - "ta": "Tamil", | |
| 319 | - "ur": "Urdu", | |
| 320 | - "bn": "Bengali", | |
| 321 | - "pl": "Polish", | |
| 322 | - "nl": "Dutch", | |
| 323 | - "ro": "Romanian", | |
| 324 | - "tr": "Turkish", | |
| 325 | - "km": "Khmer", | |
| 326 | - "lo": "Lao", | |
| 327 | - "yue": "Cantonese", | |
| 328 | - "cs": "Czech", | |
| 329 | - "el": "Greek", | |
| 330 | - "sv": "Swedish", | |
| 331 | - "hu": "Hungarian", | |
| 332 | - "da": "Danish", | |
| 333 | - "fi": "Finnish", | |
| 334 | - "uk": "Ukrainian", | |
| 335 | - "bg": "Bulgarian", | |
| 336 | - } | |
| 337 | - | |
| 338 | - # Convert target language | |
| 339 | - target_lang_normalized = target_lang.lower() | |
| 340 | - target_lang_qwen = qwen_lang_map.get(target_lang_normalized, target_lang.capitalize()) | |
| 341 | - | |
| 342 | - # Convert source language | |
| 343 | - source_lang_normalized = (source_lang or "").strip().lower() | |
| 344 | - if not source_lang_normalized or source_lang_normalized == "auto": | |
| 345 | - source_lang_qwen = "auto" | |
| 346 | - else: | |
| 347 | - source_lang_qwen = qwen_lang_map.get(source_lang_normalized, source_lang.capitalize()) | |
| 348 | - | |
| 349 | - # Prepare translation options | |
| 350 | - translation_options = { | |
| 351 | - "source_lang": source_lang_qwen, | |
| 352 | - "target_lang": target_lang_qwen, | |
| 353 | - } | |
| 354 | - | |
| 355 | - # Prepare messages | |
| 356 | - messages = [ | |
| 357 | - { | |
| 358 | - "role": "user", | |
| 359 | - "content": text | |
| 360 | - } | |
| 361 | - ] | |
| 362 | - | |
| 363 | - start_time = time.time() | |
| 364 | - try: | |
| 365 | - completion = self.qwen_client.chat.completions.create( | |
| 366 | - model=self.QWEN_MODEL, | |
| 367 | - messages=messages, | |
| 368 | - extra_body={ | |
| 369 | - "translation_options": translation_options | |
| 370 | - } | |
| 371 | - ) | |
| 372 | - | |
| 373 | - translated_text = completion.choices[0].message.content.strip() | |
| 374 | - duration_ms = (time.time() - start_time) * 1000 | |
| 375 | - | |
| 376 | - logger.info( | |
| 377 | - f"[Translator] Qwen API response success | Original text: '{text}' | Target language: {target_lang_qwen} | " | |
| 378 | - f"Translation result: '{translated_text}' | Duration: {duration_ms:.2f} ms" | |
| 379 | - ) | |
| 380 | - return translated_text | |
| 381 | - | |
| 382 | - except Exception as e: | |
| 383 | - duration_ms = (time.time() - start_time) * 1000 | |
| 384 | - logger.error( | |
| 385 | - f"[Translator] Qwen API request exception | Original text: '{text}' | Target language: {target_lang_qwen} | " | |
| 386 | - f"Duration: {duration_ms:.2f} ms | Error: {e}", exc_info=True | |
| 387 | - ) | |
| 388 | - return None | |
| 389 | - | |
| 390 | - def _translate_deepl( | |
| 391 | - self, | |
| 392 | - text: str, | |
| 393 | - target_lang: str, | |
| 394 | - source_lang: Optional[str], | |
| 395 | - context: Optional[str] = None, | |
| 396 | - prompt: Optional[str] = None | |
| 397 | - ) -> Optional[str]: | |
| 398 | - """ | |
| 399 | - Translate using DeepL API with context and glossary support. | |
| 400 | - | |
| 401 | - Args: | |
| 402 | - text: Text to translate | |
| 403 | - target_lang: Target language code | |
| 404 | - source_lang: Source language code (optional) | |
| 405 | - context: Context hint for translation (e.g., "e-commerce product search") | |
| 406 | - """ | |
| 407 | - # Map to DeepL language codes | |
| 408 | - target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper()) | |
| 409 | - | |
| 410 | - headers = { | |
| 411 | - "Authorization": f"DeepL-Auth-Key {self.api_key}", | |
| 412 | - "Content-Type": "application/json", | |
| 413 | - } | |
| 414 | - | |
| 415 | - # Use prompt as context parameter for DeepL API (not as text prefix) | |
| 416 | - # According to DeepL API: context is "Additional context that can influence a translation but is not translated itself" | |
| 417 | - # If prompt is provided, use it as context; otherwise use the default context | |
| 418 | - api_context = prompt if prompt else context | |
| 419 | - | |
| 420 | - # For e-commerce, add context words to help DeepL understand the domain | |
| 421 | - # This is especially important for single-word ambiguous terms like "车" (car vs rook) | |
| 422 | - text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context) | |
| 423 | - | |
| 424 | - payload = { | |
| 425 | - "text": [text_to_translate], | |
| 426 | - "target_lang": target_code, | |
| 427 | - } | |
| 428 | - | |
| 429 | - if source_lang: | |
| 430 | - source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper()) | |
| 431 | - payload["source_lang"] = source_code | |
| 432 | - | |
| 433 | - # Add context parameter (prompt or default context) | |
| 434 | - # Context influences translation but is not translated itself | |
| 435 | - if api_context: | |
| 436 | - payload["context"] = api_context | |
| 437 | - | |
| 438 | - # Add glossary if configured | |
| 439 | - if self.glossary_id: | |
| 440 | - payload["glossary_id"] = self.glossary_id | |
| 441 | - | |
| 442 | - # Note: DeepL API v2 supports "context" parameter for additional context | |
| 443 | - # that influences translation but is not translated itself. | |
| 444 | - # We use prompt as context parameter when provided. | |
| 445 | - | |
| 446 | - try: | |
| 447 | - response = requests.post( | |
| 448 | - self.DEEPL_API_URL, | |
| 449 | - headers=headers, | |
| 450 | - json=payload, | |
| 451 | - timeout=self.timeout | |
| 452 | - ) | |
| 453 | - | |
| 454 | - if response.status_code == 200: | |
| 455 | - data = response.json() | |
| 456 | - if "translations" in data and len(data["translations"]) > 0: | |
| 457 | - translated_text = data["translations"][0]["text"] | |
| 458 | - # If we added context, extract just the term from the result | |
| 459 | - if needs_extraction: | |
| 460 | - translated_text = self._extract_term_from_translation( | |
| 461 | - translated_text, text, target_code | |
| 462 | - ) | |
| 463 | - logger.debug( | |
| 464 | - f"[Translator] DeepL API response success | Original text: '{text}' | Target language: {target_code} | " | |
| 465 | - f"Translation result: '{translated_text}'" | |
| 466 | - ) | |
| 467 | - return translated_text | |
| 468 | - else: | |
| 469 | - logger.error( | |
| 470 | - f"[Translator] DeepL API error | Original text: '{text}' | Target language: {target_code} | " | |
| 471 | - f"Status code: {response.status_code} | Error message: {response.text}" | |
| 472 | - ) | |
| 473 | - return None | |
| 474 | - | |
| 475 | - except requests.Timeout: | |
| 476 | - logger.warning( | |
| 477 | - f"[Translator] DeepL API request timeout | Original text: '{text}' | Target language: {target_code} | " | |
| 478 | - f"Timeout: {self.timeout}s" | |
| 479 | - ) | |
| 480 | - return None | |
| 481 | - except Exception as e: | |
| 482 | - logger.error( | |
| 483 | - f"[Translator] DeepL API request exception | Original text: '{text}' | Target language: {target_code} | " | |
| 484 | - f"Error: {e}", exc_info=True | |
| 485 | - ) | |
| 486 | - return None | |
| 487 | - | |
| 488 | - # NOTE: _translate_deepl_free is intentionally not implemented. | |
| 489 | - # We do not support automatic fallback to the free endpoint, to avoid | |
| 490 | - # mixing Pro keys with https://api-free.deepl.com and related 403 errors. | |
| 491 | - | |
| 492 | - def translate_multi( | |
| 493 | - self, | |
| 494 | - text: str, | |
| 495 | - target_langs: List[str], | |
| 496 | - source_lang: Optional[str] = None, | |
| 497 | - context: Optional[str] = None, | |
| 498 | - async_mode: bool = True, | |
| 499 | - prompt: Optional[str] = None | |
| 500 | - ) -> Dict[str, Optional[str]]: | |
| 501 | - """ | |
| 502 | - Translate text to multiple target languages. | |
| 503 | - | |
| 504 | - In async_mode=True (default): | |
| 505 | - - Returns cached translations immediately if available | |
| 506 | - - For translations that can be optimized (e.g., pure numbers, already in target language), | |
| 507 | - returns result immediately via synchronous call | |
| 508 | - - Launches async tasks for other missing translations (non-blocking) | |
| 509 | - - Returns None for missing translations that require async processing | |
| 510 | - | |
| 511 | - In async_mode=False: | |
| 512 | - - Waits for all translations to complete (blocking) | |
| 513 | - | |
| 514 | - Args: | |
| 515 | - text: Text to translate | |
| 516 | - target_langs: List of target language codes | |
| 517 | - source_lang: Source language code (optional) | |
| 518 | - context: Context hint for translation (optional) | |
| 519 | - async_mode: If True, return cached results immediately and translate missing ones async | |
| 520 | - prompt: Translation prompt/instruction (optional) | |
| 521 | - | |
| 522 | - Returns: | |
| 523 | - Dictionary mapping language code to translated text (only cached results in async mode) | |
| 524 | - """ | |
| 525 | - results = {} | |
| 526 | - missing_langs = [] | |
| 527 | - async_langs = [] | |
| 528 | - | |
| 529 | - # First, get cached translations | |
| 530 | - for lang in target_langs: | |
| 531 | - cached = self._get_cached_translation(text, lang, source_lang, context, prompt) | |
| 532 | - if cached is not None: | |
| 533 | - results[lang] = cached | |
| 534 | - else: | |
| 535 | - missing_langs.append(lang) | |
| 536 | - | |
| 537 | - # If async mode and there are missing translations | |
| 538 | - if async_mode and missing_langs: | |
| 539 | - # Check if translation can be optimized (immediate return) | |
| 540 | - for lang in missing_langs: | |
| 541 | - target_lang = lang.lower() | |
| 542 | - # Check optimization conditions (same as in translate method) | |
| 543 | - can_optimize = False | |
| 544 | - if target_lang == 'en' and self._is_english_text(text): | |
| 545 | - can_optimize = True | |
| 546 | - elif target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): | |
| 547 | - can_optimize = True | |
| 548 | - | |
| 549 | - if can_optimize: | |
| 550 | - # Can be optimized, call translate synchronously for immediate result | |
| 551 | - results[lang] = self.translate(text, lang, source_lang, context, prompt) | |
| 552 | - else: | |
| 553 | - # Requires actual translation, add to async list | |
| 554 | - async_langs.append(lang) | |
| 555 | - | |
| 556 | - # Launch async tasks for translations that require actual API calls | |
| 557 | - if async_langs: | |
| 558 | - for lang in async_langs: | |
| 559 | - self._translate_async(text, lang, source_lang, context, prompt) | |
| 560 | - # Return None for async translations | |
| 561 | - for lang in async_langs: | |
| 562 | - results[lang] = None | |
| 563 | - else: | |
| 564 | - # Synchronous mode: wait for all translations | |
| 565 | - for lang in missing_langs: | |
| 566 | - results[lang] = self.translate(text, lang, source_lang, context, prompt) | |
| 567 | - | |
| 568 | - return results | |
| 569 | - | |
| 570 | - def translate_multi_async( | |
| 571 | - self, | |
| 572 | - text: str, | |
| 573 | - target_langs: List[str], | |
| 574 | - source_lang: Optional[str] = None, | |
| 575 | - context: Optional[str] = None, | |
| 576 | - prompt: Optional[str] = None | |
| 577 | - ) -> Dict[str, Union[str, Future]]: | |
| 578 | - """ | |
| 579 | - Translate text to multiple target languages asynchronously, returning Futures that can be awaited. | |
| 580 | - | |
| 581 | - This method returns a dictionary where: | |
| 582 | - - If translation is cached, the value is the translation string (immediate) | |
| 583 | - - If translation needs to be done, the value is a Future object that can be awaited | |
| 584 | - | |
| 585 | - Args: | |
| 586 | - text: Text to translate | |
| 587 | - target_langs: List of target language codes | |
| 588 | - source_lang: Source language code (optional) | |
| 589 | - context: Context hint for translation (optional) | |
| 590 | - prompt: Translation prompt/instruction (optional) | |
| 591 | - | |
| 592 | - Returns: | |
| 593 | - Dictionary mapping language code to either translation string (cached) or Future object | |
| 594 | - """ | |
| 595 | - results = {} | |
| 596 | - missing_langs = [] | |
| 597 | - | |
| 598 | - # First, get cached translations | |
| 599 | - for lang in target_langs: | |
| 600 | - cached = self._get_cached_translation(text, lang, source_lang, context, prompt) | |
| 601 | - if cached is not None: | |
| 602 | - results[lang] = cached | |
| 603 | - else: | |
| 604 | - missing_langs.append(lang) | |
| 605 | - | |
| 606 | - # For missing translations, submit async tasks and return Futures | |
| 607 | - for lang in missing_langs: | |
| 608 | - future = self.executor.submit( | |
| 609 | - self.translate, | |
| 610 | - text, | |
| 611 | - lang, | |
| 612 | - source_lang, | |
| 613 | - context, | |
| 614 | - prompt | |
| 615 | - ) | |
| 616 | - results[lang] = future | |
| 617 | - | |
| 618 | - return results | |
| 619 | - | |
| 620 | - def _get_cached_translation( | |
| 621 | - self, | |
| 622 | - text: str, | |
| 623 | - target_lang: str, | |
| 624 | - source_lang: Optional[str] = None, | |
| 625 | - context: Optional[str] = None, | |
| 626 | - prompt: Optional[str] = None | |
| 627 | - ) -> Optional[str]: | |
| 628 | - """Get translation from cache if available.""" | |
| 629 | - if not self.redis_client: | |
| 630 | - return None | |
| 631 | - return self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt) | |
| 632 | - | |
| 633 | - def _get_cached_translation_redis( | |
| 634 | - self, | |
| 635 | - text: str, | |
| 636 | - target_lang: str, | |
| 637 | - source_lang: Optional[str] = None, | |
| 638 | - context: Optional[str] = None, | |
| 639 | - prompt: Optional[str] = None | |
| 640 | - ) -> Optional[str]: | |
| 641 | - """ | |
| 642 | - Get translation from Redis cache with sliding expiration. | |
| 643 | - | |
| 644 | - 滑动过期机制:每次访问缓存时,重置过期时间为配置的过期时间(默认720天)。 | |
| 645 | - 这样缓存会在最后一次访问后的720天才过期,而不是写入后的720天。 | |
| 646 | - 这确保了常用的翻译缓存不会被过早删除。 | |
| 647 | - """ | |
| 648 | - if not self.redis_client: | |
| 649 | - return None | |
| 650 | - | |
| 651 | - try: | |
| 652 | - # Build cache key: prefix:target_lang:text | |
| 653 | - # For simplicity, we use target_lang and text as key | |
| 654 | - # Context and prompt are not included in key to maximize cache hits | |
| 655 | - cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}" | |
| 656 | - value = self.redis_client.get(cache_key) | |
| 657 | - if value: | |
| 658 | - # Sliding expiration: reset expiration time on access | |
| 659 | - # 每次读取缓存时,重置过期时间为配置的过期时间(最后一次访问后的N天才过期) | |
| 660 | - try: | |
| 661 | - self.redis_client.expire(cache_key, self.expire_seconds) | |
| 662 | - except Exception as expire_error: | |
| 663 | - # 即使 expire 失败,也返回缓存值(不影响功能) | |
| 664 | - logger.warning( | |
| 665 | - f"[Translator] Failed to update cache expiration for key {cache_key}: {expire_error}" | |
| 666 | - ) | |
| 667 | - | |
| 668 | - logger.debug( | |
| 669 | - f"[Translator] Redis cache hit | Original text: '{text}' | Target language: {target_lang} | " | |
| 670 | - f"Cache key: {cache_key} | Translation result: '{value}' | TTL reset to {self.expire_seconds}s" | |
| 671 | - ) | |
| 672 | - return value | |
| 673 | - logger.debug( | |
| 674 | - f"[Translator] Redis cache miss | Original text: '{text}' | Target language: {target_lang} | " | |
| 675 | - f"Cache key: {cache_key}" | |
| 676 | - ) | |
| 677 | - return None | |
| 678 | - except Exception as e: | |
| 679 | - logger.error(f"[Translator] Redis error during get translation cache | Original text: '{text}' | Target language: {target_lang} | Error: {e}") | |
| 680 | - return None | |
| 681 | - | |
| 682 | - def _set_cached_translation_redis( | |
| 683 | - self, | |
| 684 | - text: str, | |
| 685 | - target_lang: str, | |
| 686 | - translation: str, | |
| 687 | - source_lang: Optional[str] = None, | |
| 688 | - context: Optional[str] = None, | |
| 689 | - prompt: Optional[str] = None | |
| 690 | - ) -> None: | |
| 691 | - """Store translation in Redis cache.""" | |
| 692 | - if not self.redis_client: | |
| 693 | - return | |
| 694 | - | |
| 695 | - try: | |
| 696 | - cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}" | |
| 697 | - self.redis_client.setex(cache_key, self.expire_seconds, translation) | |
| 698 | - logger.info( | |
| 699 | - f"[Translator] Redis cache write | Original text: '{text}' | Target language: {target_lang} | " | |
| 700 | - f"Cache key: {cache_key} | Translation result: '{translation}'" | |
| 701 | - ) | |
| 702 | - except Exception as e: | |
| 703 | - logger.error( | |
| 704 | - f"[Translator] Redis cache write failed | Original text: '{text}' | Target language: {target_lang} | " | |
| 705 | - f"Error: {e}" | |
| 706 | - ) | |
| 707 | - | |
| 708 | - def _translate_async( | |
| 709 | - self, | |
| 710 | - text: str, | |
| 711 | - target_lang: str, | |
| 712 | - source_lang: Optional[str] = None, | |
| 713 | - context: Optional[str] = None, | |
| 714 | - prompt: Optional[str] = None | |
| 715 | - ): | |
| 716 | - """Launch async translation task.""" | |
| 717 | - def _do_translate(): | |
| 718 | - try: | |
| 719 | - result = self.translate(text, target_lang, source_lang, context, prompt) | |
| 720 | - if result: | |
| 721 | - logger.debug(f"Async translation completed: {text} -> {target_lang}: {result}") | |
| 722 | - except Exception as e: | |
| 723 | - logger.warning(f"Async translation failed: {text} -> {target_lang}: {e}") | |
| 724 | - | |
| 725 | - self.executor.submit(_do_translate) | |
| 726 | - | |
| 727 | - def _add_ecommerce_context( | |
| 728 | - self, | |
| 729 | - text: str, | |
| 730 | - source_lang: Optional[str], | |
| 731 | - context: Optional[str] | |
| 732 | - ) -> tuple: | |
| 733 | - """ | |
| 734 | - Add e-commerce context to text for better disambiguation. | |
| 735 | - | |
| 736 | - For single-word ambiguous Chinese terms, we add context words that help | |
| 737 | - DeepL understand this is an e-commerce/product search context. | |
| 738 | - | |
| 739 | - Args: | |
| 740 | - text: Original text to translate | |
| 741 | - source_lang: Source language code | |
| 742 | - context: Context hint | |
| 743 | - | |
| 744 | - Returns: | |
| 745 | - Tuple of (text_with_context, needs_extraction) | |
| 746 | - - text_with_context: Text to send to DeepL | |
| 747 | - - needs_extraction: Whether we need to extract the term from the result | |
| 748 | - """ | |
| 749 | - # Only apply for e-commerce context and Chinese source | |
| 750 | - if not context or "e-commerce" not in context.lower(): | |
| 751 | - return text, False | |
| 752 | - | |
| 753 | - if not source_lang or source_lang.lower() != 'zh': | |
| 754 | - return text, False | |
| 755 | - | |
| 756 | - # For single-word queries, add context to help disambiguation | |
| 757 | - text_stripped = text.strip() | |
| 758 | - if len(text_stripped.split()) == 1 and len(text_stripped) <= 2: | |
| 759 | - # Common ambiguous Chinese e-commerce terms like "车" (car vs rook) | |
| 760 | - # We add a context phrase: "购买 [term]" (buy [term]) or "商品 [term]" (product [term]) | |
| 761 | - # This helps DeepL understand the e-commerce context | |
| 762 | - # We'll need to extract just the term from the translation result | |
| 763 | - context_phrase = f"购买 {text_stripped}" | |
| 764 | - return context_phrase, True | |
| 765 | - | |
| 766 | - # For multi-word queries, DeepL usually has enough context | |
| 767 | - return text, False | |
| 768 | - | |
| 769 | - def _extract_term_from_translation( | |
| 770 | - self, | |
| 771 | - translated_text: str, | |
| 772 | - original_text: str, | |
| 773 | - target_lang_code: str | |
| 774 | - ) -> str: | |
| 775 | - """ | |
| 776 | - Extract the actual term from a translation that included context. | |
| 777 | - | |
| 778 | - For example, if we translated "购买 车" (buy car) and got "buy car", | |
| 779 | - we want to extract just "car". | |
| 780 | - | |
| 781 | - Args: | |
| 782 | - translated_text: Full translation result | |
| 783 | - original_text: Original single-word query | |
| 784 | - target_lang_code: Target language code (EN, ZH, etc.) | |
| 785 | - | |
| 786 | - Returns: | |
| 787 | - Extracted term or original translation if extraction fails | |
| 788 | - """ | |
| 789 | - # For English target, try to extract the last word (the actual term) | |
| 790 | - if target_lang_code == "EN": | |
| 791 | - words = translated_text.strip().split() | |
| 792 | - if len(words) > 1: | |
| 793 | - # Usually the last word is the term we want | |
| 794 | - # But we need to be smart - if it's "buy car", we want "car" | |
| 795 | - # Common context words to skip: buy, purchase, product, item, etc. | |
| 796 | - context_words = {"buy", "purchase", "product", "item", "commodity", "goods"} | |
| 797 | - # Try to find the term (not a context word) | |
| 798 | - for word in reversed(words): | |
| 799 | - word_lower = word.lower().rstrip('.,!?;:') | |
| 800 | - if word_lower not in context_words: | |
| 801 | - return word_lower | |
| 802 | - # If all words are context words, return the last one | |
| 803 | - return words[-1].lower().rstrip('.,!?;:') | |
| 804 | - | |
| 805 | - # For other languages or if extraction fails, return as-is | |
| 806 | - # The user can configure a glossary for better results | |
| 807 | - return translated_text | |
| 808 | - | |
| 809 | - def _shop_lang_matches(self, shop_lang_lower: str, lang_code: str) -> bool: | |
| 810 | - """True if shop language matches index language (use source, no translate).""" | |
| 811 | - if not shop_lang_lower or not lang_code: | |
| 812 | - return False | |
| 813 | - if shop_lang_lower == lang_code: | |
| 814 | - return True | |
| 815 | - if lang_code == "zh" and "zh" in shop_lang_lower: | |
| 816 | - return True | |
| 817 | - if lang_code == "en" and "en" in shop_lang_lower: | |
| 818 | - return True | |
| 819 | - return False | |
| 820 | - | |
| 821 | - def translate_for_indexing( | |
| 822 | - self, | |
| 823 | - text: str, | |
| 824 | - shop_language: str, | |
| 825 | - source_lang: Optional[str] = None, | |
| 826 | - context: Optional[str] = None, | |
| 827 | - prompt: Optional[str] = None, | |
| 828 | - index_languages: Optional[List[str]] = None, | |
| 829 | - ) -> Dict[str, Optional[str]]: | |
| 830 | - """ | |
| 831 | - Translate text for indexing based on shop language and tenant index_languages. | |
| 832 | - | |
| 833 | - For each language in index_languages: use source text if shop language matches, | |
| 834 | - otherwise translate to that language. | |
| 835 | - | |
| 836 | - Args: | |
| 837 | - text: Text to translate | |
| 838 | - shop_language: Shop primary language (e.g. 'zh', 'en', 'ru') | |
| 839 | - source_lang: Source language code (optional) | |
| 840 | - context: Additional context for translation (optional) | |
| 841 | - prompt: Translation prompt (optional) | |
| 842 | - index_languages: Languages to index (from tenant_config). Default ["en", "zh"]. | |
| 843 | - | |
| 844 | - Returns: | |
| 845 | - Dict keyed by each index_language with translated or source text (or None). | |
| 846 | - """ | |
| 847 | - langs = index_languages if index_languages else ["en", "zh"] | |
| 848 | - results = {lang: None for lang in langs} | |
| 849 | - if not text or not text.strip(): | |
| 850 | - return results | |
| 851 | - if re.match(r'^[\d\s_-]+$', text): | |
| 852 | - logger.info(f"[Translator] Skip translation for symbol-only query: '{text}'") | |
| 853 | - return results | |
| 854 | - | |
| 855 | - shop_lang_lower = (shop_language or "").strip().lower() | |
| 856 | - targets = [] | |
| 857 | - for lang in langs: | |
| 858 | - if self._shop_lang_matches(shop_lang_lower, lang): | |
| 859 | - results[lang] = text | |
| 860 | - else: | |
| 861 | - targets.append(lang) | |
| 862 | - | |
| 863 | - for target_lang in targets: | |
| 864 | - cached = self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt) | |
| 865 | - if cached: | |
| 866 | - results[target_lang] = cached | |
| 867 | - logger.debug(f"[Translator] Cache hit for indexing: '{text}' -> {target_lang}: {cached}") | |
| 868 | - continue | |
| 869 | - translated = self.translate( | |
| 870 | - text, | |
| 871 | - target_lang=target_lang, | |
| 872 | - source_lang=source_lang or shop_language, | |
| 873 | - context=context, | |
| 874 | - prompt=prompt, | |
| 875 | - ) | |
| 876 | - results[target_lang] = translated | |
| 877 | - return results | |
| 878 | - | |
| 879 | - def get_translation_needs( | |
| 880 | - self, | |
| 881 | - detected_lang: str, | |
| 882 | - supported_langs: List[str] | |
| 883 | - ) -> List[str]: | |
| 884 | - """ | |
| 885 | - Determine which languages need translation. | |
| 886 | - | |
| 887 | - Args: | |
| 888 | - detected_lang: Detected query language | |
| 889 | - supported_langs: List of supported languages | |
| 890 | - | |
| 891 | - Returns: | |
| 892 | - List of language codes to translate to | |
| 893 | - """ | |
| 894 | - # If detected language is in supported list, translate to others | |
| 895 | - if detected_lang in supported_langs: | |
| 896 | - return [lang for lang in supported_langs if detected_lang != lang] | |
| 897 | - | |
| 898 | - # Otherwise, translate to all supported languages | |
| 899 | - return supported_langs | |
| 900 | - | |
| 901 | - def _is_english_text(self, text: str) -> bool: | |
| 902 | - """ | |
| 903 | - Check if text is primarily English (ASCII letters, numbers, common punctuation). | |
| 904 | - | |
| 905 | - Args: | |
| 906 | - text: Text to check | |
| 907 | - | |
| 908 | - Returns: | |
| 909 | - True if text appears to be English | |
| 910 | - """ | |
| 911 | - if not text or not text.strip(): | |
| 912 | - return True | |
| 913 | - | |
| 914 | - # Remove whitespace and common punctuation | |
| 915 | - text_clean = re.sub(r'[\s\.,!?;:\-\'\"\(\)\[\]{}]', '', text) | |
| 916 | - if not text_clean: | |
| 917 | - return True | |
| 918 | - | |
| 919 | - # Check if all remaining characters are ASCII (letters, numbers) | |
| 920 | - # This is a simple heuristic: if most characters are ASCII, it's likely English | |
| 921 | - ascii_count = sum(1 for c in text_clean if ord(c) < 128) | |
| 922 | - ratio = ascii_count / len(text_clean) if text_clean else 0 | |
| 923 | - | |
| 924 | - # If more than 80% are ASCII characters, consider it English | |
| 925 | - return ratio > 0.8 | |
| 926 | - | |
| 927 | - def _contains_chinese(self, text: str) -> bool: | |
| 928 | - """ | |
| 929 | - Check if text contains Chinese characters (Han characters). | |
| 930 | - | |
| 931 | - Args: | |
| 932 | - text: Text to check | |
| 933 | - | |
| 934 | - Returns: | |
| 935 | - True if text contains Chinese characters | |
| 936 | - """ | |
| 937 | - if not text: | |
| 938 | - return False | |
| 939 | - | |
| 940 | - # Check for Chinese characters (Unicode range: \u4e00-\u9fff) | |
| 941 | - chinese_pattern = re.compile(r'[\u4e00-\u9fff]') | |
| 942 | - return bool(chinese_pattern.search(text)) | |
| 943 | - | |
| 944 | - def _is_pure_number(self, text: str) -> bool: | |
| 945 | - """ | |
| 946 | - Check if text is purely numeric (digits, possibly with spaces, dots, commas). | |
| 947 | - | |
| 948 | - Args: | |
| 949 | - text: Text to check | |
| 950 | - | |
| 951 | - Returns: | |
| 952 | - True if text is purely numeric | |
| 953 | - """ | |
| 954 | - if not text or not text.strip(): | |
| 955 | - return False | |
| 956 | - | |
| 957 | - # Remove whitespace, dots, commas (common number separators) | |
| 958 | - text_clean = re.sub(r'[\s\.,]', '', text.strip()) | |
| 959 | - if not text_clean: | |
| 960 | - return False | |
| 961 | - | |
| 962 | - # Check if all remaining characters are digits | |
| 963 | - return text_clean.isdigit() |
tests/test_embedding_pipeline.py
| ... | ... | @@ -77,12 +77,10 @@ def _build_test_config() -> SearchConfig: |
| 77 | 77 | enable_text_embedding=True, |
| 78 | 78 | enable_query_rewrite=False, |
| 79 | 79 | rewrite_dictionary={}, |
| 80 | - translation_prompts={"query_zh": "e-commerce domain", "query_en": "e-commerce domain"}, | |
| 81 | 80 | text_embedding_field="title_embedding", |
| 82 | 81 | image_embedding_field=None, |
| 83 | 82 | ), |
| 84 | 83 | function_score=FunctionScoreConfig(), |
| 85 | - function_score=FunctionScoreConfig(), | |
| 86 | 84 | rerank=RerankConfig(), |
| 87 | 85 | spu_config=SPUConfig(enabled=True, spu_field="spu_id", inner_hits_size=3), |
| 88 | 86 | es_index_name="test_products", | ... | ... |