""" Translation service for multi-language query support. Supports DeepL API for high-quality translations. #### 官方文档: https://developers.deepl.com/api-reference/translate/request-translation ##### """ import requests import re import redis from concurrent.futures import ThreadPoolExecutor, Future from datetime import timedelta from typing import Dict, List, Optional, Union import logging logger = logging.getLogger(__name__) # Try to import DEEPL_AUTH_KEY and REDIS_CONFIG, but allow import to fail try: from config.env_config import DEEPL_AUTH_KEY, REDIS_CONFIG except ImportError: DEEPL_AUTH_KEY = None REDIS_CONFIG = {} class Translator: """Multi-language translator using DeepL API.""" DEEPL_API_URL = "https://api.deepl.com/v2/translate" # Pro tier # Language code mapping LANG_CODE_MAP = { 'zh': 'ZH', 'en': 'EN', 'ru': 'RU', 'ar': 'AR', 'ja': 'JA', 'es': 'ES', 'de': 'DE', 'fr': 'FR', 'it': 'IT', 'pt': 'PT', } def __init__( self, api_key: Optional[str] = None, use_cache: bool = True, timeout: int = 10, glossary_id: Optional[str] = None, translation_context: Optional[str] = None ): """ Initialize translator. Args: api_key: DeepL API key (or None to use from config/env) use_cache: Whether to cache translations timeout: Request timeout in seconds glossary_id: DeepL glossary ID for custom terminology (optional) translation_context: Context hint for translation (e.g., "e-commerce", "product search") """ # Get API key from config if not provided if api_key is None and DEEPL_AUTH_KEY: api_key = DEEPL_AUTH_KEY self.api_key = api_key self.timeout = timeout self.use_cache = use_cache self.glossary_id = glossary_id self.translation_context = translation_context or "e-commerce product search" # Initialize Redis cache if enabled if use_cache: try: self.redis_client = redis.Redis( host=REDIS_CONFIG.get('host', 'localhost'), port=REDIS_CONFIG.get('port', 6479), password=REDIS_CONFIG.get('password'), decode_responses=True, # Return str instead of bytes socket_timeout=REDIS_CONFIG.get('socket_timeout', 1), socket_connect_timeout=REDIS_CONFIG.get('socket_connect_timeout', 1), retry_on_timeout=REDIS_CONFIG.get('retry_on_timeout', False), health_check_interval=10, # 避免复用坏连接 ) # Test connection self.redis_client.ping() self.expire_time = timedelta(days=REDIS_CONFIG.get('translation_cache_expire_days', 360)) self.cache_prefix = REDIS_CONFIG.get('translation_cache_prefix', 'trans') logger.info("Redis cache initialized for translations") except Exception as e: logger.warning(f"Failed to initialize Redis cache: {e}, falling back to no cache") self.redis_client = None self.cache = None else: self.redis_client = None self.cache = None # Thread pool for async translation self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="translator") def translate( self, text: str, target_lang: str, source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None ) -> Optional[str]: """ Translate text to target language (synchronous mode). Args: text: Text to translate target_lang: Target language code ('zh', 'en', 'ru', etc.) source_lang: Source language code (optional, auto-detect if None) context: Additional context for translation (overrides default context) prompt: Translation prompt/instruction (optional, for better translation quality) Returns: Translated text or None if translation fails """ if not text or not text.strip(): return text # Normalize language codes target_lang = target_lang.lower() if source_lang: source_lang = source_lang.lower() # Optimization: Skip translation if not needed if target_lang == 'en' and self._is_english_text(text): logger.info(f"[Translator] Text is already English, skipping translation: '{text[:50]}...'") return text if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): logger.info(f"[Translator] Text contains Chinese or is pure number, skipping translation: '{text[:50]}...'") return text # Use provided context or default context translation_context = context or self.translation_context # Build cache key (include prompt in cache key if provided) cache_key_parts = [source_lang or 'auto', target_lang, translation_context] if prompt: cache_key_parts.append(prompt) cache_key_parts.append(text) cache_key = ':'.join(cache_key_parts) # Check cache (include context and prompt in cache key for accuracy) if self.use_cache and self.redis_client: cached = self._get_cached_translation_redis(text, target_lang, source_lang, translation_context, prompt) if cached: logger.info( f"[Translator] Cache hit: source={source_lang or 'auto'} " f"target={target_lang} | text='{text[:80]}...' -> '{cached[:80]}...'" ) return cached # If no API key, return mock translation (for testing) if not self.api_key: logger.debug(f"[Translator] No API key, returning original text (mock mode)") return text # Translate using DeepL (Pro endpoint only, no free fallback) logger.info( f"[Translator] Translating text: target={target_lang}, " f"source={source_lang or 'auto'}, context={translation_context}, " f"prompt={'yes' if prompt else 'no'} | text='{text[:80]}...'" ) result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt) # If still failed, return original text with warning if result is None: logger.warning(f"[Translator] Translation failed for '{text[:50]}...', returning original text") result = text logger.info( f"[Translator] Translation completed: source={source_lang or 'auto'} " f"target={target_lang} | original='{text[:80]}...' -> '{result[:80]}...'" ) # Cache result if result and self.use_cache and self.redis_client: self._set_cached_translation_redis(text, target_lang, result, source_lang, translation_context, prompt) return result def _translate_deepl( self, text: str, target_lang: str, source_lang: Optional[str], context: Optional[str] = None, prompt: Optional[str] = None ) -> Optional[str]: """ Translate using DeepL API with context and glossary support. Args: text: Text to translate target_lang: Target language code source_lang: Source language code (optional) context: Context hint for translation (e.g., "e-commerce product search") """ # Map to DeepL language codes target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper()) headers = { "Authorization": f"DeepL-Auth-Key {self.api_key}", "Content-Type": "application/json", } # Use prompt as context parameter for DeepL API (not as text prefix) # According to DeepL API: context is "Additional context that can influence a translation but is not translated itself" # If prompt is provided, use it as context; otherwise use the default context api_context = prompt if prompt else context # For e-commerce, add context words to help DeepL understand the domain # This is especially important for single-word ambiguous terms like "车" (car vs rook) text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context) payload = { "text": [text_to_translate], "target_lang": target_code, } if source_lang: source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper()) payload["source_lang"] = source_code # Add context parameter (prompt or default context) # Context influences translation but is not translated itself if api_context: payload["context"] = api_context # Add glossary if configured if self.glossary_id: payload["glossary_id"] = self.glossary_id # Note: DeepL API v2 supports "context" parameter for additional context # that influences translation but is not translated itself. # We use prompt as context parameter when provided. try: response = requests.post( self.DEEPL_API_URL, headers=headers, json=payload, timeout=self.timeout ) if response.status_code == 200: data = response.json() if "translations" in data and len(data["translations"]) > 0: translated_text = data["translations"][0]["text"] # If we added context, extract just the term from the result if needs_extraction: translated_text = self._extract_term_from_translation( translated_text, text, target_code ) return translated_text else: logger.error(f"[Translator] DeepL API error: {response.status_code} - {response.text}") return None except requests.Timeout: logger.warning(f"[Translator] Translation request timed out") return None except Exception as e: logger.error(f"[Translator] Translation failed: {e}", exc_info=True) return None # NOTE: _translate_deepl_free is intentionally not implemented. # We do not support automatic fallback to the free endpoint, to avoid # mixing Pro keys with https://api-free.deepl.com and related 403 errors. def translate_multi( self, text: str, target_langs: List[str], source_lang: Optional[str] = None, context: Optional[str] = None, async_mode: bool = True, prompt: Optional[str] = None ) -> Dict[str, Optional[str]]: """ Translate text to multiple target languages. In async_mode=True (default): - Returns cached translations immediately if available - For translations that can be optimized (e.g., pure numbers, already in target language), returns result immediately via synchronous call - Launches async tasks for other missing translations (non-blocking) - Returns None for missing translations that require async processing In async_mode=False: - Waits for all translations to complete (blocking) Args: text: Text to translate target_langs: List of target language codes source_lang: Source language code (optional) context: Context hint for translation (optional) async_mode: If True, return cached results immediately and translate missing ones async prompt: Translation prompt/instruction (optional) Returns: Dictionary mapping language code to translated text (only cached results in async mode) """ results = {} missing_langs = [] async_langs = [] # First, get cached translations for lang in target_langs: cached = self._get_cached_translation(text, lang, source_lang, context, prompt) if cached is not None: results[lang] = cached else: missing_langs.append(lang) # If async mode and there are missing translations if async_mode and missing_langs: # Check if translation can be optimized (immediate return) for lang in missing_langs: target_lang = lang.lower() # Check optimization conditions (same as in translate method) can_optimize = False if target_lang == 'en' and self._is_english_text(text): can_optimize = True elif target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): can_optimize = True if can_optimize: # Can be optimized, call translate synchronously for immediate result results[lang] = self.translate(text, lang, source_lang, context, prompt) else: # Requires actual translation, add to async list async_langs.append(lang) # Launch async tasks for translations that require actual API calls if async_langs: for lang in async_langs: self._translate_async(text, lang, source_lang, context, prompt) # Return None for async translations for lang in async_langs: results[lang] = None else: # Synchronous mode: wait for all translations for lang in missing_langs: results[lang] = self.translate(text, lang, source_lang, context, prompt) return results def translate_multi_async( self, text: str, target_langs: List[str], source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None ) -> Dict[str, Union[str, Future]]: """ Translate text to multiple target languages asynchronously, returning Futures that can be awaited. This method returns a dictionary where: - If translation is cached, the value is the translation string (immediate) - If translation needs to be done, the value is a Future object that can be awaited Args: text: Text to translate target_langs: List of target language codes source_lang: Source language code (optional) context: Context hint for translation (optional) prompt: Translation prompt/instruction (optional) Returns: Dictionary mapping language code to either translation string (cached) or Future object """ results = {} missing_langs = [] # First, get cached translations for lang in target_langs: cached = self._get_cached_translation(text, lang, source_lang, context, prompt) if cached is not None: results[lang] = cached else: missing_langs.append(lang) # For missing translations, submit async tasks and return Futures for lang in missing_langs: future = self.executor.submit( self.translate, text, lang, source_lang, context, prompt ) results[lang] = future return results def _get_cached_translation( self, text: str, target_lang: str, source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None ) -> Optional[str]: """Get translation from cache if available.""" if not self.redis_client: return None return self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt) def _get_cached_translation_redis( self, text: str, target_lang: str, source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None ) -> Optional[str]: """Get translation from Redis cache with sliding expiration.""" if not self.redis_client: return None try: # Build cache key: prefix:target_lang:text # For simplicity, we use target_lang and text as key # Context and prompt are not included in key to maximize cache hits cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}" value = self.redis_client.get(cache_key) if value: # Sliding expiration: reset expiration time on access self.redis_client.expire(cache_key, self.expire_time) logger.info( f"[Translator] Redis cache hit: key={cache_key}, " f"target={target_lang}, value='{value[:80]}...'" ) return value logger.debug(f"[Translator] Redis cache miss: key={cache_key}, target={target_lang}") return None except Exception as e: logger.error(f"[Translator] Redis error during get translation cache: '{text}' {target_lang}: {e}") return None def _set_cached_translation_redis( self, text: str, target_lang: str, translation: str, source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None ) -> None: """Store translation in Redis cache.""" if not self.redis_client: return try: cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}" self.redis_client.setex(cache_key, self.expire_time, translation) logger.info( f"[Translator] Cached translation: key={cache_key}, " f"target={target_lang}, value='{translation}...'" ) except Exception as e: logger.error(f"[Translator] Redis error during set translation cache: '{text}' {target_lang}: {e}") def _translate_async( self, text: str, target_lang: str, source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None ): """Launch async translation task.""" def _do_translate(): try: result = self.translate(text, target_lang, source_lang, context, prompt) if result: logger.debug(f"Async translation completed: {text} -> {target_lang}: {result}") except Exception as e: logger.warning(f"Async translation failed: {text} -> {target_lang}: {e}") self.executor.submit(_do_translate) def _add_ecommerce_context( self, text: str, source_lang: Optional[str], context: Optional[str] ) -> tuple: """ Add e-commerce context to text for better disambiguation. For single-word ambiguous Chinese terms, we add context words that help DeepL understand this is an e-commerce/product search context. Args: text: Original text to translate source_lang: Source language code context: Context hint Returns: Tuple of (text_with_context, needs_extraction) - text_with_context: Text to send to DeepL - needs_extraction: Whether we need to extract the term from the result """ # Only apply for e-commerce context and Chinese source if not context or "e-commerce" not in context.lower(): return text, False if not source_lang or source_lang.lower() != 'zh': return text, False # For single-word queries, add context to help disambiguation text_stripped = text.strip() if len(text_stripped.split()) == 1 and len(text_stripped) <= 2: # Common ambiguous Chinese e-commerce terms like "车" (car vs rook) # We add a context phrase: "购买 [term]" (buy [term]) or "商品 [term]" (product [term]) # This helps DeepL understand the e-commerce context # We'll need to extract just the term from the translation result context_phrase = f"购买 {text_stripped}" return context_phrase, True # For multi-word queries, DeepL usually has enough context return text, False def _extract_term_from_translation( self, translated_text: str, original_text: str, target_lang_code: str ) -> str: """ Extract the actual term from a translation that included context. For example, if we translated "购买 车" (buy car) and got "buy car", we want to extract just "car". Args: translated_text: Full translation result original_text: Original single-word query target_lang_code: Target language code (EN, ZH, etc.) Returns: Extracted term or original translation if extraction fails """ # For English target, try to extract the last word (the actual term) if target_lang_code == "EN": words = translated_text.strip().split() if len(words) > 1: # Usually the last word is the term we want # But we need to be smart - if it's "buy car", we want "car" # Common context words to skip: buy, purchase, product, item, etc. context_words = {"buy", "purchase", "product", "item", "commodity", "goods"} # Try to find the term (not a context word) for word in reversed(words): word_lower = word.lower().rstrip('.,!?;:') if word_lower not in context_words: return word_lower # If all words are context words, return the last one return words[-1].lower().rstrip('.,!?;:') # For other languages or if extraction fails, return as-is # The user can configure a glossary for better results return translated_text def translate_for_indexing( self, text: str, shop_language: str, source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None, translate_to_en: bool = True, translate_to_zh: bool = True, ) -> Dict[str, Optional[str]]: """ Translate text for indexing based on shop language and tenant configuration. Translation behavior: - If translate_to_zh=True and shop language is not 'zh', translate to Chinese (zh) - If translate_to_en=True and shop language is not 'en', translate to English (en) - If both flags are False, no translation is performed (returns None for both) Args: text: Text to translate shop_language: Shop's configured language (e.g., 'zh', 'en', 'ru') source_lang: Source language code (optional, auto-detect if None) context: Additional context for translation (optional) prompt: Translation prompt/instruction (optional) translate_to_en: Whether to translate to English (from tenant_config) translate_to_zh: Whether to translate to Chinese (from tenant_config) Returns: Dictionary with 'zh' and 'en' keys containing translated text (or None if not needed/not enabled) Example: {'zh': '中文翻译', 'en': 'English translation'} or {'zh': None, 'en': None} """ if not text or not text.strip(): return {'zh': None, 'en': None} # Skip translation for symbol-only queries if re.match(r'^[\d\s_-]+$', text): logger.info(f"[Translator] Skip translation for symbol-only query: '{text}'") return {'zh': None, 'en': None} results = {'zh': None, 'en': None} shop_lang_lower = shop_language.lower() if shop_language else "" # Determine which languages need translation based on tenant configuration targets = [] if translate_to_zh and "zh" not in shop_lang_lower: targets.append("zh") if translate_to_en and "en" not in shop_lang_lower: targets.append("en") # If shop language is already zh and en, no translation needed if not targets: # Use original text for both languages if "zh" in shop_lang_lower: results['zh'] = text if "en" in shop_lang_lower: results['en'] = text return results # Translate to each target language for target_lang in targets: # Check cache first cached = self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt) if cached: results[target_lang] = cached logger.debug(f"[Translator] Cache hit for indexing: '{text}' -> {target_lang}: {cached}") continue # Translate synchronously for indexing (we need the result immediately) translated = self.translate( text, target_lang=target_lang, source_lang=source_lang or shop_language, context=context, prompt=prompt ) results[target_lang] = translated return results def get_translation_needs( self, detected_lang: str, supported_langs: List[str] ) -> List[str]: """ Determine which languages need translation. Args: detected_lang: Detected query language supported_langs: List of supported languages Returns: List of language codes to translate to """ # If detected language is in supported list, translate to others if detected_lang in supported_langs: return [lang for lang in supported_langs if detected_lang != lang] # Otherwise, translate to all supported languages return supported_langs def _is_english_text(self, text: str) -> bool: """ Check if text is primarily English (ASCII letters, numbers, common punctuation). Args: text: Text to check Returns: True if text appears to be English """ if not text or not text.strip(): return True # Remove whitespace and common punctuation text_clean = re.sub(r'[\s\.,!?;:\-\'\"\(\)\[\]{}]', '', text) if not text_clean: return True # Check if all remaining characters are ASCII (letters, numbers) # This is a simple heuristic: if most characters are ASCII, it's likely English ascii_count = sum(1 for c in text_clean if ord(c) < 128) ratio = ascii_count / len(text_clean) if text_clean else 0 # If more than 80% are ASCII characters, consider it English return ratio > 0.8 def _contains_chinese(self, text: str) -> bool: """ Check if text contains Chinese characters (Han characters). Args: text: Text to check Returns: True if text contains Chinese characters """ if not text: return False # Check for Chinese characters (Unicode range: \u4e00-\u9fff) chinese_pattern = re.compile(r'[\u4e00-\u9fff]') return bool(chinese_pattern.search(text)) def _is_pure_number(self, text: str) -> bool: """ Check if text is purely numeric (digits, possibly with spaces, dots, commas). Args: text: Text to check Returns: True if text is purely numeric """ if not text or not text.strip(): return False # Remove whitespace, dots, commas (common number separators) text_clean = re.sub(r'[\s\.,]', '', text.strip()) if not text_clean: return False # Check if all remaining characters are digits return text_clean.isdigit()