""" Translation service for multi-language query support. Supports DeepL API for high-quality translations. #### 官方文档: https://developers.deepl.com/api-reference/translate/request-translation ##### """ import requests import re from concurrent.futures import ThreadPoolExecutor from typing import Dict, List, Optional from utils.cache import DictCache import logging logger = logging.getLogger(__name__) # Try to import DEEPL_AUTH_KEY, but allow import to fail try: from config.env_config import DEEPL_AUTH_KEY except ImportError: DEEPL_AUTH_KEY = None class Translator: """Multi-language translator using DeepL API.""" DEEPL_API_URL = "https://api.deepl.com/v2/translate" # Pro tier # Language code mapping LANG_CODE_MAP = { 'zh': 'ZH', 'en': 'EN', 'ru': 'RU', 'ar': 'AR', 'ja': 'JA', 'es': 'ES', 'de': 'DE', 'fr': 'FR', 'it': 'IT', 'pt': 'PT', } def __init__( self, api_key: Optional[str] = None, use_cache: bool = True, timeout: int = 10, glossary_id: Optional[str] = None, translation_context: Optional[str] = None ): """ Initialize translator. Args: api_key: DeepL API key (or None to use from config/env) use_cache: Whether to cache translations timeout: Request timeout in seconds glossary_id: DeepL glossary ID for custom terminology (optional) translation_context: Context hint for translation (e.g., "e-commerce", "product search") """ # Get API key from config if not provided if api_key is None and DEEPL_AUTH_KEY: api_key = DEEPL_AUTH_KEY self.api_key = api_key self.timeout = timeout self.use_cache = use_cache self.glossary_id = glossary_id self.translation_context = translation_context or "e-commerce product search" if use_cache: self.cache = DictCache(".cache/translations.json") else: self.cache = None # Thread pool for async translation self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="translator") def translate( self, text: str, target_lang: str, source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None ) -> Optional[str]: """ Translate text to target language (synchronous mode). Args: text: Text to translate target_lang: Target language code ('zh', 'en', 'ru', etc.) source_lang: Source language code (optional, auto-detect if None) context: Additional context for translation (overrides default context) prompt: Translation prompt/instruction (optional, for better translation quality) Returns: Translated text or None if translation fails """ if not text or not text.strip(): return text # Normalize language codes target_lang = target_lang.lower() if source_lang: source_lang = source_lang.lower() # Optimization: Skip translation if not needed if target_lang == 'en' and self._is_english_text(text): logger.debug(f"[Translator] Text is already English, skipping translation: '{text[:50]}...'") return text if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): logger.debug(f"[Translator] Text contains Chinese or is pure number, skipping translation: '{text[:50]}...'") return text # Use provided context or default context translation_context = context or self.translation_context # Build cache key (include prompt in cache key if provided) cache_key_parts = [source_lang or 'auto', target_lang, translation_context] if prompt: cache_key_parts.append(prompt) cache_key_parts.append(text) cache_key = ':'.join(cache_key_parts) # Check cache (include context and prompt in cache key for accuracy) if self.use_cache: cached = self.cache.get(cache_key, category="translations") if cached: return cached # If no API key, return mock translation (for testing) if not self.api_key: logger.debug(f"[Translator] No API key, returning original text (mock mode)") return text # Translate using DeepL with fallback result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt) # If translation failed, try fallback to free API if result is None and "api.deepl.com" in self.DEEPL_API_URL: logger.debug(f"[Translator] Pro API failed, trying free API...") result = self._translate_deepl_free(text, target_lang, source_lang, translation_context, prompt) # If still failed, return original text with warning if result is None: logger.warning(f"[Translator] Translation failed for '{text[:50]}...', returning original text") result = text # Cache result if result and self.use_cache: self.cache.set(cache_key, result, category="translations") return result def _translate_deepl( self, text: str, target_lang: str, source_lang: Optional[str], context: Optional[str] = None, prompt: Optional[str] = None ) -> Optional[str]: """ Translate using DeepL API with context and glossary support. Args: text: Text to translate target_lang: Target language code source_lang: Source language code (optional) context: Context hint for translation (e.g., "e-commerce product search") """ # Map to DeepL language codes target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper()) headers = { "Authorization": f"DeepL-Auth-Key {self.api_key}", "Content-Type": "application/json", } # Use prompt as context parameter for DeepL API (not as text prefix) # According to DeepL API: context is "Additional context that can influence a translation but is not translated itself" # If prompt is provided, use it as context; otherwise use the default context api_context = prompt if prompt else context # For e-commerce, add context words to help DeepL understand the domain # This is especially important for single-word ambiguous terms like "车" (car vs rook) text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context) payload = { "text": [text_to_translate], "target_lang": target_code, } if source_lang: source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper()) payload["source_lang"] = source_code # Add context parameter (prompt or default context) # Context influences translation but is not translated itself if api_context: payload["context"] = api_context # Add glossary if configured if self.glossary_id: payload["glossary_id"] = self.glossary_id # Note: DeepL API v2 supports "context" parameter for additional context # that influences translation but is not translated itself. # We use prompt as context parameter when provided. try: response = requests.post( self.DEEPL_API_URL, headers=headers, json=payload, timeout=self.timeout ) if response.status_code == 200: data = response.json() if "translations" in data and len(data["translations"]) > 0: translated_text = data["translations"][0]["text"] # If we added context, extract just the term from the result if needs_extraction: translated_text = self._extract_term_from_translation( translated_text, text, target_code ) return translated_text else: logger.error(f"[Translator] DeepL API error: {response.status_code} - {response.text}") return None except requests.Timeout: logger.warning(f"[Translator] Translation request timed out") return None except Exception as e: logger.error(f"[Translator] Translation failed: {e}", exc_info=True) return None def _translate_deepl_free( self, text: str, target_lang: str, source_lang: Optional[str], context: Optional[str] = None, prompt: Optional[str] = None ) -> Optional[str]: """ Translate using DeepL Free API. Note: Free API may not support glossary_id parameter. """ # Map to DeepL language codes target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper()) headers = { "Authorization": f"DeepL-Auth-Key {self.api_key}", "Content-Type": "application/json", } # Use prompt as context parameter for DeepL API api_context = prompt if prompt else context payload = { "text": [text], "target_lang": target_code, } if source_lang: source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper()) payload["source_lang"] = source_code # Add context parameter if api_context: payload["context"] = api_context # Note: Free API typically doesn't support glossary_id # But we can still use context hints in the text try: response = requests.post( "https://api-free.deepl.com/v2/translate", headers=headers, json=payload, timeout=self.timeout ) if response.status_code == 200: data = response.json() if "translations" in data and len(data["translations"]) > 0: return data["translations"][0]["text"] else: logger.error(f"[Translator] DeepL Free API error: {response.status_code} - {response.text}") return None except requests.Timeout: logger.warning(f"[Translator] Free API request timed out") return None except Exception as e: logger.error(f"[Translator] Free API translation failed: {e}", exc_info=True) return None def translate_multi( self, text: str, target_langs: List[str], source_lang: Optional[str] = None, context: Optional[str] = None, async_mode: bool = True, prompt: Optional[str] = None ) -> Dict[str, Optional[str]]: """ Translate text to multiple target languages. In async_mode=True (default): - Returns cached translations immediately if available - For translations that can be optimized (e.g., pure numbers, already in target language), returns result immediately via synchronous call - Launches async tasks for other missing translations (non-blocking) - Returns None for missing translations that require async processing In async_mode=False: - Waits for all translations to complete (blocking) Args: text: Text to translate target_langs: List of target language codes source_lang: Source language code (optional) context: Context hint for translation (optional) async_mode: If True, return cached results immediately and translate missing ones async prompt: Translation prompt/instruction (optional) Returns: Dictionary mapping language code to translated text (only cached results in async mode) """ results = {} missing_langs = [] async_langs = [] # First, get cached translations for lang in target_langs: cached = self._get_cached_translation(text, lang, source_lang, context, prompt) if cached is not None: results[lang] = cached else: missing_langs.append(lang) # If async mode and there are missing translations if async_mode and missing_langs: # Check if translation can be optimized (immediate return) for lang in missing_langs: target_lang = lang.lower() # Check optimization conditions (same as in translate method) can_optimize = False if target_lang == 'en' and self._is_english_text(text): can_optimize = True elif target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): can_optimize = True if can_optimize: # Can be optimized, call translate synchronously for immediate result results[lang] = self.translate(text, lang, source_lang, context, prompt) else: # Requires actual translation, add to async list async_langs.append(lang) # Launch async tasks for translations that require actual API calls if async_langs: for lang in async_langs: self._translate_async(text, lang, source_lang, context, prompt) # Return None for async translations for lang in async_langs: results[lang] = None else: # Synchronous mode: wait for all translations for lang in missing_langs: results[lang] = self.translate(text, lang, source_lang, context, prompt) return results def _get_cached_translation( self, text: str, target_lang: str, source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None ) -> Optional[str]: """Get translation from cache if available.""" if not self.cache: return None translation_context = context or self.translation_context cache_key_parts = [source_lang or 'auto', target_lang, translation_context] if prompt: cache_key_parts.append(prompt) cache_key_parts.append(text) cache_key = ':'.join(cache_key_parts) return self.cache.get(cache_key, category="translations") def _translate_async( self, text: str, target_lang: str, source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None ): """Launch async translation task.""" def _do_translate(): try: result = self.translate(text, target_lang, source_lang, context, prompt) if result: logger.debug(f"Async translation completed: {text} -> {target_lang}: {result}") except Exception as e: logger.warning(f"Async translation failed: {text} -> {target_lang}: {e}") self.executor.submit(_do_translate) def _add_ecommerce_context( self, text: str, source_lang: Optional[str], context: Optional[str] ) -> tuple: """ Add e-commerce context to text for better disambiguation. For single-word ambiguous Chinese terms, we add context words that help DeepL understand this is an e-commerce/product search context. Args: text: Original text to translate source_lang: Source language code context: Context hint Returns: Tuple of (text_with_context, needs_extraction) - text_with_context: Text to send to DeepL - needs_extraction: Whether we need to extract the term from the result """ # Only apply for e-commerce context and Chinese source if not context or "e-commerce" not in context.lower(): return text, False if not source_lang or source_lang.lower() != 'zh': return text, False # For single-word queries, add context to help disambiguation text_stripped = text.strip() if len(text_stripped.split()) == 1 and len(text_stripped) <= 2: # Common ambiguous Chinese e-commerce terms like "车" (car vs rook) # We add a context phrase: "购买 [term]" (buy [term]) or "商品 [term]" (product [term]) # This helps DeepL understand the e-commerce context # We'll need to extract just the term from the translation result context_phrase = f"购买 {text_stripped}" return context_phrase, True # For multi-word queries, DeepL usually has enough context return text, False def _extract_term_from_translation( self, translated_text: str, original_text: str, target_lang_code: str ) -> str: """ Extract the actual term from a translation that included context. For example, if we translated "购买 车" (buy car) and got "buy car", we want to extract just "car". Args: translated_text: Full translation result original_text: Original single-word query target_lang_code: Target language code (EN, ZH, etc.) Returns: Extracted term or original translation if extraction fails """ # For English target, try to extract the last word (the actual term) if target_lang_code == "EN": words = translated_text.strip().split() if len(words) > 1: # Usually the last word is the term we want # But we need to be smart - if it's "buy car", we want "car" # Common context words to skip: buy, purchase, product, item, etc. context_words = {"buy", "purchase", "product", "item", "commodity", "goods"} # Try to find the term (not a context word) for word in reversed(words): word_lower = word.lower().rstrip('.,!?;:') if word_lower not in context_words: return word_lower # If all words are context words, return the last one return words[-1].lower().rstrip('.,!?;:') # For other languages or if extraction fails, return as-is # The user can configure a glossary for better results return translated_text def get_translation_needs( self, detected_lang: str, supported_langs: List[str] ) -> List[str]: """ Determine which languages need translation. Args: detected_lang: Detected query language supported_langs: List of supported languages Returns: List of language codes to translate to """ # If detected language is in supported list, translate to others if detected_lang in supported_langs: return [lang for lang in supported_langs if lang != detected_lang] # Otherwise, translate to all supported languages return supported_langs def _is_english_text(self, text: str) -> bool: """ Check if text is primarily English (ASCII letters, numbers, common punctuation). Args: text: Text to check Returns: True if text appears to be English """ if not text or not text.strip(): return True # Remove whitespace and common punctuation text_clean = re.sub(r'[\s\.,!?;:\-\'\"\(\)\[\]{}]', '', text) if not text_clean: return True # Check if all remaining characters are ASCII (letters, numbers) # This is a simple heuristic: if most characters are ASCII, it's likely English ascii_count = sum(1 for c in text_clean if ord(c) < 128) ratio = ascii_count / len(text_clean) if text_clean else 0 # If more than 80% are ASCII characters, consider it English return ratio > 0.8 def _contains_chinese(self, text: str) -> bool: """ Check if text contains Chinese characters (Han characters). Args: text: Text to check Returns: True if text contains Chinese characters """ if not text: return False # Check for Chinese characters (Unicode range: \u4e00-\u9fff) chinese_pattern = re.compile(r'[\u4e00-\u9fff]') return bool(chinese_pattern.search(text)) def _is_pure_number(self, text: str) -> bool: """ Check if text is purely numeric (digits, possibly with spaces, dots, commas). Args: text: Text to check Returns: True if text is purely numeric """ if not text or not text.strip(): return False # Remove whitespace, dots, commas (common number separators) text_clean = re.sub(r'[\s\.,]', '', text.strip()) if not text_clean: return False # Check if all remaining characters are digits return text_clean.isdigit()