translator.py 28.5 KB
Edit Raw Blame History

"""
Translation service for multi-language query support.

Supports DeepL API for high-quality translations.


#### 官方文档：
https://developers.deepl.com/api-reference/translate/request-translation
#####


"""

import requests
import re
import redis
from concurrent.futures import ThreadPoolExecutor, Future
from datetime import timedelta
from typing import Dict, List, Optional, Union
import logging

logger = logging.getLogger(__name__)

# Try to import DEEPL_AUTH_KEY and REDIS_CONFIG, but allow import to fail
try:
    from config.env_config import DEEPL_AUTH_KEY, REDIS_CONFIG
except ImportError:
    DEEPL_AUTH_KEY = None
    REDIS_CONFIG = {}


class Translator:
    """Multi-language translator using DeepL API."""

    DEEPL_API_URL = "https://api.deepl.com/v2/translate"  # Pro tier

    # Language code mapping
    LANG_CODE_MAP = {
        'zh': 'ZH',
        'en': 'EN',
        'ru': 'RU',
        'ar': 'AR',
        'ja': 'JA',
        'es': 'ES',
        'de': 'DE',
        'fr': 'FR',
        'it': 'IT',
        'pt': 'PT',
    }

    def __init__(
        self,
        api_key: Optional[str] = None,
        use_cache: bool = True,
        timeout: int = 10,
        glossary_id: Optional[str] = None,
        translation_context: Optional[str] = None
    ):
        """
        Initialize translator.

        Args:
            api_key: DeepL API key (or None to use from config/env)
            use_cache: Whether to cache translations
            timeout: Request timeout in seconds
            glossary_id: DeepL glossary ID for custom terminology (optional)
            translation_context: Context hint for translation (e.g., "e-commerce", "product search")
        """
        # Get API key from config if not provided
        if api_key is None and DEEPL_AUTH_KEY:
            api_key = DEEPL_AUTH_KEY

        self.api_key = api_key
        self.timeout = timeout
        self.use_cache = use_cache
        self.glossary_id = glossary_id
        self.translation_context = translation_context or "e-commerce product search"

        # Initialize Redis cache if enabled
        if use_cache:
            try:
                self.redis_client = redis.Redis(
                    host=REDIS_CONFIG.get('host', 'localhost'),
                    port=REDIS_CONFIG.get('port', 6479),
                    password=REDIS_CONFIG.get('password'),
                    decode_responses=True,  # Return str instead of bytes
                    socket_timeout=REDIS_CONFIG.get('socket_timeout', 1),
                    socket_connect_timeout=REDIS_CONFIG.get('socket_connect_timeout', 1),
                    retry_on_timeout=REDIS_CONFIG.get('retry_on_timeout', False),
                    health_check_interval=10,  # 避免复用坏连接
                )
                # Test connection
                self.redis_client.ping()
                self.expire_time = timedelta(days=REDIS_CONFIG.get('translation_cache_expire_days', 360))
                self.cache_prefix = REDIS_CONFIG.get('translation_cache_prefix', 'trans')
                logger.info("Redis cache initialized for translations")
            except Exception as e:
                logger.warning(f"Failed to initialize Redis cache: {e}, falling back to no cache")
                self.redis_client = None
                self.cache = None
        else:
            self.redis_client = None
            self.cache = None

        # Thread pool for async translation
        self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="translator")

    def translate(
        self,
        text: str,
        target_lang: str,
        source_lang: Optional[str] = None,
        context: Optional[str] = None,
        prompt: Optional[str] = None
    ) -> Optional[str]:
        """
        Translate text to target language (synchronous mode).

        Args:
            text: Text to translate
            target_lang: Target language code ('zh', 'en', 'ru', etc.)
            source_lang: Source language code (optional, auto-detect if None)
            context: Additional context for translation (overrides default context)
            prompt: Translation prompt/instruction (optional, for better translation quality)

        Returns:
            Translated text or None if translation fails
        """
        if not text or not text.strip():
            return text

        # Normalize language codes
        target_lang = target_lang.lower()
        if source_lang:
            source_lang = source_lang.lower()

        # Optimization: Skip translation if not needed
        if target_lang == 'en' and self._is_english_text(text):
            logger.info(f"[Translator] Text is already English, skipping translation: '{text[:50]}...'")
            return text

        if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)):
            logger.info(f"[Translator] Text contains Chinese or is pure number, skipping translation: '{text[:50]}...'")
            return text

        # Use provided context or default context
        translation_context = context or self.translation_context

        # Build cache key (include prompt in cache key if provided)
        cache_key_parts = [source_lang or 'auto', target_lang, translation_context]
        if prompt:
            cache_key_parts.append(prompt)
        cache_key_parts.append(text)
        cache_key = ':'.join(cache_key_parts)

        # Check cache (include context and prompt in cache key for accuracy)
        if self.use_cache and self.redis_client:
            cached = self._get_cached_translation_redis(text, target_lang, source_lang, translation_context, prompt)
            if cached:
                logger.info(
                    f"[Translator] Cache hit: source={source_lang or 'auto'} "
                    f"target={target_lang} | text='{text[:80]}...' -> '{cached[:80]}...'"
                )
                return cached

        # If no API key, return mock translation (for testing)
        if not self.api_key:
            logger.debug(f"[Translator] No API key, returning original text (mock mode)")
            return text

        # Translate using DeepL (Pro endpoint only, no free fallback)
        logger.info(
            f"[Translator] Translating text: target={target_lang}, "
            f"source={source_lang or 'auto'}, context={translation_context}, "
            f"prompt={'yes' if prompt else 'no'} | text='{text[:80]}...'"
        )
        result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt)

        # If still failed, return original text with warning
        if result is None:
            logger.warning(f"[Translator] Translation failed for '{text[:50]}...', returning original text")
            result = text

        logger.info(
            f"[Translator] Translation completed: source={source_lang or 'auto'} "
            f"target={target_lang} | original='{text[:80]}...' -> '{result[:80]}...'"
        )

        # Cache result
        if result and self.use_cache and self.redis_client:
            self._set_cached_translation_redis(text, target_lang, result, source_lang, translation_context, prompt)

        return result

    def _translate_deepl(
        self,
        text: str,
        target_lang: str,
        source_lang: Optional[str],
        context: Optional[str] = None,
        prompt: Optional[str] = None
    ) -> Optional[str]:
        """
        Translate using DeepL API with context and glossary support.
        
        Args:
            text: Text to translate
            target_lang: Target language code
            source_lang: Source language code (optional)
            context: Context hint for translation (e.g., "e-commerce product search")
        """
        # Map to DeepL language codes
        target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper())

        headers = {
            "Authorization": f"DeepL-Auth-Key {self.api_key}",
            "Content-Type": "application/json",
        }

        # Use prompt as context parameter for DeepL API (not as text prefix)
        # According to DeepL API: context is "Additional context that can influence a translation but is not translated itself"
        # If prompt is provided, use it as context; otherwise use the default context
        api_context = prompt if prompt else context

        # For e-commerce, add context words to help DeepL understand the domain
        # This is especially important for single-word ambiguous terms like "车" (car vs rook)
        text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context)

        payload = {
            "text": [text_to_translate],
            "target_lang": target_code,
        }

        if source_lang:
            source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper())
            payload["source_lang"] = source_code

        # Add context parameter (prompt or default context)
        # Context influences translation but is not translated itself
        if api_context:
            payload["context"] = api_context

        # Add glossary if configured
        if self.glossary_id:
            payload["glossary_id"] = self.glossary_id

        # Note: DeepL API v2 supports "context" parameter for additional context
        # that influences translation but is not translated itself.
        # We use prompt as context parameter when provided.

        try:
            response = requests.post(
                self.DEEPL_API_URL,
                headers=headers,
                json=payload,
                timeout=self.timeout
            )

            if response.status_code == 200:
                data = response.json()
                if "translations" in data and len(data["translations"]) > 0:
                    translated_text = data["translations"][0]["text"]
                    # If we added context, extract just the term from the result
                    if needs_extraction:
                        translated_text = self._extract_term_from_translation(
                            translated_text, text, target_code
                        )
                    return translated_text
            else:
                logger.error(f"[Translator] DeepL API error: {response.status_code} - {response.text}")
                return None

        except requests.Timeout:
            logger.warning(f"[Translator] Translation request timed out")
            return None
        except Exception as e:
            logger.error(f"[Translator] Translation failed: {e}", exc_info=True)
            return None

    # NOTE: _translate_deepl_free is intentionally not implemented.
    # We do not support automatic fallback to the free endpoint, to avoid
    # mixing Pro keys with https://api-free.deepl.com and related 403 errors.

    def translate_multi(
        self,
        text: str,
        target_langs: List[str],
        source_lang: Optional[str] = None,
        context: Optional[str] = None,
        async_mode: bool = True,
        prompt: Optional[str] = None
    ) -> Dict[str, Optional[str]]:
        """
        Translate text to multiple target languages.
        
        In async_mode=True (default):
        - Returns cached translations immediately if available
        - For translations that can be optimized (e.g., pure numbers, already in target language),
          returns result immediately via synchronous call
        - Launches async tasks for other missing translations (non-blocking)
        - Returns None for missing translations that require async processing
        
        In async_mode=False:
        - Waits for all translations to complete (blocking)

        Args:
            text: Text to translate
            target_langs: List of target language codes
            source_lang: Source language code (optional)
            context: Context hint for translation (optional)
            async_mode: If True, return cached results immediately and translate missing ones async
            prompt: Translation prompt/instruction (optional)

        Returns:
            Dictionary mapping language code to translated text (only cached results in async mode)
        """
        results = {}
        missing_langs = []
        async_langs = []

        # First, get cached translations
        for lang in target_langs:
            cached = self._get_cached_translation(text, lang, source_lang, context, prompt)
            if cached is not None:
                results[lang] = cached
            else:
                missing_langs.append(lang)

        # If async mode and there are missing translations
        if async_mode and missing_langs:
            # Check if translation can be optimized (immediate return)
            for lang in missing_langs:
                target_lang = lang.lower()
                # Check optimization conditions (same as in translate method)
                can_optimize = False
                if target_lang == 'en' and self._is_english_text(text):
                    can_optimize = True
                elif target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)):
                    can_optimize = True

                if can_optimize:
                    # Can be optimized, call translate synchronously for immediate result
                    results[lang] = self.translate(text, lang, source_lang, context, prompt)
                else:
                    # Requires actual translation, add to async list
                    async_langs.append(lang)

            # Launch async tasks for translations that require actual API calls
            if async_langs:
                for lang in async_langs:
                    self._translate_async(text, lang, source_lang, context, prompt)
                # Return None for async translations
                for lang in async_langs:
                    results[lang] = None
        else:
            # Synchronous mode: wait for all translations
            for lang in missing_langs:
                results[lang] = self.translate(text, lang, source_lang, context, prompt)

        return results

    def translate_multi_async(
        self,
        text: str,
        target_langs: List[str],
        source_lang: Optional[str] = None,
        context: Optional[str] = None,
        prompt: Optional[str] = None
    ) -> Dict[str, Union[str, Future]]:
        """
        Translate text to multiple target languages asynchronously, returning Futures that can be awaited.
        
        This method returns a dictionary where:
        - If translation is cached, the value is the translation string (immediate)
        - If translation needs to be done, the value is a Future object that can be awaited
        
        Args:
            text: Text to translate
            target_langs: List of target language codes
            source_lang: Source language code (optional)
            context: Context hint for translation (optional)
            prompt: Translation prompt/instruction (optional)

        Returns:
            Dictionary mapping language code to either translation string (cached) or Future object
        """
        results = {}
        missing_langs = []

        # First, get cached translations
        for lang in target_langs:
            cached = self._get_cached_translation(text, lang, source_lang, context, prompt)
            if cached is not None:
                results[lang] = cached
            else:
                missing_langs.append(lang)

        # For missing translations, submit async tasks and return Futures
        for lang in missing_langs:
            future = self.executor.submit(
                self.translate,
                text,
                lang,
                source_lang,
                context,
                prompt
            )
            results[lang] = future

        return results

    def _get_cached_translation(
        self,
        text: str,
        target_lang: str,
        source_lang: Optional[str] = None,
        context: Optional[str] = None,
        prompt: Optional[str] = None
    ) -> Optional[str]:
        """Get translation from cache if available."""
        if not self.redis_client:
            return None
        return self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt)

    def _get_cached_translation_redis(
        self,
        text: str,
        target_lang: str,
        source_lang: Optional[str] = None,
        context: Optional[str] = None,
        prompt: Optional[str] = None
    ) -> Optional[str]:
        """Get translation from Redis cache with sliding expiration."""
        if not self.redis_client:
            return None

        try:
            # Build cache key: prefix:target_lang:text
            # For simplicity, we use target_lang and text as key
            # Context and prompt are not included in key to maximize cache hits
            cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}"
            value = self.redis_client.get(cache_key)
            if value:
                # Sliding expiration: reset expiration time on access
                self.redis_client.expire(cache_key, self.expire_time)
                logger.info(
                    f"[Translator] Redis cache hit: key={cache_key}, "
                    f"target={target_lang}, value='{value[:80]}...'"
                )
                return value
            logger.debug(f"[Translator] Redis cache miss: key={cache_key}, target={target_lang}")
            return None
        except Exception as e:
            logger.error(f"[Translator] Redis error during get translation cache: '{text}' {target_lang}: {e}")
            return None

    def _set_cached_translation_redis(
        self,
        text: str,
        target_lang: str,
        translation: str,
        source_lang: Optional[str] = None,
        context: Optional[str] = None,
        prompt: Optional[str] = None
    ) -> None:
        """Store translation in Redis cache."""
        if not self.redis_client:
            return

        try:
            cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}"
            self.redis_client.setex(cache_key, self.expire_time, translation)
            logger.info(
                f"[Translator] Cached translation: key={cache_key}, "
                f"target={target_lang}, value='{translation}...'"
            )
        except Exception as e:
            logger.error(f"[Translator] Redis error during set translation cache: '{text}' {target_lang}: {e}")

    def _translate_async(
        self,
        text: str,
        target_lang: str,
        source_lang: Optional[str] = None,
        context: Optional[str] = None,
        prompt: Optional[str] = None
    ):
        """Launch async translation task."""
        def _do_translate():
            try:
                result = self.translate(text, target_lang, source_lang, context, prompt)
                if result:
                    logger.debug(f"Async translation completed: {text} -> {target_lang}: {result}")
            except Exception as e:
                logger.warning(f"Async translation failed: {text} -> {target_lang}: {e}")

        self.executor.submit(_do_translate)

    def _add_ecommerce_context(
        self,
        text: str,
        source_lang: Optional[str],
        context: Optional[str]
    ) -> tuple:
        """
        Add e-commerce context to text for better disambiguation.
        
        For single-word ambiguous Chinese terms, we add context words that help
        DeepL understand this is an e-commerce/product search context.
        
        Args:
            text: Original text to translate
            source_lang: Source language code
            context: Context hint
            
        Returns:
            Tuple of (text_with_context, needs_extraction)
            - text_with_context: Text to send to DeepL
            - needs_extraction: Whether we need to extract the term from the result
        """
        # Only apply for e-commerce context and Chinese source
        if not context or "e-commerce" not in context.lower():
            return text, False

        if not source_lang or source_lang.lower() != 'zh':
            return text, False

        # For single-word queries, add context to help disambiguation
        text_stripped = text.strip()
        if len(text_stripped.split()) == 1 and len(text_stripped) <= 2:
            # Common ambiguous Chinese e-commerce terms like "车" (car vs rook)
            # We add a context phrase: "购买 [term]" (buy [term]) or "商品 [term]" (product [term])
            # This helps DeepL understand the e-commerce context
            # We'll need to extract just the term from the translation result
            context_phrase = f"购买 {text_stripped}"
            return context_phrase, True

        # For multi-word queries, DeepL usually has enough context
        return text, False

    def _extract_term_from_translation(
        self,
        translated_text: str,
        original_text: str,
        target_lang_code: str
    ) -> str:
        """
        Extract the actual term from a translation that included context.
        
        For example, if we translated "购买 车" (buy car) and got "buy car",
        we want to extract just "car".
        
        Args:
            translated_text: Full translation result
            original_text: Original single-word query
            target_lang_code: Target language code (EN, ZH, etc.)
            
        Returns:
            Extracted term or original translation if extraction fails
        """
        # For English target, try to extract the last word (the actual term)
        if target_lang_code == "EN":
            words = translated_text.strip().split()
            if len(words) > 1:
                # Usually the last word is the term we want
                # But we need to be smart - if it's "buy car", we want "car"
                # Common context words to skip: buy, purchase, product, item, etc.
                context_words = {"buy", "purchase", "product", "item", "commodity", "goods"}
                # Try to find the term (not a context word)
                for word in reversed(words):
                    word_lower = word.lower().rstrip('.,!?;:')
                    if word_lower not in context_words:
                        return word_lower
                # If all words are context words, return the last one
                return words[-1].lower().rstrip('.,!?;:')

        # For other languages or if extraction fails, return as-is
        # The user can configure a glossary for better results
        return translated_text

    def translate_for_indexing(
        self,
        text: str,
        shop_language: str,
        source_lang: Optional[str] = None,
        context: Optional[str] = None,
        prompt: Optional[str] = None,
        translate_to_en: bool = True,
        translate_to_zh: bool = True,
    ) -> Dict[str, Optional[str]]:
        """
        Translate text for indexing based on shop language and tenant configuration.
        
        Translation behavior:
        - If translate_to_zh=True and shop language is not 'zh', translate to Chinese (zh)
        - If translate_to_en=True and shop language is not 'en', translate to English (en)
        - If both flags are False, no translation is performed (returns None for both)
        
        Args:
            text: Text to translate
            shop_language: Shop's configured language (e.g., 'zh', 'en', 'ru')
            source_lang: Source language code (optional, auto-detect if None)
            context: Additional context for translation (optional)
            prompt: Translation prompt/instruction (optional)
            translate_to_en: Whether to translate to English (from tenant_config)
            translate_to_zh: Whether to translate to Chinese (from tenant_config)
            
        Returns:
            Dictionary with 'zh' and 'en' keys containing translated text (or None if not needed/not enabled)
            Example: {'zh': '中文翻译', 'en': 'English translation'} or {'zh': None, 'en': None}
        """
        if not text or not text.strip():
            return {'zh': None, 'en': None}

        # Skip translation for symbol-only queries
        if re.match(r'^[\d\s_-]+$', text):
            logger.info(f"[Translator] Skip translation for symbol-only query: '{text}'")
            return {'zh': None, 'en': None}

        results = {'zh': None, 'en': None}
        shop_lang_lower = shop_language.lower() if shop_language else ""

        # Determine which languages need translation based on tenant configuration
        targets = []
        if translate_to_zh and "zh" not in shop_lang_lower:
            targets.append("zh")
        if translate_to_en and "en" not in shop_lang_lower:
            targets.append("en")

        # If shop language is already zh and en, no translation needed
        if not targets:
            # Use original text for both languages
            if "zh" in shop_lang_lower:
                results['zh'] = text
            if "en" in shop_lang_lower:
                results['en'] = text
            return results

        # Translate to each target language
        for target_lang in targets:
            # Check cache first
            cached = self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt)
            if cached:
                results[target_lang] = cached
                logger.debug(f"[Translator] Cache hit for indexing: '{text}' -> {target_lang}: {cached}")
                continue

            # Translate synchronously for indexing (we need the result immediately)
            translated = self.translate(
                text,
                target_lang=target_lang,
                source_lang=source_lang or shop_language,
                context=context,
                prompt=prompt
            )
            results[target_lang] = translated

        return results

    def get_translation_needs(
        self,
        detected_lang: str,
        supported_langs: List[str]
    ) -> List[str]:
        """
        Determine which languages need translation.

        Args:
            detected_lang: Detected query language
            supported_langs: List of supported languages

        Returns:
            List of language codes to translate to
        """
        # If detected language is in supported list, translate to others
        if detected_lang in supported_langs:
            return [lang for lang in supported_langs if detected_lang != lang]

        # Otherwise, translate to all supported languages
        return supported_langs

    def _is_english_text(self, text: str) -> bool:
        """
        Check if text is primarily English (ASCII letters, numbers, common punctuation).
        
        Args:
            text: Text to check
            
        Returns:
            True if text appears to be English
        """
        if not text or not text.strip():
            return True

        # Remove whitespace and common punctuation
        text_clean = re.sub(r'[\s\.,!?;:\-\'\"\(\)\[\]{}]', '', text)
        if not text_clean:
            return True

        # Check if all remaining characters are ASCII (letters, numbers)
        # This is a simple heuristic: if most characters are ASCII, it's likely English
        ascii_count = sum(1 for c in text_clean if ord(c) < 128)
        ratio = ascii_count / len(text_clean) if text_clean else 0

        # If more than 80% are ASCII characters, consider it English
        return ratio > 0.8

    def _contains_chinese(self, text: str) -> bool:
        """
        Check if text contains Chinese characters (Han characters).
        
        Args:
            text: Text to check
            
        Returns:
            True if text contains Chinese characters
        """
        if not text:
            return False

        # Check for Chinese characters (Unicode range: \u4e00-\u9fff)
        chinese_pattern = re.compile(r'[\u4e00-\u9fff]')
        return bool(chinese_pattern.search(text))

    def _is_pure_number(self, text: str) -> bool:
        """
        Check if text is purely numeric (digits, possibly with spaces, dots, commas).
        
        Args:
            text: Text to check
            
        Returns:
            True if text is purely numeric
        """
        if not text or not text.strip():
            return False

        # Remove whitespace, dots, commas (common number separators)
        text_clean = re.sub(r'[\s\.,]', '', text.strip())
        if not text_clean:
            return False

        # Check if all remaining characters are digits
        return text_clean.isdigit()