translator.py 17.8 KB
Edit Raw Blame History

"""
Translation service for multi-language query support.

Supports DeepL API for high-quality translations.


#### 官方文档：
https://developers.deepl.com/api-reference/translate/request-translation
#####


"""

import requests
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Optional
from utils.cache import DictCache
import logging

logger = logging.getLogger(__name__)

# Try to import DEEPL_AUTH_KEY, but allow import to fail
try:
    from config.env_config import DEEPL_AUTH_KEY
except ImportError:
    DEEPL_AUTH_KEY = None


class Translator:
    """Multi-language translator using DeepL API."""

    DEEPL_API_URL = "https://api.deepl.com/v2/translate"  # Pro tier

    # Language code mapping
    LANG_CODE_MAP = {
        'zh': 'ZH',
        'en': 'EN',
        'ru': 'RU',
        'ar': 'AR',
        'ja': 'JA',
        'es': 'ES',
        'de': 'DE',
        'fr': 'FR',
        'it': 'IT',
        'pt': 'PT',
    }

    def __init__(
        self,
        api_key: Optional[str] = None,
        use_cache: bool = True,
        timeout: int = 10,
        glossary_id: Optional[str] = None,
        translation_context: Optional[str] = None
    ):
        """
        Initialize translator.

        Args:
            api_key: DeepL API key (or None to use from config/env)
            use_cache: Whether to cache translations
            timeout: Request timeout in seconds
            glossary_id: DeepL glossary ID for custom terminology (optional)
            translation_context: Context hint for translation (e.g., "e-commerce", "product search")
        """
        # Get API key from config if not provided
        if api_key is None and DEEPL_AUTH_KEY:
            api_key = DEEPL_AUTH_KEY

        self.api_key = api_key
        self.timeout = timeout
        self.use_cache = use_cache
        self.glossary_id = glossary_id
        self.translation_context = translation_context or "e-commerce product search"

        if use_cache:
            self.cache = DictCache(".cache/translations.json")
        else:
            self.cache = None

        # Thread pool for async translation
        self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="translator")

    def translate(
        self,
        text: str,
        target_lang: str,
        source_lang: Optional[str] = None,
        context: Optional[str] = None,
        prompt: Optional[str] = None
    ) -> Optional[str]:
        """
        Translate text to target language (synchronous mode).

        Args:
            text: Text to translate
            target_lang: Target language code ('zh', 'en', 'ru', etc.)
            source_lang: Source language code (optional, auto-detect if None)
            context: Additional context for translation (overrides default context)
            prompt: Translation prompt/instruction (optional, for better translation quality)

        Returns:
            Translated text or None if translation fails
        """
        if not text or not text.strip():
            return text

        # Normalize language codes
        target_lang = target_lang.lower()
        if source_lang:
            source_lang = source_lang.lower()

        # Use provided context or default context
        translation_context = context or self.translation_context

        # Build cache key (include prompt in cache key if provided)
        cache_key_parts = [source_lang or 'auto', target_lang, translation_context]
        if prompt:
            cache_key_parts.append(prompt)
        cache_key_parts.append(text)
        cache_key = ':'.join(cache_key_parts)

        # Check cache (include context and prompt in cache key for accuracy)
        if self.use_cache:
            cached = self.cache.get(cache_key, category="translations")
            if cached:
                return cached

        # If no API key, return mock translation (for testing)
        if not self.api_key:
            logger.debug(f"[Translator] No API key, returning original text (mock mode)")
            return text

        # Translate using DeepL with fallback
        result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt)

        # If translation failed, try fallback to free API
        if result is None and "api.deepl.com" in self.DEEPL_API_URL:
            logger.debug(f"[Translator] Pro API failed, trying free API...")
            result = self._translate_deepl_free(text, target_lang, source_lang, translation_context, prompt)

        # If still failed, return original text with warning
        if result is None:
            logger.warning(f"[Translator] Translation failed for '{text[:50]}...', returning original text")
            result = text

        # Cache result
        if result and self.use_cache:
            self.cache.set(cache_key, result, category="translations")

        return result

    def _translate_deepl(
        self,
        text: str,
        target_lang: str,
        source_lang: Optional[str],
        context: Optional[str] = None,
        prompt: Optional[str] = None
    ) -> Optional[str]:
        """
        Translate using DeepL API with context and glossary support.
        
        Args:
            text: Text to translate
            target_lang: Target language code
            source_lang: Source language code (optional)
            context: Context hint for translation (e.g., "e-commerce product search")
        """
        # Map to DeepL language codes
        target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper())

        headers = {
            "Authorization": f"DeepL-Auth-Key {self.api_key}",
            "Content-Type": "application/json",
        }

        # Use prompt as context parameter for DeepL API (not as text prefix)
        # According to DeepL API: context is "Additional context that can influence a translation but is not translated itself"
        # If prompt is provided, use it as context; otherwise use the default context
        api_context = prompt if prompt else context

        # For e-commerce, add context words to help DeepL understand the domain
        # This is especially important for single-word ambiguous terms like "车" (car vs rook)
        text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context)

        payload = {
            "text": [text_to_translate],
            "target_lang": target_code,
        }

        if source_lang:
            source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper())
            payload["source_lang"] = source_code

        # Add context parameter (prompt or default context)
        # Context influences translation but is not translated itself
        if api_context:
            payload["context"] = api_context

        # Add glossary if configured
        if self.glossary_id:
            payload["glossary_id"] = self.glossary_id

        # Note: DeepL API v2 supports "context" parameter for additional context
        # that influences translation but is not translated itself.
        # We use prompt as context parameter when provided.

        try:
            response = requests.post(
                self.DEEPL_API_URL,
                headers=headers,
                json=payload,
                timeout=self.timeout
            )

            if response.status_code == 200:
                data = response.json()
                if "translations" in data and len(data["translations"]) > 0:
                    translated_text = data["translations"][0]["text"]
                    # If we added context, extract just the term from the result
                    if needs_extraction:
                        translated_text = self._extract_term_from_translation(
                            translated_text, text, target_code
                        )
                    return translated_text
            else:
                logger.error(f"[Translator] DeepL API error: {response.status_code} - {response.text}")
                return None

        except requests.Timeout:
            logger.warning(f"[Translator] Translation request timed out")
            return None
        except Exception as e:
            logger.error(f"[Translator] Translation failed: {e}", exc_info=True)
            return None

    def _translate_deepl_free(
        self,
        text: str,
        target_lang: str,
        source_lang: Optional[str],
        context: Optional[str] = None,
        prompt: Optional[str] = None
    ) -> Optional[str]:
        """
        Translate using DeepL Free API.
        
        Note: Free API may not support glossary_id parameter.
        """
        # Map to DeepL language codes
        target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper())

        headers = {
            "Authorization": f"DeepL-Auth-Key {self.api_key}",
            "Content-Type": "application/json",
        }

        # Use prompt as context parameter for DeepL API
        api_context = prompt if prompt else context

        payload = {
            "text": [text],
            "target_lang": target_code,
        }

        if source_lang:
            source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper())
            payload["source_lang"] = source_code

        # Add context parameter
        if api_context:
            payload["context"] = api_context

        # Note: Free API typically doesn't support glossary_id
        # But we can still use context hints in the text

        try:
            response = requests.post(
                "https://api-free.deepl.com/v2/translate",
                headers=headers,
                json=payload,
                timeout=self.timeout
            )

            if response.status_code == 200:
                data = response.json()
                if "translations" in data and len(data["translations"]) > 0:
                    return data["translations"][0]["text"]
            else:
                logger.error(f"[Translator] DeepL Free API error: {response.status_code} - {response.text}")
                return None

        except requests.Timeout:
            logger.warning(f"[Translator] Free API request timed out")
            return None
        except Exception as e:
            logger.error(f"[Translator] Free API translation failed: {e}", exc_info=True)
            return None

    def translate_multi(
        self,
        text: str,
        target_langs: List[str],
        source_lang: Optional[str] = None,
        context: Optional[str] = None,
        async_mode: bool = True,
        prompt: Optional[str] = None
    ) -> Dict[str, Optional[str]]:
        """
        Translate text to multiple target languages.
        
        In async_mode=True (default):
        - Returns cached translations immediately if available
        - Launches async tasks for missing translations (non-blocking)
        - Returns None for missing translations (will be available in cache next time)
        
        In async_mode=False:
        - Waits for all translations to complete (blocking)

        Args:
            text: Text to translate
            target_langs: List of target language codes
            source_lang: Source language code (optional)
            context: Context hint for translation (optional)
            async_mode: If True, return cached results immediately and translate missing ones async
            prompt: Translation prompt/instruction (optional)

        Returns:
            Dictionary mapping language code to translated text (only cached results in async mode)
        """
        results = {}
        missing_langs = []

        # First, get cached translations
        for lang in target_langs:
            cached = self._get_cached_translation(text, lang, source_lang, context, prompt)
            if cached is not None:
                results[lang] = cached
            else:
                missing_langs.append(lang)

        # If async mode and there are missing translations, launch async tasks
        if async_mode and missing_langs:
            for lang in missing_langs:
                self._translate_async(text, lang, source_lang, context, prompt)
            # Return None for missing translations
            for lang in missing_langs:
                results[lang] = None
        else:
            # Synchronous mode: wait for all translations
            for lang in missing_langs:
                results[lang] = self.translate(text, lang, source_lang, context, prompt)

        return results

    def _get_cached_translation(
        self,
        text: str,
        target_lang: str,
        source_lang: Optional[str] = None,
        context: Optional[str] = None,
        prompt: Optional[str] = None
    ) -> Optional[str]:
        """Get translation from cache if available."""
        if not self.cache:
            return None

        translation_context = context or self.translation_context
        cache_key_parts = [source_lang or 'auto', target_lang, translation_context]
        if prompt:
            cache_key_parts.append(prompt)
        cache_key_parts.append(text)
        cache_key = ':'.join(cache_key_parts)
        return self.cache.get(cache_key, category="translations")

    def _translate_async(
        self,
        text: str,
        target_lang: str,
        source_lang: Optional[str] = None,
        context: Optional[str] = None,
        prompt: Optional[str] = None
    ):
        """Launch async translation task."""
        def _do_translate():
            try:
                result = self.translate(text, target_lang, source_lang, context, prompt)
                if result:
                    logger.debug(f"Async translation completed: {text} -> {target_lang}: {result}")
            except Exception as e:
                logger.warning(f"Async translation failed: {text} -> {target_lang}: {e}")

        self.executor.submit(_do_translate)

    def _add_ecommerce_context(
        self,
        text: str,
        source_lang: Optional[str],
        context: Optional[str]
    ) -> tuple:
        """
        Add e-commerce context to text for better disambiguation.
        
        For single-word ambiguous Chinese terms, we add context words that help
        DeepL understand this is an e-commerce/product search context.
        
        Args:
            text: Original text to translate
            source_lang: Source language code
            context: Context hint
            
        Returns:
            Tuple of (text_with_context, needs_extraction)
            - text_with_context: Text to send to DeepL
            - needs_extraction: Whether we need to extract the term from the result
        """
        # Only apply for e-commerce context and Chinese source
        if not context or "e-commerce" not in context.lower():
            return text, False

        if not source_lang or source_lang.lower() != 'zh':
            return text, False

        # For single-word queries, add context to help disambiguation
        text_stripped = text.strip()
        if len(text_stripped.split()) == 1 and len(text_stripped) <= 2:
            # Common ambiguous Chinese e-commerce terms like "车" (car vs rook)
            # We add a context phrase: "购买 [term]" (buy [term]) or "商品 [term]" (product [term])
            # This helps DeepL understand the e-commerce context
            # We'll need to extract just the term from the translation result
            context_phrase = f"购买 {text_stripped}"
            return context_phrase, True

        # For multi-word queries, DeepL usually has enough context
        return text, False

    def _extract_term_from_translation(
        self,
        translated_text: str,
        original_text: str,
        target_lang_code: str
    ) -> str:
        """
        Extract the actual term from a translation that included context.
        
        For example, if we translated "购买 车" (buy car) and got "buy car",
        we want to extract just "car".
        
        Args:
            translated_text: Full translation result
            original_text: Original single-word query
            target_lang_code: Target language code (EN, ZH, etc.)
            
        Returns:
            Extracted term or original translation if extraction fails
        """
        # For English target, try to extract the last word (the actual term)
        if target_lang_code == "EN":
            words = translated_text.strip().split()
            if len(words) > 1:
                # Usually the last word is the term we want
                # But we need to be smart - if it's "buy car", we want "car"
                # Common context words to skip: buy, purchase, product, item, etc.
                context_words = {"buy", "purchase", "product", "item", "commodity", "goods"}
                # Try to find the term (not a context word)
                for word in reversed(words):
                    word_lower = word.lower().rstrip('.,!?;:')
                    if word_lower not in context_words:
                        return word_lower
                # If all words are context words, return the last one
                return words[-1].lower().rstrip('.,!?;:')

        # For other languages or if extraction fails, return as-is
        # The user can configure a glossary for better results
        return translated_text

    def get_translation_needs(
        self,
        detected_lang: str,
        supported_langs: List[str]
    ) -> List[str]:
        """
        Determine which languages need translation.

        Args:
            detected_lang: Detected query language
            supported_langs: List of supported languages

        Returns:
            List of language codes to translate to
        """
        # If detected language is in supported list, translate to others
        if detected_lang in supported_langs:
            return [lang for lang in supported_langs if lang != detected_lang]

        # Otherwise, translate to all supported languages
        return supported_langs