translator.py 13.7 KB
"""
Translation service for multi-language query support.

Supports DeepL API for high-quality translations.
"""

import requests
from typing import Dict, List, Optional
from utils.cache import DictCache

# Try to import DEEPL_AUTH_KEY, but allow import to fail
try:
    from config.env_config import DEEPL_AUTH_KEY
except ImportError:
    DEEPL_AUTH_KEY = None


class Translator:
    """Multi-language translator using DeepL API."""

    DEEPL_API_URL = "https://api.deepl.com/v2/translate"  # Pro tier

    # Language code mapping
    LANG_CODE_MAP = {
        'zh': 'ZH',
        'en': 'EN',
        'ru': 'RU',
        'ar': 'AR',
        'ja': 'JA',
        'es': 'ES',
        'de': 'DE',
        'fr': 'FR',
        'it': 'IT',
        'pt': 'PT',
    }

    def __init__(
        self,
        api_key: Optional[str] = None,
        use_cache: bool = True,
        timeout: int = 10,
        glossary_id: Optional[str] = None,
        translation_context: Optional[str] = None
    ):
        """
        Initialize translator.

        Args:
            api_key: DeepL API key (or None to use from config/env)
            use_cache: Whether to cache translations
            timeout: Request timeout in seconds
            glossary_id: DeepL glossary ID for custom terminology (optional)
            translation_context: Context hint for translation (e.g., "e-commerce", "product search")
        """
        # Get API key from config if not provided
        if api_key is None and DEEPL_AUTH_KEY:
            api_key = DEEPL_AUTH_KEY

        self.api_key = api_key
        self.timeout = timeout
        self.use_cache = use_cache
        self.glossary_id = glossary_id
        self.translation_context = translation_context or "e-commerce product search"

        if use_cache:
            self.cache = DictCache(".cache/translations.json")
        else:
            self.cache = None

    def translate(
        self,
        text: str,
        target_lang: str,
        source_lang: Optional[str] = None,
        context: Optional[str] = None
    ) -> Optional[str]:
        """
        Translate text to target language.

        Args:
            text: Text to translate
            target_lang: Target language code ('zh', 'en', 'ru', etc.)
            source_lang: Source language code (optional, auto-detect if None)
            context: Additional context for translation (overrides default context)

        Returns:
            Translated text or None if translation fails
        """
        if not text or not text.strip():
            return text

        # Normalize language codes
        target_lang = target_lang.lower()
        if source_lang:
            source_lang = source_lang.lower()

        # Use provided context or default context
        translation_context = context or self.translation_context

        # Check cache (include context in cache key for accuracy)
        if self.use_cache:
            cache_key = f"{source_lang or 'auto'}:{target_lang}:{translation_context}:{text}"
            cached = self.cache.get(cache_key, category="translations")
            if cached:
                return cached

        # If no API key, return mock translation (for testing)
        if not self.api_key:
            print(f"[Translator] No API key, returning original text (mock mode)")
            return text

        # Translate using DeepL with fallback
        result = self._translate_deepl(text, target_lang, source_lang, translation_context)

        # If translation failed, try fallback to free API
        if result is None and "api.deepl.com" in self.DEEPL_API_URL:
            print(f"[Translator] Pro API failed, trying free API...")
            result = self._translate_deepl_free(text, target_lang, source_lang, translation_context)

        # If still failed, return original text with warning
        if result is None:
            print(f"[Translator] Translation failed, returning original text")
            result = text

        # Cache result
        if result and self.use_cache:
            cache_key = f"{source_lang or 'auto'}:{target_lang}:{translation_context}:{text}"
            self.cache.set(cache_key, result, category="translations")

        return result

    def _translate_deepl(
        self,
        text: str,
        target_lang: str,
        source_lang: Optional[str],
        context: Optional[str] = None
    ) -> Optional[str]:
        """
        Translate using DeepL API with context and glossary support.
        
        Args:
            text: Text to translate
            target_lang: Target language code
            source_lang: Source language code (optional)
            context: Context hint for translation (e.g., "e-commerce product search")
        """
        # Map to DeepL language codes
        target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper())

        headers = {
            "Authorization": f"DeepL-Auth-Key {self.api_key}",
            "Content-Type": "application/json",
        }

        # Build text with context for better disambiguation
        # For e-commerce, add context words to help DeepL understand the domain
        # This is especially important for single-word ambiguous terms like "车" (car vs rook)
        text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, context)

        payload = {
            "text": [text_to_translate],
            "target_lang": target_code,
        }

        if source_lang:
            source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper())
            payload["source_lang"] = source_code

        # Add glossary if configured
        if self.glossary_id:
            payload["glossary_id"] = self.glossary_id

        # Note: DeepL API v2 doesn't have a direct "context" parameter,
        # but we can improve translation by:
        # 1. Using glossary for domain-specific terms (best solution)
        # 2. Adding context words to the text (for single-word queries) - implemented in _add_ecommerce_context
        # 3. Using more specific source language detection

        try:
            response = requests.post(
                self.DEEPL_API_URL,
                headers=headers,
                json=payload,
                timeout=self.timeout
            )

            if response.status_code == 200:
                data = response.json()
                if "translations" in data and len(data["translations"]) > 0:
                    translated_text = data["translations"][0]["text"]
                    # If we added context, extract just the term from the result
                    if needs_extraction:
                        translated_text = self._extract_term_from_translation(
                            translated_text, text, target_code
                        )
                    return translated_text
            else:
                print(f"[Translator] DeepL API error: {response.status_code} - {response.text}")
                return None

        except requests.Timeout:
            print(f"[Translator] Translation request timed out")
            return None
        except Exception as e:
            print(f"[Translator] Translation failed: {e}")
            return None

    def _translate_deepl_free(
        self,
        text: str,
        target_lang: str,
        source_lang: Optional[str],
        context: Optional[str] = None
    ) -> Optional[str]:
        """
        Translate using DeepL Free API.
        
        Note: Free API may not support glossary_id parameter.
        """
        # Map to DeepL language codes
        target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper())

        headers = {
            "Authorization": f"DeepL-Auth-Key {self.api_key}",
            "Content-Type": "application/json",
        }

        payload = {
            "text": [text],
            "target_lang": target_code,
        }

        if source_lang:
            source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper())
            payload["source_lang"] = source_code

        # Note: Free API typically doesn't support glossary_id
        # But we can still use context hints in the text

        try:
            response = requests.post(
                "https://api-free.deepl.com/v2/translate",
                headers=headers,
                json=payload,
                timeout=self.timeout
            )

            if response.status_code == 200:
                data = response.json()
                if "translations" in data and len(data["translations"]) > 0:
                    return data["translations"][0]["text"]
            else:
                print(f"[Translator] DeepL Free API error: {response.status_code} - {response.text}")
                return None

        except requests.Timeout:
            print(f"[Translator] Free API request timed out")
            return None
        except Exception as e:
            print(f"[Translator] Free API translation failed: {e}")
            return None

    def translate_multi(
        self,
        text: str,
        target_langs: List[str],
        source_lang: Optional[str] = None,
        context: Optional[str] = None
    ) -> Dict[str, Optional[str]]:
        """
        Translate text to multiple target languages.

        Args:
            text: Text to translate
            target_langs: List of target language codes
            source_lang: Source language code (optional)
            context: Context hint for translation (optional)

        Returns:
            Dictionary mapping language code to translated text
        """
        results = {}
        for lang in target_langs:
            results[lang] = self.translate(text, lang, source_lang, context)
        return results

    def _add_ecommerce_context(
        self,
        text: str,
        source_lang: Optional[str],
        context: Optional[str]
    ) -> tuple:
        """
        Add e-commerce context to text for better disambiguation.
        
        For single-word ambiguous Chinese terms, we add context words that help
        DeepL understand this is an e-commerce/product search context.
        
        Args:
            text: Original text to translate
            source_lang: Source language code
            context: Context hint
            
        Returns:
            Tuple of (text_with_context, needs_extraction)
            - text_with_context: Text to send to DeepL
            - needs_extraction: Whether we need to extract the term from the result
        """
        # Only apply for e-commerce context and Chinese source
        if not context or "e-commerce" not in context.lower():
            return text, False
            
        if not source_lang or source_lang.lower() != 'zh':
            return text, False
            
        # For single-word queries, add context to help disambiguation
        text_stripped = text.strip()
        if len(text_stripped.split()) == 1 and len(text_stripped) <= 2:
            # Common ambiguous Chinese e-commerce terms like "车" (car vs rook)
            # We add a context phrase: "购买 [term]" (buy [term]) or "商品 [term]" (product [term])
            # This helps DeepL understand the e-commerce context
            # We'll need to extract just the term from the translation result
            context_phrase = f"购买 {text_stripped}"
            return context_phrase, True
        
        # For multi-word queries, DeepL usually has enough context
        return text, False

    def _extract_term_from_translation(
        self,
        translated_text: str,
        original_text: str,
        target_lang_code: str
    ) -> str:
        """
        Extract the actual term from a translation that included context.
        
        For example, if we translated "购买 车" (buy car) and got "buy car",
        we want to extract just "car".
        
        Args:
            translated_text: Full translation result
            original_text: Original single-word query
            target_lang_code: Target language code (EN, ZH, etc.)
            
        Returns:
            Extracted term or original translation if extraction fails
        """
        # For English target, try to extract the last word (the actual term)
        if target_lang_code == "EN":
            words = translated_text.strip().split()
            if len(words) > 1:
                # Usually the last word is the term we want
                # But we need to be smart - if it's "buy car", we want "car"
                # Common context words to skip: buy, purchase, product, item, etc.
                context_words = {"buy", "purchase", "product", "item", "commodity", "goods"}
                # Try to find the term (not a context word)
                for word in reversed(words):
                    word_lower = word.lower().rstrip('.,!?;:')
                    if word_lower not in context_words:
                        return word_lower
                # If all words are context words, return the last one
                return words[-1].lower().rstrip('.,!?;:')
        
        # For other languages or if extraction fails, return as-is
        # The user can configure a glossary for better results
        return translated_text

    def get_translation_needs(
        self,
        detected_lang: str,
        supported_langs: List[str]
    ) -> List[str]:
        """
        Determine which languages need translation.

        Args:
            detected_lang: Detected query language
            supported_langs: List of supported languages

        Returns:
            List of language codes to translate to
        """
        # If detected language is in supported list, translate to others
        if detected_lang in supported_langs:
            return [lang for lang in supported_langs if lang != detected_lang]

        # Otherwise, translate to all supported languages
        return supported_langs