translator.py 37.2 KB
Edit Raw Blame History

"""
Translation service for multi-language query support.

Supports multiple translation models:
- Qwen (default): Alibaba Cloud DashScope API using qwen-mt-flash model
- DeepL: DeepL API for high-quality translations

使用方法 (Usage):

```python
from query.translator import Translator

# 使用默认的 qwen 模型（推荐）
translator = Translator()  # 默认使用 qwen 模型

# 或显式指定模型
translator = Translator(model='qwen')  # 使用 qwen 模型
translator = Translator(model='deepl')  # 使用 DeepL 模型

# 翻译文本
result = translator.translate(
    text="我看到这个视频后没有笑",
    target_lang="en",
    source_lang="auto"  # 自动检测源语言
)
```

配置说明 (Configuration):
- Qwen 模型需要设置 DASHSCOPE_API_KEY 环境变量（在 .env 文件中）
- DeepL 模型需要设置 DEEPL_AUTH_KEY 环境变量（在 .env 文件中）

Qwen 模型参考文档：
- 官方文档：https://help.aliyun.com/zh/model-studio/get-api-key
- 模型：qwen-mt-flash（快速翻译模型）

DeepL 官方文档：
https://developers.deepl.com/api-reference/translate/request-translation
"""

import os
import requests
import re
import redis
from concurrent.futures import ThreadPoolExecutor, Future
from datetime import timedelta
from typing import Dict, List, Optional, Union
import logging
import time

logger = logging.getLogger(__name__)

from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG
from openai import OpenAI


class Translator:
    """
    Multi-language translator supporting Qwen and DeepL APIs.
    
    Default model is 'qwen' which uses Alibaba Cloud DashScope API.
    """
# 华北2（北京）：https://dashscope.aliyuncs.com/compatible-mode/v1
# 新加坡：https://dashscope-intl.aliyuncs.com/compatible-mode/v1
# 美国（弗吉尼亚）：https://dashscope-us.aliyuncs.com/compatible-mode/v1

    DEEPL_API_URL = "https://api.deepl.com/v2/translate"  # Pro tier
    QWEN_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1"  # 北京地域
    # QWEN_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"  # 新加坡
    # 如果使用新加坡地域的模型，需要将base_url替换为：https://dashscope-intl.aliyuncs.com/compatible-mode/v1
    QWEN_MODEL = "qwen-mt-flash"  # 快速翻译模型

    # Language code mapping
    LANG_CODE_MAP = {
        'zh': 'ZH',
        'en': 'EN',
        'ru': 'RU',
        'ar': 'AR',
        'ja': 'JA',
        'es': 'ES',
        'de': 'DE',
        'fr': 'FR',
        'it': 'IT',
        'pt': 'PT',
    }

    def __init__(
        self,
        model: str = "qwen",
        api_key: Optional[str] = None,
        use_cache: bool = True,
        timeout: int = 10,
        glossary_id: Optional[str] = None,
        translation_context: Optional[str] = None
    ):
        """
        Initialize translator.

        Args:
            model: Translation model to use. Options: 'qwen' (default) or 'deepl'
            api_key: API key for the selected model (or None to use from config/env)
            use_cache: Whether to cache translations
            timeout: Request timeout in seconds
            glossary_id: DeepL glossary ID for custom terminology (optional, only for DeepL)
            translation_context: Context hint for translation (e.g., "e-commerce", "product search")
        """
        self.model = model.lower()
        if self.model not in ['qwen', 'deepl']:
            raise ValueError(f"Unsupported model: {model}. Supported models: 'qwen', 'deepl'")

        # Get API key from config if not provided
        if api_key is None:
            if self.model == 'qwen':
                api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY")
            else:  # deepl
                api_key = DEEPL_AUTH_KEY or os.getenv("DEEPL_AUTH_KEY")

        self.api_key = api_key
        self.timeout = timeout
        self.use_cache = use_cache
        self.glossary_id = glossary_id
        self.translation_context = translation_context or "e-commerce product search"

        # Initialize OpenAI client for Qwen if needed
        self.qwen_client = None
        if self.model == 'qwen':
            if not self.api_key:
                logger.warning("DASHSCOPE_API_KEY not set. Qwen translation will not work.")
            else:
                self.qwen_client = OpenAI(
                    api_key=self.api_key,
                    base_url=self.QWEN_BASE_URL,
                )

        # Initialize Redis cache if enabled
        if use_cache:
            try:
                self.redis_client = redis.Redis(
                    host=REDIS_CONFIG.get('host', 'localhost'),
                    port=REDIS_CONFIG.get('port', 6479),
                    password=REDIS_CONFIG.get('password'),
                    decode_responses=True,  # Return str instead of bytes
                    socket_timeout=REDIS_CONFIG.get('socket_timeout', 1),
                    socket_connect_timeout=REDIS_CONFIG.get('socket_connect_timeout', 1),
                    retry_on_timeout=REDIS_CONFIG.get('retry_on_timeout', False),
                    health_check_interval=10,  # 避免复用坏连接
                )
                # Test connection
                self.redis_client.ping()
                expire_days = REDIS_CONFIG.get('translation_cache_expire_days', 360)
                self.expire_time = timedelta(days=expire_days)
                self.expire_seconds = int(self.expire_time.total_seconds())  # Redis 需要秒数
                self.cache_prefix = REDIS_CONFIG.get('translation_cache_prefix', 'trans')
                logger.info("Redis cache initialized for translations")
            except Exception as e:
                logger.warning(f"Failed to initialize Redis cache: {e}, falling back to no cache")
                self.redis_client = None
                self.cache = None
        else:
            self.redis_client = None
            self.cache = None

        # Thread pool for async translation
        self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="translator")

    def translate(
        self,
        text: str,
        target_lang: str,
        source_lang: Optional[str] = None,
        context: Optional[str] = None,
        prompt: Optional[str] = None
    ) -> Optional[str]:
        """
        Translate text to target language (synchronous mode).

        Args:
            text: Text to translate
            target_lang: Target language code ('zh', 'en', 'ru', etc.)
            source_lang: Source language code (option al, auto-detect if None)
            context: Additional context for translation (overrides default context)
            prompt: Translation prompt/instruction (optional, for better translation quality)

        Returns:
            Translated text or None if translation fails
        """
        if not text or not text.strip():
            return text

        # Normalize language codes
        target_lang = target_lang.lower()
        if source_lang:
            source_lang = source_lang.lower()

        # Optimization: Skip translation if not needed
        if target_lang == 'en' and self._is_english_text(text):
            logger.info(f"[Translator] Text is already English, skipping translation: '{text[:50]}...'")
            return text

        if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)):
            logger.info(
                f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
                f"Source language: {source_lang or 'auto'} | Result: Skip translation (contains Chinese or pure number)"
            )
            return text

        # Use provided context or default context
        translation_context = context or self.translation_context

        # Build cache key (include prompt in cache key if provided)
        cache_key_parts = [source_lang or 'auto', target_lang, translation_context]
        if prompt:
            cache_key_parts.append(prompt)
        cache_key_parts.append(text)
        cache_key = ':'.join(cache_key_parts)

        # Check cache (include context and prompt in cache key for accuracy)
        if self.use_cache and self.redis_client:
            cached = self._get_cached_translation_redis(text, target_lang, source_lang, translation_context, prompt)
            if cached:
                logger.info(
                    f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
                    f"Source language: {source_lang or 'auto'} | Result: '{cached}' | Source: Cache hit"
                )
                return cached

        # If no API key, return mock translation (for testing)
        if not self.api_key:
            logger.info(
                f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
                f"Source language: {source_lang or 'auto'} | Result: '{text}' | Source: Mock mode (no API key)"
            )
            return text

        # Translate using selected model
        logger.info(
            f"[Translator] Translation request | Model: {self.model} | Original text: '{text}' | Target language: {target_lang} | "
            f"Source language: {source_lang or 'auto'} | Context: {translation_context} | "
            f"Prompt: {'yes' if prompt else 'no'} | Status: Starting translation"
        )

        if self.model == 'qwen':
            result = self._translate_qwen(text, target_lang, source_lang, translation_context, prompt)
        else:  # deepl
            result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt)

        # Surface translation failure to the caller instead of silently
        # masquerading the source text as a successful translation.
        if result is None:
            logger.warning(
                f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
                f"Source language: {source_lang or 'auto'} | Status: Translation failed"
            )
        else:
            logger.info(
                f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
                f"Source language: {source_lang or 'auto'} | Result: '{result}' | Status: Translation successful"
            )

        # Cache only successful translations. Failed attempts must not poison
        # Redis with the original text.
        if result is not None and self.use_cache and self.redis_client:
            self._set_cached_translation_redis(text, target_lang, result, source_lang, translation_context, prompt)

        return result

    def _translate_qwen(
        self,
        text: str,
        target_lang: str,
        source_lang: Optional[str],
        context: Optional[str] = None,
        prompt: Optional[str] = None
    ) -> Optional[str]:
        """
        Translate using Qwen MT Flash model via Alibaba Cloud DashScope API.
        
        Args:
            text: Text to translate
            target_lang: Target language code ('zh', 'en', 'ru', etc.)
            source_lang: Source language code (optional, 'auto' if None)
            context: Context hint for translation (optional)
            prompt: Translation prompt/instruction (optional)
            
        Returns:
            Translated text or None if translation fails
        """
        if not self.qwen_client:
            logger.error("[Translator] Qwen client not initialized. Check DASHSCOPE_API_KEY.")
            return None

        # Qwen (qwen-mt-plus/flash/turbo) supported languages mapping
        # 标准来自：你提供的“语言 / 英文名 / 代码”表
        qwen_lang_map = {
            "en": "English",
            "zh": "Chinese",
            "zh_tw": "Traditional Chinese",
            "ru": "Russian",
            "ja": "Japanese",
            "ko": "Korean",
            "es": "Spanish",
            "fr": "French",
            "pt": "Portuguese",
            "de": "German",
            "it": "Italian",
            "th": "Thai",
            "vi": "Vietnamese",
            "id": "Indonesian",
            "ms": "Malay",
            "ar": "Arabic",
            "hi": "Hindi",
            "he": "Hebrew",
            "my": "Burmese",
            "ta": "Tamil",
            "ur": "Urdu",
            "bn": "Bengali",
            "pl": "Polish",
            "nl": "Dutch",
            "ro": "Romanian",
            "tr": "Turkish",
            "km": "Khmer",
            "lo": "Lao",
            "yue": "Cantonese",
            "cs": "Czech",
            "el": "Greek",
            "sv": "Swedish",
            "hu": "Hungarian",
            "da": "Danish",
            "fi": "Finnish",
            "uk": "Ukrainian",
            "bg": "Bulgarian",
        }

        # Convert target language
        target_lang_normalized = target_lang.lower()
        target_lang_qwen = qwen_lang_map.get(target_lang_normalized, target_lang.capitalize())

        # Convert source language
        source_lang_normalized = (source_lang or "").strip().lower()
        if not source_lang_normalized or source_lang_normalized == "auto":
            source_lang_qwen = "auto"
        else:
            source_lang_qwen = qwen_lang_map.get(source_lang_normalized, source_lang.capitalize())

        # Prepare translation options
        translation_options = {
            "source_lang": source_lang_qwen,
            "target_lang": target_lang_qwen,
        }

        # Prepare messages
        messages = [
            {
                "role": "user",
                "content": text
            }
        ]

        start_time = time.time()
        try:
            completion = self.qwen_client.chat.completions.create(
                model=self.QWEN_MODEL,
                messages=messages,
                extra_body={
                    "translation_options": translation_options
                }
            )

            translated_text = completion.choices[0].message.content.strip()
            duration_ms = (time.time() - start_time) * 1000

            logger.info(
                f"[Translator] Qwen API response success | Original text: '{text}' | Target language: {target_lang_qwen} | "
                f"Translation result: '{translated_text}' | Duration: {duration_ms:.2f} ms"
            )
            return translated_text

        except Exception as e:
            duration_ms = (time.time() - start_time) * 1000
            logger.error(
                f"[Translator] Qwen API request exception | Original text: '{text}' | Target language: {target_lang_qwen} | "
                f"Duration: {duration_ms:.2f} ms | Error: {e}", exc_info=True
            )
            return None

    def _translate_deepl(
        self,
        text: str,
        target_lang: str,
        source_lang: Optional[str],
        context: Optional[str] = None,
        prompt: Optional[str] = None
    ) -> Optional[str]:
        """
        Translate using DeepL API with context and glossary support.
        
        Args:
            text: Text to translate
            target_lang: Target language code
            source_lang: Source language code (optional)
            context: Context hint for translation (e.g., "e-commerce product search")
        """
        # Map to DeepL language codes
        target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper())

        headers = {
            "Authorization": f"DeepL-Auth-Key {self.api_key}",
            "Content-Type": "application/json",
        }

        # Use prompt as context parameter for DeepL API (not as text prefix)
        # According to DeepL API: context is "Additional context that can influence a translation but is not translated itself"
        # If prompt is provided, use it as context; otherwise use the default context
        api_context = prompt if prompt else context

        # For e-commerce, add context words to help DeepL understand the domain
        # This is especially important for single-word ambiguous terms like "车" (car vs rook)
        text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context)

        payload = {
            "text": [text_to_translate],
            "target_lang": target_code,
        }

        if source_lang:
            source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper())
            payload["source_lang"] = source_code

        # Add context parameter (prompt or default context)
        # Context influences translation but is not translated itself
        if api_context:
            payload["context"] = api_context

        # Add glossary if configured
        if self.glossary_id:
            payload["glossary_id"] = self.glossary_id

        # Note: DeepL API v2 supports "context" parameter for additional context
        # that influences translation but is not translated itself.
        # We use prompt as context parameter when provided.

        try:
            response = requests.post(
                self.DEEPL_API_URL,
                headers=headers,
                json=payload,
                timeout=self.timeout
            )

            if response.status_code == 200:
                data = response.json()
                if "translations" in data and len(data["translations"]) > 0:
                    translated_text = data["translations"][0]["text"]
                    # If we added context, extract just the term from the result
                    if needs_extraction:
                        translated_text = self._extract_term_from_translation(
                            translated_text, text, target_code
                        )
                    logger.debug(
                        f"[Translator] DeepL API response success | Original text: '{text}' | Target language: {target_code} | "
                        f"Translation result: '{translated_text}'"
                    )
                    return translated_text
            else:
                logger.error(
                    f"[Translator] DeepL API error | Original text: '{text}' | Target language: {target_code} | "
                    f"Status code: {response.status_code} | Error message: {response.text}"
                )
                return None

        except requests.Timeout:
            logger.warning(
                f"[Translator] DeepL API request timeout | Original text: '{text}' | Target language: {target_code} | "
                f"Timeout: {self.timeout}s"
            )
            return None
        except Exception as e:
            logger.error(
                f"[Translator] DeepL API request exception | Original text: '{text}' | Target language: {target_code} | "
                f"Error: {e}", exc_info=True
            )
            return None

    # NOTE: _translate_deepl_free is intentionally not implemented.
    # We do not support automatic fallback to the free endpoint, to avoid
    # mixing Pro keys with https://api-free.deepl.com and related 403 errors.

    def translate_multi(
        self,
        text: str,
        target_langs: List[str],
        source_lang: Optional[str] = None,
        context: Optional[str] = None,
        async_mode: bool = True,
        prompt: Optional[str] = None
    ) -> Dict[str, Optional[str]]:
        """
        Translate text to multiple target languages.
        
        In async_mode=True (default):
        - Returns cached translations immediately if available
        - For translations that can be optimized (e.g., pure numbers, already in target language),
          returns result immediately via synchronous call
        - Launches async tasks for other missing translations (non-blocking)
        - Returns None for missing translations that require async processing
        
        In async_mode=False:
        - Waits for all translations to complete (blocking)

        Args:
            text: Text to translate
            target_langs: List of target language codes
            source_lang: Source language code (optional)
            context: Context hint for translation (optional)
            async_mode: If True, return cached results immediately and translate missing ones async
            prompt: Translation prompt/instruction (optional)

        Returns:
            Dictionary mapping language code to translated text (only cached results in async mode)
        """
        results = {}
        missing_langs = []
        async_langs = []

        # First, get cached translations
        for lang in target_langs:
            cached = self._get_cached_translation(text, lang, source_lang, context, prompt)
            if cached is not None:
                results[lang] = cached
            else:
                missing_langs.append(lang)

        # If async mode and there are missing translations
        if async_mode and missing_langs:
            # Check if translation can be optimized (immediate return)
            for lang in missing_langs:
                target_lang = lang.lower()
                # Check optimization conditions (same as in translate method)
                can_optimize = False
                if target_lang == 'en' and self._is_english_text(text):
                    can_optimize = True
                elif target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)):
                    can_optimize = True

                if can_optimize:
                    # Can be optimized, call translate synchronously for immediate result
                    results[lang] = self.translate(text, lang, source_lang, context, prompt)
                else:
                    # Requires actual translation, add to async list
                    async_langs.append(lang)

            # Launch async tasks for translations that require actual API calls
            if async_langs:
                for lang in async_langs:
                    self._translate_async(text, lang, source_lang, context, prompt)
                # Return None for async translations
                for lang in async_langs:
                    results[lang] = None
        else:
            # Synchronous mode: wait for all translations
            for lang in missing_langs:
                results[lang] = self.translate(text, lang, source_lang, context, prompt)

        return results

    def translate_multi_async(
        self,
        text: str,
        target_langs: List[str],
        source_lang: Optional[str] = None,
        context: Optional[str] = None,
        prompt: Optional[str] = None
    ) -> Dict[str, Union[str, Future]]:
        """
        Translate text to multiple target languages asynchronously, returning Futures that can be awaited.
        
        This method returns a dictionary where:
        - If translation is cached, the value is the translation string (immediate)
        - If translation needs to be done, the value is a Future object that can be awaited
        
        Args:
            text: Text to translate
            target_langs: List of target language codes
            source_lang: Source language code (optional)
            context: Context hint for translation (optional)
            prompt: Translation prompt/instruction (optional)

        Returns:
            Dictionary mapping language code to either translation string (cached) or Future object
        """
        results = {}
        missing_langs = []

        # First, get cached translations
        for lang in target_langs:
            cached = self._get_cached_translation(text, lang, source_lang, context, prompt)
            if cached is not None:
                results[lang] = cached
            else:
                missing_langs.append(lang)

        # For missing translations, submit async tasks and return Futures
        for lang in missing_langs:
            future = self.executor.submit(
                self.translate,
                text,
                lang,
                source_lang,
                context,
                prompt
            )
            results[lang] = future

        return results

    def _get_cached_translation(
        self,
        text: str,
        target_lang: str,
        source_lang: Optional[str] = None,
        context: Optional[str] = None,
        prompt: Optional[str] = None
    ) -> Optional[str]:
        """Get translation from cache if available."""
        if not self.redis_client:
            return None
        return self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt)

    def _get_cached_translation_redis(
        self,
        text: str,
        target_lang: str,
        source_lang: Optional[str] = None,
        context: Optional[str] = None,
        prompt: Optional[str] = None
    ) -> Optional[str]:
        """
        Get translation from Redis cache with sliding expiration.
        
        滑动过期机制：每次访问缓存时，重置过期时间为配置的过期时间（默认720天）。
        这样缓存会在最后一次访问后的720天才过期，而不是写入后的720天。
        这确保了常用的翻译缓存不会被过早删除。
        """
        if not self.redis_client:
            return None

        try:
            # Build cache key: prefix:target_lang:text
            # For simplicity, we use target_lang and text as key
            # Context and prompt are not included in key to maximize cache hits
            cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}"
            value = self.redis_client.get(cache_key)
            if value:
                # Sliding expiration: reset expiration time on access
                # 每次读取缓存时，重置过期时间为配置的过期时间（最后一次访问后的N天才过期）
                try:
                    self.redis_client.expire(cache_key, self.expire_seconds)
                except Exception as expire_error:
                    # 即使 expire 失败，也返回缓存值（不影响功能）
                    logger.warning(
                        f"[Translator] Failed to update cache expiration for key {cache_key}: {expire_error}"
                    )

                logger.debug(
                    f"[Translator] Redis cache hit | Original text: '{text}' | Target language: {target_lang} | "
                    f"Cache key: {cache_key} | Translation result: '{value}' | TTL reset to {self.expire_seconds}s"
                )
                return value
            logger.debug(
                f"[Translator] Redis cache miss | Original text: '{text}' | Target language: {target_lang} | "
                f"Cache key: {cache_key}"
            )
            return None
        except Exception as e:
            logger.error(f"[Translator] Redis error during get translation cache | Original text: '{text}' | Target language: {target_lang} | Error: {e}")
            return None

    def _set_cached_translation_redis(
        self,
        text: str,
        target_lang: str,
        translation: str,
        source_lang: Optional[str] = None,
        context: Optional[str] = None,
        prompt: Optional[str] = None
    ) -> None:
        """Store translation in Redis cache."""
        if not self.redis_client:
            return

        try:
            cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}"
            self.redis_client.setex(cache_key, self.expire_seconds, translation)
            logger.info(
                f"[Translator] Redis cache write | Original text: '{text}' | Target language: {target_lang} | "
                f"Cache key: {cache_key} | Translation result: '{translation}'"
            )
        except Exception as e:
            logger.error(
                f"[Translator] Redis cache write failed | Original text: '{text}' | Target language: {target_lang} | "
                f"Error: {e}"
            )

    def _translate_async(
        self,
        text: str,
        target_lang: str,
        source_lang: Optional[str] = None,
        context: Optional[str] = None,
        prompt: Optional[str] = None
    ):
        """Launch async translation task."""
        def _do_translate():
            try:
                result = self.translate(text, target_lang, source_lang, context, prompt)
                if result:
                    logger.debug(f"Async translation completed: {text} -> {target_lang}: {result}")
            except Exception as e:
                logger.warning(f"Async translation failed: {text} -> {target_lang}: {e}")

        self.executor.submit(_do_translate)

    def _add_ecommerce_context(
        self,
        text: str,
        source_lang: Optional[str],
        context: Optional[str]
    ) -> tuple:
        """
        Add e-commerce context to text for better disambiguation.
        
        For single-word ambiguous Chinese terms, we add context words that help
        DeepL understand this is an e-commerce/product search context.
        
        Args:
            text: Original text to translate
            source_lang: Source language code
            context: Context hint
            
        Returns:
            Tuple of (text_with_context, needs_extraction)
            - text_with_context: Text to send to DeepL
            - needs_extraction: Whether we need to extract the term from the result
        """
        # Only apply for e-commerce context and Chinese source
        if not context or "e-commerce" not in context.lower():
            return text, False

        if not source_lang or source_lang.lower() != 'zh':
            return text, False

        # For single-word queries, add context to help disambiguation
        text_stripped = text.strip()
        if len(text_stripped.split()) == 1 and len(text_stripped) <= 2:
            # Common ambiguous Chinese e-commerce terms like "车" (car vs rook)
            # We add a context phrase: "购买 [term]" (buy [term]) or "商品 [term]" (product [term])
            # This helps DeepL understand the e-commerce context
            # We'll need to extract just the term from the translation result
            context_phrase = f"购买 {text_stripped}"
            return context_phrase, True

        # For multi-word queries, DeepL usually has enough context
        return text, False

    def _extract_term_from_translation(
        self,
        translated_text: str,
        original_text: str,
        target_lang_code: str
    ) -> str:
        """
        Extract the actual term from a translation that included context.
        
        For example, if we translated "购买 车" (buy car) and got "buy car",
        we want to extract just "car".
        
        Args:
            translated_text: Full translation result
            original_text: Original single-word query
            target_lang_code: Target language code (EN, ZH, etc.)
            
        Returns:
            Extracted term or original translation if extraction fails
        """
        # For English target, try to extract the last word (the actual term)
        if target_lang_code == "EN":
            words = translated_text.strip().split()
            if len(words) > 1:
                # Usually the last word is the term we want
                # But we need to be smart - if it's "buy car", we want "car"
                # Common context words to skip: buy, purchase, product, item, etc.
                context_words = {"buy", "purchase", "product", "item", "commodity", "goods"}
                # Try to find the term (not a context word)
                for word in reversed(words):
                    word_lower = word.lower().rstrip('.,!?;:')
                    if word_lower not in context_words:
                        return word_lower
                # If all words are context words, return the last one
                return words[-1].lower().rstrip('.,!?;:')

        # For other languages or if extraction fails, return as-is
        # The user can configure a glossary for better results
        return translated_text

    def _shop_lang_matches(self, shop_lang_lower: str, lang_code: str) -> bool:
        """True if shop language matches index language (use source, no translate)."""
        if not shop_lang_lower or not lang_code:
            return False
        if shop_lang_lower == lang_code:
            return True
        if lang_code == "zh" and "zh" in shop_lang_lower:
            return True
        if lang_code == "en" and "en" in shop_lang_lower:
            return True
        return False

    def translate_for_indexing(
        self,
        text: str,
        shop_language: str,
        source_lang: Optional[str] = None,
        context: Optional[str] = None,
        prompt: Optional[str] = None,
        index_languages: Optional[List[str]] = None,
    ) -> Dict[str, Optional[str]]:
        """
        Translate text for indexing based on shop language and tenant index_languages.

        For each language in index_languages: use source text if shop language matches,
        otherwise translate to that language.

        Args:
            text: Text to translate
            shop_language: Shop primary language (e.g. 'zh', 'en', 'ru')
            source_lang: Source language code (optional)
            context: Additional context for translation (optional)
            prompt: Translation prompt (optional)
            index_languages: Languages to index (from tenant_config). Default ["en", "zh"].

        Returns:
            Dict keyed by each index_language with translated or source text (or None).
        """
        langs = index_languages if index_languages else ["en", "zh"]
        results = {lang: None for lang in langs}
        if not text or not text.strip():
            return results
        if re.match(r'^[\d\s_-]+$', text):
            logger.info(f"[Translator] Skip translation for symbol-only query: '{text}'")
            return results

        shop_lang_lower = (shop_language or "").strip().lower()
        targets = []
        for lang in langs:
            if self._shop_lang_matches(shop_lang_lower, lang):
                results[lang] = text
            else:
                targets.append(lang)

        for target_lang in targets:
            cached = self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt)
            if cached:
                results[target_lang] = cached
                logger.debug(f"[Translator] Cache hit for indexing: '{text}' -> {target_lang}: {cached}")
                continue
            translated = self.translate(
                text,
                target_lang=target_lang,
                source_lang=source_lang or shop_language,
                context=context,
                prompt=prompt,
            )
            results[target_lang] = translated
        return results

    def get_translation_needs(
        self,
        detected_lang: str,
        supported_langs: List[str]
    ) -> List[str]:
        """
        Determine which languages need translation.

        Args:
            detected_lang: Detected query language
            supported_langs: List of supported languages

        Returns:
            List of language codes to translate to
        """
        # If detected language is in supported list, translate to others
        if detected_lang in supported_langs:
            return [lang for lang in supported_langs if detected_lang != lang]

        # Otherwise, translate to all supported languages
        return supported_langs

    def _is_english_text(self, text: str) -> bool:
        """
        Check if text is primarily English (ASCII letters, numbers, common punctuation).
        
        Args:
            text: Text to check
            
        Returns:
            True if text appears to be English
        """
        if not text or not text.strip():
            return True

        # Remove whitespace and common punctuation
        text_clean = re.sub(r'[\s\.,!?;:\-\'\"\(\)\[\]{}]', '', text)
        if not text_clean:
            return True

        # Check if all remaining characters are ASCII (letters, numbers)
        # This is a simple heuristic: if most characters are ASCII, it's likely English
        ascii_count = sum(1 for c in text_clean if ord(c) < 128)
        ratio = ascii_count / len(text_clean) if text_clean else 0

        # If more than 80% are ASCII characters, consider it English
        return ratio > 0.8

    def _contains_chinese(self, text: str) -> bool:
        """
        Check if text contains Chinese characters (Han characters).
        
        Args:
            text: Text to check
            
        Returns:
            True if text contains Chinese characters
        """
        if not text:
            return False

        # Check for Chinese characters (Unicode range: \u4e00-\u9fff)
        chinese_pattern = re.compile(r'[\u4e00-\u9fff]')
        return bool(chinese_pattern.search(text))

    def _is_pure_number(self, text: str) -> bool:
        """
        Check if text is purely numeric (digits, possibly with spaces, dots, commas).
        
        Args:
            text: Text to check
            
        Returns:
            True if text is purely numeric
        """
        if not text or not text.strip():
            return False

        # Remove whitespace, dots, commas (common number separators)
        text_clean = re.sub(r'[\s\.,]', '', text.strip())
        if not text_clean:
            return False

        # Check if all remaining characters are digits
        return text_clean.isdigit()