deepl_provider.py 6.51 KB
"""
DeepL backend provider.

This module only handles network calls to DeepL.
It does not handle cache, async fanout, or fallback semantics.
"""

from __future__ import annotations

import logging
import os
import re
from typing import Dict, Optional, Tuple

import requests
from config.services_config import get_translation_config


logger = logging.getLogger(__name__)

DEFAULT_CONTEXTS: Dict[str, Dict[str, str]] = {
    "sku_name": {
        "zh": "商品SKU名称",
        "en": "product SKU name",
    },
    "ecommerce_search_query": {
        "zh": "电商",
        "en": "e-commerce",
    },
    "general": {
        "zh": "",
        "en": "",
    },
}
SCENE_NAMES = frozenset(DEFAULT_CONTEXTS.keys())


def _merge_contexts(raw: object) -> Dict[str, Dict[str, str]]:
    merged: Dict[str, Dict[str, str]] = {
        scene: dict(lang_map) for scene, lang_map in DEFAULT_CONTEXTS.items()
    }
    if not isinstance(raw, dict):
        return merged
    for scene, lang_map in raw.items():
        if not isinstance(lang_map, dict):
            continue
        scene_name = str(scene or "").strip()
        if not scene_name:
            continue
        merged.setdefault(scene_name, {})
        for lang, value in lang_map.items():
            lang_key = str(lang or "").strip().lower()
            context_value = str(value or "").strip()
            if lang_key and context_value:
                merged[scene_name][lang_key] = context_value
    return merged


class DeepLProvider:
    API_URL = "https://api.deepl.com/v2/translate"  # Pro tier
    LANG_CODE_MAP = {
        "zh": "ZH",
        "en": "EN",
        "ru": "RU",
        "ar": "AR",
        "ja": "JA",
        "es": "ES",
        "de": "DE",
        "fr": "FR",
        "it": "IT",
        "pt": "PT",
    }

    def __init__(
        self,
        api_key: Optional[str],
        *,
        timeout: float = 10.0,
        glossary_id: Optional[str] = None,
    ) -> None:
        cfg = get_translation_config()
        provider_cfg = cfg.providers.get("deepl", {}) if isinstance(cfg.providers, dict) else {}
        self.api_key = api_key or os.getenv("DEEPL_AUTH_KEY")
        self.timeout = float(provider_cfg.get("timeout_sec") or timeout or 10.0)
        self.glossary_id = glossary_id or provider_cfg.get("glossary_id")
        self.model = "deepl"
        self.context_presets = _merge_contexts(provider_cfg.get("contexts"))
        if not self.api_key:
            logger.warning("DEEPL_AUTH_KEY not set; DeepL translation is unavailable")

    def _resolve_request_context(
        self,
        target_lang: str,
        context: Optional[str],
        prompt: Optional[str],
    ) -> Optional[str]:
        if prompt:
            return prompt
        if context in SCENE_NAMES:
            scene_map = self.context_presets.get(context) or self.context_presets.get("default") or {}
            tgt = (target_lang or "").strip().lower()
            return scene_map.get(tgt) or scene_map.get("en")
        if context:
            return context
        scene_map = self.context_presets.get("default") or {}
        tgt = (target_lang or "").strip().lower()
        return scene_map.get(tgt) or scene_map.get("en")

    def translate(
        self,
        text: str,
        target_lang: str,
        source_lang: Optional[str] = None,
        context: Optional[str] = None,
        prompt: Optional[str] = None,
    ) -> Optional[str]:
        if not self.api_key:
            return None

        target_code = self.LANG_CODE_MAP.get((target_lang or "").lower(), (target_lang or "").upper())
        headers = {
            "Authorization": f"DeepL-Auth-Key {self.api_key}",
            "Content-Type": "application/json",
        }

        api_context = self._resolve_request_context(target_lang, context, prompt)
        text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context)

        payload = {
            "text": [text_to_translate],
            "target_lang": target_code,
        }
        if source_lang:
            payload["source_lang"] = self.LANG_CODE_MAP.get(source_lang.lower(), source_lang.upper())
        if api_context:
            payload["context"] = api_context
        if self.glossary_id:
            payload["glossary_id"] = self.glossary_id

        try:
            response = requests.post(self.API_URL, headers=headers, json=payload, timeout=self.timeout)
            if response.status_code != 200:
                logger.warning(
                    "[deepl] Failed | status=%s tgt=%s body=%s",
                    response.status_code,
                    target_code,
                    (response.text or "")[:200],
                )
                return None

            data = response.json()
            translations = data.get("translations") or []
            if not translations:
                return None
            translated = translations[0].get("text")
            if not translated:
                return None
            if needs_extraction:
                translated = self._extract_term_from_translation(translated, text, target_code)
            return translated
        except requests.Timeout:
            logger.warning("[deepl] Timeout | tgt=%s timeout=%.1fs", target_code, self.timeout)
            return None
        except Exception as exc:
            logger.warning("[deepl] Exception | tgt=%s error=%s", target_code, exc, exc_info=True)
            return None

    def _add_ecommerce_context(
        self,
        text: str,
        source_lang: Optional[str],
        context: Optional[str],
    ) -> Tuple[str, bool]:
        if not context or "e-commerce" not in context.lower():
            return text, False
        if (source_lang or "").lower() != "zh":
            return text, False

        term = (text or "").strip()
        if len(term.split()) == 1 and len(term) <= 2:
            return f"购买 {term}", True
        return text, False

    def _extract_term_from_translation(
        self,
        translated_text: str,
        original_text: str,
        target_lang_code: str,
    ) -> str:
        del original_text
        if target_lang_code != "EN":
            return translated_text

        words = translated_text.strip().split()
        if len(words) <= 1:
            return translated_text
        context_words = {"buy", "purchase", "product", "item", "commodity", "goods"}
        for word in reversed(words):
            normalized = re.sub(r"[.,!?;:]+$", "", word.lower())
            if normalized not in context_words:
                return normalized
        return re.sub(r"[.,!?;:]+$", "", words[-1].lower())