Blame view

query/deepl_provider.py 7.41 KB
d4cadc13   tangwang   翻译重构
1
2
3
4
5
6
7
8
9
10
11
12
  """
  DeepL backend provider.
  
  This module only handles network calls to DeepL.
  It does not handle cache, async fanout, or fallback semantics.
  """
  
  from __future__ import annotations
  
  import logging
  import os
  import re
6f7840cf   tangwang   refactor: rename ...
13
  from typing import Dict, List, Optional, Sequence, Tuple, Union
d4cadc13   tangwang   翻译重构
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
  
  import requests
  from config.services_config import get_translation_config
  
  
  logger = logging.getLogger(__name__)
  
  DEFAULT_CONTEXTS: Dict[str, Dict[str, str]] = {
      "sku_name": {
          "zh": "商品SKU名称",
          "en": "product SKU name",
      },
      "ecommerce_search_query": {
          "zh": "电商",
          "en": "e-commerce",
      },
      "general": {
          "zh": "",
          "en": "",
      },
  }
  SCENE_NAMES = frozenset(DEFAULT_CONTEXTS.keys())
  
  
  def _merge_contexts(raw: object) -> Dict[str, Dict[str, str]]:
      merged: Dict[str, Dict[str, str]] = {
          scene: dict(lang_map) for scene, lang_map in DEFAULT_CONTEXTS.items()
      }
      if not isinstance(raw, dict):
          return merged
      for scene, lang_map in raw.items():
          if not isinstance(lang_map, dict):
              continue
          scene_name = str(scene or "").strip()
          if not scene_name:
              continue
          merged.setdefault(scene_name, {})
          for lang, value in lang_map.items():
              lang_key = str(lang or "").strip().lower()
              context_value = str(value or "").strip()
              if lang_key and context_value:
                  merged[scene_name][lang_key] = context_value
      return merged
  
  
  class DeepLProvider:
      API_URL = "https://api.deepl.com/v2/translate"  # Pro tier
      LANG_CODE_MAP = {
          "zh": "ZH",
          "en": "EN",
          "ru": "RU",
          "ar": "AR",
          "ja": "JA",
          "es": "ES",
          "de": "DE",
          "fr": "FR",
          "it": "IT",
          "pt": "PT",
      }
  
      def __init__(
          self,
          api_key: Optional[str],
          *,
          timeout: float = 10.0,
          glossary_id: Optional[str] = None,
      ) -> None:
          cfg = get_translation_config()
          provider_cfg = cfg.providers.get("deepl", {}) if isinstance(cfg.providers, dict) else {}
          self.api_key = api_key or os.getenv("DEEPL_AUTH_KEY")
          self.timeout = float(provider_cfg.get("timeout_sec") or timeout or 10.0)
          self.glossary_id = glossary_id or provider_cfg.get("glossary_id")
          self.model = "deepl"
          self.context_presets = _merge_contexts(provider_cfg.get("contexts"))
          if not self.api_key:
              logger.warning("DEEPL_AUTH_KEY not set; DeepL translation is unavailable")
  
6f7840cf   tangwang   refactor: rename ...
91
92
93
94
95
96
97
98
      @property
      def supports_batch(self) -> bool:
          """
          DeepL HTTP API 本身支持一次传多条 text,这里先返回 False
          由上层逐条拆分,后续如果要真正批量,可调整实现。
          """
          return False
  
d4cadc13   tangwang   翻译重构
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
      def _resolve_request_context(
          self,
          target_lang: str,
          context: Optional[str],
          prompt: Optional[str],
      ) -> Optional[str]:
          if prompt:
              return prompt
          if context in SCENE_NAMES:
              scene_map = self.context_presets.get(context) or self.context_presets.get("default") or {}
              tgt = (target_lang or "").strip().lower()
              return scene_map.get(tgt) or scene_map.get("en")
          if context:
              return context
          scene_map = self.context_presets.get("default") or {}
          tgt = (target_lang or "").strip().lower()
          return scene_map.get(tgt) or scene_map.get("en")
  
      def translate(
          self,
6f7840cf   tangwang   refactor: rename ...
119
          text: Union[str, Sequence[str]],
d4cadc13   tangwang   翻译重构
120
121
122
123
          target_lang: str,
          source_lang: Optional[str] = None,
          context: Optional[str] = None,
          prompt: Optional[str] = None,
6f7840cf   tangwang   refactor: rename ...
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
      ) -> Union[Optional[str], List[Optional[str]]]:
          if isinstance(text, (list, tuple)):
              results: List[Optional[str]] = []
              for item in text:
                  if item is None or not str(item).strip():
                      results.append(item)  # type: ignore[arg-type]
                      continue
                  out = self.translate(
                      text=str(item),
                      target_lang=target_lang,
                      source_lang=source_lang,
                      context=context,
                      prompt=prompt,
                  )
                  results.append(out)
              return results
  
d4cadc13   tangwang   翻译重构
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
          if not self.api_key:
              return None
  
          target_code = self.LANG_CODE_MAP.get((target_lang or "").lower(), (target_lang or "").upper())
          headers = {
              "Authorization": f"DeepL-Auth-Key {self.api_key}",
              "Content-Type": "application/json",
          }
  
          api_context = self._resolve_request_context(target_lang, context, prompt)
          text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context)
  
          payload = {
              "text": [text_to_translate],
              "target_lang": target_code,
          }
          if source_lang:
              payload["source_lang"] = self.LANG_CODE_MAP.get(source_lang.lower(), source_lang.upper())
          if api_context:
              payload["context"] = api_context
          if self.glossary_id:
              payload["glossary_id"] = self.glossary_id
  
          try:
              response = requests.post(self.API_URL, headers=headers, json=payload, timeout=self.timeout)
              if response.status_code != 200:
                  logger.warning(
                      "[deepl] Failed | status=%s tgt=%s body=%s",
                      response.status_code,
                      target_code,
                      (response.text or "")[:200],
                  )
                  return None
  
              data = response.json()
              translations = data.get("translations") or []
              if not translations:
                  return None
              translated = translations[0].get("text")
              if not translated:
                  return None
              if needs_extraction:
                  translated = self._extract_term_from_translation(translated, text, target_code)
              return translated
          except requests.Timeout:
              logger.warning("[deepl] Timeout | tgt=%s timeout=%.1fs", target_code, self.timeout)
              return None
          except Exception as exc:
              logger.warning("[deepl] Exception | tgt=%s error=%s", target_code, exc, exc_info=True)
              return None
  
      def _add_ecommerce_context(
          self,
          text: str,
          source_lang: Optional[str],
          context: Optional[str],
      ) -> Tuple[str, bool]:
          if not context or "e-commerce" not in context.lower():
              return text, False
          if (source_lang or "").lower() != "zh":
              return text, False
  
          term = (text or "").strip()
          if len(term.split()) == 1 and len(term) <= 2:
              return f"购买 {term}", True
          return text, False
  
      def _extract_term_from_translation(
          self,
          translated_text: str,
          original_text: str,
          target_lang_code: str,
      ) -> str:
          del original_text
          if target_lang_code != "EN":
              return translated_text
  
          words = translated_text.strip().split()
          if len(words) <= 1:
              return translated_text
          context_words = {"buy", "purchase", "product", "item", "commodity", "goods"}
          for word in reversed(words):
              normalized = re.sub(r"[.,!?;:]+$", "", word.lower())
              if normalized not in context_words:
                  return normalized
          return re.sub(r"[.,!?;:]+$", "", words[-1].lower())