deepl_provider.py
7.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
"""
DeepL backend provider.
This module only handles network calls to DeepL.
It does not handle cache, async fanout, or fallback semantics.
"""
from __future__ import annotations
import logging
import os
import re
from typing import Dict, List, Optional, Sequence, Tuple, Union
import requests
from config.services_config import get_translation_config
logger = logging.getLogger(__name__)
DEFAULT_CONTEXTS: Dict[str, Dict[str, str]] = {
"sku_name": {
"zh": "商品SKU名称",
"en": "product SKU name",
},
"ecommerce_search_query": {
"zh": "电商",
"en": "e-commerce",
},
"general": {
"zh": "",
"en": "",
},
}
SCENE_NAMES = frozenset(DEFAULT_CONTEXTS.keys())
def _merge_contexts(raw: object) -> Dict[str, Dict[str, str]]:
merged: Dict[str, Dict[str, str]] = {
scene: dict(lang_map) for scene, lang_map in DEFAULT_CONTEXTS.items()
}
if not isinstance(raw, dict):
return merged
for scene, lang_map in raw.items():
if not isinstance(lang_map, dict):
continue
scene_name = str(scene or "").strip()
if not scene_name:
continue
merged.setdefault(scene_name, {})
for lang, value in lang_map.items():
lang_key = str(lang or "").strip().lower()
context_value = str(value or "").strip()
if lang_key and context_value:
merged[scene_name][lang_key] = context_value
return merged
class DeepLProvider:
API_URL = "https://api.deepl.com/v2/translate" # Pro tier
LANG_CODE_MAP = {
"zh": "ZH",
"en": "EN",
"ru": "RU",
"ar": "AR",
"ja": "JA",
"es": "ES",
"de": "DE",
"fr": "FR",
"it": "IT",
"pt": "PT",
}
def __init__(
self,
api_key: Optional[str],
*,
timeout: float = 10.0,
glossary_id: Optional[str] = None,
) -> None:
cfg = get_translation_config()
provider_cfg = cfg.providers.get("deepl", {}) if isinstance(cfg.providers, dict) else {}
self.api_key = api_key or os.getenv("DEEPL_AUTH_KEY")
self.timeout = float(provider_cfg.get("timeout_sec") or timeout or 10.0)
self.glossary_id = glossary_id or provider_cfg.get("glossary_id")
self.model = "deepl"
self.context_presets = _merge_contexts(provider_cfg.get("contexts"))
if not self.api_key:
logger.warning("DEEPL_AUTH_KEY not set; DeepL translation is unavailable")
@property
def supports_batch(self) -> bool:
"""
DeepL HTTP API 本身支持一次传多条 text,这里先返回 False,
由上层逐条拆分,后续如果要真正批量,可调整实现。
"""
return False
def _resolve_request_context(
self,
target_lang: str,
context: Optional[str],
prompt: Optional[str],
) -> Optional[str]:
if prompt:
return prompt
if context in SCENE_NAMES:
scene_map = self.context_presets.get(context) or self.context_presets.get("default") or {}
tgt = (target_lang or "").strip().lower()
return scene_map.get(tgt) or scene_map.get("en")
if context:
return context
scene_map = self.context_presets.get("default") or {}
tgt = (target_lang or "").strip().lower()
return scene_map.get(tgt) or scene_map.get("en")
def translate(
self,
text: Union[str, Sequence[str]],
target_lang: str,
source_lang: Optional[str] = None,
context: Optional[str] = None,
prompt: Optional[str] = None,
) -> Union[Optional[str], List[Optional[str]]]:
if isinstance(text, (list, tuple)):
results: List[Optional[str]] = []
for item in text:
if item is None or not str(item).strip():
results.append(item) # type: ignore[arg-type]
continue
out = self.translate(
text=str(item),
target_lang=target_lang,
source_lang=source_lang,
context=context,
prompt=prompt,
)
results.append(out)
return results
if not self.api_key:
return None
target_code = self.LANG_CODE_MAP.get((target_lang or "").lower(), (target_lang or "").upper())
headers = {
"Authorization": f"DeepL-Auth-Key {self.api_key}",
"Content-Type": "application/json",
}
api_context = self._resolve_request_context(target_lang, context, prompt)
text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context)
payload = {
"text": [text_to_translate],
"target_lang": target_code,
}
if source_lang:
payload["source_lang"] = self.LANG_CODE_MAP.get(source_lang.lower(), source_lang.upper())
if api_context:
payload["context"] = api_context
if self.glossary_id:
payload["glossary_id"] = self.glossary_id
try:
response = requests.post(self.API_URL, headers=headers, json=payload, timeout=self.timeout)
if response.status_code != 200:
logger.warning(
"[deepl] Failed | status=%s tgt=%s body=%s",
response.status_code,
target_code,
(response.text or "")[:200],
)
return None
data = response.json()
translations = data.get("translations") or []
if not translations:
return None
translated = translations[0].get("text")
if not translated:
return None
if needs_extraction:
translated = self._extract_term_from_translation(translated, text, target_code)
return translated
except requests.Timeout:
logger.warning("[deepl] Timeout | tgt=%s timeout=%.1fs", target_code, self.timeout)
return None
except Exception as exc:
logger.warning("[deepl] Exception | tgt=%s error=%s", target_code, exc, exc_info=True)
return None
def _add_ecommerce_context(
self,
text: str,
source_lang: Optional[str],
context: Optional[str],
) -> Tuple[str, bool]:
if not context or "e-commerce" not in context.lower():
return text, False
if (source_lang or "").lower() != "zh":
return text, False
term = (text or "").strip()
if len(term.split()) == 1 and len(term) <= 2:
return f"购买 {term}", True
return text, False
def _extract_term_from_translation(
self,
translated_text: str,
original_text: str,
target_lang_code: str,
) -> str:
del original_text
if target_lang_code != "EN":
return translated_text
words = translated_text.strip().split()
if len(words) <= 1:
return translated_text
context_words = {"buy", "purchase", "product", "item", "commodity", "goods"}
for word in reversed(words):
normalized = re.sub(r"[.,!?;:]+$", "", word.lower())
if normalized not in context_words:
return normalized
return re.sub(r"[.,!?;:]+$", "", words[-1].lower())