Blame view

query/qwen_mt_translate.py 9.75 KB
d4cadc13   tangwang   翻译重构
1
  """Qwen-MT translation orchestrator with cache and async helpers."""
a0a173ae   tangwang   last
2
  
d4cadc13   tangwang   翻译重构
3
  from __future__ import annotations
a0a173ae   tangwang   last
4
  
d4cadc13   tangwang   翻译重构
5
6
  import hashlib
  import logging
a0a173ae   tangwang   last
7
  import os
a0a173ae   tangwang   last
8
  import re
a0a173ae   tangwang   last
9
  import time
d4cadc13   tangwang   翻译重构
10
  from typing import Dict, List, Optional
a0a173ae   tangwang   last
11
  
d4cadc13   tangwang   翻译重构
12
  import redis
a0a173ae   tangwang   last
13
14
  from openai import OpenAI
  
d4cadc13   tangwang   翻译重构
15
16
17
  from config.env_config import DASHSCOPE_API_KEY, REDIS_CONFIG
  from config.services_config import get_translation_cache_config
  from config.translate_prompts import SOURCE_LANG_CODE_MAP
a0a173ae   tangwang   last
18
  
d4cadc13   tangwang   翻译重构
19
  logger = logging.getLogger(__name__)
a0a173ae   tangwang   last
20
  
a0a173ae   tangwang   last
21
  
d4cadc13   tangwang   翻译重构
22
23
24
  class Translator:
      QWEN_DEFAULT_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1"
      QWEN_MODEL = "qwen-mt-flash"
a0a173ae   tangwang   last
25
26
27
28
29
30
31
32
  
      def __init__(
          self,
          model: str = "qwen",
          api_key: Optional[str] = None,
          use_cache: bool = True,
          timeout: int = 10,
          glossary_id: Optional[str] = None,
d4cadc13   tangwang   翻译重构
33
          translation_context: Optional[str] = None,
a0a173ae   tangwang   last
34
      ):
d4cadc13   tangwang   翻译重构
35
36
37
          self.model = self._normalize_model(model)
          self.timeout = int(timeout)
          self.use_cache = bool(use_cache)
a0a173ae   tangwang   last
38
39
          self.glossary_id = glossary_id
          self.translation_context = translation_context or "e-commerce product search"
a0a173ae   tangwang   last
40
  
d4cadc13   tangwang   翻译重构
41
42
43
44
45
46
47
48
49
50
51
52
53
          cache_cfg = get_translation_cache_config()
          self.cache_prefix = str(cache_cfg.get("key_prefix", "trans:v2"))
          self.expire_seconds = int(cache_cfg.get("ttl_seconds", 360 * 24 * 3600))
          self.cache_sliding_expiration = bool(cache_cfg.get("sliding_expiration", True))
          self.cache_include_context = bool(cache_cfg.get("key_include_context", True))
          self.cache_include_prompt = bool(cache_cfg.get("key_include_prompt", True))
          self.cache_include_source_lang = bool(cache_cfg.get("key_include_source_lang", True))
  
          self.qwen_model_name = self._resolve_qwen_model_name(model)
          self._api_key = api_key or self._default_api_key(self.model)
          self._qwen_client: Optional[OpenAI] = None
          base_url = os.getenv("DASHSCOPE_BASE_URL") or self.QWEN_DEFAULT_BASE_URL
          if self._api_key:
a0a173ae   tangwang   last
54
              try:
d4cadc13   tangwang   翻译重构
55
56
57
                  self._qwen_client = OpenAI(api_key=self._api_key, base_url=base_url)
              except Exception as exc:
                  logger.warning("Failed to initialize qwen-mt client: %s", exc, exc_info=True)
a0a173ae   tangwang   last
58
          else:
d4cadc13   tangwang   翻译重构
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
              logger.warning("DASHSCOPE_API_KEY not set; qwen-mt translation unavailable")
  
          self.redis_client = None
          if self.use_cache and bool(cache_cfg.get("enabled", True)):
              self.redis_client = self._init_redis_client()
  
      @staticmethod
      def _normalize_model(model: str) -> str:
          m = (model or "qwen").strip().lower()
          if m.startswith("qwen"):
              return "qwen-mt"
          raise ValueError(f"Unsupported model: {model}. Supported models: 'qwen', 'qwen-mt', 'qwen-mt-flash'")
  
      @staticmethod
      def _resolve_qwen_model_name(model: str) -> str:
          m = (model or "qwen").strip().lower()
          if m in {"qwen", "qwen-mt"}:
              return "qwen-mt-flash"
          return m
  
      @staticmethod
      def _default_api_key(model: str) -> Optional[str]:
          del model
          return DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY")
  
      def _init_redis_client(self):
          try:
              client = redis.Redis(
                  host=REDIS_CONFIG.get("host", "localhost"),
                  port=REDIS_CONFIG.get("port", 6479),
                  password=REDIS_CONFIG.get("password"),
                  decode_responses=True,
                  socket_timeout=REDIS_CONFIG.get("socket_timeout", 1),
                  socket_connect_timeout=REDIS_CONFIG.get("socket_connect_timeout", 1),
                  retry_on_timeout=REDIS_CONFIG.get("retry_on_timeout", False),
                  health_check_interval=10,
              )
              client.ping()
              return client
          except Exception as exc:
              logger.warning("Failed to initialize translation redis cache: %s", exc)
              return None
  
      def _build_cache_key(
          self,
          text: str,
          target_lang: str,
          source_lang: Optional[str],
          context: Optional[str],
          prompt: Optional[str],
      ) -> str:
          src = (source_lang or "auto").strip().lower() if self.cache_include_source_lang else "-"
          tgt = (target_lang or "").strip().lower()
          ctx = (context or "").strip() if self.cache_include_context else ""
          prm = (prompt or "").strip() if self.cache_include_prompt else ""
          payload = f"model={self.model}\nsrc={src}\ntgt={tgt}\nctx={ctx}\nprm={prm}\ntext={text}"
          digest = hashlib.sha256(payload.encode("utf-8")).hexdigest()
          return f"{self.cache_prefix}:{self.model}:{src}:{tgt}:{digest}"
a0a173ae   tangwang   last
117
118
119
120
121
122
123
  
      def translate(
          self,
          text: str,
          target_lang: str,
          source_lang: Optional[str] = None,
          context: Optional[str] = None,
d4cadc13   tangwang   翻译重构
124
          prompt: Optional[str] = None,
a0a173ae   tangwang   last
125
      ) -> Optional[str]:
a0a173ae   tangwang   last
126
127
128
          if not text or not text.strip():
              return text
  
d4cadc13   tangwang   翻译重构
129
130
131
          tgt = (target_lang or "").strip().lower()
          src = (source_lang or "").strip().lower() or None
          if tgt == "en" and self._is_english_text(text):
a0a173ae   tangwang   last
132
              return text
d4cadc13   tangwang   翻译重构
133
          if tgt == "zh" and (self._contains_chinese(text) or self._is_pure_number(text)):
a0a173ae   tangwang   last
134
135
              return text
  
a0a173ae   tangwang   last
136
          translation_context = context or self.translation_context
d4cadc13   tangwang   翻译重构
137
138
139
          cached = self._get_cached_translation_redis(text, tgt, src, translation_context, prompt)
          if cached is not None:
              return cached
a0a173ae   tangwang   last
140
  
d4cadc13   tangwang   翻译重构
141
          result = self._translate_qwen(text, tgt, src)
a0a173ae   tangwang   last
142
  
d4cadc13   tangwang   翻译重构
143
144
          if result is not None:
              self._set_cached_translation_redis(text, tgt, result, src, translation_context, prompt)
a0a173ae   tangwang   last
145
146
147
148
149
150
151
          return result
  
      def _translate_qwen(
          self,
          text: str,
          target_lang: str,
          source_lang: Optional[str],
a0a173ae   tangwang   last
152
      ) -> Optional[str]:
d4cadc13   tangwang   翻译重构
153
          if not self._qwen_client:
a0a173ae   tangwang   last
154
              return None
d4cadc13   tangwang   翻译重构
155
156
157
158
159
          tgt_norm = (target_lang or "").strip().lower()
          src_norm = (source_lang or "").strip().lower()
          tgt_qwen = self.SOURCE_LANG_CODE_MAP.get(tgt_norm, tgt_norm.capitalize())
          src_qwen = "auto" if not src_norm or src_norm == "auto" else self.SOURCE_LANG_CODE_MAP.get(src_norm, src_norm.capitalize())
          start = time.time()
a0a173ae   tangwang   last
160
          try:
d4cadc13   tangwang   翻译重构
161
162
163
              completion = self._qwen_client.chat.completions.create(
                  model=self.qwen_model_name,
                  messages=[{"role": "user", "content": text}],
a0a173ae   tangwang   last
164
                  extra_body={
d4cadc13   tangwang   翻译重构
165
166
167
168
169
170
                      "translation_options": {
                          "source_lang": src_qwen,
                          "target_lang": tgt_qwen,
                      }
                  },
                  timeout=self.timeout,
a0a173ae   tangwang   last
171
              )
d4cadc13   tangwang   翻译重构
172
173
              content = (completion.choices[0].message.content or "").strip()
              if not content:
a0a173ae   tangwang   last
174
                  return None
d4cadc13   tangwang   翻译重构
175
176
177
              logger.info("[qwen-mt] Success | src=%s tgt=%s latency=%.1fms", src_qwen, tgt_qwen, (time.time() - start) * 1000)
              return content
          except Exception as exc:
a0a173ae   tangwang   last
178
              logger.warning(
d4cadc13   tangwang   翻译重构
179
180
181
182
183
184
                  "[qwen-mt] Failed | src=%s tgt=%s latency=%.1fms error=%s",
                  src_qwen,
                  tgt_qwen,
                  (time.time() - start) * 1000,
                  exc,
                  exc_info=True,
a0a173ae   tangwang   last
185
186
187
              )
              return None
  
a0a173ae   tangwang   last
188
  
a0a173ae   tangwang   last
189
190
191
192
193
194
      def _get_cached_translation_redis(
          self,
          text: str,
          target_lang: str,
          source_lang: Optional[str] = None,
          context: Optional[str] = None,
d4cadc13   tangwang   翻译重构
195
          prompt: Optional[str] = None,
a0a173ae   tangwang   last
196
      ) -> Optional[str]:
a0a173ae   tangwang   last
197
198
          if not self.redis_client:
              return None
d4cadc13   tangwang   翻译重构
199
          key = self._build_cache_key(text, target_lang, source_lang, context, prompt)
a0a173ae   tangwang   last
200
          try:
d4cadc13   tangwang   翻译重构
201
202
203
204
205
206
              value = self.redis_client.get(key)
              if value and self.cache_sliding_expiration:
                  self.redis_client.expire(key, self.expire_seconds)
              return value
          except Exception as exc:
              logger.warning("Redis get translation cache failed: %s", exc)
a0a173ae   tangwang   last
207
              return None
d4cadc13   tangwang   翻译重构
208
  
a0a173ae   tangwang   last
209
210
211
212
213
214
215
      def _set_cached_translation_redis(
          self,
          text: str,
          target_lang: str,
          translation: str,
          source_lang: Optional[str] = None,
          context: Optional[str] = None,
d4cadc13   tangwang   翻译重构
216
          prompt: Optional[str] = None,
a0a173ae   tangwang   last
217
      ) -> None:
a0a173ae   tangwang   last
218
219
          if not self.redis_client:
              return
d4cadc13   tangwang   翻译重构
220
          key = self._build_cache_key(text, target_lang, source_lang, context, prompt)
a0a173ae   tangwang   last
221
          try:
d4cadc13   tangwang   翻译重构
222
223
224
              self.redis_client.setex(key, self.expire_seconds, translation)
          except Exception as exc:
              logger.warning("Redis set translation cache failed: %s", exc)
a0a173ae   tangwang   last
225
226
  
      def _shop_lang_matches(self, shop_lang_lower: str, lang_code: str) -> bool:
a0a173ae   tangwang   last
227
228
229
230
231
232
233
234
235
236
          if not shop_lang_lower or not lang_code:
              return False
          if shop_lang_lower == lang_code:
              return True
          if lang_code == "zh" and "zh" in shop_lang_lower:
              return True
          if lang_code == "en" and "en" in shop_lang_lower:
              return True
          return False
  
d4cadc13   tangwang   翻译重构
237
      def get_translation_needs(self, detected_lang: str, supported_langs: List[str]) -> List[str]:
a0a173ae   tangwang   last
238
          if detected_lang in supported_langs:
d4cadc13   tangwang   翻译重构
239
              return [lang for lang in supported_langs if lang != detected_lang]
a0a173ae   tangwang   last
240
          return supported_langs
d4cadc13   tangwang   翻译重构
241
  
a0a173ae   tangwang   last
242
      def _is_english_text(self, text: str) -> bool:
a0a173ae   tangwang   last
243
244
          if not text or not text.strip():
              return True
d4cadc13   tangwang   翻译重构
245
          text_clean = re.sub(r"[\s\.,!?;:\-\'\"\(\)\[\]{}]", "", text)
a0a173ae   tangwang   last
246
247
          if not text_clean:
              return True
a0a173ae   tangwang   last
248
          ascii_count = sum(1 for c in text_clean if ord(c) < 128)
d4cadc13   tangwang   翻译重构
249
250
          return (ascii_count / len(text_clean)) > 0.8
  
a0a173ae   tangwang   last
251
      def _contains_chinese(self, text: str) -> bool:
a0a173ae   tangwang   last
252
253
          if not text:
              return False
d4cadc13   tangwang   翻译重构
254
255
          return bool(re.search(r"[\u4e00-\u9fff]", text))
  
a0a173ae   tangwang   last
256
      def _is_pure_number(self, text: str) -> bool:
a0a173ae   tangwang   last
257
258
          if not text or not text.strip():
              return False
d4cadc13   tangwang   翻译重构
259
260
          text_clean = re.sub(r"[\s\.,]", "", text.strip())
          return bool(text_clean) and text_clean.isdigit()