Commit d4cadc13bd22491045c3580a54d0aaa1d4f625e6
1 parent
a0a173ae
翻译重构
Showing
21 changed files
with
832 additions
and
2263 deletions
Show diff stats
api/routes/search.py
| @@ -472,7 +472,6 @@ async def get_es_raw_document(spu_id: str, http_request: Request): | @@ -472,7 +472,6 @@ async def get_es_raw_document(spu_id: str, http_request: Request): | ||
| 472 | index_name = get_tenant_index_name(tenant_id) | 472 | index_name = get_tenant_index_name(tenant_id) |
| 473 | 473 | ||
| 474 | body = { | 474 | body = { |
| 475 | - "size": 5, | ||
| 476 | "query": { | 475 | "query": { |
| 477 | "bool": { | 476 | "bool": { |
| 478 | "filter": [ | 477 | "filter": [ |
api/translator_app.py
| @@ -98,7 +98,9 @@ from pydantic import BaseModel, Field | @@ -98,7 +98,9 @@ from pydantic import BaseModel, Field | ||
| 98 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | 98 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| 99 | 99 | ||
| 100 | from query.qwen_mt_translate import Translator | 100 | from query.qwen_mt_translate import Translator |
| 101 | -from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG | 101 | +from query.llm_translate import LLMTranslatorProvider |
| 102 | +from query.deepl_provider import DeepLProvider | ||
| 103 | +from config.services_config import get_translation_config | ||
| 102 | 104 | ||
| 103 | # Configure logging | 105 | # Configure logging |
| 104 | logging.basicConfig( | 106 | logging.basicConfig( |
| @@ -107,23 +109,52 @@ logging.basicConfig( | @@ -107,23 +109,52 @@ logging.basicConfig( | ||
| 107 | ) | 109 | ) |
| 108 | logger = logging.getLogger(__name__) | 110 | logger = logging.getLogger(__name__) |
| 109 | 111 | ||
| 110 | -# Fixed translation prompt | ||
| 111 | -TRANSLATION_PROMPT = "Translate the original text into an English product SKU name. Requirements: Ensure accurate and complete transmission of the original information, with concise, clear, authentic, and professional language." | ||
| 112 | - | ||
| 113 | # Global translator instances cache (keyed by model) | 112 | # Global translator instances cache (keyed by model) |
| 114 | -_translators: Dict[str, Translator] = {} | 113 | +_translators: Dict[str, object] = {} |
| 114 | + | ||
| 115 | 115 | ||
| 116 | +def _resolve_default_model() -> str: | ||
| 117 | + """ | ||
| 118 | + Resolve translator model from services.translation config first. | ||
| 116 | 119 | ||
| 117 | -def get_translator(model: str = "qwen") -> Translator: | 120 | + Priority: |
| 121 | + 1) TRANSLATION_MODEL env (explicit runtime override) | ||
| 122 | + 2) services.translation.provider + providers.<provider>.model | ||
| 123 | + 3) qwen-mt | ||
| 124 | + """ | ||
| 125 | + env_model = (os.getenv("TRANSLATION_MODEL") or "").strip() | ||
| 126 | + if env_model: | ||
| 127 | + return env_model | ||
| 128 | + try: | ||
| 129 | + cfg = get_translation_config() | ||
| 130 | + provider = (cfg.provider or "").strip().lower() | ||
| 131 | + provider_cfg = cfg.get_provider_cfg() if hasattr(cfg, "get_provider_cfg") else {} | ||
| 132 | + model = (provider_cfg.get("model") or "").strip().lower() if isinstance(provider_cfg, dict) else "" | ||
| 133 | + if provider == "llm": | ||
| 134 | + return "llm" | ||
| 135 | + if provider in {"qwen-mt", "direct", "http"}: | ||
| 136 | + return model or "qwen-mt" | ||
| 137 | + if provider == "deepl": | ||
| 138 | + return "deepl" | ||
| 139 | + except Exception: | ||
| 140 | + pass | ||
| 141 | + return "qwen-mt" | ||
| 142 | + | ||
| 143 | + | ||
| 144 | +def get_translator(model: str = "qwen") -> object: | ||
| 118 | """Get or create translator instance for the specified model.""" | 145 | """Get or create translator instance for the specified model.""" |
| 119 | global _translators | 146 | global _translators |
| 120 | if model not in _translators: | 147 | if model not in _translators: |
| 121 | logger.info(f"Initializing translator with model: {model}...") | 148 | logger.info(f"Initializing translator with model: {model}...") |
| 122 | - _translators[model] = Translator( | ||
| 123 | - model=model, | ||
| 124 | - use_cache=True, | ||
| 125 | - timeout=10 | ||
| 126 | - ) | 149 | + normalized = (model or "qwen").strip().lower() |
| 150 | + if normalized in {"qwen", "qwen-mt", "qwen-mt-flash", "qwen-mt-flush"}: | ||
| 151 | + _translators[model] = Translator(model=normalized, use_cache=True, timeout=10) | ||
| 152 | + elif normalized == "deepl": | ||
| 153 | + _translators[model] = DeepLProvider(api_key=None, timeout=10.0) | ||
| 154 | + elif normalized == "llm": | ||
| 155 | + _translators[model] = LLMTranslatorProvider() | ||
| 156 | + else: | ||
| 157 | + raise ValueError(f"Unsupported model: {model}") | ||
| 127 | logger.info(f"Translator initialized with model: {model}") | 158 | logger.info(f"Translator initialized with model: {model}") |
| 128 | return _translators[model] | 159 | return _translators[model] |
| 129 | 160 | ||
| @@ -134,7 +165,9 @@ class TranslationRequest(BaseModel): | @@ -134,7 +165,9 @@ class TranslationRequest(BaseModel): | ||
| 134 | text: str = Field(..., description="Text to translate") | 165 | text: str = Field(..., description="Text to translate") |
| 135 | target_lang: str = Field(..., description="Target language code (zh, en, ru, etc.)") | 166 | target_lang: str = Field(..., description="Target language code (zh, en, ru, etc.)") |
| 136 | source_lang: Optional[str] = Field(None, description="Source language code (optional, auto-detect if not provided)") | 167 | source_lang: Optional[str] = Field(None, description="Source language code (optional, auto-detect if not provided)") |
| 137 | - model: Optional[str] = Field("qwen", description="Translation model: 'qwen' (default) or 'deepl'") | 168 | + model: Optional[str] = Field(None, description="Translation model: qwen-mt | deepl | llm") |
| 169 | + context: Optional[str] = Field(None, description="Optional translation scene or context") | ||
| 170 | + prompt: Optional[str] = Field(None, description="Optional prompt override") | ||
| 138 | 171 | ||
| 139 | class Config: | 172 | class Config: |
| 140 | json_schema_extra = { | 173 | json_schema_extra = { |
| @@ -142,7 +175,8 @@ class TranslationRequest(BaseModel): | @@ -142,7 +175,8 @@ class TranslationRequest(BaseModel): | ||
| 142 | "text": "商品名称", | 175 | "text": "商品名称", |
| 143 | "target_lang": "en", | 176 | "target_lang": "en", |
| 144 | "source_lang": "zh", | 177 | "source_lang": "zh", |
| 145 | - "model": "qwen" | 178 | + "model": "llm", |
| 179 | + "context": "sku_name" | ||
| 146 | } | 180 | } |
| 147 | } | 181 | } |
| 148 | 182 | ||
| @@ -180,8 +214,7 @@ app.add_middleware( | @@ -180,8 +214,7 @@ app.add_middleware( | ||
| 180 | async def startup_event(): | 214 | async def startup_event(): |
| 181 | """Initialize translator on startup.""" | 215 | """Initialize translator on startup.""" |
| 182 | logger.info("Starting Translation Service API on port 6006") | 216 | logger.info("Starting Translation Service API on port 6006") |
| 183 | - # Get default model from environment variable or use 'qwen' | ||
| 184 | - default_model = os.getenv("TRANSLATION_MODEL", "qwen") | 217 | + default_model = _resolve_default_model() |
| 185 | try: | 218 | try: |
| 186 | get_translator(model=default_model) | 219 | get_translator(model=default_model) |
| 187 | logger.info(f"Translation service ready with default model: {default_model}") | 220 | logger.info(f"Translation service ready with default model: {default_model}") |
| @@ -194,15 +227,17 @@ async def startup_event(): | @@ -194,15 +227,17 @@ async def startup_event(): | ||
| 194 | async def health_check(): | 227 | async def health_check(): |
| 195 | """Health check endpoint.""" | 228 | """Health check endpoint.""" |
| 196 | try: | 229 | try: |
| 197 | - default_model = os.getenv("TRANSLATION_MODEL", "qwen") | ||
| 198 | - translator = get_translator(model=default_model) | 230 | + # 仅做轻量级本地检查,避免在健康检查中触发潜在的阻塞初始化或外部依赖 |
| 231 | + default_model = _resolve_default_model() | ||
| 232 | + # 如果启动事件成功,默认模型通常会已经初始化到缓存中 | ||
| 233 | + translator = _translators.get(default_model) or next(iter(_translators.values()), None) | ||
| 199 | return { | 234 | return { |
| 200 | "status": "healthy", | 235 | "status": "healthy", |
| 201 | "service": "translation", | 236 | "service": "translation", |
| 202 | "default_model": default_model, | 237 | "default_model": default_model, |
| 203 | "available_models": list(_translators.keys()), | 238 | "available_models": list(_translators.keys()), |
| 204 | "translator_initialized": translator is not None, | 239 | "translator_initialized": translator is not None, |
| 205 | - "cache_enabled": translator.use_cache if translator else False | 240 | + "cache_enabled": bool(getattr(translator, "use_cache", False)) |
| 206 | } | 241 | } |
| 207 | except Exception as e: | 242 | except Exception as e: |
| 208 | logger.error(f"Health check failed: {e}") | 243 | logger.error(f"Health check failed: {e}") |
| @@ -238,11 +273,11 @@ async def translate(request: TranslationRequest): | @@ -238,11 +273,11 @@ async def translate(request: TranslationRequest): | ||
| 238 | ) | 273 | ) |
| 239 | 274 | ||
| 240 | # Validate model parameter | 275 | # Validate model parameter |
| 241 | - model = request.model.lower() if request.model else "qwen" | ||
| 242 | - if model not in ['qwen', 'deepl']: | 276 | + model = request.model.lower() if request.model else _resolve_default_model().lower() |
| 277 | + if model not in ["qwen", "qwen-mt", "deepl", "llm"]: | ||
| 243 | raise HTTPException( | 278 | raise HTTPException( |
| 244 | status_code=400, | 279 | status_code=400, |
| 245 | - detail=f"Invalid model: {model}. Supported models: 'qwen', 'deepl'" | 280 | + detail="Invalid model. Supported models: 'qwen-mt', 'deepl', 'llm'" |
| 246 | ) | 281 | ) |
| 247 | 282 | ||
| 248 | try: | 283 | try: |
| @@ -254,7 +289,8 @@ async def translate(request: TranslationRequest): | @@ -254,7 +289,8 @@ async def translate(request: TranslationRequest): | ||
| 254 | text=request.text, | 289 | text=request.text, |
| 255 | target_lang=request.target_lang, | 290 | target_lang=request.target_lang, |
| 256 | source_lang=request.source_lang, | 291 | source_lang=request.source_lang, |
| 257 | - prompt=TRANSLATION_PROMPT | 292 | + context=request.context, |
| 293 | + prompt=request.prompt, | ||
| 258 | ) | 294 | ) |
| 259 | 295 | ||
| 260 | if translated_text is None: | 296 | if translated_text is None: |
| @@ -269,7 +305,7 @@ async def translate(request: TranslationRequest): | @@ -269,7 +305,7 @@ async def translate(request: TranslationRequest): | ||
| 269 | source_lang=request.source_lang, | 305 | source_lang=request.source_lang, |
| 270 | translated_text=translated_text, | 306 | translated_text=translated_text, |
| 271 | status="success", | 307 | status="success", |
| 272 | - model=translator.model | 308 | + model=str(getattr(translator, "model", model)) |
| 273 | ) | 309 | ) |
| 274 | 310 | ||
| 275 | except HTTPException: | 311 | except HTTPException: |
config/__init__.py
| @@ -28,6 +28,7 @@ from .services_config import ( | @@ -28,6 +28,7 @@ from .services_config import ( | ||
| 28 | get_translation_base_url, | 28 | get_translation_base_url, |
| 29 | get_embedding_base_url, | 29 | get_embedding_base_url, |
| 30 | get_rerank_service_url, | 30 | get_rerank_service_url, |
| 31 | + get_translation_cache_config, | ||
| 31 | ServiceConfig, | 32 | ServiceConfig, |
| 32 | ) | 33 | ) |
| 33 | 34 | ||
| @@ -53,5 +54,6 @@ __all__ = [ | @@ -53,5 +54,6 @@ __all__ = [ | ||
| 53 | 'get_translation_base_url', | 54 | 'get_translation_base_url', |
| 54 | 'get_embedding_base_url', | 55 | 'get_embedding_base_url', |
| 55 | 'get_rerank_service_url', | 56 | 'get_rerank_service_url', |
| 57 | + 'get_translation_cache_config', | ||
| 56 | 'ServiceConfig', | 58 | 'ServiceConfig', |
| 57 | ] | 59 | ] |
config/config.yaml
| @@ -81,18 +81,6 @@ query_config: | @@ -81,18 +81,6 @@ query_config: | ||
| 81 | translation_service: "deepl" | 81 | translation_service: "deepl" |
| 82 | translation_api_key: null # 通过环境变量设置 | 82 | translation_api_key: null # 通过环境变量设置 |
| 83 | 83 | ||
| 84 | - # 翻译提示词配置(用于提高翻译质量,作为DeepL API的context参数) | ||
| 85 | - translation_prompts: | ||
| 86 | - # 商品标题翻译提示词 | ||
| 87 | - product_title_zh: "请将原文翻译成中文商品SKU名称,要求:确保精确、完整地传达原文信息的基础上,语言简洁清晰、地道、专业。" | ||
| 88 | - product_title_en: "Translate the original text into an English product SKU name. Requirements: Ensure accurate and complete transmission of the original information, with concise, clear, authentic, and professional language." | ||
| 89 | - # query翻译提示词 | ||
| 90 | - query_zh: "电商领域" | ||
| 91 | - query_en: "e-commerce domain" | ||
| 92 | - # 默认翻译用词 | ||
| 93 | - default_zh: "电商领域" | ||
| 94 | - default_en: "e-commerce domain" | ||
| 95 | - | ||
| 96 | # 返回字段配置(_source includes) | 84 | # 返回字段配置(_source includes) |
| 97 | # null表示返回所有字段,[]表示不返回任何字段,列表表示只返回指定字段 | 85 | # null表示返回所有字段,[]表示不返回任何字段,列表表示只返回指定字段 |
| 98 | source_fields: null | 86 | source_fields: null |
| @@ -119,16 +107,24 @@ rerank: | @@ -119,16 +107,24 @@ rerank: | ||
| 119 | # 可扩展服务/provider 注册表(单一配置源) | 107 | # 可扩展服务/provider 注册表(单一配置源) |
| 120 | services: | 108 | services: |
| 121 | translation: | 109 | translation: |
| 122 | - provider: "llm" # direct | http | google(reserved) | 110 | + provider: "llm" # qwen-mt | deepl | http | llm |
| 123 | base_url: "http://127.0.0.1:6006" | 111 | base_url: "http://127.0.0.1:6006" |
| 124 | - model: "qwen" | 112 | + model: "qwen-flash" |
| 125 | timeout_sec: 10.0 | 113 | timeout_sec: 10.0 |
| 114 | + cache: | ||
| 115 | + enabled: true | ||
| 116 | + key_prefix: "trans:v2" | ||
| 117 | + ttl_seconds: 62208000 | ||
| 118 | + sliding_expiration: true | ||
| 119 | + key_include_context: true | ||
| 120 | + key_include_prompt: true | ||
| 121 | + key_include_source_lang: true | ||
| 126 | providers: | 122 | providers: |
| 127 | - direct: | ||
| 128 | - model: "qwen" | 123 | + qwen-mt: |
| 124 | + model: "qwen-mt-flush" | ||
| 129 | http: | 125 | http: |
| 130 | base_url: "http://127.0.0.1:6006" | 126 | base_url: "http://127.0.0.1:6006" |
| 131 | - model: "qwen" | 127 | + model: "qwen-mt-flush" |
| 132 | timeout_sec: 10.0 | 128 | timeout_sec: 10.0 |
| 133 | llm: | 129 | llm: |
| 134 | model: "qwen-flash" | 130 | model: "qwen-flash" |
| @@ -136,6 +132,11 @@ services: | @@ -136,6 +132,11 @@ services: | ||
| 136 | # base_url 留空则使用 DASHSCOPE_BASE_URL 或默认地域 | 132 | # base_url 留空则使用 DASHSCOPE_BASE_URL 或默认地域 |
| 137 | base_url: "" | 133 | base_url: "" |
| 138 | timeout_sec: 30.0 | 134 | timeout_sec: 30.0 |
| 135 | + deepl: | ||
| 136 | + model: "deepl" | ||
| 137 | + timeout_sec: 10.0 | ||
| 138 | + # 可选:用于术语表翻译(由 query_config.translation_glossary_id 衔接) | ||
| 139 | + glossary_id: "" | ||
| 139 | google: | 140 | google: |
| 140 | enabled: false | 141 | enabled: false |
| 141 | project_id: "" | 142 | project_id: "" |
config/config_loader.py
| @@ -42,7 +42,6 @@ class QueryConfig: | @@ -42,7 +42,6 @@ class QueryConfig: | ||
| 42 | translation_api_key: Optional[str] = None | 42 | translation_api_key: Optional[str] = None |
| 43 | translation_glossary_id: Optional[str] = None | 43 | translation_glossary_id: Optional[str] = None |
| 44 | translation_context: str = "e-commerce product search" | 44 | translation_context: str = "e-commerce product search" |
| 45 | - translation_prompts: Dict[str, str] = field(default_factory=dict) | ||
| 46 | 45 | ||
| 47 | # Embedding field names | 46 | # Embedding field names |
| 48 | text_embedding_field: Optional[str] = "title_embedding" | 47 | text_embedding_field: Optional[str] = "title_embedding" |
| @@ -250,7 +249,6 @@ class ConfigLoader: | @@ -250,7 +249,6 @@ class ConfigLoader: | ||
| 250 | translation_service=query_config_data.get("translation_service") or "deepl", | 249 | translation_service=query_config_data.get("translation_service") or "deepl", |
| 251 | translation_glossary_id=query_config_data.get("translation_glossary_id"), | 250 | translation_glossary_id=query_config_data.get("translation_glossary_id"), |
| 252 | translation_context=query_config_data.get("translation_context") or "e-commerce product search", | 251 | translation_context=query_config_data.get("translation_context") or "e-commerce product search", |
| 253 | - translation_prompts=query_config_data.get("translation_prompts", {}), | ||
| 254 | text_embedding_field=query_config_data.get("text_embedding_field"), | 252 | text_embedding_field=query_config_data.get("text_embedding_field"), |
| 255 | image_embedding_field=query_config_data.get("image_embedding_field"), | 253 | image_embedding_field=query_config_data.get("image_embedding_field"), |
| 256 | source_fields=query_config_data.get("source_fields"), | 254 | source_fields=query_config_data.get("source_fields"), |
config/services_config.py
| @@ -72,12 +72,12 @@ def _resolve_translation() -> ServiceConfig: | @@ -72,12 +72,12 @@ def _resolve_translation() -> ServiceConfig: | ||
| 72 | config_provider=cfg.get("provider"), | 72 | config_provider=cfg.get("provider"), |
| 73 | capability="translation", | 73 | capability="translation", |
| 74 | ) | 74 | ) |
| 75 | - if provider not in ("direct", "local", "inprocess", "http", "service"): | 75 | + if provider not in ("qwen-mt", "deepl", "direct", "local", "inprocess", "http", "service", "llm"): |
| 76 | raise ValueError(f"Unsupported translation provider: {provider}") | 76 | raise ValueError(f"Unsupported translation provider: {provider}") |
| 77 | 77 | ||
| 78 | # Env override for http base_url | 78 | # Env override for http base_url |
| 79 | env_url = os.getenv("TRANSLATION_SERVICE_URL") | 79 | env_url = os.getenv("TRANSLATION_SERVICE_URL") |
| 80 | - if env_url and provider == "http": | 80 | + if env_url and provider in ("http", "service"): |
| 81 | providers = dict(providers) | 81 | providers = dict(providers) |
| 82 | providers["http"] = dict(providers.get("http", {})) | 82 | providers["http"] = dict(providers.get("http", {})) |
| 83 | providers["http"]["base_url"] = env_url.rstrip("/") | 83 | providers["http"]["base_url"] = env_url.rstrip("/") |
| @@ -206,6 +206,27 @@ def get_translation_base_url() -> str: | @@ -206,6 +206,27 @@ def get_translation_base_url() -> str: | ||
| 206 | return str(base).rstrip("/") | 206 | return str(base).rstrip("/") |
| 207 | 207 | ||
| 208 | 208 | ||
| 209 | +def get_translation_cache_config() -> Dict[str, Any]: | ||
| 210 | + """ | ||
| 211 | + Resolve translation cache policy from services.translation.cache. | ||
| 212 | + | ||
| 213 | + All translation cache key/TTL behavior should be configured in config.yaml, | ||
| 214 | + not hardcoded in code. | ||
| 215 | + """ | ||
| 216 | + raw = _load_services_raw() | ||
| 217 | + cfg = raw.get("translation", {}) if isinstance(raw.get("translation"), dict) else {} | ||
| 218 | + cache_cfg = cfg.get("cache", {}) if isinstance(cfg.get("cache"), dict) else {} | ||
| 219 | + return { | ||
| 220 | + "enabled": bool(cache_cfg.get("enabled", True)), | ||
| 221 | + "key_prefix": str(cache_cfg.get("key_prefix", "trans:v2")), | ||
| 222 | + "ttl_seconds": int(cache_cfg.get("ttl_seconds", 360 * 24 * 3600)), | ||
| 223 | + "sliding_expiration": bool(cache_cfg.get("sliding_expiration", True)), | ||
| 224 | + "key_include_context": bool(cache_cfg.get("key_include_context", True)), | ||
| 225 | + "key_include_prompt": bool(cache_cfg.get("key_include_prompt", True)), | ||
| 226 | + "key_include_source_lang": bool(cache_cfg.get("key_include_source_lang", True)), | ||
| 227 | + } | ||
| 228 | + | ||
| 229 | + | ||
| 209 | def get_embedding_base_url() -> str: | 230 | def get_embedding_base_url() -> str: |
| 210 | """Resolve embedding HTTP base URL.""" | 231 | """Resolve embedding HTTP base URL.""" |
| 211 | base = ( | 232 | base = ( |
| @@ -0,0 +1,82 @@ | @@ -0,0 +1,82 @@ | ||
| 1 | +SOURCE_LANG_CODE_MAP = { | ||
| 2 | + "en": "English", | ||
| 3 | + "zh": "Chinese", | ||
| 4 | + "zh_tw": "Traditional Chinese", | ||
| 5 | + "ru": "Russian", | ||
| 6 | + "ja": "Japanese", | ||
| 7 | + "ko": "Korean", | ||
| 8 | + "es": "Spanish", | ||
| 9 | + "fr": "French", | ||
| 10 | + "pt": "Portuguese", | ||
| 11 | + "de": "German", | ||
| 12 | + "it": "Italian", | ||
| 13 | + "th": "Thai", | ||
| 14 | + "vi": "Vietnamese", | ||
| 15 | + "id": "Indonesian", | ||
| 16 | + "ms": "Malay", | ||
| 17 | + "ar": "Arabic", | ||
| 18 | + "hi": "Hindi", | ||
| 19 | + "he": "Hebrew", | ||
| 20 | + "my": "Burmese", | ||
| 21 | + "ta": "Tamil", | ||
| 22 | + "ur": "Urdu", | ||
| 23 | + "bn": "Bengali", | ||
| 24 | + "pl": "Polish", | ||
| 25 | + "nl": "Dutch", | ||
| 26 | + "ro": "Romanian", | ||
| 27 | + "tr": "Turkish", | ||
| 28 | + "km": "Khmer", | ||
| 29 | + "lo": "Lao", | ||
| 30 | + "yue": "Cantonese", | ||
| 31 | + "cs": "Czech", | ||
| 32 | + "el": "Greek", | ||
| 33 | + "sv": "Swedish", | ||
| 34 | + "hu": "Hungarian", | ||
| 35 | + "da": "Danish", | ||
| 36 | + "fi": "Finnish", | ||
| 37 | + "uk": "Ukrainian", | ||
| 38 | + "bg": "Bulgarian", | ||
| 39 | +} | ||
| 40 | + | ||
| 41 | +TARGET_LANG_CODE_MAP = {v: k for k, v in SOURCE_LANG_CODE_MAP.items()} | ||
| 42 | + | ||
| 43 | +TRANSLATION_PROMPTS = { | ||
| 44 | + "general": { | ||
| 45 | + "zh": "你是一名专业的 {source_lang}({src_lang_code})到 {target_lang}({tgt_lang_code})翻译专家,请准确传达原文含义并符合{target_lang}语言习惯,只输出翻译结果:{text}", | ||
| 46 | + "en": "You are a professional {source_lang} ({src_lang_code}) to {target_lang} ({tgt_lang_code}) translator. Accurately convey the meaning following {target_lang} grammar and usage, output only the translation: {text}", | ||
| 47 | + "ru": "Вы профессиональный переводчик с {source_lang} ({src_lang_code}) на {target_lang} ({tgt_lang_code}). Точно передайте смысл текста, соблюдая нормы {target_lang}, выводите только перевод: {text}", | ||
| 48 | + "ar": "أنت مترجم محترف من {source_lang} ({src_lang_code}) إلى {target_lang} ({tgt_lang_code}). انقل المعنى بدقة وفق قواعد {target_lang} وأخرج الترجمة فقط: {text}", | ||
| 49 | + "ja": "あなたは {source_lang}({src_lang_code})から {target_lang}({tgt_lang_code})へのプロ翻訳者です。意味を正確に伝え、{target_lang}の表現に従い、翻訳のみ出力してください:{text}", | ||
| 50 | + "es": "Eres un traductor profesional de {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Transmite con precisión el significado y devuelve solo la traducción: {text}", | ||
| 51 | + "de": "Du bist ein professioneller Übersetzer von {source_lang} ({src_lang_code}) nach {target_lang} ({tgt_lang_code}). Gib die Bedeutung korrekt wieder und gib nur die Übersetzung aus: {text}", | ||
| 52 | + "fr": "Vous êtes un traducteur professionnel de {source_lang} ({src_lang_code}) vers {target_lang} ({tgt_lang_code}). Transmettez fidèlement le sens et produisez uniquement la traduction : {text}", | ||
| 53 | + "it": "Sei un traduttore professionista da {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Trasmetti accuratamente il significato e restituisci solo la traduzione: {text}", | ||
| 54 | + "pt": "Você é um tradutor profissional de {source_lang} ({src_lang_code}) para {target_lang} ({tgt_lang_code}). Transmita o significado com precisão e produza apenas a tradução: {text}" | ||
| 55 | + }, | ||
| 56 | + | ||
| 57 | + "sku_name": { | ||
| 58 | + "zh": "你是一名专业的 {source_lang}({src_lang_code})到 {target_lang}({tgt_lang_code})电商翻译专家,请将原文翻译为{target_lang}商品SKU名称,要求准确完整、简洁专业,只输出结果:{text}", | ||
| 59 | + "en": "You are a professional {source_lang} ({src_lang_code}) to {target_lang} ({tgt_lang_code}) ecommerce translator. Translate into a concise and accurate {target_lang} product SKU name, output only the result: {text}", | ||
| 60 | + "ru": "Вы переводчик e-commerce с {source_lang} ({src_lang_code}) на {target_lang} ({tgt_lang_code}). Переведите в краткое и точное название SKU товара на {target_lang}, выводите только результат: {text}", | ||
| 61 | + "ar": "أنت مترجم تجارة إلكترونية من {source_lang} ({src_lang_code}) إلى {target_lang} ({tgt_lang_code}). ترجم إلى اسم SKU للمنتج بلغة {target_lang} بدقة واختصار، وأخرج النتيجة فقط: {text}", | ||
| 62 | + "ja": "{source_lang}({src_lang_code})から {target_lang}({tgt_lang_code})へのEC翻訳者として、簡潔で正確な{target_lang}の商品SKU名に翻訳し、結果のみ出力してください:{text}", | ||
| 63 | + "es": "Eres un traductor ecommerce de {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Traduce a un nombre SKU de producto en {target_lang}, preciso y conciso, devuelve solo el resultado: {text}", | ||
| 64 | + "de": "Du bist ein E-Commerce-Übersetzer von {source_lang} ({src_lang_code}) nach {target_lang} ({tgt_lang_code}). Übersetze in einen präzisen und kurzen {target_lang} Produkt-SKU-Namen, nur Ergebnis ausgeben: {text}", | ||
| 65 | + "fr": "Vous êtes un traducteur e-commerce de {source_lang} ({src_lang_code}) vers {target_lang} ({tgt_lang_code}). Traduisez en un nom SKU produit {target_lang} précis et concis, sortie uniquement : {text}", | ||
| 66 | + "it": "Sei un traduttore ecommerce da {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Traduce in un nome SKU prodotto {target_lang} conciso e accurato, restituisci solo il risultato: {text}", | ||
| 67 | + "pt": "Você é um tradutor de e-commerce de {source_lang} ({src_lang_code}) para {target_lang} ({tgt_lang_code}). Traduza para um nome SKU de produto {target_lang} conciso e preciso, produza apenas o resultado: {text}" | ||
| 68 | + }, | ||
| 69 | + | ||
| 70 | + "ecommerce_search_query": { | ||
| 71 | + "zh": "你是一名专业的 {source_lang}({src_lang_code})到 {target_lang}({tgt_lang_code})翻译助手,请将电商搜索词准确翻译为{target_lang}并符合搜索习惯,只输出结果:{text}", | ||
| 72 | + "en": "You are a professional {source_lang} ({src_lang_code}) to {target_lang} ({tgt_lang_code}) translator. Translate the ecommerce search query accurately following {target_lang} search habits, output only the result: {text}", | ||
| 73 | + "ru": "Вы переводчик с {source_lang} ({src_lang_code}) на {target_lang} ({tgt_lang_code}). Переведите поисковый запрос e-commerce с учётом привычек поиска, выводите только результат: {text}", | ||
| 74 | + "ar": "أنت مترجم من {source_lang} ({src_lang_code}) إلى {target_lang} ({tgt_lang_code}). ترجم عبارة البحث للتجارة الإلكترونية بما يناسب عادات البحث وأخرج النتيجة فقط: {text}", | ||
| 75 | + "ja": "{source_lang}({src_lang_code})から {target_lang}({tgt_lang_code})への翻訳者として、EC検索キーワードを{target_lang}の検索習慣に合わせて翻訳し、結果のみ出力してください:{text}", | ||
| 76 | + "es": "Eres un traductor de {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Traduce la consulta de búsqueda ecommerce según los hábitos de búsqueda y devuelve solo el resultado: {text}", | ||
| 77 | + "de": "Du bist ein Übersetzer von {source_lang} ({src_lang_code}) nach {target_lang} ({tgt_lang_code}). Übersetze die E-Commerce-Suchanfrage entsprechend den Suchgewohnheiten, nur Ergebnis ausgeben: {text}", | ||
| 78 | + "fr": "Vous êtes un traducteur de {source_lang} ({src_lang_code}) vers {target_lang} ({tgt_lang_code}). Traduisez la requête de recherche e-commerce selon les habitudes de recherche, sortie uniquement : {text}", | ||
| 79 | + "it": "Sei un traduttore da {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Traduce la query di ricerca ecommerce secondo le abitudini di ricerca e restituisci solo il risultato: {text}", | ||
| 80 | + "pt": "Você é um tradutor de {source_lang} ({src_lang_code}) para {target_lang} ({tgt_lang_code}). Traduza a consulta de busca de ecommerce conforme os hábitos de busca e produza apenas o resultado: {text}" | ||
| 81 | + } | ||
| 82 | +} |
docs/搜索API对接指南.md
| @@ -1814,7 +1814,8 @@ curl "http://localhost:6007/health" | @@ -1814,7 +1814,8 @@ curl "http://localhost:6007/health" | ||
| 1814 | "text": "商品名称", | 1814 | "text": "商品名称", |
| 1815 | "target_lang": "en", | 1815 | "target_lang": "en", |
| 1816 | "source_lang": "zh", | 1816 | "source_lang": "zh", |
| 1817 | - "model": "qwen" | 1817 | + "model": "qwen", |
| 1818 | + "context": "sku_name" | ||
| 1818 | } | 1819 | } |
| 1819 | ``` | 1820 | ``` |
| 1820 | 1821 | ||
| @@ -1823,7 +1824,8 @@ curl "http://localhost:6007/health" | @@ -1823,7 +1824,8 @@ curl "http://localhost:6007/health" | ||
| 1823 | | `text` | string | Y | 待翻译文本 | | 1824 | | `text` | string | Y | 待翻译文本 | |
| 1824 | | `target_lang` | string | Y | 目标语言:`zh`、`en`、`ru` 等 | | 1825 | | `target_lang` | string | Y | 目标语言:`zh`、`en`、`ru` 等 | |
| 1825 | | `source_lang` | string | N | 源语言,不传则自动检测 | | 1826 | | `source_lang` | string | N | 源语言,不传则自动检测 | |
| 1826 | -| `model` | string | N | `qwen`(默认)或 `deepl` | | 1827 | +| `model` | string | N | `qwen`(默认)、`deepl` 或 `llm` | |
| 1828 | +| `context` | string | N | 翻译场景参数:商品标题翻译使用 `sku_name`,搜索请求中的 query 翻译使用 `ecommerce_search_query`,其它通用场景可不传或使用 `general` | | ||
| 1827 | 1829 | ||
| 1828 | **响应**: | 1830 | **响应**: |
| 1829 | ```json | 1831 | ```json |
indexer/document_transformer.py
| @@ -36,7 +36,6 @@ class SPUDocumentTransformer: | @@ -36,7 +36,6 @@ class SPUDocumentTransformer: | ||
| 36 | searchable_option_dimensions: List[str], | 36 | searchable_option_dimensions: List[str], |
| 37 | tenant_config: Optional[Dict[str, Any]] = None, | 37 | tenant_config: Optional[Dict[str, Any]] = None, |
| 38 | translator: Optional[Any] = None, | 38 | translator: Optional[Any] = None, |
| 39 | - translation_prompts: Optional[Dict[str, str]] = None, | ||
| 40 | encoder: Optional[Any] = None, | 39 | encoder: Optional[Any] = None, |
| 41 | enable_title_embedding: bool = True, | 40 | enable_title_embedding: bool = True, |
| 42 | image_encoder: Optional[Any] = None, | 41 | image_encoder: Optional[Any] = None, |
| @@ -50,7 +49,6 @@ class SPUDocumentTransformer: | @@ -50,7 +49,6 @@ class SPUDocumentTransformer: | ||
| 50 | searchable_option_dimensions: 可搜索的option维度列表 | 49 | searchable_option_dimensions: 可搜索的option维度列表 |
| 51 | tenant_config: 租户配置(包含主语言和翻译配置) | 50 | tenant_config: 租户配置(包含主语言和翻译配置) |
| 52 | translator: 翻译器实例(可选,如果提供则启用翻译功能) | 51 | translator: 翻译器实例(可选,如果提供则启用翻译功能) |
| 53 | - translation_prompts: 翻译提示词配置(可选) | ||
| 54 | encoder: 文本编码器实例(可选,用于生成title_embedding) | 52 | encoder: 文本编码器实例(可选,用于生成title_embedding) |
| 55 | enable_title_embedding: 是否启用标题向量化(默认True) | 53 | enable_title_embedding: 是否启用标题向量化(默认True) |
| 56 | image_encoder: 图片编码器实例(可选,需实现 encode_image_urls(urls) -> List[Optional[np.ndarray]]) | 54 | image_encoder: 图片编码器实例(可选,需实现 encode_image_urls(urls) -> List[Optional[np.ndarray]]) |
| @@ -60,12 +58,33 @@ class SPUDocumentTransformer: | @@ -60,12 +58,33 @@ class SPUDocumentTransformer: | ||
| 60 | self.searchable_option_dimensions = searchable_option_dimensions | 58 | self.searchable_option_dimensions = searchable_option_dimensions |
| 61 | self.tenant_config = tenant_config or {} | 59 | self.tenant_config = tenant_config or {} |
| 62 | self.translator = translator | 60 | self.translator = translator |
| 63 | - self.translation_prompts = translation_prompts or {} | ||
| 64 | self.encoder = encoder | 61 | self.encoder = encoder |
| 65 | self.enable_title_embedding = enable_title_embedding | 62 | self.enable_title_embedding = enable_title_embedding |
| 66 | self.image_encoder = image_encoder | 63 | self.image_encoder = image_encoder |
| 67 | self.enable_image_embedding = bool(enable_image_embedding and image_encoder is not None) | 64 | self.enable_image_embedding = bool(enable_image_embedding and image_encoder is not None) |
| 68 | 65 | ||
| 66 | + def _translate_index_languages( | ||
| 67 | + self, | ||
| 68 | + text: str, | ||
| 69 | + source_lang: str, | ||
| 70 | + index_languages: List[str], | ||
| 71 | + scene: str, | ||
| 72 | + ) -> Dict[str, Optional[str]]: | ||
| 73 | + translations: Dict[str, Optional[str]] = {} | ||
| 74 | + if not self.translator or not text or not str(text).strip(): | ||
| 75 | + return translations | ||
| 76 | + for lang in index_languages: | ||
| 77 | + if lang == source_lang: | ||
| 78 | + translations[lang] = text | ||
| 79 | + continue | ||
| 80 | + translations[lang] = self.translator.translate( | ||
| 81 | + text=text, | ||
| 82 | + target_lang=lang, | ||
| 83 | + source_lang=source_lang, | ||
| 84 | + context=scene, | ||
| 85 | + ) | ||
| 86 | + return translations | ||
| 87 | + | ||
| 69 | def transform_spu_to_doc( | 88 | def transform_spu_to_doc( |
| 70 | self, | 89 | self, |
| 71 | tenant_id: str, | 90 | tenant_id: str, |
| @@ -322,15 +341,12 @@ class SPUDocumentTransformer: | @@ -322,15 +341,12 @@ class SPUDocumentTransformer: | ||
| 322 | title_text = str(spu_row['title']) | 341 | title_text = str(spu_row['title']) |
| 323 | translations: Dict[str, Optional[str]] = {} | 342 | translations: Dict[str, Optional[str]] = {} |
| 324 | if self.translator: | 343 | if self.translator: |
| 325 | - prompt_zh = self.translation_prompts.get('product_title_zh') or self.translation_prompts.get('default_zh') | ||
| 326 | - prompt_en = self.translation_prompts.get('product_title_en') or self.translation_prompts.get('default_en') | ||
| 327 | - translations = self.translator.translate_for_indexing( | ||
| 328 | - title_text, | ||
| 329 | - shop_language=primary_lang, | 344 | + translations = self._translate_index_languages( |
| 345 | + text=title_text, | ||
| 330 | source_lang=primary_lang, | 346 | source_lang=primary_lang, |
| 331 | - prompt=prompt_zh if primary_lang == 'zh' else prompt_en, | ||
| 332 | index_languages=index_langs, | 347 | index_languages=index_langs, |
| 333 | - ) or {} | 348 | + scene="product_title", |
| 349 | + ) | ||
| 334 | _set_lang_obj("title", title_text, translations) | 350 | _set_lang_obj("title", title_text, translations) |
| 335 | 351 | ||
| 336 | # Brief | 352 | # Brief |
| @@ -338,14 +354,12 @@ class SPUDocumentTransformer: | @@ -338,14 +354,12 @@ class SPUDocumentTransformer: | ||
| 338 | brief_text = str(spu_row['brief']) | 354 | brief_text = str(spu_row['brief']) |
| 339 | translations = {} | 355 | translations = {} |
| 340 | if self.translator: | 356 | if self.translator: |
| 341 | - prompt = self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en') | ||
| 342 | - translations = self.translator.translate_for_indexing( | ||
| 343 | - brief_text, | ||
| 344 | - shop_language=primary_lang, | 357 | + translations = self._translate_index_languages( |
| 358 | + text=brief_text, | ||
| 345 | source_lang=primary_lang, | 359 | source_lang=primary_lang, |
| 346 | - prompt=prompt, | ||
| 347 | index_languages=index_langs, | 360 | index_languages=index_langs, |
| 348 | - ) or {} | 361 | + scene="default", |
| 362 | + ) | ||
| 349 | _set_lang_obj("brief", brief_text, translations) | 363 | _set_lang_obj("brief", brief_text, translations) |
| 350 | 364 | ||
| 351 | # Description | 365 | # Description |
| @@ -353,14 +367,12 @@ class SPUDocumentTransformer: | @@ -353,14 +367,12 @@ class SPUDocumentTransformer: | ||
| 353 | desc_text = str(spu_row['description']) | 367 | desc_text = str(spu_row['description']) |
| 354 | translations = {} | 368 | translations = {} |
| 355 | if self.translator: | 369 | if self.translator: |
| 356 | - prompt = self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en') | ||
| 357 | - translations = self.translator.translate_for_indexing( | ||
| 358 | - desc_text, | ||
| 359 | - shop_language=primary_lang, | 370 | + translations = self._translate_index_languages( |
| 371 | + text=desc_text, | ||
| 360 | source_lang=primary_lang, | 372 | source_lang=primary_lang, |
| 361 | - prompt=prompt, | ||
| 362 | index_languages=index_langs, | 373 | index_languages=index_langs, |
| 363 | - ) or {} | 374 | + scene="default", |
| 375 | + ) | ||
| 364 | _set_lang_obj("description", desc_text, translations) | 376 | _set_lang_obj("description", desc_text, translations) |
| 365 | 377 | ||
| 366 | # Vendor | 378 | # Vendor |
| @@ -368,14 +380,12 @@ class SPUDocumentTransformer: | @@ -368,14 +380,12 @@ class SPUDocumentTransformer: | ||
| 368 | vendor_text = str(spu_row['vendor']) | 380 | vendor_text = str(spu_row['vendor']) |
| 369 | translations = {} | 381 | translations = {} |
| 370 | if self.translator: | 382 | if self.translator: |
| 371 | - prompt = self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en') | ||
| 372 | - translations = self.translator.translate_for_indexing( | ||
| 373 | - vendor_text, | ||
| 374 | - shop_language=primary_lang, | 383 | + translations = self._translate_index_languages( |
| 384 | + text=vendor_text, | ||
| 375 | source_lang=primary_lang, | 385 | source_lang=primary_lang, |
| 376 | - prompt=prompt, | ||
| 377 | index_languages=index_langs, | 386 | index_languages=index_langs, |
| 378 | - ) or {} | 387 | + scene="default", |
| 388 | + ) | ||
| 379 | _set_lang_obj("vendor", vendor_text, translations) | 389 | _set_lang_obj("vendor", vendor_text, translations) |
| 380 | 390 | ||
| 381 | def _fill_category_fields(self, doc: Dict[str, Any], spu_row: pd.Series): | 391 | def _fill_category_fields(self, doc: Dict[str, Any], spu_row: pd.Series): |
indexer/incremental_service.py
| @@ -39,7 +39,6 @@ class IncrementalIndexerService: | @@ -39,7 +39,6 @@ class IncrementalIndexerService: | ||
| 39 | self._config: Optional[Any] = None | 39 | self._config: Optional[Any] = None |
| 40 | self._config_lock = threading.Lock() | 40 | self._config_lock = threading.Lock() |
| 41 | self._translator: Optional[Any] = None | 41 | self._translator: Optional[Any] = None |
| 42 | - self._translation_prompts: Optional[Dict[str, Any]] = None | ||
| 43 | self._searchable_option_dimensions: Optional[List[str]] = None | 42 | self._searchable_option_dimensions: Optional[List[str]] = None |
| 44 | self._shared_text_encoder: Optional[Any] = None | 43 | self._shared_text_encoder: Optional[Any] = None |
| 45 | self._shared_image_encoder: Optional[Any] = None | 44 | self._shared_image_encoder: Optional[Any] = None |
| @@ -52,7 +51,6 @@ class IncrementalIndexerService: | @@ -52,7 +51,6 @@ class IncrementalIndexerService: | ||
| 52 | def _eager_init(self) -> None: | 51 | def _eager_init(self) -> None: |
| 53 | """Strict eager initialization. Any dependency failure should fail fast.""" | 52 | """Strict eager initialization. Any dependency failure should fail fast.""" |
| 54 | self._config = ConfigLoader("config/config.yaml").load_config() | 53 | self._config = ConfigLoader("config/config.yaml").load_config() |
| 55 | - self._translation_prompts = getattr(self._config.query_config, "translation_prompts", {}) or {} | ||
| 56 | self._searchable_option_dimensions = ( | 54 | self._searchable_option_dimensions = ( |
| 57 | getattr(self._config.spu_config, "searchable_option_dimensions", None) | 55 | getattr(self._config.spu_config, "searchable_option_dimensions", None) |
| 58 | or ["option1", "option2", "option3"] | 56 | or ["option1", "option2", "option3"] |
| @@ -110,7 +108,6 @@ class IncrementalIndexerService: | @@ -110,7 +108,6 @@ class IncrementalIndexerService: | ||
| 110 | tenant_id=tenant_id, | 108 | tenant_id=tenant_id, |
| 111 | searchable_option_dimensions=self._searchable_option_dimensions, | 109 | searchable_option_dimensions=self._searchable_option_dimensions, |
| 112 | translator=self._translator, | 110 | translator=self._translator, |
| 113 | - translation_prompts=self._translation_prompts, | ||
| 114 | encoder=encoder, | 111 | encoder=encoder, |
| 115 | enable_title_embedding=False, # batch fill later | 112 | enable_title_embedding=False, # batch fill later |
| 116 | image_encoder=image_encoder, | 113 | image_encoder=image_encoder, |
indexer/indexing_utils.py
| @@ -57,7 +57,6 @@ def create_document_transformer( | @@ -57,7 +57,6 @@ def create_document_transformer( | ||
| 57 | tenant_id: str, | 57 | tenant_id: str, |
| 58 | searchable_option_dimensions: Optional[list] = None, | 58 | searchable_option_dimensions: Optional[list] = None, |
| 59 | translator: Optional[Any] = None, | 59 | translator: Optional[Any] = None, |
| 60 | - translation_prompts: Optional[Dict[str, str]] = None, | ||
| 61 | encoder: Optional[Any] = None, | 60 | encoder: Optional[Any] = None, |
| 62 | enable_title_embedding: bool = True, | 61 | enable_title_embedding: bool = True, |
| 63 | image_encoder: Optional[Any] = None, | 62 | image_encoder: Optional[Any] = None, |
| @@ -72,7 +71,6 @@ def create_document_transformer( | @@ -72,7 +71,6 @@ def create_document_transformer( | ||
| 72 | tenant_id: 租户ID | 71 | tenant_id: 租户ID |
| 73 | searchable_option_dimensions: 可搜索的option维度列表(如果为None则从配置加载) | 72 | searchable_option_dimensions: 可搜索的option维度列表(如果为None则从配置加载) |
| 74 | translator: 翻译器实例(如果为None则根据配置初始化) | 73 | translator: 翻译器实例(如果为None则根据配置初始化) |
| 75 | - translation_prompts: 翻译提示词配置(如果为None则从配置加载) | ||
| 76 | encoder: 文本编码器实例(如果为None且enable_title_embedding为True则根据配置初始化) | 74 | encoder: 文本编码器实例(如果为None且enable_title_embedding为True则根据配置初始化) |
| 77 | enable_title_embedding: 是否启用标题向量化(默认True) | 75 | enable_title_embedding: 是否启用标题向量化(默认True) |
| 78 | image_encoder: 图片编码器(可选,需实现 encode_image_urls(urls)) | 76 | image_encoder: 图片编码器(可选,需实现 encode_image_urls(urls)) |
| @@ -89,7 +87,6 @@ def create_document_transformer( | @@ -89,7 +87,6 @@ def create_document_transformer( | ||
| 89 | if ( | 87 | if ( |
| 90 | searchable_option_dimensions is None | 88 | searchable_option_dimensions is None |
| 91 | or translator is None | 89 | or translator is None |
| 92 | - or translation_prompts is None | ||
| 93 | or (encoder is None and enable_title_embedding) | 90 | or (encoder is None and enable_title_embedding) |
| 94 | or config is None | 91 | or config is None |
| 95 | ): | 92 | ): |
| @@ -107,9 +104,6 @@ def create_document_transformer( | @@ -107,9 +104,6 @@ def create_document_transformer( | ||
| 107 | 104 | ||
| 108 | translator = create_translation_provider(config.query_config) | 105 | translator = create_translation_provider(config.query_config) |
| 109 | 106 | ||
| 110 | - if translation_prompts is None: | ||
| 111 | - translation_prompts = config.query_config.translation_prompts | ||
| 112 | - | ||
| 113 | # 初始化encoder(如果启用标题向量化且未提供encoder) | 107 | # 初始化encoder(如果启用标题向量化且未提供encoder) |
| 114 | if encoder is None and enable_title_embedding and config.query_config.enable_text_embedding: | 108 | if encoder is None and enable_title_embedding and config.query_config.enable_text_embedding: |
| 115 | from embeddings.text_encoder import TextEmbeddingEncoder | 109 | from embeddings.text_encoder import TextEmbeddingEncoder |
| @@ -122,7 +116,6 @@ def create_document_transformer( | @@ -122,7 +116,6 @@ def create_document_transformer( | ||
| 122 | searchable_option_dimensions=searchable_option_dimensions, | 116 | searchable_option_dimensions=searchable_option_dimensions, |
| 123 | tenant_config=tenant_config, | 117 | tenant_config=tenant_config, |
| 124 | translator=translator, | 118 | translator=translator, |
| 125 | - translation_prompts=translation_prompts, | ||
| 126 | encoder=encoder, | 119 | encoder=encoder, |
| 127 | enable_title_embedding=enable_title_embedding, | 120 | enable_title_embedding=enable_title_embedding, |
| 128 | image_encoder=image_encoder, | 121 | image_encoder=image_encoder, |
indexer/test_indexing.py
| @@ -285,7 +285,6 @@ def test_document_transformer(): | @@ -285,7 +285,6 @@ def test_document_transformer(): | ||
| 285 | searchable_option_dimensions=['option1', 'option2', 'option3'], | 285 | searchable_option_dimensions=['option1', 'option2', 'option3'], |
| 286 | tenant_config=tenant_config, | 286 | tenant_config=tenant_config, |
| 287 | translator=translator, | 287 | translator=translator, |
| 288 | - translation_prompts=config.query_config.translation_prompts | ||
| 289 | ) | 288 | ) |
| 290 | 289 | ||
| 291 | # 转换文档 | 290 | # 转换文档 |
providers/translation.py
| 1 | -""" | ||
| 2 | -Translation provider - direct (in-process) or HTTP service. | ||
| 3 | -""" | 1 | +"""Translation provider factory and HTTP provider implementation.""" |
| 4 | from __future__ import annotations | 2 | from __future__ import annotations |
| 5 | 3 | ||
| 6 | import logging | 4 | import logging |
| 7 | -from typing import Any, Dict, List, Optional, Union | ||
| 8 | - | ||
| 9 | -from concurrent.futures import Future, ThreadPoolExecutor | 5 | +from typing import Any, Dict, Optional |
| 10 | import requests | 6 | import requests |
| 11 | 7 | ||
| 12 | from config.services_config import get_translation_config, get_translation_base_url | 8 | from config.services_config import get_translation_config, get_translation_base_url |
| @@ -22,19 +18,18 @@ class HttpTranslationProvider: | @@ -22,19 +18,18 @@ class HttpTranslationProvider: | ||
| 22 | base_url: str, | 18 | base_url: str, |
| 23 | model: str = "qwen", | 19 | model: str = "qwen", |
| 24 | timeout_sec: float = 10.0, | 20 | timeout_sec: float = 10.0, |
| 25 | - translation_context: Optional[str] = None, | ||
| 26 | ): | 21 | ): |
| 27 | self.base_url = (base_url or "").rstrip("/") | 22 | self.base_url = (base_url or "").rstrip("/") |
| 28 | self.model = model or "qwen" | 23 | self.model = model or "qwen" |
| 29 | self.timeout_sec = float(timeout_sec or 10.0) | 24 | self.timeout_sec = float(timeout_sec or 10.0) |
| 30 | - self.translation_context = translation_context or "e-commerce product search" | ||
| 31 | - self.executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="http-translator") | ||
| 32 | 25 | ||
| 33 | def _translate_once( | 26 | def _translate_once( |
| 34 | self, | 27 | self, |
| 35 | text: str, | 28 | text: str, |
| 36 | target_lang: str, | 29 | target_lang: str, |
| 37 | source_lang: Optional[str] = None, | 30 | source_lang: Optional[str] = None, |
| 31 | + context: Optional[str] = None, | ||
| 32 | + prompt: Optional[str] = None, | ||
| 38 | ) -> Optional[str]: | 33 | ) -> Optional[str]: |
| 39 | if not text or not str(text).strip(): | 34 | if not text or not str(text).strip(): |
| 40 | return text | 35 | return text |
| @@ -46,6 +41,10 @@ class HttpTranslationProvider: | @@ -46,6 +41,10 @@ class HttpTranslationProvider: | ||
| 46 | "source_lang": source_lang or "auto", | 41 | "source_lang": source_lang or "auto", |
| 47 | "model": self.model, | 42 | "model": self.model, |
| 48 | } | 43 | } |
| 44 | + if context: | ||
| 45 | + payload["context"] = context | ||
| 46 | + if prompt: | ||
| 47 | + payload["prompt"] = prompt | ||
| 49 | response = requests.post(url, json=payload, timeout=self.timeout_sec) | 48 | response = requests.post(url, json=payload, timeout=self.timeout_sec) |
| 50 | if response.status_code != 200: | 49 | if response.status_code != 200: |
| 51 | logger.warning( | 50 | logger.warning( |
| @@ -69,58 +68,13 @@ class HttpTranslationProvider: | @@ -69,58 +68,13 @@ class HttpTranslationProvider: | ||
| 69 | context: Optional[str] = None, | 68 | context: Optional[str] = None, |
| 70 | prompt: Optional[str] = None, | 69 | prompt: Optional[str] = None, |
| 71 | ) -> Optional[str]: | 70 | ) -> Optional[str]: |
| 72 | - del context, prompt | ||
| 73 | - result = self._translate_once(text=text, target_lang=target_lang, source_lang=source_lang) | ||
| 74 | - return result if result is not None else text | ||
| 75 | - | ||
| 76 | - def translate_multi( | ||
| 77 | - self, | ||
| 78 | - text: str, | ||
| 79 | - target_langs: List[str], | ||
| 80 | - source_lang: Optional[str] = None, | ||
| 81 | - context: Optional[str] = None, | ||
| 82 | - async_mode: bool = True, | ||
| 83 | - prompt: Optional[str] = None, | ||
| 84 | - ) -> Dict[str, Optional[str]]: | ||
| 85 | - del context, async_mode, prompt | ||
| 86 | - out: Dict[str, Optional[str]] = {} | ||
| 87 | - for lang in target_langs: | ||
| 88 | - out[lang] = self.translate(text, lang, source_lang=source_lang) | ||
| 89 | - return out | ||
| 90 | - | ||
| 91 | - def translate_multi_async( | ||
| 92 | - self, | ||
| 93 | - text: str, | ||
| 94 | - target_langs: List[str], | ||
| 95 | - source_lang: Optional[str] = None, | ||
| 96 | - context: Optional[str] = None, | ||
| 97 | - prompt: Optional[str] = None, | ||
| 98 | - ) -> Dict[str, Union[str, Future]]: | ||
| 99 | - del context, prompt | ||
| 100 | - out: Dict[str, Union[str, Future]] = {} | ||
| 101 | - for lang in target_langs: | ||
| 102 | - out[lang] = self.executor.submit(self.translate, text, lang, source_lang) | ||
| 103 | - return out | ||
| 104 | - | ||
| 105 | - def translate_for_indexing( | ||
| 106 | - self, | ||
| 107 | - text: str, | ||
| 108 | - shop_language: str, | ||
| 109 | - source_lang: Optional[str] = None, | ||
| 110 | - context: Optional[str] = None, | ||
| 111 | - prompt: Optional[str] = None, | ||
| 112 | - index_languages: Optional[List[str]] = None, | ||
| 113 | - ) -> Dict[str, Optional[str]]: | ||
| 114 | - del context, prompt | ||
| 115 | - langs = index_languages if index_languages else ["en", "zh"] | ||
| 116 | - source = source_lang or shop_language or "auto" | ||
| 117 | - out: Dict[str, Optional[str]] = {} | ||
| 118 | - for lang in langs: | ||
| 119 | - if lang == shop_language: | ||
| 120 | - out[lang] = text | ||
| 121 | - else: | ||
| 122 | - out[lang] = self.translate(text, target_lang=lang, source_lang=source) | ||
| 123 | - return out | 71 | + return self._translate_once( |
| 72 | + text=text, | ||
| 73 | + target_lang=target_lang, | ||
| 74 | + source_lang=source_lang, | ||
| 75 | + context=context, | ||
| 76 | + prompt=prompt, | ||
| 77 | + ) | ||
| 124 | 78 | ||
| 125 | 79 | ||
| 126 | def create_translation_provider(query_config: Any = None) -> Any: | 80 | def create_translation_provider(query_config: Any = None) -> Any: |
| @@ -133,9 +87,9 @@ def create_translation_provider(query_config: Any = None) -> Any: | @@ -133,9 +87,9 @@ def create_translation_provider(query_config: Any = None) -> Any: | ||
| 133 | provider = cfg.provider | 87 | provider = cfg.provider |
| 134 | pc = cfg.get_provider_cfg() | 88 | pc = cfg.get_provider_cfg() |
| 135 | 89 | ||
| 136 | - if provider in ("direct", "local", "inprocess"): | 90 | + if provider in ("qwen-mt", "direct", "local", "inprocess"): |
| 137 | from query.qwen_mt_translate import Translator | 91 | from query.qwen_mt_translate import Translator |
| 138 | - model = pc.get("model") or "qwen" | 92 | + model = pc.get("model") or "qwen-mt-flash" |
| 139 | qc = query_config or _empty_query_config() | 93 | qc = query_config or _empty_query_config() |
| 140 | return Translator( | 94 | return Translator( |
| 141 | model=model, | 95 | model=model, |
| @@ -145,7 +99,7 @@ def create_translation_provider(query_config: Any = None) -> Any: | @@ -145,7 +99,7 @@ def create_translation_provider(query_config: Any = None) -> Any: | ||
| 145 | translation_context=getattr(qc, "translation_context", "e-commerce product search"), | 99 | translation_context=getattr(qc, "translation_context", "e-commerce product search"), |
| 146 | ) | 100 | ) |
| 147 | 101 | ||
| 148 | - if provider in ("http", "service"): | 102 | + elif provider in ("http", "service"): |
| 149 | base_url = get_translation_base_url() | 103 | base_url = get_translation_base_url() |
| 150 | model = pc.get("model") or "qwen" | 104 | model = pc.get("model") or "qwen" |
| 151 | timeout = pc.get("timeout_sec", 10.0) | 105 | timeout = pc.get("timeout_sec", 10.0) |
| @@ -154,7 +108,26 @@ def create_translation_provider(query_config: Any = None) -> Any: | @@ -154,7 +108,26 @@ def create_translation_provider(query_config: Any = None) -> Any: | ||
| 154 | base_url=base_url, | 108 | base_url=base_url, |
| 155 | model=model, | 109 | model=model, |
| 156 | timeout_sec=float(timeout), | 110 | timeout_sec=float(timeout), |
| 157 | - translation_context=getattr(qc, "translation_context", "e-commerce product search"), | 111 | + ) |
| 112 | + | ||
| 113 | + elif provider == "llm": | ||
| 114 | + from query.llm_translate import LLMTranslatorProvider | ||
| 115 | + model = pc.get("model") | ||
| 116 | + timeout = float(pc.get("timeout_sec", 30.0)) | ||
| 117 | + base_url = (pc.get("base_url") or "").strip() or None | ||
| 118 | + return LLMTranslatorProvider( | ||
| 119 | + model=model, | ||
| 120 | + timeout_sec=timeout, | ||
| 121 | + base_url=base_url, | ||
| 122 | + ) | ||
| 123 | + | ||
| 124 | + elif provider == "deepl": | ||
| 125 | + from query.deepl_provider import DeepLProvider | ||
| 126 | + qc = query_config or _empty_query_config() | ||
| 127 | + return DeepLProvider( | ||
| 128 | + api_key=getattr(qc, "translation_api_key", None), | ||
| 129 | + timeout=float(pc.get("timeout_sec", 10.0)), | ||
| 130 | + glossary_id=pc.get("glossary_id") or getattr(qc, "translation_glossary_id", None), | ||
| 158 | ) | 131 | ) |
| 159 | 132 | ||
| 160 | raise ValueError(f"Unsupported translation provider: {provider}") | 133 | raise ValueError(f"Unsupported translation provider: {provider}") |
| @@ -0,0 +1,203 @@ | @@ -0,0 +1,203 @@ | ||
| 1 | +""" | ||
| 2 | +DeepL backend provider. | ||
| 3 | + | ||
| 4 | +This module only handles network calls to DeepL. | ||
| 5 | +It does not handle cache, async fanout, or fallback semantics. | ||
| 6 | +""" | ||
| 7 | + | ||
| 8 | +from __future__ import annotations | ||
| 9 | + | ||
| 10 | +import logging | ||
| 11 | +import os | ||
| 12 | +import re | ||
| 13 | +from typing import Dict, Optional, Tuple | ||
| 14 | + | ||
| 15 | +import requests | ||
| 16 | +from config.services_config import get_translation_config | ||
| 17 | + | ||
| 18 | + | ||
| 19 | +logger = logging.getLogger(__name__) | ||
| 20 | + | ||
| 21 | +DEFAULT_CONTEXTS: Dict[str, Dict[str, str]] = { | ||
| 22 | + "sku_name": { | ||
| 23 | + "zh": "商品SKU名称", | ||
| 24 | + "en": "product SKU name", | ||
| 25 | + }, | ||
| 26 | + "ecommerce_search_query": { | ||
| 27 | + "zh": "电商", | ||
| 28 | + "en": "e-commerce", | ||
| 29 | + }, | ||
| 30 | + "general": { | ||
| 31 | + "zh": "", | ||
| 32 | + "en": "", | ||
| 33 | + }, | ||
| 34 | +} | ||
| 35 | +SCENE_NAMES = frozenset(DEFAULT_CONTEXTS.keys()) | ||
| 36 | + | ||
| 37 | + | ||
| 38 | +def _merge_contexts(raw: object) -> Dict[str, Dict[str, str]]: | ||
| 39 | + merged: Dict[str, Dict[str, str]] = { | ||
| 40 | + scene: dict(lang_map) for scene, lang_map in DEFAULT_CONTEXTS.items() | ||
| 41 | + } | ||
| 42 | + if not isinstance(raw, dict): | ||
| 43 | + return merged | ||
| 44 | + for scene, lang_map in raw.items(): | ||
| 45 | + if not isinstance(lang_map, dict): | ||
| 46 | + continue | ||
| 47 | + scene_name = str(scene or "").strip() | ||
| 48 | + if not scene_name: | ||
| 49 | + continue | ||
| 50 | + merged.setdefault(scene_name, {}) | ||
| 51 | + for lang, value in lang_map.items(): | ||
| 52 | + lang_key = str(lang or "").strip().lower() | ||
| 53 | + context_value = str(value or "").strip() | ||
| 54 | + if lang_key and context_value: | ||
| 55 | + merged[scene_name][lang_key] = context_value | ||
| 56 | + return merged | ||
| 57 | + | ||
| 58 | + | ||
| 59 | +class DeepLProvider: | ||
| 60 | + API_URL = "https://api.deepl.com/v2/translate" # Pro tier | ||
| 61 | + LANG_CODE_MAP = { | ||
| 62 | + "zh": "ZH", | ||
| 63 | + "en": "EN", | ||
| 64 | + "ru": "RU", | ||
| 65 | + "ar": "AR", | ||
| 66 | + "ja": "JA", | ||
| 67 | + "es": "ES", | ||
| 68 | + "de": "DE", | ||
| 69 | + "fr": "FR", | ||
| 70 | + "it": "IT", | ||
| 71 | + "pt": "PT", | ||
| 72 | + } | ||
| 73 | + | ||
| 74 | + def __init__( | ||
| 75 | + self, | ||
| 76 | + api_key: Optional[str], | ||
| 77 | + *, | ||
| 78 | + timeout: float = 10.0, | ||
| 79 | + glossary_id: Optional[str] = None, | ||
| 80 | + ) -> None: | ||
| 81 | + cfg = get_translation_config() | ||
| 82 | + provider_cfg = cfg.providers.get("deepl", {}) if isinstance(cfg.providers, dict) else {} | ||
| 83 | + self.api_key = api_key or os.getenv("DEEPL_AUTH_KEY") | ||
| 84 | + self.timeout = float(provider_cfg.get("timeout_sec") or timeout or 10.0) | ||
| 85 | + self.glossary_id = glossary_id or provider_cfg.get("glossary_id") | ||
| 86 | + self.model = "deepl" | ||
| 87 | + self.context_presets = _merge_contexts(provider_cfg.get("contexts")) | ||
| 88 | + if not self.api_key: | ||
| 89 | + logger.warning("DEEPL_AUTH_KEY not set; DeepL translation is unavailable") | ||
| 90 | + | ||
| 91 | + def _resolve_request_context( | ||
| 92 | + self, | ||
| 93 | + target_lang: str, | ||
| 94 | + context: Optional[str], | ||
| 95 | + prompt: Optional[str], | ||
| 96 | + ) -> Optional[str]: | ||
| 97 | + if prompt: | ||
| 98 | + return prompt | ||
| 99 | + if context in SCENE_NAMES: | ||
| 100 | + scene_map = self.context_presets.get(context) or self.context_presets.get("default") or {} | ||
| 101 | + tgt = (target_lang or "").strip().lower() | ||
| 102 | + return scene_map.get(tgt) or scene_map.get("en") | ||
| 103 | + if context: | ||
| 104 | + return context | ||
| 105 | + scene_map = self.context_presets.get("default") or {} | ||
| 106 | + tgt = (target_lang or "").strip().lower() | ||
| 107 | + return scene_map.get(tgt) or scene_map.get("en") | ||
| 108 | + | ||
| 109 | + def translate( | ||
| 110 | + self, | ||
| 111 | + text: str, | ||
| 112 | + target_lang: str, | ||
| 113 | + source_lang: Optional[str] = None, | ||
| 114 | + context: Optional[str] = None, | ||
| 115 | + prompt: Optional[str] = None, | ||
| 116 | + ) -> Optional[str]: | ||
| 117 | + if not self.api_key: | ||
| 118 | + return None | ||
| 119 | + | ||
| 120 | + target_code = self.LANG_CODE_MAP.get((target_lang or "").lower(), (target_lang or "").upper()) | ||
| 121 | + headers = { | ||
| 122 | + "Authorization": f"DeepL-Auth-Key {self.api_key}", | ||
| 123 | + "Content-Type": "application/json", | ||
| 124 | + } | ||
| 125 | + | ||
| 126 | + api_context = self._resolve_request_context(target_lang, context, prompt) | ||
| 127 | + text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context) | ||
| 128 | + | ||
| 129 | + payload = { | ||
| 130 | + "text": [text_to_translate], | ||
| 131 | + "target_lang": target_code, | ||
| 132 | + } | ||
| 133 | + if source_lang: | ||
| 134 | + payload["source_lang"] = self.LANG_CODE_MAP.get(source_lang.lower(), source_lang.upper()) | ||
| 135 | + if api_context: | ||
| 136 | + payload["context"] = api_context | ||
| 137 | + if self.glossary_id: | ||
| 138 | + payload["glossary_id"] = self.glossary_id | ||
| 139 | + | ||
| 140 | + try: | ||
| 141 | + response = requests.post(self.API_URL, headers=headers, json=payload, timeout=self.timeout) | ||
| 142 | + if response.status_code != 200: | ||
| 143 | + logger.warning( | ||
| 144 | + "[deepl] Failed | status=%s tgt=%s body=%s", | ||
| 145 | + response.status_code, | ||
| 146 | + target_code, | ||
| 147 | + (response.text or "")[:200], | ||
| 148 | + ) | ||
| 149 | + return None | ||
| 150 | + | ||
| 151 | + data = response.json() | ||
| 152 | + translations = data.get("translations") or [] | ||
| 153 | + if not translations: | ||
| 154 | + return None | ||
| 155 | + translated = translations[0].get("text") | ||
| 156 | + if not translated: | ||
| 157 | + return None | ||
| 158 | + if needs_extraction: | ||
| 159 | + translated = self._extract_term_from_translation(translated, text, target_code) | ||
| 160 | + return translated | ||
| 161 | + except requests.Timeout: | ||
| 162 | + logger.warning("[deepl] Timeout | tgt=%s timeout=%.1fs", target_code, self.timeout) | ||
| 163 | + return None | ||
| 164 | + except Exception as exc: | ||
| 165 | + logger.warning("[deepl] Exception | tgt=%s error=%s", target_code, exc, exc_info=True) | ||
| 166 | + return None | ||
| 167 | + | ||
| 168 | + def _add_ecommerce_context( | ||
| 169 | + self, | ||
| 170 | + text: str, | ||
| 171 | + source_lang: Optional[str], | ||
| 172 | + context: Optional[str], | ||
| 173 | + ) -> Tuple[str, bool]: | ||
| 174 | + if not context or "e-commerce" not in context.lower(): | ||
| 175 | + return text, False | ||
| 176 | + if (source_lang or "").lower() != "zh": | ||
| 177 | + return text, False | ||
| 178 | + | ||
| 179 | + term = (text or "").strip() | ||
| 180 | + if len(term.split()) == 1 and len(term) <= 2: | ||
| 181 | + return f"购买 {term}", True | ||
| 182 | + return text, False | ||
| 183 | + | ||
| 184 | + def _extract_term_from_translation( | ||
| 185 | + self, | ||
| 186 | + translated_text: str, | ||
| 187 | + original_text: str, | ||
| 188 | + target_lang_code: str, | ||
| 189 | + ) -> str: | ||
| 190 | + del original_text | ||
| 191 | + if target_lang_code != "EN": | ||
| 192 | + return translated_text | ||
| 193 | + | ||
| 194 | + words = translated_text.strip().split() | ||
| 195 | + if len(words) <= 1: | ||
| 196 | + return translated_text | ||
| 197 | + context_words = {"buy", "purchase", "product", "item", "commodity", "goods"} | ||
| 198 | + for word in reversed(words): | ||
| 199 | + normalized = re.sub(r"[.,!?;:]+$", "", word.lower()) | ||
| 200 | + if normalized not in context_words: | ||
| 201 | + return normalized | ||
| 202 | + return re.sub(r"[.,!?;:]+$", "", words[-1].lower()) | ||
| 203 | + |
query/llm_translate.py
| 1 | """ | 1 | """ |
| 2 | -LLM-based translation helper using Qwen chat model. | 2 | +LLM-based translation backend (DashScope-compatible OpenAI API). |
| 3 | 3 | ||
| 4 | -This module provides a thin wrapper around DashScope's `qwen-flash` model | ||
| 5 | -for high-quality, prompt-controlled translation, independent of the main | ||
| 6 | -`Translator` (machine translation) pipeline. | ||
| 7 | - | ||
| 8 | -Usage example: | ||
| 9 | - | ||
| 10 | - from query.llm_translate import llm_translate | ||
| 11 | - | ||
| 12 | - result = llm_translate( | ||
| 13 | - text="我看到这个视频后没有笑", | ||
| 14 | - target_lang="en", | ||
| 15 | - source_lang="zh", | ||
| 16 | - source_lang_label="中文", | ||
| 17 | - target_lang_label="英文", | ||
| 18 | - ) | 4 | +Failure semantics are strict: |
| 5 | +- success: translated string | ||
| 6 | +- failure: None | ||
| 19 | """ | 7 | """ |
| 20 | 8 | ||
| 21 | from __future__ import annotations | 9 | from __future__ import annotations |
| @@ -23,113 +11,159 @@ from __future__ import annotations | @@ -23,113 +11,159 @@ from __future__ import annotations | ||
| 23 | import logging | 11 | import logging |
| 24 | import os | 12 | import os |
| 25 | import time | 13 | import time |
| 26 | -from typing import Dict, Optional | 14 | +from typing import Optional |
| 27 | 15 | ||
| 28 | from openai import OpenAI | 16 | from openai import OpenAI |
| 29 | 17 | ||
| 30 | from config.env_config import DASHSCOPE_API_KEY | 18 | from config.env_config import DASHSCOPE_API_KEY |
| 31 | from config.services_config import get_translation_config | 19 | from config.services_config import get_translation_config |
| 20 | +from config.translate_prompts import TRANSLATION_PROMPTS, SOURCE_LANG_CODE_MAP | ||
| 21 | + | ||
| 32 | 22 | ||
| 33 | logger = logging.getLogger(__name__) | 23 | logger = logging.getLogger(__name__) |
| 34 | 24 | ||
| 35 | 25 | ||
| 36 | -# 华北2(北京):https://dashscope.aliyuncs.com/compatible-mode/v1 | ||
| 37 | -# 新加坡:https://dashscope-intl.aliyuncs.com/compatible-mode/v1 | ||
| 38 | -# 美国(弗吉尼亚):https://dashscope-us.aliyuncs.com/compatible-mode/v1 | ||
| 39 | -# | ||
| 40 | -# 默认保持与现有翻译/索引脚本相同的美国地域,可通过环境变量覆盖: | ||
| 41 | -# DASHSCOPE_BASE_URL=https://dashscope.aliyuncs.com/compatible-mode/v1 | ||
| 42 | DEFAULT_QWEN_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1" | 26 | DEFAULT_QWEN_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1" |
| 43 | -QWEN_MODEL_NAME = "qwen-flash" | ||
| 44 | - | ||
| 45 | - | ||
| 46 | -# 由调用方提供的语言标签/代码填充,占位符说明: | ||
| 47 | -# - source_lang: 源语言的人类可读名称(按目标语言本地化,例如 "中文", "English") | ||
| 48 | -# - target_lang: 目标语言的人类可读名称 | ||
| 49 | -# - src_lang_code: 源语言代码,例如 "zh" | ||
| 50 | -# - tgt_lang_code: 目标语言代码,例如 "en" | ||
| 51 | -TRANSLATION_PROMPTS: Dict[str, str] = { | ||
| 52 | - "zh": """你是一名专业的 {source_lang}({src_lang_code})到 {target_lang}({tgt_lang_code})翻译员。你的目标是在遵循 {target_lang} 的语法、词汇和文化习惯的前提下,准确传达原始 {source_lang} 文本的含义和细微差别。请只输出 {target_lang} 的翻译内容,不要包含任何额外的解释或评论。请将以下 {source_lang} 文本翻译成 {target_lang}: | ||
| 53 | - | ||
| 54 | -{text}""", | ||
| 55 | - "en": """You are a professional {source_lang} ({src_lang_code}) to {target_lang} ({tgt_lang_code}) translator. Your goal is to accurately convey the meaning and nuances of the original {source_lang} text while adhering to {target_lang} grammar, vocabulary, and cultural sensitivities. Produce only the {target_lang} translation, without any additional explanations or commentary. Please translate the following {source_lang} text into {target_lang}: | ||
| 56 | - | ||
| 57 | -{text}""", | ||
| 58 | - "ru": """Вы профессиональный переводчик с {source_lang} ({src_lang_code}) на {target_lang} ({tgt_lang_code}). Ваша задача — точно передать смысл и нюансы исходного текста на {source_lang}, соблюдая грамматику, лексику и культурные особенности {target_lang}. Выводите только перевод на {target_lang}, без каких-либо дополнительных объяснений или комментариев. Пожалуйста, переведите следующий текст с {source_lang} на {target_lang}: | ||
| 59 | - | ||
| 60 | -{text}""", | ||
| 61 | - "ar": """أنت مترجم محترف من {source_lang} ({src_lang_code}) إلى {target_lang} ({tgt_lang_code}). هدفك هو نقل المعنى والدلالات الدقيقة للنص الأصلي بلغة {source_lang} بدقة، مع الالتزام بقواعد اللغة والمفردات والحساسيات الثقافية الخاصة بلغة {target_lang}. قم بإنتاج الترجمة إلى {target_lang} فقط دون أي شروحات أو تعليقات إضافية. يرجى ترجمة النص التالي من {source_lang} إلى {target_lang}: | ||
| 62 | - | ||
| 63 | -{text}""", | ||
| 64 | - "ja": """あなたは {source_lang}({src_lang_code})から {target_lang}({tgt_lang_code})へのプロの翻訳者です。{target_lang} の文法、語彙、文化的配慮に従いながら、元の {source_lang} テキストの意味やニュアンスを正確に伝えることが目的です。追加の説明やコメントは一切含めず、{target_lang} の翻訳のみを出力してください。次の {source_lang} テキストを {target_lang} に翻訳してください: | ||
| 65 | - | ||
| 66 | -{text}""", | ||
| 67 | - "es": """Eres un traductor profesional de {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Tu objetivo es transmitir con precisión el significado y los matices del texto original en {source_lang}, respetando la gramática, el vocabulario y las sensibilidades culturales de {target_lang}. Produce únicamente la traducción en {target_lang}, sin explicaciones ni comentarios adicionales. Por favor, traduce el siguiente texto de {source_lang} a {target_lang}: | ||
| 68 | - | ||
| 69 | -{text}""", | ||
| 70 | - "de": """Du bist ein professioneller Übersetzer von {source_lang} ({src_lang_code}) nach {target_lang} ({tgt_lang_code}). Dein Ziel ist es, die Bedeutung und Nuancen des ursprünglichen {source_lang}-Textes genau zu vermitteln und dabei die Grammatik, den Wortschatz und die kulturellen Besonderheiten von {target_lang} zu berücksichtigen. Gib ausschließlich die Übersetzung in {target_lang} aus, ohne zusätzliche Erklärungen oder Kommentare. Bitte übersetze den folgenden {source_lang}-Text in {target_lang}: | ||
| 71 | - | ||
| 72 | -{text}""", | ||
| 73 | - "fr": """Vous êtes un traducteur professionnel de {source_lang} ({src_lang_code}) vers {target_lang} ({tgt_lang_code}). Votre objectif est de transmettre fidèlement le sens et les nuances du texte original en {source_lang}, tout en respectant la grammaire, le vocabulaire et les sensibilités culturelles de {target_lang}. Produisez uniquement la traduction en {target_lang}, sans explications ni commentaires supplémentaires. Veuillez traduire le texte suivant de {source_lang} vers {target_lang} : | ||
| 74 | - | ||
| 75 | -{text}""", | ||
| 76 | - "it": """Sei un traduttore professionista da {source_lang} ({src_lang_code}) a {target_lang} ({tgt_lang_code}). Il tuo obiettivo è trasmettere con precisione il significato e le sfumature del testo originale in {source_lang}, rispettando la grammatica, il vocabolario e le sensibilità culturali di {target_lang}. Produci solo la traduzione in {target_lang}, senza spiegazioni o commenti aggiuntivi. Per favore traduci il seguente testo da {source_lang} a {target_lang}: | ||
| 77 | - | ||
| 78 | -{text}""", | ||
| 79 | - "pt": """Você é um tradutor profissional de {source_lang} ({src_lang_code}) para {target_lang} ({tgt_lang_code}). Seu objetivo é transmitir com precisão o significado e as nuances do texto original em {source_lang}, respeitando a gramática, o vocabulário e as sensibilidades culturais de {target_lang}. Produza apenas a tradução em {target_lang}, sem quaisquer explicações ou comentários adicionais. Por favor, traduza o seguinte texto de {source_lang} para {target_lang}: | ||
| 80 | - | ||
| 81 | -{text}""", | ||
| 82 | -} | ||
| 83 | - | ||
| 84 | - | ||
| 85 | -def _get_qwen_client(base_url: Optional[str] = None) -> Optional[OpenAI]: | ||
| 86 | - """ | ||
| 87 | - Lazily construct an OpenAI-compatible client for DashScope. | ||
| 88 | - | ||
| 89 | - Uses DASHSCOPE_API_KEY and base_url (provider config / env) to configure endpoint. | ||
| 90 | - """ | ||
| 91 | - api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY") | ||
| 92 | - if not api_key: | ||
| 93 | - logger.warning("DASHSCOPE_API_KEY not set; llm-based translation will be disabled") | ||
| 94 | - return None | ||
| 95 | - | ||
| 96 | - # 优先使用显式传入的 base_url,其次环境变量,最后默认地域。 | ||
| 97 | - base_url = ( | ||
| 98 | - (base_url or "").strip() | ||
| 99 | - or os.getenv("DASHSCOPE_BASE_URL") | ||
| 100 | - or DEFAULT_QWEN_BASE_URL | ||
| 101 | - ) | ||
| 102 | - | ||
| 103 | - try: | ||
| 104 | - client = OpenAI(api_key=api_key, base_url=base_url) | ||
| 105 | - return client | ||
| 106 | - except Exception as exc: | ||
| 107 | - logger.error("Failed to initialize DashScope OpenAI client: %s", exc, exc_info=True) | ||
| 108 | - return None | 27 | +DEFAULT_LLM_MODEL = "qwen-flash" |
| 109 | 28 | ||
| 110 | 29 | ||
| 111 | def _build_prompt( | 30 | def _build_prompt( |
| 112 | text: str, | 31 | text: str, |
| 32 | + *, | ||
| 33 | + source_lang: Optional[str], | ||
| 113 | target_lang: str, | 34 | target_lang: str, |
| 114 | - source_lang_label: str, | ||
| 115 | - target_lang_label: str, | ||
| 116 | - src_lang_code: str, | ||
| 117 | - tgt_lang_code: str, | 35 | + scene: Optional[str], |
| 118 | ) -> str: | 36 | ) -> str: |
| 119 | """ | 37 | """ |
| 120 | - Build translation prompt for given target language, defaulting to English template. | 38 | + 从 config.translate_prompts.TRANSLATION_PROMPTS 中构建提示词。 |
| 39 | + | ||
| 40 | + 要求:模板必须包含 {source_lang}({src_lang_code}){target_lang}({tgt_lang_code})。 | ||
| 41 | + 这里统一使用 code 作为占位的 lang 与 label,外部接口仍然只传语言 code。 | ||
| 121 | """ | 42 | """ |
| 122 | - key = (target_lang or "").lower() | ||
| 123 | - template = TRANSLATION_PROMPTS.get(key) or TRANSLATION_PROMPTS["en"] | 43 | + tgt = (target_lang or "").lower() or "en" |
| 44 | + src = (source_lang or "auto").lower() | ||
| 45 | + | ||
| 46 | + # 将业务上下文 scene 映射为模板分组名 | ||
| 47 | + normalized_scene = (scene or "").strip() or "general" | ||
| 48 | + # 如果出现历史词,则报错,用于发现错误 | ||
| 49 | + if normalized_scene in {"query", "ecommerce_search", "ecommerce_search_query"}: | ||
| 50 | + group_key = "ecommerce_search_query" | ||
| 51 | + elif normalized_scene in {"product_title", "sku_name"}: | ||
| 52 | + group_key = "sku_name" | ||
| 53 | + else: | ||
| 54 | + group_key = normalized_scene | ||
| 55 | + group = TRANSLATION_PROMPTS.get(group_key) or TRANSLATION_PROMPTS["general"] | ||
| 56 | + | ||
| 57 | + # 先按目标语言 code 取模板,取不到回退到英文 | ||
| 58 | + template = group.get(tgt) or group.get("en") | ||
| 59 | + if not template: | ||
| 60 | + # 理论上不会发生,兜底一个简单模板 | ||
| 61 | + template = ( | ||
| 62 | + "You are a professional {source_lang} ({src_lang_code}) to " | ||
| 63 | + "{target_lang} ({tgt_lang_code}) translator, output only the translation: {text}" | ||
| 64 | + ) | ||
| 65 | + | ||
| 66 | + # 目前不额外维护语言名称映射,直接使用 code 作为 label | ||
| 67 | + source_lang_label = SOURCE_LANG_CODE_MAP.get(src, src) | ||
| 68 | + target_lang_label = SOURCE_LANG_CODE_MAP.get(tgt, tgt) | ||
| 69 | + | ||
| 124 | return template.format( | 70 | return template.format( |
| 125 | source_lang=source_lang_label, | 71 | source_lang=source_lang_label, |
| 72 | + src_lang_code=src, | ||
| 126 | target_lang=target_lang_label, | 73 | target_lang=target_lang_label, |
| 127 | - src_lang_code=src_lang_code, | ||
| 128 | - tgt_lang_code=tgt_lang_code, | 74 | + tgt_lang_code=tgt, |
| 129 | text=text, | 75 | text=text, |
| 130 | ) | 76 | ) |
| 131 | 77 | ||
| 132 | 78 | ||
| 79 | +class LLMTranslatorProvider: | ||
| 80 | + def __init__( | ||
| 81 | + self, | ||
| 82 | + *, | ||
| 83 | + model: Optional[str] = None, | ||
| 84 | + timeout_sec: float = 30.0, | ||
| 85 | + base_url: Optional[str] = None, | ||
| 86 | + ) -> None: | ||
| 87 | + cfg = get_translation_config() | ||
| 88 | + llm_cfg = cfg.providers.get("llm", {}) if isinstance(cfg.providers, dict) else {} | ||
| 89 | + self.model = model or llm_cfg.get("model") or DEFAULT_LLM_MODEL | ||
| 90 | + self.timeout_sec = float(llm_cfg.get("timeout_sec") or timeout_sec or 30.0) | ||
| 91 | + self.base_url = ( | ||
| 92 | + (base_url or "").strip() | ||
| 93 | + or (llm_cfg.get("base_url") or "").strip() | ||
| 94 | + or os.getenv("DASHSCOPE_BASE_URL") | ||
| 95 | + or DEFAULT_QWEN_BASE_URL | ||
| 96 | + ) | ||
| 97 | + self.client = self._create_client() | ||
| 98 | + | ||
| 99 | + def _create_client(self) -> Optional[OpenAI]: | ||
| 100 | + api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY") | ||
| 101 | + if not api_key: | ||
| 102 | + logger.warning("DASHSCOPE_API_KEY not set; llm translation unavailable") | ||
| 103 | + return None | ||
| 104 | + try: | ||
| 105 | + return OpenAI(api_key=api_key, base_url=self.base_url) | ||
| 106 | + except Exception as exc: | ||
| 107 | + logger.error("Failed to initialize llm translation client: %s", exc, exc_info=True) | ||
| 108 | + return None | ||
| 109 | + | ||
| 110 | + def translate( | ||
| 111 | + self, | ||
| 112 | + text: str, | ||
| 113 | + target_lang: str, | ||
| 114 | + source_lang: Optional[str] = None, | ||
| 115 | + context: Optional[str] = None, | ||
| 116 | + prompt: Optional[str] = None, | ||
| 117 | + ) -> Optional[str]: | ||
| 118 | + if not text or not str(text).strip(): | ||
| 119 | + return text | ||
| 120 | + if not self.client: | ||
| 121 | + return None | ||
| 122 | + | ||
| 123 | + tgt = (target_lang or "").lower() or "en" | ||
| 124 | + src = (source_lang or "auto").lower() | ||
| 125 | + scene = context or "default" | ||
| 126 | + user_prompt = prompt or _build_prompt( | ||
| 127 | + text=text, | ||
| 128 | + source_lang=src, | ||
| 129 | + target_lang=tgt, | ||
| 130 | + scene=scene, | ||
| 131 | + ) | ||
| 132 | + start = time.time() | ||
| 133 | + try: | ||
| 134 | + logger.info( | ||
| 135 | + "[llm] Request | src=%s tgt=%s model=%s prompt=%s", | ||
| 136 | + src, | ||
| 137 | + tgt, | ||
| 138 | + self.model, | ||
| 139 | + user_prompt, | ||
| 140 | + ) | ||
| 141 | + completion = self.client.chat.completions.create( | ||
| 142 | + model=self.model, | ||
| 143 | + messages=[{"role": "user", "content": user_prompt}], | ||
| 144 | + timeout=self.timeout_sec, | ||
| 145 | + ) | ||
| 146 | + content = (completion.choices[0].message.content or "").strip() | ||
| 147 | + latency_ms = (time.time() - start) * 1000 | ||
| 148 | + if not content: | ||
| 149 | + logger.warning("[llm] Empty result | src=%s tgt=%s latency=%.1fms", src, tgt, latency_ms) | ||
| 150 | + return None | ||
| 151 | + logger.info("[llm] Response | src=%s tgt=%s response=%s", src, tgt, content) | ||
| 152 | + logger.info("[llm] Success | src=%s tgt=%s latency=%.1fms", src, tgt, latency_ms) | ||
| 153 | + return content | ||
| 154 | + except Exception as exc: | ||
| 155 | + latency_ms = (time.time() - start) * 1000 | ||
| 156 | + logger.warning( | ||
| 157 | + "[llm] Failed | src=%s tgt=%s latency=%.1fms error=%s", | ||
| 158 | + src, | ||
| 159 | + tgt, | ||
| 160 | + latency_ms, | ||
| 161 | + exc, | ||
| 162 | + exc_info=True, | ||
| 163 | + ) | ||
| 164 | + return None | ||
| 165 | + | ||
| 166 | + | ||
| 133 | def llm_translate( | 167 | def llm_translate( |
| 134 | text: str, | 168 | text: str, |
| 135 | target_lang: str, | 169 | target_lang: str, |
| @@ -139,100 +173,13 @@ def llm_translate( | @@ -139,100 +173,13 @@ def llm_translate( | ||
| 139 | target_lang_label: Optional[str] = None, | 173 | target_lang_label: Optional[str] = None, |
| 140 | timeout_sec: Optional[float] = None, | 174 | timeout_sec: Optional[float] = None, |
| 141 | ) -> Optional[str]: | 175 | ) -> Optional[str]: |
| 142 | - """ | ||
| 143 | - Translate text with Qwen chat model using rich prompts. | ||
| 144 | - | ||
| 145 | - - 根据目标语言选择提示词,如果没匹配到则退回英文模板。 | ||
| 146 | - - 不对 text 做语言检测或缓存,调用方自行控制。 | ||
| 147 | - | ||
| 148 | - Args: | ||
| 149 | - text: 原始文本 | ||
| 150 | - target_lang: 目标语言代码(如 "zh", "en") | ||
| 151 | - source_lang: 源语言代码(可选,不影响提示词选择,仅用于日志) | ||
| 152 | - source_lang_label: 源语言展示名称,用于 prompt(默认使用 source_lang) | ||
| 153 | - target_lang_label: 目标语言展示名称,用于 prompt(默认使用 target_lang) | ||
| 154 | - timeout_sec: 请求超时时间(秒,可选;若未配置则从 config 读取或采用默认) | ||
| 155 | - | ||
| 156 | - Returns: | ||
| 157 | - 翻译后的文本;如失败则返回 None。 | ||
| 158 | - """ | ||
| 159 | - if not text or not str(text).strip(): | ||
| 160 | - return text | ||
| 161 | - | ||
| 162 | - cfg = get_translation_config() | ||
| 163 | - provider_cfg = cfg.providers.get("llm", {}) if isinstance(cfg.providers, dict) else {} | ||
| 164 | - | ||
| 165 | - model_name = provider_cfg.get("model") or QWEN_MODEL_NAME | ||
| 166 | - req_timeout = float(provider_cfg.get("timeout_sec") or timeout_sec or 30.0) | ||
| 167 | - base_url = (provider_cfg.get("base_url") or "").strip() or None | ||
| 168 | - | ||
| 169 | - client = _get_qwen_client(base_url=base_url) | ||
| 170 | - if not client: | ||
| 171 | - # 无法调用云端,直接回退 | ||
| 172 | - logger.warning( | ||
| 173 | - "[llm_translate] Client init failed; returning original text. " | ||
| 174 | - "text=%r target_lang=%s source_lang=%s", | ||
| 175 | - text[:80], | ||
| 176 | - target_lang, | ||
| 177 | - source_lang or "auto", | ||
| 178 | - ) | ||
| 179 | - return text | ||
| 180 | - | ||
| 181 | - tgt = (target_lang or "").lower() or "en" | ||
| 182 | - src = (source_lang or "auto").lower() | ||
| 183 | - src_label = source_lang_label or src | ||
| 184 | - tgt_label = target_lang_label or tgt | ||
| 185 | - | ||
| 186 | - prompt = _build_prompt( | 176 | + provider = LLMTranslatorProvider(timeout_sec=timeout_sec or 30.0) |
| 177 | + return provider.translate( | ||
| 187 | text=text, | 178 | text=text, |
| 188 | - target_lang=tgt, | ||
| 189 | - source_lang_label=src_label, | ||
| 190 | - target_lang_label=tgt_label, | ||
| 191 | - src_lang_code=src, | ||
| 192 | - tgt_lang_code=tgt, | 179 | + target_lang=target_lang, |
| 180 | + source_lang=source_lang, | ||
| 181 | + context=None, | ||
| 193 | ) | 182 | ) |
| 194 | 183 | ||
| 195 | - start = time.time() | ||
| 196 | - try: | ||
| 197 | - completion = client.chat.completions.create( | ||
| 198 | - model=model_name, | ||
| 199 | - messages=[ | ||
| 200 | - { | ||
| 201 | - "role": "user", | ||
| 202 | - "content": prompt, | ||
| 203 | - } | ||
| 204 | - ], | ||
| 205 | - timeout=req_timeout, | ||
| 206 | - ) | ||
| 207 | - content = (completion.choices[0].message.content or "").strip() | ||
| 208 | - duration_ms = (time.time() - start) * 1000 | ||
| 209 | - logger.info( | ||
| 210 | - "[llm_translate] Success | model=%s src=%s tgt=%s latency=%.1fms text=%r -> %r", | ||
| 211 | - model_name, | ||
| 212 | - src, | ||
| 213 | - tgt, | ||
| 214 | - duration_ms, | ||
| 215 | - text[:80], | ||
| 216 | - content[:80], | ||
| 217 | - ) | ||
| 218 | - return content or text | ||
| 219 | - except Exception as exc: | ||
| 220 | - duration_ms = (time.time() - start) * 1000 | ||
| 221 | - logger.warning( | ||
| 222 | - "[llm_translate] Failed | model=%s src=%s tgt=%s latency=%.1fms error=%s", | ||
| 223 | - model_name, | ||
| 224 | - src, | ||
| 225 | - tgt, | ||
| 226 | - duration_ms, | ||
| 227 | - exc, | ||
| 228 | - exc_info=True, | ||
| 229 | - ) | ||
| 230 | - # 安全回退:出错时返回原文,避免中断上游流程 | ||
| 231 | - return text | ||
| 232 | - | ||
| 233 | - | ||
| 234 | -__all__ = [ | ||
| 235 | - "TRANSLATION_PROMPTS", | ||
| 236 | - "llm_translate", | ||
| 237 | -] | ||
| 238 | 184 | ||
| 185 | +__all__ = ["LLMTranslatorProvider", "llm_translate"] |
query/query_parser.py
| @@ -8,7 +8,7 @@ from typing import Dict, List, Optional, Any, Union | @@ -8,7 +8,7 @@ from typing import Dict, List, Optional, Any, Union | ||
| 8 | import numpy as np | 8 | import numpy as np |
| 9 | import logging | 9 | import logging |
| 10 | import re | 10 | import re |
| 11 | -from concurrent.futures import Future, ThreadPoolExecutor, as_completed | 11 | +from concurrent.futures import ThreadPoolExecutor, as_completed, wait |
| 12 | 12 | ||
| 13 | from embeddings.text_encoder import TextEmbeddingEncoder | 13 | from embeddings.text_encoder import TextEmbeddingEncoder |
| 14 | from config import SearchConfig | 14 | from config import SearchConfig |
| @@ -135,6 +135,7 @@ class QueryParser: | @@ -135,6 +135,7 @@ class QueryParser: | ||
| 135 | cfg = get_translation_config() | 135 | cfg = get_translation_config() |
| 136 | logger.info("Initializing translator at QueryParser construction (provider=%s)...", cfg.provider) | 136 | logger.info("Initializing translator at QueryParser construction (provider=%s)...", cfg.provider) |
| 137 | self._translator = create_translation_provider(self.config.query_config) | 137 | self._translator = create_translation_provider(self.config.query_config) |
| 138 | + self._translation_executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="query-translation") | ||
| 138 | 139 | ||
| 139 | @property | 140 | @property |
| 140 | def text_encoder(self) -> TextEmbeddingEncoder: | 141 | def text_encoder(self) -> TextEmbeddingEncoder: |
| @@ -265,6 +266,7 @@ class QueryParser: | @@ -265,6 +266,7 @@ class QueryParser: | ||
| 265 | # Stage 4: Translation (with async support and conditional waiting) | 266 | # Stage 4: Translation (with async support and conditional waiting) |
| 266 | translations = {} | 267 | translations = {} |
| 267 | translation_futures = {} | 268 | translation_futures = {} |
| 269 | + translation_executor = None | ||
| 268 | index_langs = ["en", "zh"] | 270 | index_langs = ["en", "zh"] |
| 269 | try: | 271 | try: |
| 270 | # 根据租户配置的 index_languages 决定翻译目标语言 | 272 | # 根据租户配置的 index_languages 决定翻译目标语言 |
| @@ -287,48 +289,33 @@ class QueryParser: | @@ -287,48 +289,33 @@ class QueryParser: | ||
| 287 | target_langs = target_langs_for_translation | 289 | target_langs = target_langs_for_translation |
| 288 | 290 | ||
| 289 | if target_langs: | 291 | if target_langs: |
| 290 | - # Use e-commerce context for better disambiguation | ||
| 291 | - translation_context = self.config.query_config.translation_context | ||
| 292 | - # For query translation, we use a general prompt (not language-specific) | ||
| 293 | - query_prompt = ( | ||
| 294 | - self.config.query_config.translation_prompts.get(f"query_{detected_lang}") | ||
| 295 | - or self.config.query_config.translation_prompts.get("query_en") | ||
| 296 | - or self.config.query_config.translation_prompts.get("default_en") | ||
| 297 | - or self.config.query_config.translation_prompts.get("default_zh") | ||
| 298 | - ) | ||
| 299 | - | ||
| 300 | # Determine if we need to wait for translation results | 292 | # Determine if we need to wait for translation results |
| 301 | # If detected_lang is not in index_languages, we must wait for translation | 293 | # If detected_lang is not in index_languages, we must wait for translation |
| 302 | need_wait_translation = detected_lang not in index_langs | 294 | need_wait_translation = detected_lang not in index_langs |
| 303 | - | 295 | + |
| 304 | if need_wait_translation: | 296 | if need_wait_translation: |
| 305 | - # Use async method that returns Futures, so we can wait for results | ||
| 306 | - translation_results = self.translator.translate_multi_async( | ||
| 307 | - query_text, | ||
| 308 | - target_langs, | ||
| 309 | - source_lang=detected_lang, | ||
| 310 | - context=translation_context, | ||
| 311 | - prompt=query_prompt | 297 | + translation_executor = ThreadPoolExecutor( |
| 298 | + max_workers=max(1, min(len(target_langs), 4)), | ||
| 299 | + thread_name_prefix="query-translation-wait", | ||
| 312 | ) | 300 | ) |
| 313 | - # Separate cached results and futures | ||
| 314 | - for lang, result in translation_results.items(): | ||
| 315 | - if isinstance(result, Future): | ||
| 316 | - translation_futures[lang] = result | ||
| 317 | - else: | ||
| 318 | - translations[lang] = result | 301 | + for lang in target_langs: |
| 302 | + translation_futures[lang] = translation_executor.submit( | ||
| 303 | + self.translator.translate, | ||
| 304 | + query_text, | ||
| 305 | + lang, | ||
| 306 | + detected_lang, | ||
| 307 | + "ecommerce_search_query", | ||
| 308 | + ) | ||
| 319 | else: | 309 | else: |
| 320 | - # Use async mode: returns cached translations immediately, missing ones translated in background | ||
| 321 | - translations = self.translator.translate_multi( | ||
| 322 | - query_text, | ||
| 323 | - target_langs, | ||
| 324 | - source_lang=detected_lang, | ||
| 325 | - context=translation_context, | ||
| 326 | - async_mode=True, | ||
| 327 | - prompt=query_prompt | ||
| 328 | - ) | ||
| 329 | - # Filter out None values (missing translations that are being processed async) | ||
| 330 | - translations = {k: v for k, v in translations.items() if v is not None} | ||
| 331 | - | 310 | + for lang in target_langs: |
| 311 | + self._translation_executor.submit( | ||
| 312 | + self.translator.translate, | ||
| 313 | + query_text, | ||
| 314 | + lang, | ||
| 315 | + detected_lang, | ||
| 316 | + "ecommerce_search_query", | ||
| 317 | + ) | ||
| 318 | + | ||
| 332 | if translations: | 319 | if translations: |
| 333 | log_info(f"Translation completed (cache hit) | Query text: '{query_text}' | Results: {translations}") | 320 | log_info(f"Translation completed (cache hit) | Query text: '{query_text}' | Results: {translations}") |
| 334 | if translation_futures: | 321 | if translation_futures: |
| @@ -407,15 +394,18 @@ class QueryParser: | @@ -407,15 +394,18 @@ class QueryParser: | ||
| 407 | all_futures.append(embedding_future) | 394 | all_futures.append(embedding_future) |
| 408 | future_to_lang[embedding_future] = ('embedding', None) | 395 | future_to_lang[embedding_future] = ('embedding', None) |
| 409 | 396 | ||
| 410 | - # Wait for all futures to complete | ||
| 411 | - for future in as_completed(all_futures): | 397 | + # Enforce a hard timeout for translation-related work (300ms budget) |
| 398 | + done, not_done = wait(all_futures, timeout=0.3) | ||
| 399 | + for future in done: | ||
| 412 | task_type, lang = future_to_lang[future] | 400 | task_type, lang = future_to_lang[future] |
| 413 | try: | 401 | try: |
| 414 | result = future.result() | 402 | result = future.result() |
| 415 | if task_type == 'translation': | 403 | if task_type == 'translation': |
| 416 | if result: | 404 | if result: |
| 417 | translations[lang] = result | 405 | translations[lang] = result |
| 418 | - log_info(f"Translation completed | Query text: '{query_text}' | Target language: {lang} | Translation result: '{result}'") | 406 | + log_info( |
| 407 | + f"Translation completed | Query text: '{query_text}' | Target language: {lang} | Translation result: '{result}'" | ||
| 408 | + ) | ||
| 419 | if context: | 409 | if context: |
| 420 | context.store_intermediate_result(f'translation_{lang}', result) | 410 | context.store_intermediate_result(f'translation_{lang}', result) |
| 421 | elif task_type == 'embedding': | 411 | elif task_type == 'embedding': |
| @@ -434,10 +424,27 @@ class QueryParser: | @@ -434,10 +424,27 @@ class QueryParser: | ||
| 434 | log_info(error_msg) | 424 | log_info(error_msg) |
| 435 | if context: | 425 | if context: |
| 436 | context.add_warning(error_msg) | 426 | context.add_warning(error_msg) |
| 437 | - | 427 | + |
| 428 | + # Log timeouts for any futures that did not finish within 300ms | ||
| 429 | + if not_done: | ||
| 430 | + for future in not_done: | ||
| 431 | + task_type, lang = future_to_lang[future] | ||
| 432 | + if task_type == 'translation': | ||
| 433 | + timeout_msg = ( | ||
| 434 | + f"Translation timeout (>300ms) | Language: {lang} | " | ||
| 435 | + f"Query text: '{query_text}'" | ||
| 436 | + ) | ||
| 437 | + else: | ||
| 438 | + timeout_msg = "Query vector generation timeout (>300ms), proceeding without embedding result" | ||
| 439 | + log_info(timeout_msg) | ||
| 440 | + if context: | ||
| 441 | + context.add_warning(timeout_msg) | ||
| 442 | + | ||
| 438 | # Clean up encoding executor | 443 | # Clean up encoding executor |
| 439 | if encoding_executor: | 444 | if encoding_executor: |
| 440 | encoding_executor.shutdown(wait=False) | 445 | encoding_executor.shutdown(wait=False) |
| 446 | + if translation_executor: | ||
| 447 | + translation_executor.shutdown(wait=False) | ||
| 441 | 448 | ||
| 442 | # Update translations in context after all are complete | 449 | # Update translations in context after all are complete |
| 443 | if translations and context: | 450 | if translations and context: |
query/qwen_mt_translate.py
| 1 | -""" | ||
| 2 | -Translation service for multi-language query support. | 1 | +"""Qwen-MT translation orchestrator with cache and async helpers.""" |
| 3 | 2 | ||
| 4 | -Supports multiple translation models: | ||
| 5 | -- Qwen (default): Alibaba Cloud DashScope API using qwen-mt-flash model | ||
| 6 | -- DeepL: DeepL API for high-quality translations | ||
| 7 | - | ||
| 8 | -重要说明(Qwen 机翻限速): | ||
| 9 | -- 当前默认使用的 `qwen-mt-flash` 为云端机翻模型,**官方限速较低,约 RPM=60(每分钟约 60 请求)** | ||
| 10 | -- 在高并发场景必须依赖 Redis 翻译缓存与批量预热,避免在用户实时请求路径上直接打满 DashScope 限流 | ||
| 11 | -- 若业务侧存在大规模离线翻译或更高吞吐需求,建议评估 DeepL 或自建翻译后端 | ||
| 12 | - | ||
| 13 | -使用方法 (Usage): | ||
| 14 | - | ||
| 15 | -```python | ||
| 16 | -from query.translator import Translator | ||
| 17 | - | ||
| 18 | -# 使用默认的 qwen 模型(推荐) | ||
| 19 | -translator = Translator() # 默认使用 qwen 模型 | ||
| 20 | - | ||
| 21 | -# 或显式指定模型 | ||
| 22 | -translator = Translator(model='qwen') # 使用 qwen 模型 | ||
| 23 | -translator = Translator(model='deepl') # 使用 DeepL 模型 | ||
| 24 | - | ||
| 25 | -# 翻译文本 | ||
| 26 | -result = translator.translate( | ||
| 27 | - text="我看到这个视频后没有笑", | ||
| 28 | - target_lang="en", | ||
| 29 | - source_lang="auto" # 自动检测源语言 | ||
| 30 | -) | ||
| 31 | -``` | ||
| 32 | - | ||
| 33 | -配置说明 (Configuration): | ||
| 34 | -- Qwen 模型需要设置 DASHSCOPE_API_KEY 环境变量(在 .env 文件中) | ||
| 35 | -- DeepL 模型需要设置 DEEPL_AUTH_KEY 环境变量(在 .env 文件中) | ||
| 36 | - | ||
| 37 | -Qwen 模型参考文档: | ||
| 38 | -- 官方文档:https://help.aliyun.com/zh/model-studio/get-api-key | ||
| 39 | -- 模型:qwen-mt-flash(快速翻译模型) | ||
| 40 | - | ||
| 41 | -DeepL 官方文档: | ||
| 42 | -https://developers.deepl.com/api-reference/translate/request-translation | ||
| 43 | -""" | 3 | +from __future__ import annotations |
| 44 | 4 | ||
| 5 | +import hashlib | ||
| 6 | +import logging | ||
| 45 | import os | 7 | import os |
| 46 | -import requests | ||
| 47 | import re | 8 | import re |
| 48 | -import redis | ||
| 49 | -from concurrent.futures import ThreadPoolExecutor, Future | ||
| 50 | -from datetime import timedelta | ||
| 51 | -from typing import Dict, List, Optional, Union | ||
| 52 | -import logging | ||
| 53 | import time | 9 | import time |
| 10 | +from typing import Dict, List, Optional | ||
| 54 | 11 | ||
| 55 | -logger = logging.getLogger(__name__) | ||
| 56 | - | ||
| 57 | -from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG | 12 | +import redis |
| 58 | from openai import OpenAI | 13 | from openai import OpenAI |
| 59 | 14 | ||
| 15 | +from config.env_config import DASHSCOPE_API_KEY, REDIS_CONFIG | ||
| 16 | +from config.services_config import get_translation_cache_config | ||
| 17 | +from config.translate_prompts import SOURCE_LANG_CODE_MAP | ||
| 60 | 18 | ||
| 61 | -class Translator: | ||
| 62 | - """ | ||
| 63 | - Multi-language translator supporting Qwen and DeepL APIs. | ||
| 64 | - | ||
| 65 | - Default model is 'qwen' which uses Alibaba Cloud DashScope API. | ||
| 66 | - """ | ||
| 67 | -# 华北2(北京):https://dashscope.aliyuncs.com/compatible-mode/v1 | ||
| 68 | -# 新加坡:https://dashscope-intl.aliyuncs.com/compatible-mode/v1 | ||
| 69 | -# 美国(弗吉尼亚):https://dashscope-us.aliyuncs.com/compatible-mode/v1 | 19 | +logger = logging.getLogger(__name__) |
| 70 | 20 | ||
| 71 | - DEEPL_API_URL = "https://api.deepl.com/v2/translate" # Pro tier | ||
| 72 | - QWEN_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1" # 北京地域 | ||
| 73 | - # QWEN_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" # 新加坡 | ||
| 74 | - # 如果使用新加坡地域的模型,需要将base_url替换为:https://dashscope-intl.aliyuncs.com/compatible-mode/v1 | ||
| 75 | - QWEN_MODEL = "qwen-mt-flash" # 快速翻译模型 | ||
| 76 | 21 | ||
| 77 | - # Language code mapping | ||
| 78 | - LANG_CODE_MAP = { | ||
| 79 | - 'zh': 'ZH', | ||
| 80 | - 'en': 'EN', | ||
| 81 | - 'ru': 'RU', | ||
| 82 | - 'ar': 'AR', | ||
| 83 | - 'ja': 'JA', | ||
| 84 | - 'es': 'ES', | ||
| 85 | - 'de': 'DE', | ||
| 86 | - 'fr': 'FR', | ||
| 87 | - 'it': 'IT', | ||
| 88 | - 'pt': 'PT', | ||
| 89 | - } | 22 | +class Translator: |
| 23 | + QWEN_DEFAULT_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1" | ||
| 24 | + QWEN_MODEL = "qwen-mt-flash" | ||
| 90 | 25 | ||
| 91 | def __init__( | 26 | def __init__( |
| 92 | self, | 27 | self, |
| @@ -95,77 +30,90 @@ class Translator: | @@ -95,77 +30,90 @@ class Translator: | ||
| 95 | use_cache: bool = True, | 30 | use_cache: bool = True, |
| 96 | timeout: int = 10, | 31 | timeout: int = 10, |
| 97 | glossary_id: Optional[str] = None, | 32 | glossary_id: Optional[str] = None, |
| 98 | - translation_context: Optional[str] = None | 33 | + translation_context: Optional[str] = None, |
| 99 | ): | 34 | ): |
| 100 | - """ | ||
| 101 | - Initialize translator. | ||
| 102 | - | ||
| 103 | - Args: | ||
| 104 | - model: Translation model to use. Options: 'qwen' (default) or 'deepl' | ||
| 105 | - api_key: API key for the selected model (or None to use from config/env) | ||
| 106 | - use_cache: Whether to cache translations | ||
| 107 | - timeout: Request timeout in seconds | ||
| 108 | - glossary_id: DeepL glossary ID for custom terminology (optional, only for DeepL) | ||
| 109 | - translation_context: Context hint for translation (e.g., "e-commerce", "product search") | ||
| 110 | - """ | ||
| 111 | - self.model = model.lower() | ||
| 112 | - if self.model not in ['qwen', 'deepl']: | ||
| 113 | - raise ValueError(f"Unsupported model: {model}. Supported models: 'qwen', 'deepl'") | ||
| 114 | - | ||
| 115 | - # Get API key from config if not provided | ||
| 116 | - if api_key is None: | ||
| 117 | - if self.model == 'qwen': | ||
| 118 | - api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY") | ||
| 119 | - else: # deepl | ||
| 120 | - api_key = DEEPL_AUTH_KEY or os.getenv("DEEPL_AUTH_KEY") | ||
| 121 | - | ||
| 122 | - self.api_key = api_key | ||
| 123 | - self.timeout = timeout | ||
| 124 | - self.use_cache = use_cache | 35 | + self.model = self._normalize_model(model) |
| 36 | + self.timeout = int(timeout) | ||
| 37 | + self.use_cache = bool(use_cache) | ||
| 125 | self.glossary_id = glossary_id | 38 | self.glossary_id = glossary_id |
| 126 | self.translation_context = translation_context or "e-commerce product search" | 39 | self.translation_context = translation_context or "e-commerce product search" |
| 127 | - | ||
| 128 | - # Initialize OpenAI client for Qwen if needed | ||
| 129 | - self.qwen_client = None | ||
| 130 | - if self.model == 'qwen': | ||
| 131 | - if not self.api_key: | ||
| 132 | - logger.warning("DASHSCOPE_API_KEY not set. Qwen translation will not work.") | ||
| 133 | - else: | ||
| 134 | - self.qwen_client = OpenAI( | ||
| 135 | - api_key=self.api_key, | ||
| 136 | - base_url=self.QWEN_BASE_URL, | ||
| 137 | - ) | ||
| 138 | 40 | ||
| 139 | - # Initialize Redis cache if enabled | ||
| 140 | - if use_cache: | 41 | + cache_cfg = get_translation_cache_config() |
| 42 | + self.cache_prefix = str(cache_cfg.get("key_prefix", "trans:v2")) | ||
| 43 | + self.expire_seconds = int(cache_cfg.get("ttl_seconds", 360 * 24 * 3600)) | ||
| 44 | + self.cache_sliding_expiration = bool(cache_cfg.get("sliding_expiration", True)) | ||
| 45 | + self.cache_include_context = bool(cache_cfg.get("key_include_context", True)) | ||
| 46 | + self.cache_include_prompt = bool(cache_cfg.get("key_include_prompt", True)) | ||
| 47 | + self.cache_include_source_lang = bool(cache_cfg.get("key_include_source_lang", True)) | ||
| 48 | + | ||
| 49 | + self.qwen_model_name = self._resolve_qwen_model_name(model) | ||
| 50 | + self._api_key = api_key or self._default_api_key(self.model) | ||
| 51 | + self._qwen_client: Optional[OpenAI] = None | ||
| 52 | + base_url = os.getenv("DASHSCOPE_BASE_URL") or self.QWEN_DEFAULT_BASE_URL | ||
| 53 | + if self._api_key: | ||
| 141 | try: | 54 | try: |
| 142 | - self.redis_client = redis.Redis( | ||
| 143 | - host=REDIS_CONFIG.get('host', 'localhost'), | ||
| 144 | - port=REDIS_CONFIG.get('port', 6479), | ||
| 145 | - password=REDIS_CONFIG.get('password'), | ||
| 146 | - decode_responses=True, # Return str instead of bytes | ||
| 147 | - socket_timeout=REDIS_CONFIG.get('socket_timeout', 1), | ||
| 148 | - socket_connect_timeout=REDIS_CONFIG.get('socket_connect_timeout', 1), | ||
| 149 | - retry_on_timeout=REDIS_CONFIG.get('retry_on_timeout', False), | ||
| 150 | - health_check_interval=10, # 避免复用坏连接 | ||
| 151 | - ) | ||
| 152 | - # Test connection | ||
| 153 | - self.redis_client.ping() | ||
| 154 | - expire_days = REDIS_CONFIG.get('translation_cache_expire_days', 360) | ||
| 155 | - self.expire_time = timedelta(days=expire_days) | ||
| 156 | - self.expire_seconds = int(self.expire_time.total_seconds()) # Redis 需要秒数 | ||
| 157 | - self.cache_prefix = REDIS_CONFIG.get('translation_cache_prefix', 'trans') | ||
| 158 | - logger.info("Redis cache initialized for translations") | ||
| 159 | - except Exception as e: | ||
| 160 | - logger.warning(f"Failed to initialize Redis cache: {e}, falling back to no cache") | ||
| 161 | - self.redis_client = None | ||
| 162 | - self.cache = None | 55 | + self._qwen_client = OpenAI(api_key=self._api_key, base_url=base_url) |
| 56 | + except Exception as exc: | ||
| 57 | + logger.warning("Failed to initialize qwen-mt client: %s", exc, exc_info=True) | ||
| 163 | else: | 58 | else: |
| 164 | - self.redis_client = None | ||
| 165 | - self.cache = None | ||
| 166 | - | ||
| 167 | - # Thread pool for async translation | ||
| 168 | - self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="translator") | 59 | + logger.warning("DASHSCOPE_API_KEY not set; qwen-mt translation unavailable") |
| 60 | + | ||
| 61 | + self.redis_client = None | ||
| 62 | + if self.use_cache and bool(cache_cfg.get("enabled", True)): | ||
| 63 | + self.redis_client = self._init_redis_client() | ||
| 64 | + | ||
| 65 | + @staticmethod | ||
| 66 | + def _normalize_model(model: str) -> str: | ||
| 67 | + m = (model or "qwen").strip().lower() | ||
| 68 | + if m.startswith("qwen"): | ||
| 69 | + return "qwen-mt" | ||
| 70 | + raise ValueError(f"Unsupported model: {model}. Supported models: 'qwen', 'qwen-mt', 'qwen-mt-flash'") | ||
| 71 | + | ||
| 72 | + @staticmethod | ||
| 73 | + def _resolve_qwen_model_name(model: str) -> str: | ||
| 74 | + m = (model or "qwen").strip().lower() | ||
| 75 | + if m in {"qwen", "qwen-mt"}: | ||
| 76 | + return "qwen-mt-flash" | ||
| 77 | + return m | ||
| 78 | + | ||
| 79 | + @staticmethod | ||
| 80 | + def _default_api_key(model: str) -> Optional[str]: | ||
| 81 | + del model | ||
| 82 | + return DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY") | ||
| 83 | + | ||
| 84 | + def _init_redis_client(self): | ||
| 85 | + try: | ||
| 86 | + client = redis.Redis( | ||
| 87 | + host=REDIS_CONFIG.get("host", "localhost"), | ||
| 88 | + port=REDIS_CONFIG.get("port", 6479), | ||
| 89 | + password=REDIS_CONFIG.get("password"), | ||
| 90 | + decode_responses=True, | ||
| 91 | + socket_timeout=REDIS_CONFIG.get("socket_timeout", 1), | ||
| 92 | + socket_connect_timeout=REDIS_CONFIG.get("socket_connect_timeout", 1), | ||
| 93 | + retry_on_timeout=REDIS_CONFIG.get("retry_on_timeout", False), | ||
| 94 | + health_check_interval=10, | ||
| 95 | + ) | ||
| 96 | + client.ping() | ||
| 97 | + return client | ||
| 98 | + except Exception as exc: | ||
| 99 | + logger.warning("Failed to initialize translation redis cache: %s", exc) | ||
| 100 | + return None | ||
| 101 | + | ||
| 102 | + def _build_cache_key( | ||
| 103 | + self, | ||
| 104 | + text: str, | ||
| 105 | + target_lang: str, | ||
| 106 | + source_lang: Optional[str], | ||
| 107 | + context: Optional[str], | ||
| 108 | + prompt: Optional[str], | ||
| 109 | + ) -> str: | ||
| 110 | + src = (source_lang or "auto").strip().lower() if self.cache_include_source_lang else "-" | ||
| 111 | + tgt = (target_lang or "").strip().lower() | ||
| 112 | + ctx = (context or "").strip() if self.cache_include_context else "" | ||
| 113 | + prm = (prompt or "").strip() if self.cache_include_prompt else "" | ||
| 114 | + payload = f"model={self.model}\nsrc={src}\ntgt={tgt}\nctx={ctx}\nprm={prm}\ntext={text}" | ||
| 115 | + digest = hashlib.sha256(payload.encode("utf-8")).hexdigest() | ||
| 116 | + return f"{self.cache_prefix}:{self.model}:{src}:{tgt}:{digest}" | ||
| 169 | 117 | ||
| 170 | def translate( | 118 | def translate( |
| 171 | self, | 119 | self, |
| @@ -173,99 +121,27 @@ class Translator: | @@ -173,99 +121,27 @@ class Translator: | ||
| 173 | target_lang: str, | 121 | target_lang: str, |
| 174 | source_lang: Optional[str] = None, | 122 | source_lang: Optional[str] = None, |
| 175 | context: Optional[str] = None, | 123 | context: Optional[str] = None, |
| 176 | - prompt: Optional[str] = None | 124 | + prompt: Optional[str] = None, |
| 177 | ) -> Optional[str]: | 125 | ) -> Optional[str]: |
| 178 | - """ | ||
| 179 | - Translate text to target language (synchronous mode). | ||
| 180 | - | ||
| 181 | - Args: | ||
| 182 | - text: Text to translate | ||
| 183 | - target_lang: Target language code ('zh', 'en', 'ru', etc.) | ||
| 184 | - source_lang: Source language code (option al, auto-detect if None) | ||
| 185 | - context: Additional context for translation (overrides default context) | ||
| 186 | - prompt: Translation prompt/instruction (optional, for better translation quality) | ||
| 187 | - | ||
| 188 | - Returns: | ||
| 189 | - Translated text or None if translation fails | ||
| 190 | - """ | ||
| 191 | if not text or not text.strip(): | 126 | if not text or not text.strip(): |
| 192 | return text | 127 | return text |
| 193 | 128 | ||
| 194 | - # Normalize language codes | ||
| 195 | - target_lang = target_lang.lower() | ||
| 196 | - if source_lang: | ||
| 197 | - source_lang = source_lang.lower() | ||
| 198 | - | ||
| 199 | - # Optimization: Skip translation if not needed | ||
| 200 | - if target_lang == 'en' and self._is_english_text(text): | ||
| 201 | - logger.info(f"[Translator] Text is already English, skipping translation: '{text[:50]}...'") | 129 | + tgt = (target_lang or "").strip().lower() |
| 130 | + src = (source_lang or "").strip().lower() or None | ||
| 131 | + if tgt == "en" and self._is_english_text(text): | ||
| 202 | return text | 132 | return text |
| 203 | - | ||
| 204 | - if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): | ||
| 205 | - logger.info( | ||
| 206 | - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | ||
| 207 | - f"Source language: {source_lang or 'auto'} | Result: Skip translation (contains Chinese or pure number)" | ||
| 208 | - ) | 133 | + if tgt == "zh" and (self._contains_chinese(text) or self._is_pure_number(text)): |
| 209 | return text | 134 | return text |
| 210 | 135 | ||
| 211 | - # Use provided context or default context | ||
| 212 | translation_context = context or self.translation_context | 136 | translation_context = context or self.translation_context |
| 213 | - | ||
| 214 | - # Build cache key (include prompt in cache key if provided) | ||
| 215 | - cache_key_parts = [source_lang or 'auto', target_lang, translation_context] | ||
| 216 | - if prompt: | ||
| 217 | - cache_key_parts.append(prompt) | ||
| 218 | - cache_key_parts.append(text) | ||
| 219 | - cache_key = ':'.join(cache_key_parts) | 137 | + cached = self._get_cached_translation_redis(text, tgt, src, translation_context, prompt) |
| 138 | + if cached is not None: | ||
| 139 | + return cached | ||
| 220 | 140 | ||
| 221 | - # Check cache (include context and prompt in cache key for accuracy) | ||
| 222 | - if self.use_cache and self.redis_client: | ||
| 223 | - cached = self._get_cached_translation_redis(text, target_lang, source_lang, translation_context, prompt) | ||
| 224 | - if cached: | ||
| 225 | - logger.info( | ||
| 226 | - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | ||
| 227 | - f"Source language: {source_lang or 'auto'} | Result: '{cached}' | Source: Cache hit" | ||
| 228 | - ) | ||
| 229 | - return cached | ||
| 230 | - | ||
| 231 | - # If no API key, return mock translation (for testing) | ||
| 232 | - if not self.api_key: | ||
| 233 | - logger.info( | ||
| 234 | - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | ||
| 235 | - f"Source language: {source_lang or 'auto'} | Result: '{text}' | Source: Mock mode (no API key)" | ||
| 236 | - ) | ||
| 237 | - return text | ||
| 238 | - | ||
| 239 | - # Translate using selected model | ||
| 240 | - logger.info( | ||
| 241 | - f"[Translator] Translation request | Model: {self.model} | Original text: '{text}' | Target language: {target_lang} | " | ||
| 242 | - f"Source language: {source_lang or 'auto'} | Context: {translation_context} | " | ||
| 243 | - f"Prompt: {'yes' if prompt else 'no'} | Status: Starting translation" | ||
| 244 | - ) | ||
| 245 | - | ||
| 246 | - if self.model == 'qwen': | ||
| 247 | - result = self._translate_qwen(text, target_lang, source_lang, translation_context, prompt) | ||
| 248 | - else: # deepl | ||
| 249 | - result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt) | ||
| 250 | - | ||
| 251 | - # Surface translation failure to the caller instead of silently | ||
| 252 | - # masquerading the source text as a successful translation. | ||
| 253 | - if result is None: | ||
| 254 | - logger.warning( | ||
| 255 | - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | ||
| 256 | - f"Source language: {source_lang or 'auto'} | Status: Translation failed" | ||
| 257 | - ) | ||
| 258 | - else: | ||
| 259 | - logger.info( | ||
| 260 | - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | ||
| 261 | - f"Source language: {source_lang or 'auto'} | Result: '{result}' | Status: Translation successful" | ||
| 262 | - ) | ||
| 263 | - | ||
| 264 | - # Cache only successful translations. Failed attempts must not poison | ||
| 265 | - # Redis with the original text. | ||
| 266 | - if result is not None and self.use_cache and self.redis_client: | ||
| 267 | - self._set_cached_translation_redis(text, target_lang, result, source_lang, translation_context, prompt) | 141 | + result = self._translate_qwen(text, tgt, src) |
| 268 | 142 | ||
| 143 | + if result is not None: | ||
| 144 | + self._set_cached_translation_redis(text, tgt, result, src, translation_context, prompt) | ||
| 269 | return result | 145 | return result |
| 270 | 146 | ||
| 271 | def _translate_qwen( | 147 | def _translate_qwen( |
| @@ -273,412 +149,63 @@ class Translator: | @@ -273,412 +149,63 @@ class Translator: | ||
| 273 | text: str, | 149 | text: str, |
| 274 | target_lang: str, | 150 | target_lang: str, |
| 275 | source_lang: Optional[str], | 151 | source_lang: Optional[str], |
| 276 | - context: Optional[str] = None, | ||
| 277 | - prompt: Optional[str] = None | ||
| 278 | ) -> Optional[str]: | 152 | ) -> Optional[str]: |
| 279 | - """ | ||
| 280 | - Translate using Qwen MT Flash model via Alibaba Cloud DashScope API. | ||
| 281 | - | ||
| 282 | - Args: | ||
| 283 | - text: Text to translate | ||
| 284 | - target_lang: Target language code ('zh', 'en', 'ru', etc.) | ||
| 285 | - source_lang: Source language code (optional, 'auto' if None) | ||
| 286 | - context: Context hint for translation (optional) | ||
| 287 | - prompt: Translation prompt/instruction (optional) | ||
| 288 | - | ||
| 289 | - Returns: | ||
| 290 | - Translated text or None if translation fails | ||
| 291 | - """ | ||
| 292 | - if not self.qwen_client: | ||
| 293 | - logger.error("[Translator] Qwen client not initialized. Check DASHSCOPE_API_KEY.") | 153 | + if not self._qwen_client: |
| 294 | return None | 154 | return None |
| 295 | - | ||
| 296 | - # Qwen (qwen-mt-plus/flash/turbo) supported languages mapping | ||
| 297 | - # 标准来自:你提供的“语言 / 英文名 / 代码”表 | ||
| 298 | - qwen_lang_map = { | ||
| 299 | - "en": "English", | ||
| 300 | - "zh": "Chinese", | ||
| 301 | - "zh_tw": "Traditional Chinese", | ||
| 302 | - "ru": "Russian", | ||
| 303 | - "ja": "Japanese", | ||
| 304 | - "ko": "Korean", | ||
| 305 | - "es": "Spanish", | ||
| 306 | - "fr": "French", | ||
| 307 | - "pt": "Portuguese", | ||
| 308 | - "de": "German", | ||
| 309 | - "it": "Italian", | ||
| 310 | - "th": "Thai", | ||
| 311 | - "vi": "Vietnamese", | ||
| 312 | - "id": "Indonesian", | ||
| 313 | - "ms": "Malay", | ||
| 314 | - "ar": "Arabic", | ||
| 315 | - "hi": "Hindi", | ||
| 316 | - "he": "Hebrew", | ||
| 317 | - "my": "Burmese", | ||
| 318 | - "ta": "Tamil", | ||
| 319 | - "ur": "Urdu", | ||
| 320 | - "bn": "Bengali", | ||
| 321 | - "pl": "Polish", | ||
| 322 | - "nl": "Dutch", | ||
| 323 | - "ro": "Romanian", | ||
| 324 | - "tr": "Turkish", | ||
| 325 | - "km": "Khmer", | ||
| 326 | - "lo": "Lao", | ||
| 327 | - "yue": "Cantonese", | ||
| 328 | - "cs": "Czech", | ||
| 329 | - "el": "Greek", | ||
| 330 | - "sv": "Swedish", | ||
| 331 | - "hu": "Hungarian", | ||
| 332 | - "da": "Danish", | ||
| 333 | - "fi": "Finnish", | ||
| 334 | - "uk": "Ukrainian", | ||
| 335 | - "bg": "Bulgarian", | ||
| 336 | - } | ||
| 337 | - | ||
| 338 | - # Convert target language | ||
| 339 | - target_lang_normalized = target_lang.lower() | ||
| 340 | - target_lang_qwen = qwen_lang_map.get(target_lang_normalized, target_lang.capitalize()) | ||
| 341 | - | ||
| 342 | - # Convert source language | ||
| 343 | - source_lang_normalized = (source_lang or "").strip().lower() | ||
| 344 | - if not source_lang_normalized or source_lang_normalized == "auto": | ||
| 345 | - source_lang_qwen = "auto" | ||
| 346 | - else: | ||
| 347 | - source_lang_qwen = qwen_lang_map.get(source_lang_normalized, source_lang.capitalize()) | ||
| 348 | - | ||
| 349 | - # Prepare translation options | ||
| 350 | - translation_options = { | ||
| 351 | - "source_lang": source_lang_qwen, | ||
| 352 | - "target_lang": target_lang_qwen, | ||
| 353 | - } | ||
| 354 | - | ||
| 355 | - # Prepare messages | ||
| 356 | - messages = [ | ||
| 357 | - { | ||
| 358 | - "role": "user", | ||
| 359 | - "content": text | ||
| 360 | - } | ||
| 361 | - ] | ||
| 362 | - | ||
| 363 | - start_time = time.time() | 155 | + tgt_norm = (target_lang or "").strip().lower() |
| 156 | + src_norm = (source_lang or "").strip().lower() | ||
| 157 | + tgt_qwen = self.SOURCE_LANG_CODE_MAP.get(tgt_norm, tgt_norm.capitalize()) | ||
| 158 | + src_qwen = "auto" if not src_norm or src_norm == "auto" else self.SOURCE_LANG_CODE_MAP.get(src_norm, src_norm.capitalize()) | ||
| 159 | + start = time.time() | ||
| 364 | try: | 160 | try: |
| 365 | - completion = self.qwen_client.chat.completions.create( | ||
| 366 | - model=self.QWEN_MODEL, | ||
| 367 | - messages=messages, | 161 | + completion = self._qwen_client.chat.completions.create( |
| 162 | + model=self.qwen_model_name, | ||
| 163 | + messages=[{"role": "user", "content": text}], | ||
| 368 | extra_body={ | 164 | extra_body={ |
| 369 | - "translation_options": translation_options | ||
| 370 | - } | ||
| 371 | - ) | ||
| 372 | - | ||
| 373 | - translated_text = completion.choices[0].message.content.strip() | ||
| 374 | - duration_ms = (time.time() - start_time) * 1000 | ||
| 375 | - | ||
| 376 | - logger.info( | ||
| 377 | - f"[Translator] Qwen API response success | Original text: '{text}' | Target language: {target_lang_qwen} | " | ||
| 378 | - f"Translation result: '{translated_text}' | Duration: {duration_ms:.2f} ms" | ||
| 379 | - ) | ||
| 380 | - return translated_text | ||
| 381 | - | ||
| 382 | - except Exception as e: | ||
| 383 | - duration_ms = (time.time() - start_time) * 1000 | ||
| 384 | - logger.error( | ||
| 385 | - f"[Translator] Qwen API request exception | Original text: '{text}' | Target language: {target_lang_qwen} | " | ||
| 386 | - f"Duration: {duration_ms:.2f} ms | Error: {e}", exc_info=True | ||
| 387 | - ) | ||
| 388 | - return None | ||
| 389 | - | ||
| 390 | - def _translate_deepl( | ||
| 391 | - self, | ||
| 392 | - text: str, | ||
| 393 | - target_lang: str, | ||
| 394 | - source_lang: Optional[str], | ||
| 395 | - context: Optional[str] = None, | ||
| 396 | - prompt: Optional[str] = None | ||
| 397 | - ) -> Optional[str]: | ||
| 398 | - """ | ||
| 399 | - Translate using DeepL API with context and glossary support. | ||
| 400 | - | ||
| 401 | - Args: | ||
| 402 | - text: Text to translate | ||
| 403 | - target_lang: Target language code | ||
| 404 | - source_lang: Source language code (optional) | ||
| 405 | - context: Context hint for translation (e.g., "e-commerce product search") | ||
| 406 | - """ | ||
| 407 | - # Map to DeepL language codes | ||
| 408 | - target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper()) | ||
| 409 | - | ||
| 410 | - headers = { | ||
| 411 | - "Authorization": f"DeepL-Auth-Key {self.api_key}", | ||
| 412 | - "Content-Type": "application/json", | ||
| 413 | - } | ||
| 414 | - | ||
| 415 | - # Use prompt as context parameter for DeepL API (not as text prefix) | ||
| 416 | - # According to DeepL API: context is "Additional context that can influence a translation but is not translated itself" | ||
| 417 | - # If prompt is provided, use it as context; otherwise use the default context | ||
| 418 | - api_context = prompt if prompt else context | ||
| 419 | - | ||
| 420 | - # For e-commerce, add context words to help DeepL understand the domain | ||
| 421 | - # This is especially important for single-word ambiguous terms like "车" (car vs rook) | ||
| 422 | - text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context) | ||
| 423 | - | ||
| 424 | - payload = { | ||
| 425 | - "text": [text_to_translate], | ||
| 426 | - "target_lang": target_code, | ||
| 427 | - } | ||
| 428 | - | ||
| 429 | - if source_lang: | ||
| 430 | - source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper()) | ||
| 431 | - payload["source_lang"] = source_code | ||
| 432 | - | ||
| 433 | - # Add context parameter (prompt or default context) | ||
| 434 | - # Context influences translation but is not translated itself | ||
| 435 | - if api_context: | ||
| 436 | - payload["context"] = api_context | ||
| 437 | - | ||
| 438 | - # Add glossary if configured | ||
| 439 | - if self.glossary_id: | ||
| 440 | - payload["glossary_id"] = self.glossary_id | ||
| 441 | - | ||
| 442 | - # Note: DeepL API v2 supports "context" parameter for additional context | ||
| 443 | - # that influences translation but is not translated itself. | ||
| 444 | - # We use prompt as context parameter when provided. | ||
| 445 | - | ||
| 446 | - try: | ||
| 447 | - response = requests.post( | ||
| 448 | - self.DEEPL_API_URL, | ||
| 449 | - headers=headers, | ||
| 450 | - json=payload, | ||
| 451 | - timeout=self.timeout | 165 | + "translation_options": { |
| 166 | + "source_lang": src_qwen, | ||
| 167 | + "target_lang": tgt_qwen, | ||
| 168 | + } | ||
| 169 | + }, | ||
| 170 | + timeout=self.timeout, | ||
| 452 | ) | 171 | ) |
| 453 | - | ||
| 454 | - if response.status_code == 200: | ||
| 455 | - data = response.json() | ||
| 456 | - if "translations" in data and len(data["translations"]) > 0: | ||
| 457 | - translated_text = data["translations"][0]["text"] | ||
| 458 | - # If we added context, extract just the term from the result | ||
| 459 | - if needs_extraction: | ||
| 460 | - translated_text = self._extract_term_from_translation( | ||
| 461 | - translated_text, text, target_code | ||
| 462 | - ) | ||
| 463 | - logger.debug( | ||
| 464 | - f"[Translator] DeepL API response success | Original text: '{text}' | Target language: {target_code} | " | ||
| 465 | - f"Translation result: '{translated_text}'" | ||
| 466 | - ) | ||
| 467 | - return translated_text | ||
| 468 | - else: | ||
| 469 | - logger.error( | ||
| 470 | - f"[Translator] DeepL API error | Original text: '{text}' | Target language: {target_code} | " | ||
| 471 | - f"Status code: {response.status_code} | Error message: {response.text}" | ||
| 472 | - ) | 172 | + content = (completion.choices[0].message.content or "").strip() |
| 173 | + if not content: | ||
| 473 | return None | 174 | return None |
| 474 | - | ||
| 475 | - except requests.Timeout: | 175 | + logger.info("[qwen-mt] Success | src=%s tgt=%s latency=%.1fms", src_qwen, tgt_qwen, (time.time() - start) * 1000) |
| 176 | + return content | ||
| 177 | + except Exception as exc: | ||
| 476 | logger.warning( | 178 | logger.warning( |
| 477 | - f"[Translator] DeepL API request timeout | Original text: '{text}' | Target language: {target_code} | " | ||
| 478 | - f"Timeout: {self.timeout}s" | ||
| 479 | - ) | ||
| 480 | - return None | ||
| 481 | - except Exception as e: | ||
| 482 | - logger.error( | ||
| 483 | - f"[Translator] DeepL API request exception | Original text: '{text}' | Target language: {target_code} | " | ||
| 484 | - f"Error: {e}", exc_info=True | 179 | + "[qwen-mt] Failed | src=%s tgt=%s latency=%.1fms error=%s", |
| 180 | + src_qwen, | ||
| 181 | + tgt_qwen, | ||
| 182 | + (time.time() - start) * 1000, | ||
| 183 | + exc, | ||
| 184 | + exc_info=True, | ||
| 485 | ) | 185 | ) |
| 486 | return None | 186 | return None |
| 487 | 187 | ||
| 488 | - # NOTE: _translate_deepl_free is intentionally not implemented. | ||
| 489 | - # We do not support automatic fallback to the free endpoint, to avoid | ||
| 490 | - # mixing Pro keys with https://api-free.deepl.com and related 403 errors. | ||
| 491 | - | ||
| 492 | - def translate_multi( | ||
| 493 | - self, | ||
| 494 | - text: str, | ||
| 495 | - target_langs: List[str], | ||
| 496 | - source_lang: Optional[str] = None, | ||
| 497 | - context: Optional[str] = None, | ||
| 498 | - async_mode: bool = True, | ||
| 499 | - prompt: Optional[str] = None | ||
| 500 | - ) -> Dict[str, Optional[str]]: | ||
| 501 | - """ | ||
| 502 | - Translate text to multiple target languages. | ||
| 503 | - | ||
| 504 | - In async_mode=True (default): | ||
| 505 | - - Returns cached translations immediately if available | ||
| 506 | - - For translations that can be optimized (e.g., pure numbers, already in target language), | ||
| 507 | - returns result immediately via synchronous call | ||
| 508 | - - Launches async tasks for other missing translations (non-blocking) | ||
| 509 | - - Returns None for missing translations that require async processing | ||
| 510 | - | ||
| 511 | - In async_mode=False: | ||
| 512 | - - Waits for all translations to complete (blocking) | ||
| 513 | - | ||
| 514 | - Args: | ||
| 515 | - text: Text to translate | ||
| 516 | - target_langs: List of target language codes | ||
| 517 | - source_lang: Source language code (optional) | ||
| 518 | - context: Context hint for translation (optional) | ||
| 519 | - async_mode: If True, return cached results immediately and translate missing ones async | ||
| 520 | - prompt: Translation prompt/instruction (optional) | ||
| 521 | 188 | ||
| 522 | - Returns: | ||
| 523 | - Dictionary mapping language code to translated text (only cached results in async mode) | ||
| 524 | - """ | ||
| 525 | - results = {} | ||
| 526 | - missing_langs = [] | ||
| 527 | - async_langs = [] | ||
| 528 | - | ||
| 529 | - # First, get cached translations | ||
| 530 | - for lang in target_langs: | ||
| 531 | - cached = self._get_cached_translation(text, lang, source_lang, context, prompt) | ||
| 532 | - if cached is not None: | ||
| 533 | - results[lang] = cached | ||
| 534 | - else: | ||
| 535 | - missing_langs.append(lang) | ||
| 536 | - | ||
| 537 | - # If async mode and there are missing translations | ||
| 538 | - if async_mode and missing_langs: | ||
| 539 | - # Check if translation can be optimized (immediate return) | ||
| 540 | - for lang in missing_langs: | ||
| 541 | - target_lang = lang.lower() | ||
| 542 | - # Check optimization conditions (same as in translate method) | ||
| 543 | - can_optimize = False | ||
| 544 | - if target_lang == 'en' and self._is_english_text(text): | ||
| 545 | - can_optimize = True | ||
| 546 | - elif target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): | ||
| 547 | - can_optimize = True | ||
| 548 | - | ||
| 549 | - if can_optimize: | ||
| 550 | - # Can be optimized, call translate synchronously for immediate result | ||
| 551 | - results[lang] = self.translate(text, lang, source_lang, context, prompt) | ||
| 552 | - else: | ||
| 553 | - # Requires actual translation, add to async list | ||
| 554 | - async_langs.append(lang) | ||
| 555 | - | ||
| 556 | - # Launch async tasks for translations that require actual API calls | ||
| 557 | - if async_langs: | ||
| 558 | - for lang in async_langs: | ||
| 559 | - self._translate_async(text, lang, source_lang, context, prompt) | ||
| 560 | - # Return None for async translations | ||
| 561 | - for lang in async_langs: | ||
| 562 | - results[lang] = None | ||
| 563 | - else: | ||
| 564 | - # Synchronous mode: wait for all translations | ||
| 565 | - for lang in missing_langs: | ||
| 566 | - results[lang] = self.translate(text, lang, source_lang, context, prompt) | ||
| 567 | - | ||
| 568 | - return results | ||
| 569 | - | ||
| 570 | - def translate_multi_async( | ||
| 571 | - self, | ||
| 572 | - text: str, | ||
| 573 | - target_langs: List[str], | ||
| 574 | - source_lang: Optional[str] = None, | ||
| 575 | - context: Optional[str] = None, | ||
| 576 | - prompt: Optional[str] = None | ||
| 577 | - ) -> Dict[str, Union[str, Future]]: | ||
| 578 | - """ | ||
| 579 | - Translate text to multiple target languages asynchronously, returning Futures that can be awaited. | ||
| 580 | - | ||
| 581 | - This method returns a dictionary where: | ||
| 582 | - - If translation is cached, the value is the translation string (immediate) | ||
| 583 | - - If translation needs to be done, the value is a Future object that can be awaited | ||
| 584 | - | ||
| 585 | - Args: | ||
| 586 | - text: Text to translate | ||
| 587 | - target_langs: List of target language codes | ||
| 588 | - source_lang: Source language code (optional) | ||
| 589 | - context: Context hint for translation (optional) | ||
| 590 | - prompt: Translation prompt/instruction (optional) | ||
| 591 | - | ||
| 592 | - Returns: | ||
| 593 | - Dictionary mapping language code to either translation string (cached) or Future object | ||
| 594 | - """ | ||
| 595 | - results = {} | ||
| 596 | - missing_langs = [] | ||
| 597 | - | ||
| 598 | - # First, get cached translations | ||
| 599 | - for lang in target_langs: | ||
| 600 | - cached = self._get_cached_translation(text, lang, source_lang, context, prompt) | ||
| 601 | - if cached is not None: | ||
| 602 | - results[lang] = cached | ||
| 603 | - else: | ||
| 604 | - missing_langs.append(lang) | ||
| 605 | - | ||
| 606 | - # For missing translations, submit async tasks and return Futures | ||
| 607 | - for lang in missing_langs: | ||
| 608 | - future = self.executor.submit( | ||
| 609 | - self.translate, | ||
| 610 | - text, | ||
| 611 | - lang, | ||
| 612 | - source_lang, | ||
| 613 | - context, | ||
| 614 | - prompt | ||
| 615 | - ) | ||
| 616 | - results[lang] = future | ||
| 617 | - | ||
| 618 | - return results | ||
| 619 | - | ||
| 620 | - def _get_cached_translation( | ||
| 621 | - self, | ||
| 622 | - text: str, | ||
| 623 | - target_lang: str, | ||
| 624 | - source_lang: Optional[str] = None, | ||
| 625 | - context: Optional[str] = None, | ||
| 626 | - prompt: Optional[str] = None | ||
| 627 | - ) -> Optional[str]: | ||
| 628 | - """Get translation from cache if available.""" | ||
| 629 | - if not self.redis_client: | ||
| 630 | - return None | ||
| 631 | - return self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt) | ||
| 632 | - | ||
| 633 | def _get_cached_translation_redis( | 189 | def _get_cached_translation_redis( |
| 634 | self, | 190 | self, |
| 635 | text: str, | 191 | text: str, |
| 636 | target_lang: str, | 192 | target_lang: str, |
| 637 | source_lang: Optional[str] = None, | 193 | source_lang: Optional[str] = None, |
| 638 | context: Optional[str] = None, | 194 | context: Optional[str] = None, |
| 639 | - prompt: Optional[str] = None | 195 | + prompt: Optional[str] = None, |
| 640 | ) -> Optional[str]: | 196 | ) -> Optional[str]: |
| 641 | - """ | ||
| 642 | - Get translation from Redis cache with sliding expiration. | ||
| 643 | - | ||
| 644 | - 滑动过期机制:每次访问缓存时,重置过期时间为配置的过期时间(默认720天)。 | ||
| 645 | - 这样缓存会在最后一次访问后的720天才过期,而不是写入后的720天。 | ||
| 646 | - 这确保了常用的翻译缓存不会被过早删除。 | ||
| 647 | - """ | ||
| 648 | if not self.redis_client: | 197 | if not self.redis_client: |
| 649 | return None | 198 | return None |
| 650 | - | 199 | + key = self._build_cache_key(text, target_lang, source_lang, context, prompt) |
| 651 | try: | 200 | try: |
| 652 | - # Build cache key: prefix:target_lang:text | ||
| 653 | - # For simplicity, we use target_lang and text as key | ||
| 654 | - # Context and prompt are not included in key to maximize cache hits | ||
| 655 | - cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}" | ||
| 656 | - value = self.redis_client.get(cache_key) | ||
| 657 | - if value: | ||
| 658 | - # Sliding expiration: reset expiration time on access | ||
| 659 | - # 每次读取缓存时,重置过期时间为配置的过期时间(最后一次访问后的N天才过期) | ||
| 660 | - try: | ||
| 661 | - self.redis_client.expire(cache_key, self.expire_seconds) | ||
| 662 | - except Exception as expire_error: | ||
| 663 | - # 即使 expire 失败,也返回缓存值(不影响功能) | ||
| 664 | - logger.warning( | ||
| 665 | - f"[Translator] Failed to update cache expiration for key {cache_key}: {expire_error}" | ||
| 666 | - ) | ||
| 667 | - | ||
| 668 | - logger.debug( | ||
| 669 | - f"[Translator] Redis cache hit | Original text: '{text}' | Target language: {target_lang} | " | ||
| 670 | - f"Cache key: {cache_key} | Translation result: '{value}' | TTL reset to {self.expire_seconds}s" | ||
| 671 | - ) | ||
| 672 | - return value | ||
| 673 | - logger.debug( | ||
| 674 | - f"[Translator] Redis cache miss | Original text: '{text}' | Target language: {target_lang} | " | ||
| 675 | - f"Cache key: {cache_key}" | ||
| 676 | - ) | 201 | + value = self.redis_client.get(key) |
| 202 | + if value and self.cache_sliding_expiration: | ||
| 203 | + self.redis_client.expire(key, self.expire_seconds) | ||
| 204 | + return value | ||
| 205 | + except Exception as exc: | ||
| 206 | + logger.warning("Redis get translation cache failed: %s", exc) | ||
| 677 | return None | 207 | return None |
| 678 | - except Exception as e: | ||
| 679 | - logger.error(f"[Translator] Redis error during get translation cache | Original text: '{text}' | Target language: {target_lang} | Error: {e}") | ||
| 680 | - return None | ||
| 681 | - | 208 | + |
| 682 | def _set_cached_translation_redis( | 209 | def _set_cached_translation_redis( |
| 683 | self, | 210 | self, |
| 684 | text: str, | 211 | text: str, |
| @@ -686,128 +213,17 @@ class Translator: | @@ -686,128 +213,17 @@ class Translator: | ||
| 686 | translation: str, | 213 | translation: str, |
| 687 | source_lang: Optional[str] = None, | 214 | source_lang: Optional[str] = None, |
| 688 | context: Optional[str] = None, | 215 | context: Optional[str] = None, |
| 689 | - prompt: Optional[str] = None | 216 | + prompt: Optional[str] = None, |
| 690 | ) -> None: | 217 | ) -> None: |
| 691 | - """Store translation in Redis cache.""" | ||
| 692 | if not self.redis_client: | 218 | if not self.redis_client: |
| 693 | return | 219 | return |
| 694 | - | 220 | + key = self._build_cache_key(text, target_lang, source_lang, context, prompt) |
| 695 | try: | 221 | try: |
| 696 | - cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}" | ||
| 697 | - self.redis_client.setex(cache_key, self.expire_seconds, translation) | ||
| 698 | - logger.info( | ||
| 699 | - f"[Translator] Redis cache write | Original text: '{text}' | Target language: {target_lang} | " | ||
| 700 | - f"Cache key: {cache_key} | Translation result: '{translation}'" | ||
| 701 | - ) | ||
| 702 | - except Exception as e: | ||
| 703 | - logger.error( | ||
| 704 | - f"[Translator] Redis cache write failed | Original text: '{text}' | Target language: {target_lang} | " | ||
| 705 | - f"Error: {e}" | ||
| 706 | - ) | ||
| 707 | - | ||
| 708 | - def _translate_async( | ||
| 709 | - self, | ||
| 710 | - text: str, | ||
| 711 | - target_lang: str, | ||
| 712 | - source_lang: Optional[str] = None, | ||
| 713 | - context: Optional[str] = None, | ||
| 714 | - prompt: Optional[str] = None | ||
| 715 | - ): | ||
| 716 | - """Launch async translation task.""" | ||
| 717 | - def _do_translate(): | ||
| 718 | - try: | ||
| 719 | - result = self.translate(text, target_lang, source_lang, context, prompt) | ||
| 720 | - if result: | ||
| 721 | - logger.debug(f"Async translation completed: {text} -> {target_lang}: {result}") | ||
| 722 | - except Exception as e: | ||
| 723 | - logger.warning(f"Async translation failed: {text} -> {target_lang}: {e}") | ||
| 724 | - | ||
| 725 | - self.executor.submit(_do_translate) | ||
| 726 | - | ||
| 727 | - def _add_ecommerce_context( | ||
| 728 | - self, | ||
| 729 | - text: str, | ||
| 730 | - source_lang: Optional[str], | ||
| 731 | - context: Optional[str] | ||
| 732 | - ) -> tuple: | ||
| 733 | - """ | ||
| 734 | - Add e-commerce context to text for better disambiguation. | ||
| 735 | - | ||
| 736 | - For single-word ambiguous Chinese terms, we add context words that help | ||
| 737 | - DeepL understand this is an e-commerce/product search context. | ||
| 738 | - | ||
| 739 | - Args: | ||
| 740 | - text: Original text to translate | ||
| 741 | - source_lang: Source language code | ||
| 742 | - context: Context hint | ||
| 743 | - | ||
| 744 | - Returns: | ||
| 745 | - Tuple of (text_with_context, needs_extraction) | ||
| 746 | - - text_with_context: Text to send to DeepL | ||
| 747 | - - needs_extraction: Whether we need to extract the term from the result | ||
| 748 | - """ | ||
| 749 | - # Only apply for e-commerce context and Chinese source | ||
| 750 | - if not context or "e-commerce" not in context.lower(): | ||
| 751 | - return text, False | ||
| 752 | - | ||
| 753 | - if not source_lang or source_lang.lower() != 'zh': | ||
| 754 | - return text, False | ||
| 755 | - | ||
| 756 | - # For single-word queries, add context to help disambiguation | ||
| 757 | - text_stripped = text.strip() | ||
| 758 | - if len(text_stripped.split()) == 1 and len(text_stripped) <= 2: | ||
| 759 | - # Common ambiguous Chinese e-commerce terms like "车" (car vs rook) | ||
| 760 | - # We add a context phrase: "购买 [term]" (buy [term]) or "商品 [term]" (product [term]) | ||
| 761 | - # This helps DeepL understand the e-commerce context | ||
| 762 | - # We'll need to extract just the term from the translation result | ||
| 763 | - context_phrase = f"购买 {text_stripped}" | ||
| 764 | - return context_phrase, True | ||
| 765 | - | ||
| 766 | - # For multi-word queries, DeepL usually has enough context | ||
| 767 | - return text, False | ||
| 768 | - | ||
| 769 | - def _extract_term_from_translation( | ||
| 770 | - self, | ||
| 771 | - translated_text: str, | ||
| 772 | - original_text: str, | ||
| 773 | - target_lang_code: str | ||
| 774 | - ) -> str: | ||
| 775 | - """ | ||
| 776 | - Extract the actual term from a translation that included context. | ||
| 777 | - | ||
| 778 | - For example, if we translated "购买 车" (buy car) and got "buy car", | ||
| 779 | - we want to extract just "car". | ||
| 780 | - | ||
| 781 | - Args: | ||
| 782 | - translated_text: Full translation result | ||
| 783 | - original_text: Original single-word query | ||
| 784 | - target_lang_code: Target language code (EN, ZH, etc.) | ||
| 785 | - | ||
| 786 | - Returns: | ||
| 787 | - Extracted term or original translation if extraction fails | ||
| 788 | - """ | ||
| 789 | - # For English target, try to extract the last word (the actual term) | ||
| 790 | - if target_lang_code == "EN": | ||
| 791 | - words = translated_text.strip().split() | ||
| 792 | - if len(words) > 1: | ||
| 793 | - # Usually the last word is the term we want | ||
| 794 | - # But we need to be smart - if it's "buy car", we want "car" | ||
| 795 | - # Common context words to skip: buy, purchase, product, item, etc. | ||
| 796 | - context_words = {"buy", "purchase", "product", "item", "commodity", "goods"} | ||
| 797 | - # Try to find the term (not a context word) | ||
| 798 | - for word in reversed(words): | ||
| 799 | - word_lower = word.lower().rstrip('.,!?;:') | ||
| 800 | - if word_lower not in context_words: | ||
| 801 | - return word_lower | ||
| 802 | - # If all words are context words, return the last one | ||
| 803 | - return words[-1].lower().rstrip('.,!?;:') | ||
| 804 | - | ||
| 805 | - # For other languages or if extraction fails, return as-is | ||
| 806 | - # The user can configure a glossary for better results | ||
| 807 | - return translated_text | 222 | + self.redis_client.setex(key, self.expire_seconds, translation) |
| 223 | + except Exception as exc: | ||
| 224 | + logger.warning("Redis set translation cache failed: %s", exc) | ||
| 808 | 225 | ||
| 809 | def _shop_lang_matches(self, shop_lang_lower: str, lang_code: str) -> bool: | 226 | def _shop_lang_matches(self, shop_lang_lower: str, lang_code: str) -> bool: |
| 810 | - """True if shop language matches index language (use source, no translate).""" | ||
| 811 | if not shop_lang_lower or not lang_code: | 227 | if not shop_lang_lower or not lang_code: |
| 812 | return False | 228 | return False |
| 813 | if shop_lang_lower == lang_code: | 229 | if shop_lang_lower == lang_code: |
| @@ -818,146 +234,27 @@ class Translator: | @@ -818,146 +234,27 @@ class Translator: | ||
| 818 | return True | 234 | return True |
| 819 | return False | 235 | return False |
| 820 | 236 | ||
| 821 | - def translate_for_indexing( | ||
| 822 | - self, | ||
| 823 | - text: str, | ||
| 824 | - shop_language: str, | ||
| 825 | - source_lang: Optional[str] = None, | ||
| 826 | - context: Optional[str] = None, | ||
| 827 | - prompt: Optional[str] = None, | ||
| 828 | - index_languages: Optional[List[str]] = None, | ||
| 829 | - ) -> Dict[str, Optional[str]]: | ||
| 830 | - """ | ||
| 831 | - Translate text for indexing based on shop language and tenant index_languages. | ||
| 832 | - | ||
| 833 | - For each language in index_languages: use source text if shop language matches, | ||
| 834 | - otherwise translate to that language. | ||
| 835 | - | ||
| 836 | - Args: | ||
| 837 | - text: Text to translate | ||
| 838 | - shop_language: Shop primary language (e.g. 'zh', 'en', 'ru') | ||
| 839 | - source_lang: Source language code (optional) | ||
| 840 | - context: Additional context for translation (optional) | ||
| 841 | - prompt: Translation prompt (optional) | ||
| 842 | - index_languages: Languages to index (from tenant_config). Default ["en", "zh"]. | ||
| 843 | - | ||
| 844 | - Returns: | ||
| 845 | - Dict keyed by each index_language with translated or source text (or None). | ||
| 846 | - """ | ||
| 847 | - langs = index_languages if index_languages else ["en", "zh"] | ||
| 848 | - results = {lang: None for lang in langs} | ||
| 849 | - if not text or not text.strip(): | ||
| 850 | - return results | ||
| 851 | - if re.match(r'^[\d\s_-]+$', text): | ||
| 852 | - logger.info(f"[Translator] Skip translation for symbol-only query: '{text}'") | ||
| 853 | - return results | ||
| 854 | - | ||
| 855 | - shop_lang_lower = (shop_language or "").strip().lower() | ||
| 856 | - targets = [] | ||
| 857 | - for lang in langs: | ||
| 858 | - if self._shop_lang_matches(shop_lang_lower, lang): | ||
| 859 | - results[lang] = text | ||
| 860 | - else: | ||
| 861 | - targets.append(lang) | ||
| 862 | - | ||
| 863 | - for target_lang in targets: | ||
| 864 | - cached = self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt) | ||
| 865 | - if cached: | ||
| 866 | - results[target_lang] = cached | ||
| 867 | - logger.debug(f"[Translator] Cache hit for indexing: '{text}' -> {target_lang}: {cached}") | ||
| 868 | - continue | ||
| 869 | - translated = self.translate( | ||
| 870 | - text, | ||
| 871 | - target_lang=target_lang, | ||
| 872 | - source_lang=source_lang or shop_language, | ||
| 873 | - context=context, | ||
| 874 | - prompt=prompt, | ||
| 875 | - ) | ||
| 876 | - results[target_lang] = translated | ||
| 877 | - return results | ||
| 878 | - | ||
| 879 | - def get_translation_needs( | ||
| 880 | - self, | ||
| 881 | - detected_lang: str, | ||
| 882 | - supported_langs: List[str] | ||
| 883 | - ) -> List[str]: | ||
| 884 | - """ | ||
| 885 | - Determine which languages need translation. | ||
| 886 | - | ||
| 887 | - Args: | ||
| 888 | - detected_lang: Detected query language | ||
| 889 | - supported_langs: List of supported languages | ||
| 890 | - | ||
| 891 | - Returns: | ||
| 892 | - List of language codes to translate to | ||
| 893 | - """ | ||
| 894 | - # If detected language is in supported list, translate to others | 237 | + def get_translation_needs(self, detected_lang: str, supported_langs: List[str]) -> List[str]: |
| 895 | if detected_lang in supported_langs: | 238 | if detected_lang in supported_langs: |
| 896 | - return [lang for lang in supported_langs if detected_lang != lang] | ||
| 897 | - | ||
| 898 | - # Otherwise, translate to all supported languages | 239 | + return [lang for lang in supported_langs if lang != detected_lang] |
| 899 | return supported_langs | 240 | return supported_langs |
| 900 | - | 241 | + |
| 901 | def _is_english_text(self, text: str) -> bool: | 242 | def _is_english_text(self, text: str) -> bool: |
| 902 | - """ | ||
| 903 | - Check if text is primarily English (ASCII letters, numbers, common punctuation). | ||
| 904 | - | ||
| 905 | - Args: | ||
| 906 | - text: Text to check | ||
| 907 | - | ||
| 908 | - Returns: | ||
| 909 | - True if text appears to be English | ||
| 910 | - """ | ||
| 911 | if not text or not text.strip(): | 243 | if not text or not text.strip(): |
| 912 | return True | 244 | return True |
| 913 | - | ||
| 914 | - # Remove whitespace and common punctuation | ||
| 915 | - text_clean = re.sub(r'[\s\.,!?;:\-\'\"\(\)\[\]{}]', '', text) | 245 | + text_clean = re.sub(r"[\s\.,!?;:\-\'\"\(\)\[\]{}]", "", text) |
| 916 | if not text_clean: | 246 | if not text_clean: |
| 917 | return True | 247 | return True |
| 918 | - | ||
| 919 | - # Check if all remaining characters are ASCII (letters, numbers) | ||
| 920 | - # This is a simple heuristic: if most characters are ASCII, it's likely English | ||
| 921 | ascii_count = sum(1 for c in text_clean if ord(c) < 128) | 248 | ascii_count = sum(1 for c in text_clean if ord(c) < 128) |
| 922 | - ratio = ascii_count / len(text_clean) if text_clean else 0 | ||
| 923 | - | ||
| 924 | - # If more than 80% are ASCII characters, consider it English | ||
| 925 | - return ratio > 0.8 | ||
| 926 | - | 249 | + return (ascii_count / len(text_clean)) > 0.8 |
| 250 | + | ||
| 927 | def _contains_chinese(self, text: str) -> bool: | 251 | def _contains_chinese(self, text: str) -> bool: |
| 928 | - """ | ||
| 929 | - Check if text contains Chinese characters (Han characters). | ||
| 930 | - | ||
| 931 | - Args: | ||
| 932 | - text: Text to check | ||
| 933 | - | ||
| 934 | - Returns: | ||
| 935 | - True if text contains Chinese characters | ||
| 936 | - """ | ||
| 937 | if not text: | 252 | if not text: |
| 938 | return False | 253 | return False |
| 939 | - | ||
| 940 | - # Check for Chinese characters (Unicode range: \u4e00-\u9fff) | ||
| 941 | - chinese_pattern = re.compile(r'[\u4e00-\u9fff]') | ||
| 942 | - return bool(chinese_pattern.search(text)) | ||
| 943 | - | 254 | + return bool(re.search(r"[\u4e00-\u9fff]", text)) |
| 255 | + | ||
| 944 | def _is_pure_number(self, text: str) -> bool: | 256 | def _is_pure_number(self, text: str) -> bool: |
| 945 | - """ | ||
| 946 | - Check if text is purely numeric (digits, possibly with spaces, dots, commas). | ||
| 947 | - | ||
| 948 | - Args: | ||
| 949 | - text: Text to check | ||
| 950 | - | ||
| 951 | - Returns: | ||
| 952 | - True if text is purely numeric | ||
| 953 | - """ | ||
| 954 | if not text or not text.strip(): | 257 | if not text or not text.strip(): |
| 955 | return False | 258 | return False |
| 956 | - | ||
| 957 | - # Remove whitespace, dots, commas (common number separators) | ||
| 958 | - text_clean = re.sub(r'[\s\.,]', '', text.strip()) | ||
| 959 | - if not text_clean: | ||
| 960 | - return False | ||
| 961 | - | ||
| 962 | - # Check if all remaining characters are digits | ||
| 963 | - return text_clean.isdigit() | 259 | + text_clean = re.sub(r"[\s\.,]", "", text.strip()) |
| 260 | + return bool(text_clean) and text_clean.isdigit() |
query/test_translation.py
| @@ -14,6 +14,7 @@ Test content: | @@ -14,6 +14,7 @@ Test content: | ||
| 14 | import sys | 14 | import sys |
| 15 | import os | 15 | import os |
| 16 | from pathlib import Path | 16 | from pathlib import Path |
| 17 | +from concurrent.futures import ThreadPoolExecutor | ||
| 17 | 18 | ||
| 18 | # Add parent directory to path | 19 | # Add parent directory to path |
| 19 | sys.path.insert(0, str(Path(__file__).parent.parent)) | 20 | sys.path.insert(0, str(Path(__file__).parent.parent)) |
| @@ -42,9 +43,6 @@ def test_config_loading(): | @@ -42,9 +43,6 @@ def test_config_loading(): | ||
| 42 | 43 | ||
| 43 | print(f"✓ Configuration loaded successfully") | 44 | print(f"✓ Configuration loaded successfully") |
| 44 | print(f" Translation service: {config.query_config.translation_service}") | 45 | print(f" Translation service: {config.query_config.translation_service}") |
| 45 | - print(f" Translation prompt configuration:") | ||
| 46 | - for key, value in config.query_config.translation_prompts.items(): | ||
| 47 | - print(f" {key}: {value[:60]}..." if len(value) > 60 else f" {key}: {value}") | ||
| 48 | 46 | ||
| 49 | return config | 47 | return config |
| 50 | except Exception as e: | 48 | except Exception as e: |
| @@ -72,34 +70,23 @@ def test_translator_sync(config): | @@ -72,34 +70,23 @@ def test_translator_sync(config): | ||
| 72 | translation_context=config.query_config.translation_context | 70 | translation_context=config.query_config.translation_context |
| 73 | ) | 71 | ) |
| 74 | 72 | ||
| 75 | - # 测试商品标题翻译(使用product_title提示词) | 73 | + # 测试商品标题翻译(使用sku_name提示词) |
| 76 | test_texts = [ | 74 | test_texts = [ |
| 77 | - ("蓝牙耳机", "zh", "en", "product_title"), | ||
| 78 | - ("Wireless Headphones", "en", "zh", "product_title"), | 75 | + ("蓝牙耳机", "zh", "en", "sku_name"), |
| 76 | + ("Wireless Headphones", "en", "zh", "sku_name"), | ||
| 79 | ] | 77 | ] |
| 80 | 78 | ||
| 81 | - for text, source_lang, target_lang, prompt_type in test_texts: | ||
| 82 | - if prompt_type == "product_title": | ||
| 83 | - if target_lang == "zh": | ||
| 84 | - prompt = config.query_config.translation_prompts.get('product_title_zh') | ||
| 85 | - else: | ||
| 86 | - prompt = config.query_config.translation_prompts.get('product_title_en') | ||
| 87 | - else: | ||
| 88 | - if target_lang == "zh": | ||
| 89 | - prompt = config.query_config.translation_prompts.get('default_zh') | ||
| 90 | - else: | ||
| 91 | - prompt = config.query_config.translation_prompts.get('default_en') | ||
| 92 | - | 79 | + for text, source_lang, target_lang, scene in test_texts: |
| 93 | print(f"\nTranslation test:") | 80 | print(f"\nTranslation test:") |
| 94 | print(f" Original text ({source_lang}): {text}") | 81 | print(f" Original text ({source_lang}): {text}") |
| 95 | print(f" Target language: {target_lang}") | 82 | print(f" Target language: {target_lang}") |
| 96 | - print(f" Prompt: {prompt[:50] if prompt else 'None'}...") | 83 | + print(f" Scene: {scene}") |
| 97 | 84 | ||
| 98 | result = translator.translate( | 85 | result = translator.translate( |
| 99 | text, | 86 | text, |
| 100 | target_lang=target_lang, | 87 | target_lang=target_lang, |
| 101 | source_lang=source_lang, | 88 | source_lang=source_lang, |
| 102 | - prompt=prompt | 89 | + context=scene, |
| 103 | ) | 90 | ) |
| 104 | 91 | ||
| 105 | if result: | 92 | if result: |
| @@ -131,43 +118,25 @@ def test_translator_async(config, translator): | @@ -131,43 +118,25 @@ def test_translator_async(config, translator): | ||
| 131 | query_text = "手机" | 118 | query_text = "手机" |
| 132 | target_langs = ['en'] | 119 | target_langs = ['en'] |
| 133 | source_lang = 'zh' | 120 | source_lang = 'zh' |
| 134 | - | ||
| 135 | - query_prompt = config.query_config.translation_prompts.get('query_zh') | ||
| 136 | - | 121 | + |
| 137 | print(f"Query text: {query_text}") | 122 | print(f"Query text: {query_text}") |
| 138 | print(f"Target languages: {target_langs}") | 123 | print(f"Target languages: {target_langs}") |
| 139 | - print(f"Prompt: {query_prompt}") | ||
| 140 | - | ||
| 141 | - # 异步模式(立即返回,后台翻译) | ||
| 142 | - results = translator.translate_multi( | ||
| 143 | - query_text, | ||
| 144 | - target_langs, | ||
| 145 | - source_lang=source_lang, | ||
| 146 | - context=config.query_config.translation_context, | ||
| 147 | - async_mode=True, | ||
| 148 | - prompt=query_prompt | ||
| 149 | - ) | ||
| 150 | - | ||
| 151 | - print(f"\nAsynchronous translation results:") | ||
| 152 | - for lang, translation in results.items(): | ||
| 153 | - if translation: | ||
| 154 | - print(f" {lang}: {translation} (cache hit)") | ||
| 155 | - else: | ||
| 156 | - print(f" {lang}: None (translating in background...)") | ||
| 157 | - | ||
| 158 | - # 同步模式(等待完成) | ||
| 159 | - print(f"\nSynchronous translation (waiting for completion):") | ||
| 160 | - results_sync = translator.translate_multi( | ||
| 161 | - query_text, | ||
| 162 | - target_langs, | ||
| 163 | - source_lang=source_lang, | ||
| 164 | - context=config.query_config.translation_context, | ||
| 165 | - async_mode=False, | ||
| 166 | - prompt=query_prompt | ||
| 167 | - ) | 124 | + print("Scene: ecommerce_search_query") |
| 168 | 125 | ||
| 169 | - for lang, translation in results_sync.items(): | ||
| 170 | - print(f" {lang}: {translation}") | 126 | + print(f"\nConcurrent translation via generic translate():") |
| 127 | + with ThreadPoolExecutor(max_workers=len(target_langs)) as executor: | ||
| 128 | + futures = { | ||
| 129 | + lang: executor.submit( | ||
| 130 | + translator.translate, | ||
| 131 | + query_text, | ||
| 132 | + lang, | ||
| 133 | + source_lang, | ||
| 134 | + "ecommerce_search_query", | ||
| 135 | + ) | ||
| 136 | + for lang in target_langs | ||
| 137 | + } | ||
| 138 | + for lang, future in futures.items(): | ||
| 139 | + print(f" {lang}: {future.result()}") | ||
| 171 | 140 | ||
| 172 | except Exception as e: | 141 | except Exception as e: |
| 173 | print(f"✗ Asynchronous translation test failed: {e}") | 142 | print(f"✗ Asynchronous translation test failed: {e}") |
| @@ -193,14 +162,13 @@ def test_cache(): | @@ -193,14 +162,13 @@ def test_cache(): | ||
| 193 | test_text = "测试文本" | 162 | test_text = "测试文本" |
| 194 | target_lang = "en" | 163 | target_lang = "en" |
| 195 | source_lang = "zh" | 164 | source_lang = "zh" |
| 196 | - prompt = config.query_config.translation_prompts.get('default_zh') | ||
| 197 | 165 | ||
| 198 | print(f"First translation (should call API or return mock):") | 166 | print(f"First translation (should call API or return mock):") |
| 199 | - result1 = translator.translate(test_text, target_lang, source_lang, prompt=prompt) | 167 | + result1 = translator.translate(test_text, target_lang, source_lang, context="default") |
| 200 | print(f" Result: {result1}") | 168 | print(f" Result: {result1}") |
| 201 | 169 | ||
| 202 | print(f"\nSecond translation (should use cache):") | 170 | print(f"\nSecond translation (should use cache):") |
| 203 | - result2 = translator.translate(test_text, target_lang, source_lang, prompt=prompt) | 171 | + result2 = translator.translate(test_text, target_lang, source_lang, context="default") |
| 204 | print(f" Result: {result2}") | 172 | print(f" Result: {result2}") |
| 205 | 173 | ||
| 206 | if result1 == result2: | 174 | if result1 == result2: |
| @@ -231,17 +199,16 @@ def test_context_parameter(): | @@ -231,17 +199,16 @@ def test_context_parameter(): | ||
| 231 | 199 | ||
| 232 | # 测试带context和不带context的翻译 | 200 | # 测试带context和不带context的翻译 |
| 233 | text = "手机" | 201 | text = "手机" |
| 234 | - prompt = config.query_config.translation_prompts.get('query_zh') | ||
| 235 | 202 | ||
| 236 | print(f"Test text: {text}") | 203 | print(f"Test text: {text}") |
| 237 | - print(f"Prompt (as context): {prompt}") | 204 | + print("Scene: ecommerce_search_query") |
| 238 | 205 | ||
| 239 | # 带context的翻译 | 206 | # 带context的翻译 |
| 240 | result_with_context = translator.translate( | 207 | result_with_context = translator.translate( |
| 241 | text, | 208 | text, |
| 242 | target_lang='en', | 209 | target_lang='en', |
| 243 | source_lang='zh', | 210 | source_lang='zh', |
| 244 | - prompt=prompt | 211 | + context="ecommerce_search_query", |
| 245 | ) | 212 | ) |
| 246 | print(f"\nTranslation result with context: {result_with_context}") | 213 | print(f"\nTranslation result with context: {result_with_context}") |
| 247 | 214 |
query/translator.py deleted
| @@ -1,963 +0,0 @@ | @@ -1,963 +0,0 @@ | ||
| 1 | -""" | ||
| 2 | -Translation service for multi-language query support. | ||
| 3 | - | ||
| 4 | -Supports multiple translation models: | ||
| 5 | -- Qwen (default): Alibaba Cloud DashScope API using qwen-mt-flash model | ||
| 6 | -- DeepL: DeepL API for high-quality translations | ||
| 7 | - | ||
| 8 | -重要说明(Qwen 机翻限速): | ||
| 9 | -- 当前默认使用的 `qwen-mt-flash` 为云端机翻模型,**官方限速较低,约 RPM=60(每分钟约 60 请求)** | ||
| 10 | -- 在高并发场景必须依赖 Redis 翻译缓存与批量预热,避免在用户实时请求路径上直接打满 DashScope 限流 | ||
| 11 | -- 若业务侧存在大规模离线翻译或更高吞吐需求,建议评估 DeepL 或自建翻译后端 | ||
| 12 | - | ||
| 13 | -使用方法 (Usage): | ||
| 14 | - | ||
| 15 | -```python | ||
| 16 | -from query.qwen_mt_translate import Translator | ||
| 17 | - | ||
| 18 | -# 使用默认的 qwen 模型(推荐) | ||
| 19 | -translator = Translator() # 默认使用 qwen 模型 | ||
| 20 | - | ||
| 21 | -# 或显式指定模型 | ||
| 22 | -translator = Translator(model='qwen') # 使用 qwen 模型 | ||
| 23 | -translator = Translator(model='deepl') # 使用 DeepL 模型 | ||
| 24 | - | ||
| 25 | -# 翻译文本 | ||
| 26 | -result = translator.translate( | ||
| 27 | - text="我看到这个视频后没有笑", | ||
| 28 | - target_lang="en", | ||
| 29 | - source_lang="auto" # 自动检测源语言 | ||
| 30 | -) | ||
| 31 | -``` | ||
| 32 | - | ||
| 33 | -配置说明 (Configuration): | ||
| 34 | -- Qwen 模型需要设置 DASHSCOPE_API_KEY 环境变量(在 .env 文件中) | ||
| 35 | -- DeepL 模型需要设置 DEEPL_AUTH_KEY 环境变量(在 .env 文件中) | ||
| 36 | - | ||
| 37 | -Qwen 模型参考文档: | ||
| 38 | -- 官方文档:https://help.aliyun.com/zh/model-studio/get-api-key | ||
| 39 | -- 模型:qwen-mt-flash(快速翻译模型) | ||
| 40 | - | ||
| 41 | -DeepL 官方文档: | ||
| 42 | -https://developers.deepl.com/api-reference/translate/request-translation | ||
| 43 | -""" | ||
| 44 | - | ||
| 45 | -import os | ||
| 46 | -import requests | ||
| 47 | -import re | ||
| 48 | -import redis | ||
| 49 | -from concurrent.futures import ThreadPoolExecutor, Future | ||
| 50 | -from datetime import timedelta | ||
| 51 | -from typing import Dict, List, Optional, Union | ||
| 52 | -import logging | ||
| 53 | -import time | ||
| 54 | - | ||
| 55 | -logger = logging.getLogger(__name__) | ||
| 56 | - | ||
| 57 | -from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG | ||
| 58 | -from openai import OpenAI | ||
| 59 | - | ||
| 60 | - | ||
| 61 | -class Translator: | ||
| 62 | - """ | ||
| 63 | - Multi-language translator supporting Qwen and DeepL APIs. | ||
| 64 | - | ||
| 65 | - Default model is 'qwen' which uses Alibaba Cloud DashScope API. | ||
| 66 | - """ | ||
| 67 | -# 华北2(北京):https://dashscope.aliyuncs.com/compatible-mode/v1 | ||
| 68 | -# 新加坡:https://dashscope-intl.aliyuncs.com/compatible-mode/v1 | ||
| 69 | -# 美国(弗吉尼亚):https://dashscope-us.aliyuncs.com/compatible-mode/v1 | ||
| 70 | - | ||
| 71 | - DEEPL_API_URL = "https://api.deepl.com/v2/translate" # Pro tier | ||
| 72 | - QWEN_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1" # 北京地域 | ||
| 73 | - # QWEN_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" # 新加坡 | ||
| 74 | - # 如果使用新加坡地域的模型,需要将base_url替换为:https://dashscope-intl.aliyuncs.com/compatible-mode/v1 | ||
| 75 | - QWEN_MODEL = "qwen-mt-flash" # 快速翻译模型 | ||
| 76 | - | ||
| 77 | - # Language code mapping | ||
| 78 | - LANG_CODE_MAP = { | ||
| 79 | - 'zh': 'ZH', | ||
| 80 | - 'en': 'EN', | ||
| 81 | - 'ru': 'RU', | ||
| 82 | - 'ar': 'AR', | ||
| 83 | - 'ja': 'JA', | ||
| 84 | - 'es': 'ES', | ||
| 85 | - 'de': 'DE', | ||
| 86 | - 'fr': 'FR', | ||
| 87 | - 'it': 'IT', | ||
| 88 | - 'pt': 'PT', | ||
| 89 | - } | ||
| 90 | - | ||
| 91 | - def __init__( | ||
| 92 | - self, | ||
| 93 | - model: str = "qwen", | ||
| 94 | - api_key: Optional[str] = None, | ||
| 95 | - use_cache: bool = True, | ||
| 96 | - timeout: int = 10, | ||
| 97 | - glossary_id: Optional[str] = None, | ||
| 98 | - translation_context: Optional[str] = None | ||
| 99 | - ): | ||
| 100 | - """ | ||
| 101 | - Initialize translator. | ||
| 102 | - | ||
| 103 | - Args: | ||
| 104 | - model: Translation model to use. Options: 'qwen' (default) or 'deepl' | ||
| 105 | - api_key: API key for the selected model (or None to use from config/env) | ||
| 106 | - use_cache: Whether to cache translations | ||
| 107 | - timeout: Request timeout in seconds | ||
| 108 | - glossary_id: DeepL glossary ID for custom terminology (optional, only for DeepL) | ||
| 109 | - translation_context: Context hint for translation (e.g., "e-commerce", "product search") | ||
| 110 | - """ | ||
| 111 | - self.model = model.lower() | ||
| 112 | - if self.model not in ['qwen', 'deepl']: | ||
| 113 | - raise ValueError(f"Unsupported model: {model}. Supported models: 'qwen', 'deepl'") | ||
| 114 | - | ||
| 115 | - # Get API key from config if not provided | ||
| 116 | - if api_key is None: | ||
| 117 | - if self.model == 'qwen': | ||
| 118 | - api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY") | ||
| 119 | - else: # deepl | ||
| 120 | - api_key = DEEPL_AUTH_KEY or os.getenv("DEEPL_AUTH_KEY") | ||
| 121 | - | ||
| 122 | - self.api_key = api_key | ||
| 123 | - self.timeout = timeout | ||
| 124 | - self.use_cache = use_cache | ||
| 125 | - self.glossary_id = glossary_id | ||
| 126 | - self.translation_context = translation_context or "e-commerce product search" | ||
| 127 | - | ||
| 128 | - # Initialize OpenAI client for Qwen if needed | ||
| 129 | - self.qwen_client = None | ||
| 130 | - if self.model == 'qwen': | ||
| 131 | - if not self.api_key: | ||
| 132 | - logger.warning("DASHSCOPE_API_KEY not set. Qwen translation will not work.") | ||
| 133 | - else: | ||
| 134 | - self.qwen_client = OpenAI( | ||
| 135 | - api_key=self.api_key, | ||
| 136 | - base_url=self.QWEN_BASE_URL, | ||
| 137 | - ) | ||
| 138 | - | ||
| 139 | - # Initialize Redis cache if enabled | ||
| 140 | - if use_cache: | ||
| 141 | - try: | ||
| 142 | - self.redis_client = redis.Redis( | ||
| 143 | - host=REDIS_CONFIG.get('host', 'localhost'), | ||
| 144 | - port=REDIS_CONFIG.get('port', 6479), | ||
| 145 | - password=REDIS_CONFIG.get('password'), | ||
| 146 | - decode_responses=True, # Return str instead of bytes | ||
| 147 | - socket_timeout=REDIS_CONFIG.get('socket_timeout', 1), | ||
| 148 | - socket_connect_timeout=REDIS_CONFIG.get('socket_connect_timeout', 1), | ||
| 149 | - retry_on_timeout=REDIS_CONFIG.get('retry_on_timeout', False), | ||
| 150 | - health_check_interval=10, # 避免复用坏连接 | ||
| 151 | - ) | ||
| 152 | - # Test connection | ||
| 153 | - self.redis_client.ping() | ||
| 154 | - expire_days = REDIS_CONFIG.get('translation_cache_expire_days', 360) | ||
| 155 | - self.expire_time = timedelta(days=expire_days) | ||
| 156 | - self.expire_seconds = int(self.expire_time.total_seconds()) # Redis 需要秒数 | ||
| 157 | - self.cache_prefix = REDIS_CONFIG.get('translation_cache_prefix', 'trans') | ||
| 158 | - logger.info("Redis cache initialized for translations") | ||
| 159 | - except Exception as e: | ||
| 160 | - logger.warning(f"Failed to initialize Redis cache: {e}, falling back to no cache") | ||
| 161 | - self.redis_client = None | ||
| 162 | - self.cache = None | ||
| 163 | - else: | ||
| 164 | - self.redis_client = None | ||
| 165 | - self.cache = None | ||
| 166 | - | ||
| 167 | - # Thread pool for async translation | ||
| 168 | - self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="translator") | ||
| 169 | - | ||
| 170 | - def translate( | ||
| 171 | - self, | ||
| 172 | - text: str, | ||
| 173 | - target_lang: str, | ||
| 174 | - source_lang: Optional[str] = None, | ||
| 175 | - context: Optional[str] = None, | ||
| 176 | - prompt: Optional[str] = None | ||
| 177 | - ) -> Optional[str]: | ||
| 178 | - """ | ||
| 179 | - Translate text to target language (synchronous mode). | ||
| 180 | - | ||
| 181 | - Args: | ||
| 182 | - text: Text to translate | ||
| 183 | - target_lang: Target language code ('zh', 'en', 'ru', etc.) | ||
| 184 | - source_lang: Source language code (option al, auto-detect if None) | ||
| 185 | - context: Additional context for translation (overrides default context) | ||
| 186 | - prompt: Translation prompt/instruction (optional, for better translation quality) | ||
| 187 | - | ||
| 188 | - Returns: | ||
| 189 | - Translated text or None if translation fails | ||
| 190 | - """ | ||
| 191 | - if not text or not text.strip(): | ||
| 192 | - return text | ||
| 193 | - | ||
| 194 | - # Normalize language codes | ||
| 195 | - target_lang = target_lang.lower() | ||
| 196 | - if source_lang: | ||
| 197 | - source_lang = source_lang.lower() | ||
| 198 | - | ||
| 199 | - # Optimization: Skip translation if not needed | ||
| 200 | - if target_lang == 'en' and self._is_english_text(text): | ||
| 201 | - logger.info(f"[Translator] Text is already English, skipping translation: '{text[:50]}...'") | ||
| 202 | - return text | ||
| 203 | - | ||
| 204 | - if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): | ||
| 205 | - logger.info( | ||
| 206 | - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | ||
| 207 | - f"Source language: {source_lang or 'auto'} | Result: Skip translation (contains Chinese or pure number)" | ||
| 208 | - ) | ||
| 209 | - return text | ||
| 210 | - | ||
| 211 | - # Use provided context or default context | ||
| 212 | - translation_context = context or self.translation_context | ||
| 213 | - | ||
| 214 | - # Build cache key (include prompt in cache key if provided) | ||
| 215 | - cache_key_parts = [source_lang or 'auto', target_lang, translation_context] | ||
| 216 | - if prompt: | ||
| 217 | - cache_key_parts.append(prompt) | ||
| 218 | - cache_key_parts.append(text) | ||
| 219 | - cache_key = ':'.join(cache_key_parts) | ||
| 220 | - | ||
| 221 | - # Check cache (include context and prompt in cache key for accuracy) | ||
| 222 | - if self.use_cache and self.redis_client: | ||
| 223 | - cached = self._get_cached_translation_redis(text, target_lang, source_lang, translation_context, prompt) | ||
| 224 | - if cached: | ||
| 225 | - logger.info( | ||
| 226 | - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | ||
| 227 | - f"Source language: {source_lang or 'auto'} | Result: '{cached}' | Source: Cache hit" | ||
| 228 | - ) | ||
| 229 | - return cached | ||
| 230 | - | ||
| 231 | - # If no API key, return mock translation (for testing) | ||
| 232 | - if not self.api_key: | ||
| 233 | - logger.info( | ||
| 234 | - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | ||
| 235 | - f"Source language: {source_lang or 'auto'} | Result: '{text}' | Source: Mock mode (no API key)" | ||
| 236 | - ) | ||
| 237 | - return text | ||
| 238 | - | ||
| 239 | - # Translate using selected model | ||
| 240 | - logger.info( | ||
| 241 | - f"[Translator] Translation request | Model: {self.model} | Original text: '{text}' | Target language: {target_lang} | " | ||
| 242 | - f"Source language: {source_lang or 'auto'} | Context: {translation_context} | " | ||
| 243 | - f"Prompt: {'yes' if prompt else 'no'} | Status: Starting translation" | ||
| 244 | - ) | ||
| 245 | - | ||
| 246 | - if self.model == 'qwen': | ||
| 247 | - result = self._translate_qwen(text, target_lang, source_lang, translation_context, prompt) | ||
| 248 | - else: # deepl | ||
| 249 | - result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt) | ||
| 250 | - | ||
| 251 | - # Surface translation failure to the caller instead of silently | ||
| 252 | - # masquerading the source text as a successful translation. | ||
| 253 | - if result is None: | ||
| 254 | - logger.warning( | ||
| 255 | - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | ||
| 256 | - f"Source language: {source_lang or 'auto'} | Status: Translation failed" | ||
| 257 | - ) | ||
| 258 | - else: | ||
| 259 | - logger.info( | ||
| 260 | - f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | ||
| 261 | - f"Source language: {source_lang or 'auto'} | Result: '{result}' | Status: Translation successful" | ||
| 262 | - ) | ||
| 263 | - | ||
| 264 | - # Cache only successful translations. Failed attempts must not poison | ||
| 265 | - # Redis with the original text. | ||
| 266 | - if result is not None and self.use_cache and self.redis_client: | ||
| 267 | - self._set_cached_translation_redis(text, target_lang, result, source_lang, translation_context, prompt) | ||
| 268 | - | ||
| 269 | - return result | ||
| 270 | - | ||
| 271 | - def _translate_qwen( | ||
| 272 | - self, | ||
| 273 | - text: str, | ||
| 274 | - target_lang: str, | ||
| 275 | - source_lang: Optional[str], | ||
| 276 | - context: Optional[str] = None, | ||
| 277 | - prompt: Optional[str] = None | ||
| 278 | - ) -> Optional[str]: | ||
| 279 | - """ | ||
| 280 | - Translate using Qwen MT Flash model via Alibaba Cloud DashScope API. | ||
| 281 | - | ||
| 282 | - Args: | ||
| 283 | - text: Text to translate | ||
| 284 | - target_lang: Target language code ('zh', 'en', 'ru', etc.) | ||
| 285 | - source_lang: Source language code (optional, 'auto' if None) | ||
| 286 | - context: Context hint for translation (optional) | ||
| 287 | - prompt: Translation prompt/instruction (optional) | ||
| 288 | - | ||
| 289 | - Returns: | ||
| 290 | - Translated text or None if translation fails | ||
| 291 | - """ | ||
| 292 | - if not self.qwen_client: | ||
| 293 | - logger.error("[Translator] Qwen client not initialized. Check DASHSCOPE_API_KEY.") | ||
| 294 | - return None | ||
| 295 | - | ||
| 296 | - # Qwen (qwen-mt-plus/flash/turbo) supported languages mapping | ||
| 297 | - # 标准来自:你提供的“语言 / 英文名 / 代码”表 | ||
| 298 | - qwen_lang_map = { | ||
| 299 | - "en": "English", | ||
| 300 | - "zh": "Chinese", | ||
| 301 | - "zh_tw": "Traditional Chinese", | ||
| 302 | - "ru": "Russian", | ||
| 303 | - "ja": "Japanese", | ||
| 304 | - "ko": "Korean", | ||
| 305 | - "es": "Spanish", | ||
| 306 | - "fr": "French", | ||
| 307 | - "pt": "Portuguese", | ||
| 308 | - "de": "German", | ||
| 309 | - "it": "Italian", | ||
| 310 | - "th": "Thai", | ||
| 311 | - "vi": "Vietnamese", | ||
| 312 | - "id": "Indonesian", | ||
| 313 | - "ms": "Malay", | ||
| 314 | - "ar": "Arabic", | ||
| 315 | - "hi": "Hindi", | ||
| 316 | - "he": "Hebrew", | ||
| 317 | - "my": "Burmese", | ||
| 318 | - "ta": "Tamil", | ||
| 319 | - "ur": "Urdu", | ||
| 320 | - "bn": "Bengali", | ||
| 321 | - "pl": "Polish", | ||
| 322 | - "nl": "Dutch", | ||
| 323 | - "ro": "Romanian", | ||
| 324 | - "tr": "Turkish", | ||
| 325 | - "km": "Khmer", | ||
| 326 | - "lo": "Lao", | ||
| 327 | - "yue": "Cantonese", | ||
| 328 | - "cs": "Czech", | ||
| 329 | - "el": "Greek", | ||
| 330 | - "sv": "Swedish", | ||
| 331 | - "hu": "Hungarian", | ||
| 332 | - "da": "Danish", | ||
| 333 | - "fi": "Finnish", | ||
| 334 | - "uk": "Ukrainian", | ||
| 335 | - "bg": "Bulgarian", | ||
| 336 | - } | ||
| 337 | - | ||
| 338 | - # Convert target language | ||
| 339 | - target_lang_normalized = target_lang.lower() | ||
| 340 | - target_lang_qwen = qwen_lang_map.get(target_lang_normalized, target_lang.capitalize()) | ||
| 341 | - | ||
| 342 | - # Convert source language | ||
| 343 | - source_lang_normalized = (source_lang or "").strip().lower() | ||
| 344 | - if not source_lang_normalized or source_lang_normalized == "auto": | ||
| 345 | - source_lang_qwen = "auto" | ||
| 346 | - else: | ||
| 347 | - source_lang_qwen = qwen_lang_map.get(source_lang_normalized, source_lang.capitalize()) | ||
| 348 | - | ||
| 349 | - # Prepare translation options | ||
| 350 | - translation_options = { | ||
| 351 | - "source_lang": source_lang_qwen, | ||
| 352 | - "target_lang": target_lang_qwen, | ||
| 353 | - } | ||
| 354 | - | ||
| 355 | - # Prepare messages | ||
| 356 | - messages = [ | ||
| 357 | - { | ||
| 358 | - "role": "user", | ||
| 359 | - "content": text | ||
| 360 | - } | ||
| 361 | - ] | ||
| 362 | - | ||
| 363 | - start_time = time.time() | ||
| 364 | - try: | ||
| 365 | - completion = self.qwen_client.chat.completions.create( | ||
| 366 | - model=self.QWEN_MODEL, | ||
| 367 | - messages=messages, | ||
| 368 | - extra_body={ | ||
| 369 | - "translation_options": translation_options | ||
| 370 | - } | ||
| 371 | - ) | ||
| 372 | - | ||
| 373 | - translated_text = completion.choices[0].message.content.strip() | ||
| 374 | - duration_ms = (time.time() - start_time) * 1000 | ||
| 375 | - | ||
| 376 | - logger.info( | ||
| 377 | - f"[Translator] Qwen API response success | Original text: '{text}' | Target language: {target_lang_qwen} | " | ||
| 378 | - f"Translation result: '{translated_text}' | Duration: {duration_ms:.2f} ms" | ||
| 379 | - ) | ||
| 380 | - return translated_text | ||
| 381 | - | ||
| 382 | - except Exception as e: | ||
| 383 | - duration_ms = (time.time() - start_time) * 1000 | ||
| 384 | - logger.error( | ||
| 385 | - f"[Translator] Qwen API request exception | Original text: '{text}' | Target language: {target_lang_qwen} | " | ||
| 386 | - f"Duration: {duration_ms:.2f} ms | Error: {e}", exc_info=True | ||
| 387 | - ) | ||
| 388 | - return None | ||
| 389 | - | ||
| 390 | - def _translate_deepl( | ||
| 391 | - self, | ||
| 392 | - text: str, | ||
| 393 | - target_lang: str, | ||
| 394 | - source_lang: Optional[str], | ||
| 395 | - context: Optional[str] = None, | ||
| 396 | - prompt: Optional[str] = None | ||
| 397 | - ) -> Optional[str]: | ||
| 398 | - """ | ||
| 399 | - Translate using DeepL API with context and glossary support. | ||
| 400 | - | ||
| 401 | - Args: | ||
| 402 | - text: Text to translate | ||
| 403 | - target_lang: Target language code | ||
| 404 | - source_lang: Source language code (optional) | ||
| 405 | - context: Context hint for translation (e.g., "e-commerce product search") | ||
| 406 | - """ | ||
| 407 | - # Map to DeepL language codes | ||
| 408 | - target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper()) | ||
| 409 | - | ||
| 410 | - headers = { | ||
| 411 | - "Authorization": f"DeepL-Auth-Key {self.api_key}", | ||
| 412 | - "Content-Type": "application/json", | ||
| 413 | - } | ||
| 414 | - | ||
| 415 | - # Use prompt as context parameter for DeepL API (not as text prefix) | ||
| 416 | - # According to DeepL API: context is "Additional context that can influence a translation but is not translated itself" | ||
| 417 | - # If prompt is provided, use it as context; otherwise use the default context | ||
| 418 | - api_context = prompt if prompt else context | ||
| 419 | - | ||
| 420 | - # For e-commerce, add context words to help DeepL understand the domain | ||
| 421 | - # This is especially important for single-word ambiguous terms like "车" (car vs rook) | ||
| 422 | - text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context) | ||
| 423 | - | ||
| 424 | - payload = { | ||
| 425 | - "text": [text_to_translate], | ||
| 426 | - "target_lang": target_code, | ||
| 427 | - } | ||
| 428 | - | ||
| 429 | - if source_lang: | ||
| 430 | - source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper()) | ||
| 431 | - payload["source_lang"] = source_code | ||
| 432 | - | ||
| 433 | - # Add context parameter (prompt or default context) | ||
| 434 | - # Context influences translation but is not translated itself | ||
| 435 | - if api_context: | ||
| 436 | - payload["context"] = api_context | ||
| 437 | - | ||
| 438 | - # Add glossary if configured | ||
| 439 | - if self.glossary_id: | ||
| 440 | - payload["glossary_id"] = self.glossary_id | ||
| 441 | - | ||
| 442 | - # Note: DeepL API v2 supports "context" parameter for additional context | ||
| 443 | - # that influences translation but is not translated itself. | ||
| 444 | - # We use prompt as context parameter when provided. | ||
| 445 | - | ||
| 446 | - try: | ||
| 447 | - response = requests.post( | ||
| 448 | - self.DEEPL_API_URL, | ||
| 449 | - headers=headers, | ||
| 450 | - json=payload, | ||
| 451 | - timeout=self.timeout | ||
| 452 | - ) | ||
| 453 | - | ||
| 454 | - if response.status_code == 200: | ||
| 455 | - data = response.json() | ||
| 456 | - if "translations" in data and len(data["translations"]) > 0: | ||
| 457 | - translated_text = data["translations"][0]["text"] | ||
| 458 | - # If we added context, extract just the term from the result | ||
| 459 | - if needs_extraction: | ||
| 460 | - translated_text = self._extract_term_from_translation( | ||
| 461 | - translated_text, text, target_code | ||
| 462 | - ) | ||
| 463 | - logger.debug( | ||
| 464 | - f"[Translator] DeepL API response success | Original text: '{text}' | Target language: {target_code} | " | ||
| 465 | - f"Translation result: '{translated_text}'" | ||
| 466 | - ) | ||
| 467 | - return translated_text | ||
| 468 | - else: | ||
| 469 | - logger.error( | ||
| 470 | - f"[Translator] DeepL API error | Original text: '{text}' | Target language: {target_code} | " | ||
| 471 | - f"Status code: {response.status_code} | Error message: {response.text}" | ||
| 472 | - ) | ||
| 473 | - return None | ||
| 474 | - | ||
| 475 | - except requests.Timeout: | ||
| 476 | - logger.warning( | ||
| 477 | - f"[Translator] DeepL API request timeout | Original text: '{text}' | Target language: {target_code} | " | ||
| 478 | - f"Timeout: {self.timeout}s" | ||
| 479 | - ) | ||
| 480 | - return None | ||
| 481 | - except Exception as e: | ||
| 482 | - logger.error( | ||
| 483 | - f"[Translator] DeepL API request exception | Original text: '{text}' | Target language: {target_code} | " | ||
| 484 | - f"Error: {e}", exc_info=True | ||
| 485 | - ) | ||
| 486 | - return None | ||
| 487 | - | ||
| 488 | - # NOTE: _translate_deepl_free is intentionally not implemented. | ||
| 489 | - # We do not support automatic fallback to the free endpoint, to avoid | ||
| 490 | - # mixing Pro keys with https://api-free.deepl.com and related 403 errors. | ||
| 491 | - | ||
| 492 | - def translate_multi( | ||
| 493 | - self, | ||
| 494 | - text: str, | ||
| 495 | - target_langs: List[str], | ||
| 496 | - source_lang: Optional[str] = None, | ||
| 497 | - context: Optional[str] = None, | ||
| 498 | - async_mode: bool = True, | ||
| 499 | - prompt: Optional[str] = None | ||
| 500 | - ) -> Dict[str, Optional[str]]: | ||
| 501 | - """ | ||
| 502 | - Translate text to multiple target languages. | ||
| 503 | - | ||
| 504 | - In async_mode=True (default): | ||
| 505 | - - Returns cached translations immediately if available | ||
| 506 | - - For translations that can be optimized (e.g., pure numbers, already in target language), | ||
| 507 | - returns result immediately via synchronous call | ||
| 508 | - - Launches async tasks for other missing translations (non-blocking) | ||
| 509 | - - Returns None for missing translations that require async processing | ||
| 510 | - | ||
| 511 | - In async_mode=False: | ||
| 512 | - - Waits for all translations to complete (blocking) | ||
| 513 | - | ||
| 514 | - Args: | ||
| 515 | - text: Text to translate | ||
| 516 | - target_langs: List of target language codes | ||
| 517 | - source_lang: Source language code (optional) | ||
| 518 | - context: Context hint for translation (optional) | ||
| 519 | - async_mode: If True, return cached results immediately and translate missing ones async | ||
| 520 | - prompt: Translation prompt/instruction (optional) | ||
| 521 | - | ||
| 522 | - Returns: | ||
| 523 | - Dictionary mapping language code to translated text (only cached results in async mode) | ||
| 524 | - """ | ||
| 525 | - results = {} | ||
| 526 | - missing_langs = [] | ||
| 527 | - async_langs = [] | ||
| 528 | - | ||
| 529 | - # First, get cached translations | ||
| 530 | - for lang in target_langs: | ||
| 531 | - cached = self._get_cached_translation(text, lang, source_lang, context, prompt) | ||
| 532 | - if cached is not None: | ||
| 533 | - results[lang] = cached | ||
| 534 | - else: | ||
| 535 | - missing_langs.append(lang) | ||
| 536 | - | ||
| 537 | - # If async mode and there are missing translations | ||
| 538 | - if async_mode and missing_langs: | ||
| 539 | - # Check if translation can be optimized (immediate return) | ||
| 540 | - for lang in missing_langs: | ||
| 541 | - target_lang = lang.lower() | ||
| 542 | - # Check optimization conditions (same as in translate method) | ||
| 543 | - can_optimize = False | ||
| 544 | - if target_lang == 'en' and self._is_english_text(text): | ||
| 545 | - can_optimize = True | ||
| 546 | - elif target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): | ||
| 547 | - can_optimize = True | ||
| 548 | - | ||
| 549 | - if can_optimize: | ||
| 550 | - # Can be optimized, call translate synchronously for immediate result | ||
| 551 | - results[lang] = self.translate(text, lang, source_lang, context, prompt) | ||
| 552 | - else: | ||
| 553 | - # Requires actual translation, add to async list | ||
| 554 | - async_langs.append(lang) | ||
| 555 | - | ||
| 556 | - # Launch async tasks for translations that require actual API calls | ||
| 557 | - if async_langs: | ||
| 558 | - for lang in async_langs: | ||
| 559 | - self._translate_async(text, lang, source_lang, context, prompt) | ||
| 560 | - # Return None for async translations | ||
| 561 | - for lang in async_langs: | ||
| 562 | - results[lang] = None | ||
| 563 | - else: | ||
| 564 | - # Synchronous mode: wait for all translations | ||
| 565 | - for lang in missing_langs: | ||
| 566 | - results[lang] = self.translate(text, lang, source_lang, context, prompt) | ||
| 567 | - | ||
| 568 | - return results | ||
| 569 | - | ||
| 570 | - def translate_multi_async( | ||
| 571 | - self, | ||
| 572 | - text: str, | ||
| 573 | - target_langs: List[str], | ||
| 574 | - source_lang: Optional[str] = None, | ||
| 575 | - context: Optional[str] = None, | ||
| 576 | - prompt: Optional[str] = None | ||
| 577 | - ) -> Dict[str, Union[str, Future]]: | ||
| 578 | - """ | ||
| 579 | - Translate text to multiple target languages asynchronously, returning Futures that can be awaited. | ||
| 580 | - | ||
| 581 | - This method returns a dictionary where: | ||
| 582 | - - If translation is cached, the value is the translation string (immediate) | ||
| 583 | - - If translation needs to be done, the value is a Future object that can be awaited | ||
| 584 | - | ||
| 585 | - Args: | ||
| 586 | - text: Text to translate | ||
| 587 | - target_langs: List of target language codes | ||
| 588 | - source_lang: Source language code (optional) | ||
| 589 | - context: Context hint for translation (optional) | ||
| 590 | - prompt: Translation prompt/instruction (optional) | ||
| 591 | - | ||
| 592 | - Returns: | ||
| 593 | - Dictionary mapping language code to either translation string (cached) or Future object | ||
| 594 | - """ | ||
| 595 | - results = {} | ||
| 596 | - missing_langs = [] | ||
| 597 | - | ||
| 598 | - # First, get cached translations | ||
| 599 | - for lang in target_langs: | ||
| 600 | - cached = self._get_cached_translation(text, lang, source_lang, context, prompt) | ||
| 601 | - if cached is not None: | ||
| 602 | - results[lang] = cached | ||
| 603 | - else: | ||
| 604 | - missing_langs.append(lang) | ||
| 605 | - | ||
| 606 | - # For missing translations, submit async tasks and return Futures | ||
| 607 | - for lang in missing_langs: | ||
| 608 | - future = self.executor.submit( | ||
| 609 | - self.translate, | ||
| 610 | - text, | ||
| 611 | - lang, | ||
| 612 | - source_lang, | ||
| 613 | - context, | ||
| 614 | - prompt | ||
| 615 | - ) | ||
| 616 | - results[lang] = future | ||
| 617 | - | ||
| 618 | - return results | ||
| 619 | - | ||
| 620 | - def _get_cached_translation( | ||
| 621 | - self, | ||
| 622 | - text: str, | ||
| 623 | - target_lang: str, | ||
| 624 | - source_lang: Optional[str] = None, | ||
| 625 | - context: Optional[str] = None, | ||
| 626 | - prompt: Optional[str] = None | ||
| 627 | - ) -> Optional[str]: | ||
| 628 | - """Get translation from cache if available.""" | ||
| 629 | - if not self.redis_client: | ||
| 630 | - return None | ||
| 631 | - return self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt) | ||
| 632 | - | ||
| 633 | - def _get_cached_translation_redis( | ||
| 634 | - self, | ||
| 635 | - text: str, | ||
| 636 | - target_lang: str, | ||
| 637 | - source_lang: Optional[str] = None, | ||
| 638 | - context: Optional[str] = None, | ||
| 639 | - prompt: Optional[str] = None | ||
| 640 | - ) -> Optional[str]: | ||
| 641 | - """ | ||
| 642 | - Get translation from Redis cache with sliding expiration. | ||
| 643 | - | ||
| 644 | - 滑动过期机制:每次访问缓存时,重置过期时间为配置的过期时间(默认720天)。 | ||
| 645 | - 这样缓存会在最后一次访问后的720天才过期,而不是写入后的720天。 | ||
| 646 | - 这确保了常用的翻译缓存不会被过早删除。 | ||
| 647 | - """ | ||
| 648 | - if not self.redis_client: | ||
| 649 | - return None | ||
| 650 | - | ||
| 651 | - try: | ||
| 652 | - # Build cache key: prefix:target_lang:text | ||
| 653 | - # For simplicity, we use target_lang and text as key | ||
| 654 | - # Context and prompt are not included in key to maximize cache hits | ||
| 655 | - cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}" | ||
| 656 | - value = self.redis_client.get(cache_key) | ||
| 657 | - if value: | ||
| 658 | - # Sliding expiration: reset expiration time on access | ||
| 659 | - # 每次读取缓存时,重置过期时间为配置的过期时间(最后一次访问后的N天才过期) | ||
| 660 | - try: | ||
| 661 | - self.redis_client.expire(cache_key, self.expire_seconds) | ||
| 662 | - except Exception as expire_error: | ||
| 663 | - # 即使 expire 失败,也返回缓存值(不影响功能) | ||
| 664 | - logger.warning( | ||
| 665 | - f"[Translator] Failed to update cache expiration for key {cache_key}: {expire_error}" | ||
| 666 | - ) | ||
| 667 | - | ||
| 668 | - logger.debug( | ||
| 669 | - f"[Translator] Redis cache hit | Original text: '{text}' | Target language: {target_lang} | " | ||
| 670 | - f"Cache key: {cache_key} | Translation result: '{value}' | TTL reset to {self.expire_seconds}s" | ||
| 671 | - ) | ||
| 672 | - return value | ||
| 673 | - logger.debug( | ||
| 674 | - f"[Translator] Redis cache miss | Original text: '{text}' | Target language: {target_lang} | " | ||
| 675 | - f"Cache key: {cache_key}" | ||
| 676 | - ) | ||
| 677 | - return None | ||
| 678 | - except Exception as e: | ||
| 679 | - logger.error(f"[Translator] Redis error during get translation cache | Original text: '{text}' | Target language: {target_lang} | Error: {e}") | ||
| 680 | - return None | ||
| 681 | - | ||
| 682 | - def _set_cached_translation_redis( | ||
| 683 | - self, | ||
| 684 | - text: str, | ||
| 685 | - target_lang: str, | ||
| 686 | - translation: str, | ||
| 687 | - source_lang: Optional[str] = None, | ||
| 688 | - context: Optional[str] = None, | ||
| 689 | - prompt: Optional[str] = None | ||
| 690 | - ) -> None: | ||
| 691 | - """Store translation in Redis cache.""" | ||
| 692 | - if not self.redis_client: | ||
| 693 | - return | ||
| 694 | - | ||
| 695 | - try: | ||
| 696 | - cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}" | ||
| 697 | - self.redis_client.setex(cache_key, self.expire_seconds, translation) | ||
| 698 | - logger.info( | ||
| 699 | - f"[Translator] Redis cache write | Original text: '{text}' | Target language: {target_lang} | " | ||
| 700 | - f"Cache key: {cache_key} | Translation result: '{translation}'" | ||
| 701 | - ) | ||
| 702 | - except Exception as e: | ||
| 703 | - logger.error( | ||
| 704 | - f"[Translator] Redis cache write failed | Original text: '{text}' | Target language: {target_lang} | " | ||
| 705 | - f"Error: {e}" | ||
| 706 | - ) | ||
| 707 | - | ||
| 708 | - def _translate_async( | ||
| 709 | - self, | ||
| 710 | - text: str, | ||
| 711 | - target_lang: str, | ||
| 712 | - source_lang: Optional[str] = None, | ||
| 713 | - context: Optional[str] = None, | ||
| 714 | - prompt: Optional[str] = None | ||
| 715 | - ): | ||
| 716 | - """Launch async translation task.""" | ||
| 717 | - def _do_translate(): | ||
| 718 | - try: | ||
| 719 | - result = self.translate(text, target_lang, source_lang, context, prompt) | ||
| 720 | - if result: | ||
| 721 | - logger.debug(f"Async translation completed: {text} -> {target_lang}: {result}") | ||
| 722 | - except Exception as e: | ||
| 723 | - logger.warning(f"Async translation failed: {text} -> {target_lang}: {e}") | ||
| 724 | - | ||
| 725 | - self.executor.submit(_do_translate) | ||
| 726 | - | ||
| 727 | - def _add_ecommerce_context( | ||
| 728 | - self, | ||
| 729 | - text: str, | ||
| 730 | - source_lang: Optional[str], | ||
| 731 | - context: Optional[str] | ||
| 732 | - ) -> tuple: | ||
| 733 | - """ | ||
| 734 | - Add e-commerce context to text for better disambiguation. | ||
| 735 | - | ||
| 736 | - For single-word ambiguous Chinese terms, we add context words that help | ||
| 737 | - DeepL understand this is an e-commerce/product search context. | ||
| 738 | - | ||
| 739 | - Args: | ||
| 740 | - text: Original text to translate | ||
| 741 | - source_lang: Source language code | ||
| 742 | - context: Context hint | ||
| 743 | - | ||
| 744 | - Returns: | ||
| 745 | - Tuple of (text_with_context, needs_extraction) | ||
| 746 | - - text_with_context: Text to send to DeepL | ||
| 747 | - - needs_extraction: Whether we need to extract the term from the result | ||
| 748 | - """ | ||
| 749 | - # Only apply for e-commerce context and Chinese source | ||
| 750 | - if not context or "e-commerce" not in context.lower(): | ||
| 751 | - return text, False | ||
| 752 | - | ||
| 753 | - if not source_lang or source_lang.lower() != 'zh': | ||
| 754 | - return text, False | ||
| 755 | - | ||
| 756 | - # For single-word queries, add context to help disambiguation | ||
| 757 | - text_stripped = text.strip() | ||
| 758 | - if len(text_stripped.split()) == 1 and len(text_stripped) <= 2: | ||
| 759 | - # Common ambiguous Chinese e-commerce terms like "车" (car vs rook) | ||
| 760 | - # We add a context phrase: "购买 [term]" (buy [term]) or "商品 [term]" (product [term]) | ||
| 761 | - # This helps DeepL understand the e-commerce context | ||
| 762 | - # We'll need to extract just the term from the translation result | ||
| 763 | - context_phrase = f"购买 {text_stripped}" | ||
| 764 | - return context_phrase, True | ||
| 765 | - | ||
| 766 | - # For multi-word queries, DeepL usually has enough context | ||
| 767 | - return text, False | ||
| 768 | - | ||
| 769 | - def _extract_term_from_translation( | ||
| 770 | - self, | ||
| 771 | - translated_text: str, | ||
| 772 | - original_text: str, | ||
| 773 | - target_lang_code: str | ||
| 774 | - ) -> str: | ||
| 775 | - """ | ||
| 776 | - Extract the actual term from a translation that included context. | ||
| 777 | - | ||
| 778 | - For example, if we translated "购买 车" (buy car) and got "buy car", | ||
| 779 | - we want to extract just "car". | ||
| 780 | - | ||
| 781 | - Args: | ||
| 782 | - translated_text: Full translation result | ||
| 783 | - original_text: Original single-word query | ||
| 784 | - target_lang_code: Target language code (EN, ZH, etc.) | ||
| 785 | - | ||
| 786 | - Returns: | ||
| 787 | - Extracted term or original translation if extraction fails | ||
| 788 | - """ | ||
| 789 | - # For English target, try to extract the last word (the actual term) | ||
| 790 | - if target_lang_code == "EN": | ||
| 791 | - words = translated_text.strip().split() | ||
| 792 | - if len(words) > 1: | ||
| 793 | - # Usually the last word is the term we want | ||
| 794 | - # But we need to be smart - if it's "buy car", we want "car" | ||
| 795 | - # Common context words to skip: buy, purchase, product, item, etc. | ||
| 796 | - context_words = {"buy", "purchase", "product", "item", "commodity", "goods"} | ||
| 797 | - # Try to find the term (not a context word) | ||
| 798 | - for word in reversed(words): | ||
| 799 | - word_lower = word.lower().rstrip('.,!?;:') | ||
| 800 | - if word_lower not in context_words: | ||
| 801 | - return word_lower | ||
| 802 | - # If all words are context words, return the last one | ||
| 803 | - return words[-1].lower().rstrip('.,!?;:') | ||
| 804 | - | ||
| 805 | - # For other languages or if extraction fails, return as-is | ||
| 806 | - # The user can configure a glossary for better results | ||
| 807 | - return translated_text | ||
| 808 | - | ||
| 809 | - def _shop_lang_matches(self, shop_lang_lower: str, lang_code: str) -> bool: | ||
| 810 | - """True if shop language matches index language (use source, no translate).""" | ||
| 811 | - if not shop_lang_lower or not lang_code: | ||
| 812 | - return False | ||
| 813 | - if shop_lang_lower == lang_code: | ||
| 814 | - return True | ||
| 815 | - if lang_code == "zh" and "zh" in shop_lang_lower: | ||
| 816 | - return True | ||
| 817 | - if lang_code == "en" and "en" in shop_lang_lower: | ||
| 818 | - return True | ||
| 819 | - return False | ||
| 820 | - | ||
| 821 | - def translate_for_indexing( | ||
| 822 | - self, | ||
| 823 | - text: str, | ||
| 824 | - shop_language: str, | ||
| 825 | - source_lang: Optional[str] = None, | ||
| 826 | - context: Optional[str] = None, | ||
| 827 | - prompt: Optional[str] = None, | ||
| 828 | - index_languages: Optional[List[str]] = None, | ||
| 829 | - ) -> Dict[str, Optional[str]]: | ||
| 830 | - """ | ||
| 831 | - Translate text for indexing based on shop language and tenant index_languages. | ||
| 832 | - | ||
| 833 | - For each language in index_languages: use source text if shop language matches, | ||
| 834 | - otherwise translate to that language. | ||
| 835 | - | ||
| 836 | - Args: | ||
| 837 | - text: Text to translate | ||
| 838 | - shop_language: Shop primary language (e.g. 'zh', 'en', 'ru') | ||
| 839 | - source_lang: Source language code (optional) | ||
| 840 | - context: Additional context for translation (optional) | ||
| 841 | - prompt: Translation prompt (optional) | ||
| 842 | - index_languages: Languages to index (from tenant_config). Default ["en", "zh"]. | ||
| 843 | - | ||
| 844 | - Returns: | ||
| 845 | - Dict keyed by each index_language with translated or source text (or None). | ||
| 846 | - """ | ||
| 847 | - langs = index_languages if index_languages else ["en", "zh"] | ||
| 848 | - results = {lang: None for lang in langs} | ||
| 849 | - if not text or not text.strip(): | ||
| 850 | - return results | ||
| 851 | - if re.match(r'^[\d\s_-]+$', text): | ||
| 852 | - logger.info(f"[Translator] Skip translation for symbol-only query: '{text}'") | ||
| 853 | - return results | ||
| 854 | - | ||
| 855 | - shop_lang_lower = (shop_language or "").strip().lower() | ||
| 856 | - targets = [] | ||
| 857 | - for lang in langs: | ||
| 858 | - if self._shop_lang_matches(shop_lang_lower, lang): | ||
| 859 | - results[lang] = text | ||
| 860 | - else: | ||
| 861 | - targets.append(lang) | ||
| 862 | - | ||
| 863 | - for target_lang in targets: | ||
| 864 | - cached = self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt) | ||
| 865 | - if cached: | ||
| 866 | - results[target_lang] = cached | ||
| 867 | - logger.debug(f"[Translator] Cache hit for indexing: '{text}' -> {target_lang}: {cached}") | ||
| 868 | - continue | ||
| 869 | - translated = self.translate( | ||
| 870 | - text, | ||
| 871 | - target_lang=target_lang, | ||
| 872 | - source_lang=source_lang or shop_language, | ||
| 873 | - context=context, | ||
| 874 | - prompt=prompt, | ||
| 875 | - ) | ||
| 876 | - results[target_lang] = translated | ||
| 877 | - return results | ||
| 878 | - | ||
| 879 | - def get_translation_needs( | ||
| 880 | - self, | ||
| 881 | - detected_lang: str, | ||
| 882 | - supported_langs: List[str] | ||
| 883 | - ) -> List[str]: | ||
| 884 | - """ | ||
| 885 | - Determine which languages need translation. | ||
| 886 | - | ||
| 887 | - Args: | ||
| 888 | - detected_lang: Detected query language | ||
| 889 | - supported_langs: List of supported languages | ||
| 890 | - | ||
| 891 | - Returns: | ||
| 892 | - List of language codes to translate to | ||
| 893 | - """ | ||
| 894 | - # If detected language is in supported list, translate to others | ||
| 895 | - if detected_lang in supported_langs: | ||
| 896 | - return [lang for lang in supported_langs if detected_lang != lang] | ||
| 897 | - | ||
| 898 | - # Otherwise, translate to all supported languages | ||
| 899 | - return supported_langs | ||
| 900 | - | ||
| 901 | - def _is_english_text(self, text: str) -> bool: | ||
| 902 | - """ | ||
| 903 | - Check if text is primarily English (ASCII letters, numbers, common punctuation). | ||
| 904 | - | ||
| 905 | - Args: | ||
| 906 | - text: Text to check | ||
| 907 | - | ||
| 908 | - Returns: | ||
| 909 | - True if text appears to be English | ||
| 910 | - """ | ||
| 911 | - if not text or not text.strip(): | ||
| 912 | - return True | ||
| 913 | - | ||
| 914 | - # Remove whitespace and common punctuation | ||
| 915 | - text_clean = re.sub(r'[\s\.,!?;:\-\'\"\(\)\[\]{}]', '', text) | ||
| 916 | - if not text_clean: | ||
| 917 | - return True | ||
| 918 | - | ||
| 919 | - # Check if all remaining characters are ASCII (letters, numbers) | ||
| 920 | - # This is a simple heuristic: if most characters are ASCII, it's likely English | ||
| 921 | - ascii_count = sum(1 for c in text_clean if ord(c) < 128) | ||
| 922 | - ratio = ascii_count / len(text_clean) if text_clean else 0 | ||
| 923 | - | ||
| 924 | - # If more than 80% are ASCII characters, consider it English | ||
| 925 | - return ratio > 0.8 | ||
| 926 | - | ||
| 927 | - def _contains_chinese(self, text: str) -> bool: | ||
| 928 | - """ | ||
| 929 | - Check if text contains Chinese characters (Han characters). | ||
| 930 | - | ||
| 931 | - Args: | ||
| 932 | - text: Text to check | ||
| 933 | - | ||
| 934 | - Returns: | ||
| 935 | - True if text contains Chinese characters | ||
| 936 | - """ | ||
| 937 | - if not text: | ||
| 938 | - return False | ||
| 939 | - | ||
| 940 | - # Check for Chinese characters (Unicode range: \u4e00-\u9fff) | ||
| 941 | - chinese_pattern = re.compile(r'[\u4e00-\u9fff]') | ||
| 942 | - return bool(chinese_pattern.search(text)) | ||
| 943 | - | ||
| 944 | - def _is_pure_number(self, text: str) -> bool: | ||
| 945 | - """ | ||
| 946 | - Check if text is purely numeric (digits, possibly with spaces, dots, commas). | ||
| 947 | - | ||
| 948 | - Args: | ||
| 949 | - text: Text to check | ||
| 950 | - | ||
| 951 | - Returns: | ||
| 952 | - True if text is purely numeric | ||
| 953 | - """ | ||
| 954 | - if not text or not text.strip(): | ||
| 955 | - return False | ||
| 956 | - | ||
| 957 | - # Remove whitespace, dots, commas (common number separators) | ||
| 958 | - text_clean = re.sub(r'[\s\.,]', '', text.strip()) | ||
| 959 | - if not text_clean: | ||
| 960 | - return False | ||
| 961 | - | ||
| 962 | - # Check if all remaining characters are digits | ||
| 963 | - return text_clean.isdigit() |
tests/test_embedding_pipeline.py
| @@ -77,12 +77,10 @@ def _build_test_config() -> SearchConfig: | @@ -77,12 +77,10 @@ def _build_test_config() -> SearchConfig: | ||
| 77 | enable_text_embedding=True, | 77 | enable_text_embedding=True, |
| 78 | enable_query_rewrite=False, | 78 | enable_query_rewrite=False, |
| 79 | rewrite_dictionary={}, | 79 | rewrite_dictionary={}, |
| 80 | - translation_prompts={"query_zh": "e-commerce domain", "query_en": "e-commerce domain"}, | ||
| 81 | text_embedding_field="title_embedding", | 80 | text_embedding_field="title_embedding", |
| 82 | image_embedding_field=None, | 81 | image_embedding_field=None, |
| 83 | ), | 82 | ), |
| 84 | function_score=FunctionScoreConfig(), | 83 | function_score=FunctionScoreConfig(), |
| 85 | - function_score=FunctionScoreConfig(), | ||
| 86 | rerank=RerankConfig(), | 84 | rerank=RerankConfig(), |
| 87 | spu_config=SPUConfig(enabled=True, spu_field="spu_id", inner_hits_size=3), | 85 | spu_config=SPUConfig(enabled=True, spu_field="spu_id", inner_hits_size=3), |
| 88 | es_index_name="test_products", | 86 | es_index_name="test_products", |