From 522a39647a24d001f7da792bc30ba54b4f01f238 Mon Sep 17 00:00:00 2001 From: tangwang Date: Thu, 13 Nov 2025 15:59:22 +0800 Subject: [PATCH] 多语言搜索翻译的优化(deepL添加上下文提示词) --- DEEPL_OPTIMIZATION.md | 185 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ config/config.yaml | 2 ++ config/config_loader.py | 6 +++++- query/query_parser.py | 9 +++++++-- query/translator.py | 163 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------- 5 files changed, 347 insertions(+), 18 deletions(-) create mode 100644 DEEPL_OPTIMIZATION.md diff --git a/DEEPL_OPTIMIZATION.md b/DEEPL_OPTIMIZATION.md new file mode 100644 index 0000000..589e57b --- /dev/null +++ b/DEEPL_OPTIMIZATION.md @@ -0,0 +1,185 @@ +# DeepL 翻译优化指南 + +## 问题描述 + +在电商搜索环境中,DeepL 翻译可能会遇到多义词翻译不准确的问题。例如: +- "车" 被翻译为 "rook"(象棋中的车)而不是 "car"(汽车) + +## 解决方案 + +我们实现了以下优化方案来改善 DeepL 在电商场景下的翻译准确性: + +### 1. 上下文提示(Context Hints) + +系统会自动为单字查询添加电商上下文,帮助 DeepL 理解查询的领域。 + +**工作原理:** +- 对于中文单字查询(如 "车"),系统会自动添加上下文 "购买 车" +- DeepL 会根据上下文将 "车" 翻译为 "car" 而不是 "rook" +- 翻译完成后,系统会自动提取实际的查询词("car") + +**配置:** +在 `config/config.yaml` 中可以设置翻译上下文: + +```yaml +query_config: + translation_context: "e-commerce product search" # 默认值 +``` + +### 2. 术语表(Glossary)支持(推荐方案) + +DeepL 支持使用自定义术语表来确保特定词汇的准确翻译。这是解决多义词问题的最佳方案。 + +#### 创建术语表 + +1. **使用 DeepL API 创建术语表:** + +```python +import requests + +# 创建术语表 +api_url = "https://api.deepl.com/v2/glossaries" +headers = { + "Authorization": "DeepL-Auth-Key YOUR_API_KEY", + "Content-Type": "application/json", +} + +# 术语表内容(TSV 格式) +glossary_entries = """车\tcar +手机\tmobile phone +电脑\tcomputer""" + +payload = { + "name": "e-commerce-glossary", + "source_lang": "ZH", + "target_lang": "EN", + "entries": glossary_entries, + "entries_format": "tsv" +} + +response = requests.post(api_url, headers=headers, json=payload) +if response.status_code == 201: + glossary_id = response.json()["glossary_id"] + print(f"术语表创建成功,ID: {glossary_id}") +``` + +2. **或者使用 DeepL 网页界面创建:** + - 登录 DeepL Pro 账户 + - 进入术语表管理页面 + - 创建新的术语表,添加 "车" -> "car" 等映射 + +#### 配置术语表 + +在 `config/config.yaml` 中配置术语表 ID: + +```yaml +query_config: + translation_glossary_id: "your-glossary-id-here" # DeepL 术语表 ID +``` + +#### 术语表格式 + +术语表使用 TSV(Tab-Separated Values)格式,每行一个词条: + +``` +车 car +手机 mobile phone +电脑 computer +``` + +**注意:** +- 术语表功能需要 DeepL Pro 账户(付费版) +- Free API 不支持术语表功能 + +### 3. 自动上下文处理 + +系统会自动检测以下情况并应用优化: + +- **单字中文查询**:自动添加电商上下文 +- **多字查询**:DeepL 通常有足够的上下文,无需特殊处理 +- **非中文查询**:不应用上下文优化 + +## 使用示例 + +### 示例 1:使用上下文提示(自动) + +查询 "车" 时: +1. 系统检测到这是单字中文查询 +2. 自动添加上下文:"购买 车" +3. DeepL 翻译为 "buy car" +4. 系统提取实际查询词:"car" + +### 示例 2:使用术语表(推荐) + +1. 创建术语表,包含 "车" -> "car" 的映射 +2. 在配置中设置 `translation_glossary_id` +3. 查询 "车" 时,DeepL 直接使用术语表翻译为 "car" + +## 最佳实践 + +1. **优先使用术语表**: + - 对于常见的电商术语,创建术语表是最可靠的方案 + - 术语表可以确保翻译的一致性和准确性 + +2. **上下文提示作为补充**: + - 对于未在术语表中的词汇,上下文提示可以提供帮助 + - 系统已默认启用,无需额外配置 + +3. **定期更新术语表**: + - 根据实际使用情况,不断添加新的术语映射 + - 特别是品牌名、产品类别等专业术语 + +## 技术实现细节 + +### 上下文添加逻辑 + +```python +# 对于单字查询(长度 <= 2 个字符) +if len(text.strip().split()) == 1 and len(text.strip()) <= 2: + context_phrase = f"购买 {text}" # 添加 "购买" 前缀 + return context_phrase, True # 需要从结果中提取 +``` + +### 结果提取逻辑 + +翻译结果 "buy car" 会被处理: +1. 识别上下文词(buy, purchase, product 等) +2. 提取非上下文词作为实际查询词 +3. 返回 "car" + +## 常见问题 + +### Q: 为什么 "车" 会被翻译为 "rook"? + +A: DeepL 在处理单字查询时,缺乏上下文来判断词义。"车" 在中文中既可以指汽车,也可以指象棋中的车。通过添加电商上下文或使用术语表,可以解决这个问题。 + +### Q: 术语表和上下文提示哪个更好? + +A: 术语表是更可靠的方案,因为它直接指定了翻译映射。上下文提示是自动的补充方案,适用于未在术语表中的词汇。 + +### Q: Free API 可以使用术语表吗? + +A: 不可以。术语表功能需要 DeepL Pro(付费版)账户。Free API 只能使用上下文提示优化。 + +### Q: 如何测试翻译效果? + +A: 可以通过搜索 API 测试翻译结果,查看返回的 `translations` 字段: + +```bash +curl -X POST http://localhost:6002/api/search \ + -H "Content-Type: application/json" \ + -d '{"query": "车", "tenant_id": "test"}' +``` + +## 相关文件 + +- `query/translator.py` - 翻译器实现 +- `query/query_parser.py` - 查询解析器(调用翻译器) +- `config/config.yaml` - 配置文件 +- `config/config_loader.py` - 配置加载器 + +## 参考资源 + +- [DeepL API 文档](https://www.deepl.com/docs-api) +- [DeepL 术语表功能](https://www.deepl.com/docs-api/managing-glossaries/) + diff --git a/config/config.yaml b/config/config.yaml index 509a5e5..5b6ef04 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -242,6 +242,8 @@ query_config: # Translation API (DeepL) translation_service: "deepl" translation_api_key: null # Set via environment variable + # translation_glossary_id: null # Optional: DeepL glossary ID for custom terminology (e.g., "车" -> "car") + # translation_context: "e-commerce product search" # Context hint for better translation disambiguation # Ranking Configuration ranking: diff --git a/config/config_loader.py b/config/config_loader.py index 27ae6c2..aea676e 100644 --- a/config/config_loader.py +++ b/config/config_loader.py @@ -51,6 +51,8 @@ class QueryConfig: # Translation API settings translation_api_key: Optional[str] = None translation_service: str = "deepl" # deepl, google, etc. + translation_glossary_id: Optional[str] = None # DeepL glossary ID for custom terminology + translation_context: str = "e-commerce product search" # Context hint for translation # ES source fields configuration - fields to return in search results source_fields: List[str] = field(default_factory=lambda: [ @@ -209,7 +211,9 @@ class ConfigLoader: enable_query_rewrite=query_config_data.get("enable_query_rewrite", True), rewrite_dictionary=rewrite_dictionary, translation_api_key=query_config_data.get("translation_api_key"), - translation_service=query_config_data.get("translation_service", "deepl") + translation_service=query_config_data.get("translation_service", "deepl"), + translation_glossary_id=query_config_data.get("translation_glossary_id"), + translation_context=query_config_data.get("translation_context", "e-commerce product search") ) # Parse ranking config diff --git a/query/query_parser.py b/query/query_parser.py index a31b4c5..bf4de44 100644 --- a/query/query_parser.py +++ b/query/query_parser.py @@ -98,7 +98,9 @@ class QueryParser: print("[QueryParser] Initializing translator...") self._translator = Translator( api_key=self.query_config.translation_api_key, - use_cache=True + use_cache=True, + glossary_id=getattr(self.query_config, 'translation_glossary_id', None), + translation_context=getattr(self.query_config, 'translation_context', 'e-commerce product search') ) return self._translator @@ -195,10 +197,13 @@ class QueryParser: if target_langs: log_info(f"开始翻译 | 源语言: {detected_lang} | 目标语言: {target_langs}") + # Use e-commerce context for better disambiguation + translation_context = getattr(self.query_config, 'translation_context', 'e-commerce product search') translations = self.translator.translate_multi( query_text, target_langs, - source_lang=detected_lang + source_lang=detected_lang, + context=translation_context ) log_info(f"翻译完成 | 结果: {translations}") if context: diff --git a/query/translator.py b/query/translator.py index 1cd203c..dd9117b 100644 --- a/query/translator.py +++ b/query/translator.py @@ -32,7 +32,9 @@ class Translator: self, api_key: Optional[str] = None, use_cache: bool = True, - timeout: int = 10 + timeout: int = 10, + glossary_id: Optional[str] = None, + translation_context: Optional[str] = None ): """ Initialize translator. @@ -41,6 +43,8 @@ class Translator: api_key: DeepL API key (or None to use from config/env) use_cache: Whether to cache translations timeout: Request timeout in seconds + glossary_id: DeepL glossary ID for custom terminology (optional) + translation_context: Context hint for translation (e.g., "e-commerce", "product search") """ # Get API key from config if not provided if api_key is None: @@ -53,6 +57,8 @@ class Translator: self.api_key = api_key self.timeout = timeout self.use_cache = use_cache + self.glossary_id = glossary_id + self.translation_context = translation_context or "e-commerce product search" if use_cache: self.cache = DictCache(".cache/translations.json") @@ -63,7 +69,8 @@ class Translator: self, text: str, target_lang: str, - source_lang: Optional[str] = None + source_lang: Optional[str] = None, + context: Optional[str] = None ) -> Optional[str]: """ Translate text to target language. @@ -72,6 +79,7 @@ class Translator: text: Text to translate target_lang: Target language code ('zh', 'en', 'ru', etc.) source_lang: Source language code (optional, auto-detect if None) + context: Additional context for translation (overrides default context) Returns: Translated text or None if translation fails @@ -84,9 +92,12 @@ class Translator: if source_lang: source_lang = source_lang.lower() - # Check cache + # Use provided context or default context + translation_context = context or self.translation_context + + # Check cache (include context in cache key for accuracy) if self.use_cache: - cache_key = f"{source_lang or 'auto'}:{target_lang}:{text}" + cache_key = f"{source_lang or 'auto'}:{target_lang}:{translation_context}:{text}" cached = self.cache.get(cache_key, category="translations") if cached: return cached @@ -97,12 +108,12 @@ class Translator: return text # Translate using DeepL with fallback - result = self._translate_deepl(text, target_lang, source_lang) + result = self._translate_deepl(text, target_lang, source_lang, translation_context) # If translation failed, try fallback to free API if result is None and "api.deepl.com" in self.DEEPL_API_URL: print(f"[Translator] Pro API failed, trying free API...") - result = self._translate_deepl_free(text, target_lang, source_lang) + result = self._translate_deepl_free(text, target_lang, source_lang, translation_context) # If still failed, return original text with warning if result is None: @@ -111,7 +122,7 @@ class Translator: # Cache result if result and self.use_cache: - cache_key = f"{source_lang or 'auto'}:{target_lang}:{text}" + cache_key = f"{source_lang or 'auto'}:{target_lang}:{translation_context}:{text}" self.cache.set(cache_key, result, category="translations") return result @@ -120,9 +131,18 @@ class Translator: self, text: str, target_lang: str, - source_lang: Optional[str] + source_lang: Optional[str], + context: Optional[str] = None ) -> Optional[str]: - """Translate using DeepL API.""" + """ + Translate using DeepL API with context and glossary support. + + Args: + text: Text to translate + target_lang: Target language code + source_lang: Source language code (optional) + context: Context hint for translation (e.g., "e-commerce product search") + """ # Map to DeepL language codes target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper()) @@ -131,8 +151,13 @@ class Translator: "Content-Type": "application/json", } + # Build text with context for better disambiguation + # For e-commerce, add context words to help DeepL understand the domain + # This is especially important for single-word ambiguous terms like "车" (car vs rook) + text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, context) + payload = { - "text": [text], + "text": [text_to_translate], "target_lang": target_code, } @@ -140,6 +165,16 @@ class Translator: source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper()) payload["source_lang"] = source_code + # Add glossary if configured + if self.glossary_id: + payload["glossary_id"] = self.glossary_id + + # Note: DeepL API v2 doesn't have a direct "context" parameter, + # but we can improve translation by: + # 1. Using glossary for domain-specific terms (best solution) + # 2. Adding context words to the text (for single-word queries) - implemented in _add_ecommerce_context + # 3. Using more specific source language detection + try: response = requests.post( self.DEEPL_API_URL, @@ -151,7 +186,13 @@ class Translator: if response.status_code == 200: data = response.json() if "translations" in data and len(data["translations"]) > 0: - return data["translations"][0]["text"] + translated_text = data["translations"][0]["text"] + # If we added context, extract just the term from the result + if needs_extraction: + translated_text = self._extract_term_from_translation( + translated_text, text, target_code + ) + return translated_text else: print(f"[Translator] DeepL API error: {response.status_code} - {response.text}") return None @@ -167,9 +208,14 @@ class Translator: self, text: str, target_lang: str, - source_lang: Optional[str] + source_lang: Optional[str], + context: Optional[str] = None ) -> Optional[str]: - """Translate using DeepL Free API.""" + """ + Translate using DeepL Free API. + + Note: Free API may not support glossary_id parameter. + """ # Map to DeepL language codes target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper()) @@ -187,6 +233,9 @@ class Translator: source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper()) payload["source_lang"] = source_code + # Note: Free API typically doesn't support glossary_id + # But we can still use context hints in the text + try: response = requests.post( "https://api-free.deepl.com/v2/translate", @@ -214,7 +263,8 @@ class Translator: self, text: str, target_langs: List[str], - source_lang: Optional[str] = None + source_lang: Optional[str] = None, + context: Optional[str] = None ) -> Dict[str, Optional[str]]: """ Translate text to multiple target languages. @@ -223,15 +273,98 @@ class Translator: text: Text to translate target_langs: List of target language codes source_lang: Source language code (optional) + context: Context hint for translation (optional) Returns: Dictionary mapping language code to translated text """ results = {} for lang in target_langs: - results[lang] = self.translate(text, lang, source_lang) + results[lang] = self.translate(text, lang, source_lang, context) return results + def _add_ecommerce_context( + self, + text: str, + source_lang: Optional[str], + context: Optional[str] + ) -> tuple: + """ + Add e-commerce context to text for better disambiguation. + + For single-word ambiguous Chinese terms, we add context words that help + DeepL understand this is an e-commerce/product search context. + + Args: + text: Original text to translate + source_lang: Source language code + context: Context hint + + Returns: + Tuple of (text_with_context, needs_extraction) + - text_with_context: Text to send to DeepL + - needs_extraction: Whether we need to extract the term from the result + """ + # Only apply for e-commerce context and Chinese source + if not context or "e-commerce" not in context.lower(): + return text, False + + if not source_lang or source_lang.lower() != 'zh': + return text, False + + # For single-word queries, add context to help disambiguation + text_stripped = text.strip() + if len(text_stripped.split()) == 1 and len(text_stripped) <= 2: + # Common ambiguous Chinese e-commerce terms like "车" (car vs rook) + # We add a context phrase: "购买 [term]" (buy [term]) or "商品 [term]" (product [term]) + # This helps DeepL understand the e-commerce context + # We'll need to extract just the term from the translation result + context_phrase = f"购买 {text_stripped}" + return context_phrase, True + + # For multi-word queries, DeepL usually has enough context + return text, False + + def _extract_term_from_translation( + self, + translated_text: str, + original_text: str, + target_lang_code: str + ) -> str: + """ + Extract the actual term from a translation that included context. + + For example, if we translated "购买 车" (buy car) and got "buy car", + we want to extract just "car". + + Args: + translated_text: Full translation result + original_text: Original single-word query + target_lang_code: Target language code (EN, ZH, etc.) + + Returns: + Extracted term or original translation if extraction fails + """ + # For English target, try to extract the last word (the actual term) + if target_lang_code == "EN": + words = translated_text.strip().split() + if len(words) > 1: + # Usually the last word is the term we want + # But we need to be smart - if it's "buy car", we want "car" + # Common context words to skip: buy, purchase, product, item, etc. + context_words = {"buy", "purchase", "product", "item", "commodity", "goods"} + # Try to find the term (not a context word) + for word in reversed(words): + word_lower = word.lower().rstrip('.,!?;:') + if word_lower not in context_words: + return word_lower + # If all words are context words, return the last one + return words[-1].lower().rstrip('.,!?;:') + + # For other languages or if extraction fails, return as-is + # The user can configure a glossary for better results + return translated_text + def get_translation_needs( self, detected_lang: str, -- libgit2 0.21.2