diff --git a/README.md b/README.md index 1eef882..67cbb92 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ - # TODO +**多语言索引**:已改为可配置的 `index_languages`(默认为 `["en", "zh"]`),商家可勾选主市场语言。支持语言见 `config.tenant_config_loader.SUPPORTED_INDEX_LANGUAGES`(含 en, zh, zh_tw, ru, ja, ko, es, fr, pt, de, it, th, vi, id, ms, ar, hi, he, my, ta, ur, bn, pl, nl, ro, tr, km, lo, yue, cs, el, sv, hu, da, fi, uk, bg 等)。 + 前端: 搜索模态框 点击搜索的时候,弹出 搜索模态框,参考 react、AJAX等技术来实现,搜索模态框的页面宽度和原始页面相同(占满),左侧是suggestions,右侧是即使刷新的搜索结果(每输入一个字母都刷新一次结果)。 diff --git a/config/config.yaml b/config/config.yaml index 7e96f73..7d9b404 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -149,32 +149,25 @@ spu_config: searchable_option_dimensions: ['option1', 'option2', 'option3'] # 租户配置(Tenant Configuration) -# 每个租户可以配置主语言和翻译选项 +# 每个租户可配置主语言 primary_language 与索引语言 index_languages(主市场语言,商家可勾选) +# 默认 index_languages: [en, zh],可配置为任意 SUPPORTED_INDEX_LANGUAGES 的子集 tenant_config: - # 默认配置(未配置的租户使用此配置) default: primary_language: "en" - translate_to_en: true - translate_to_zh: false - # 租户特定配置 + index_languages: ["en", "zh"] tenants: "1": primary_language: "zh" - translate_to_en: true - translate_to_zh: false + index_languages: ["zh", "en"] "2": primary_language: "en" - translate_to_en: false - translate_to_zh: true + index_languages: ["en", "zh"] "3": primary_language: "zh" - translate_to_en: true - translate_to_zh: false + index_languages: ["zh", "en"] "162": primary_language: "zh" - translate_to_en: true - translate_to_zh: true + index_languages: ["zh", "en"] "170": primary_language: "en" - translate_to_en: true - translate_to_zh: true + index_languages: ["en", "zh"] diff --git a/config/tenant_config_loader.py b/config/tenant_config_loader.py index 70c4598..402f7cc 100644 --- a/config/tenant_config_loader.py +++ b/config/tenant_config_loader.py @@ -1,14 +1,102 @@ """ 租户配置加载器。 -从统一配置文件(config.yaml)加载租户配置,包括主语言和翻译配置。 +从统一配置文件(config.yaml)加载租户配置,包括主语言和索引语言(index_languages)。 +支持旧配置 translate_to_en / translate_to_zh 的兼容解析。 """ import logging -from typing import Dict, Any, Optional +from typing import Dict, Any, Optional, List logger = logging.getLogger(__name__) +# 支持的索引语言:code -> display name(供商家勾选主市场语言等场景使用) +SUPPORTED_INDEX_LANGUAGES: Dict[str, str] = { + "en": "English", + "zh": "Chinese", + "zh_tw": "Traditional Chinese", + "ru": "Russian", + "ja": "Japanese", + "ko": "Korean", + "es": "Spanish", + "fr": "French", + "pt": "Portuguese", + "de": "German", + "it": "Italian", + "th": "Thai", + "vi": "Vietnamese", + "id": "Indonesian", + "ms": "Malay", + "ar": "Arabic", + "hi": "Hindi", + "he": "Hebrew", + "my": "Burmese", + "ta": "Tamil", + "ur": "Urdu", + "bn": "Bengali", + "pl": "Polish", + "nl": "Dutch", + "ro": "Romanian", + "tr": "Turkish", + "km": "Khmer", + "lo": "Lao", + "yue": "Cantonese", + "cs": "Czech", + "el": "Greek", + "sv": "Swedish", + "hu": "Hungarian", + "da": "Danish", + "fi": "Finnish", + "uk": "Ukrainian", + "bg": "Bulgarian", +} + +DEFAULT_INDEX_LANGUAGES: List[str] = ["en", "zh"] + + +def normalize_index_languages(value: Any, primary_language: str = "en") -> List[str]: + """ + 将 index_languages 配置规范化为合法语言代码列表。 + None 或空时返回 DEFAULT_INDEX_LANGUAGES。 + """ + if value is None: + return list(DEFAULT_INDEX_LANGUAGES) + if not isinstance(value, (list, tuple)): + return list(DEFAULT_INDEX_LANGUAGES) + valid: List[str] = [] + seen: set = set() + for item in value: + code = (item or "").strip().lower() + if not code or code in seen: + continue + if code in SUPPORTED_INDEX_LANGUAGES: + valid.append(code) + seen.add(code) + return valid if valid else list(DEFAULT_INDEX_LANGUAGES) + + +def resolve_index_languages(tenant_config: Dict[str, Any]) -> List[str]: + """ + 从租户配置解析 index_languages。 + 若存在 index_languages 则用之;否则按旧配置 translate_to_en / translate_to_zh 推导。 + """ + if "index_languages" in tenant_config: + return normalize_index_languages( + tenant_config["index_languages"], + tenant_config.get("primary_language") or "en", + ) + primary = (tenant_config.get("primary_language") or "en").strip().lower() + to_en = bool(tenant_config.get("translate_to_en")) + to_zh = bool(tenant_config.get("translate_to_zh")) + langs: List[str] = [] + if primary and primary in SUPPORTED_INDEX_LANGUAGES: + langs.append(primary) + for code in ("en", "zh"): + if code not in langs and ((code == "en" and to_en) or (code == "zh" and to_zh)): + if code in SUPPORTED_INDEX_LANGUAGES: + langs.append(code) + return langs if langs else list(DEFAULT_INDEX_LANGUAGES) + class TenantConfigLoader: """租户配置加载器。""" @@ -36,14 +124,9 @@ class TenantConfigLoader: return self._config except Exception as e: logger.error(f"Failed to load tenant config: {e}", exc_info=True) - # 返回默认配置 self._config = { - "default": { - "primary_language": "en", - "translate_to_en": True, - "translate_to_zh": False - }, - "tenants": {} + "default": {"primary_language": "en", "index_languages": ["en", "zh"]}, + "tenants": {}, } return self._config @@ -55,21 +138,18 @@ class TenantConfigLoader: tenant_id: 租户ID Returns: - 租户配置字典,如果租户不存在则返回默认配置 + 租户配置字典,若租户不存在则用默认配置。始终包含已解析的 index_languages。 """ config = self.load_config() tenant_id_str = str(tenant_id) - + default = config.get("default", {"primary_language": "en", "index_languages": ["en", "zh"]}) tenants = config.get("tenants", {}) - if tenant_id_str in tenants: - return tenants[tenant_id_str] - else: + raw = tenants[tenant_id_str] if tenant_id_str in tenants else default + if tenant_id_str not in tenants: logger.debug(f"Tenant {tenant_id} not found in config, using default") - return config.get("default", { - "primary_language": "en", - "translate_to_en": True, - "translate_to_zh": False - }) + out = dict(raw) + out["index_languages"] = resolve_index_languages(raw) + return out def reload(self): """重新加载配置(用于配置更新)。""" diff --git a/indexer/document_transformer.py b/indexer/document_transformer.py index 840fddb..ba67f99 100644 --- a/indexer/document_transformer.py +++ b/indexer/document_transformer.py @@ -172,49 +172,32 @@ class SPUDocumentTransformer: primary_lang: str ): """ - 填充文本字段(根据租户配置处理多语言翻译)。 - - 翻译逻辑: - - 根据 tenant_config 中的 translate_to_zh 和 translate_to_en 决定翻译方向 - - 如果 translate_to_zh=true,且店铺语言不是zh,则翻译到中文 - - 如果 translate_to_en=true,且店铺语言不是en,则翻译到英文 - - 如果两个都是false,则不进行翻译,只填充主语言字段 + 填充文本字段(根据租户 index_languages 处理多语言翻译)。 + 仅写入 primary_language 及 index_languages 中配置的语言。 """ - # 从租户配置中读取翻译方向 - translate_to_en = bool(self.tenant_config.get('translate_to_en')) - translate_to_zh = bool(self.tenant_config.get('translate_to_zh')) - - def _set_lang_obj(field_name: str, source_text: Optional[str], translations: Optional[Dict[str, str]] = None): - """ - Write multilingual text field as an object, e.g.: - doc[field_name] = {"zh": "...", "en": "..."} - Only writes keys based on tenant primary_language + translate_to_en/translate_to_zh. - """ + index_langs = self.tenant_config.get("index_languages") or ["en", "zh"] + + def _set_lang_obj(field_name: str, source_text: Optional[str], translations: Optional[Dict[str, Optional[str]]] = None): + """写入多语言对象 doc[field_name] = {"zh": "...", "en": "...", ...},仅包含 index_languages。""" if not source_text or not str(source_text).strip(): return - obj: Dict[str, str] = {} src = str(source_text) obj[primary_lang] = src - tr = translations or {} - if translate_to_en and primary_lang != "en": - en_text = tr.get("en") - if en_text and str(en_text).strip(): - obj["en"] = str(en_text) - if translate_to_zh and primary_lang != "zh": - zh_text = tr.get("zh") - if zh_text and str(zh_text).strip(): - obj["zh"] = str(zh_text) - + for lang in index_langs: + if lang == primary_lang: + continue + val = tr.get(lang) + if val and str(val).strip(): + obj[lang] = str(val) if obj: doc[field_name] = obj # Title if pd.notna(spu_row.get('title')): title_text = str(spu_row['title']) - - translations: Dict[str, str] = {} + translations: Dict[str, Optional[str]] = {} if self.translator: prompt_zh = self.translation_prompts.get('product_title_zh') or self.translation_prompts.get('default_zh') prompt_en = self.translation_prompts.get('product_title_en') or self.translation_prompts.get('default_en') @@ -223,16 +206,14 @@ class SPUDocumentTransformer: shop_language=primary_lang, source_lang=primary_lang, prompt=prompt_zh if primary_lang == 'zh' else prompt_en, - translate_to_en=translate_to_en, - translate_to_zh=translate_to_zh, + index_languages=index_langs, ) or {} - _set_lang_obj("title", title_text, translations) # Brief if pd.notna(spu_row.get('brief')): brief_text = str(spu_row['brief']) - translations: Dict[str, str] = {} + translations = {} if self.translator: prompt = self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en') translations = self.translator.translate_for_indexing( @@ -240,15 +221,14 @@ class SPUDocumentTransformer: shop_language=primary_lang, source_lang=primary_lang, prompt=prompt, - translate_to_en=translate_to_en, - translate_to_zh=translate_to_zh, + index_languages=index_langs, ) or {} _set_lang_obj("brief", brief_text, translations) # Description if pd.notna(spu_row.get('description')): desc_text = str(spu_row['description']) - translations: Dict[str, str] = {} + translations = {} if self.translator: prompt = self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en') translations = self.translator.translate_for_indexing( @@ -256,15 +236,14 @@ class SPUDocumentTransformer: shop_language=primary_lang, source_lang=primary_lang, prompt=prompt, - translate_to_en=translate_to_en, - translate_to_zh=translate_to_zh, + index_languages=index_langs, ) or {} _set_lang_obj("description", desc_text, translations) # Vendor if pd.notna(spu_row.get('vendor')): vendor_text = str(spu_row['vendor']) - translations: Dict[str, str] = {} + translations = {} if self.translator: prompt = self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en') translations = self.translator.translate_for_indexing( @@ -272,8 +251,7 @@ class SPUDocumentTransformer: shop_language=primary_lang, source_lang=primary_lang, prompt=prompt, - translate_to_en=translate_to_en, - translate_to_zh=translate_to_zh, + index_languages=index_langs, ) or {} _set_lang_obj("vendor", vendor_text, translations) diff --git a/indexer/indexing_utils.py b/indexer/indexing_utils.py index 6aff22a..857bd75 100644 --- a/indexer/indexing_utils.py +++ b/indexer/indexing_utils.py @@ -96,11 +96,9 @@ def create_document_transformer( if searchable_option_dimensions is None: searchable_option_dimensions = config.spu_config.searchable_option_dimensions - # 根据租户配置决定是否需要翻译:只要开启任一方向的翻译,就初始化翻译器 - translate_to_en = bool(tenant_config.get("translate_to_en")) - translate_to_zh = bool(tenant_config.get("translate_to_zh")) - - if translator is None and (translate_to_en or translate_to_zh): + index_langs = tenant_config.get("index_languages") or [] + need_translator = len(index_langs) > 1 + if translator is None and need_translator: from query.translator import Translator translator = Translator( api_key=config.query_config.translation_api_key, diff --git a/indexer/test_indexing.py b/indexer/test_indexing.py index 58330d7..5fe7944 100755 --- a/indexer/test_indexing.py +++ b/indexer/test_indexing.py @@ -44,19 +44,25 @@ def test_tenant_config(): # 测试默认配置 default_config = tenant_config_loader.get_tenant_config("999") print(f"默认配置: {default_config}") + assert "index_languages" in default_config, "默认配置应包含 index_languages" + assert "en" in default_config["index_languages"] and "zh" in default_config["index_languages"], \ + "默认 index_languages 应包含 en, zh" + print("✓ 默认配置正确(index_languages 含 en, zh)") - # 测试租户162(翻译关闭) + # 测试租户162(index_languages: zh, en) tenant_162_config = tenant_config_loader.get_tenant_config("162") print(f"租户162配置: {tenant_162_config}") - assert tenant_162_config['translate_to_en'] == False, "租户162翻译应该关闭" - assert tenant_162_config['translate_to_zh'] == False, "租户162翻译应该关闭" - print("✓ 租户162配置正确(翻译关闭)") + idx = tenant_162_config.get("index_languages") or [] + assert "zh" in idx and "en" in idx, "租户162 index_languages 应包含 zh, en" + print("✓ 租户162配置正确(index_languages 含 zh, en)") - # 测试其他租户 + # 测试租户1 tenant_1_config = tenant_config_loader.get_tenant_config("1") print(f"租户1配置: {tenant_1_config}") - assert tenant_1_config['translate_to_en'] == True, "租户1应该启用英文翻译" - print("✓ 租户1配置正确(翻译开启)") + idx1 = tenant_1_config.get("index_languages") or [] + assert "zh" in idx1 and "en" in idx1, "租户1 index_languages 应包含 zh, en" + assert tenant_1_config.get("primary_language") == "zh", "租户1 主语言为 zh" + print("✓ 租户1配置正确(index_languages 含 zh, en,主语言 zh)") return True except Exception as e: @@ -118,13 +124,12 @@ def test_full_indexing(tenant_id: str = "162"): print(f" 标题 (中文): {title_obj.get('zh', 'N/A') if isinstance(title_obj, dict) else 'N/A'}") print(f" 标题 (英文): {title_obj.get('en', 'N/A') if isinstance(title_obj, dict) else 'N/A'}") - # 检查租户162的翻译状态 + # 租户162 index_languages [zh, en],应有 title.en if tenant_id == "162": - # 租户162翻译应该关闭:只写入主语言,不应出现 title.en - if isinstance(title_obj, dict) and title_obj.get("en") is None: - print(f" ✓ 翻译已关闭(title.en为空)") + if isinstance(title_obj, dict) and title_obj.get("en"): + print(f" ✓ 多语言索引正常(title.en 已填充)") else: - print(f" ⚠ 警告:翻译应该关闭,但title.en有值: {title_obj.get('en') if isinstance(title_obj, dict) else None}") + print(f" ⚠ 警告:租户162 配置 [zh,en],但 title.en 为空") return True @@ -199,12 +204,12 @@ def test_incremental_indexing(tenant_id: str = "162"): print(f" SKU数量: {len(doc.get('skus', []))}") print(f" 规格数量: {len(doc.get('specifications', []))}") - # 检查租户162的翻译状态 + # 租户162 配置了 index_languages [zh, en],应有 title.en if tenant_id == "162": - if isinstance(title_obj, dict) and title_obj.get("en") is None: - print(f" ✓ 翻译已关闭(title.en为空)") + if isinstance(title_obj, dict) and title_obj.get("en"): + print(f" ✓ 多语言索引正常(title.en 已填充)") else: - print(f" ⚠ 警告:翻译应该关闭,但title.en有值: {title_obj.get('en') if isinstance(title_obj, dict) else None}") + print(f" ⚠ 警告:租户162 配置 [zh,en],但 title.en 为空") return True @@ -298,11 +303,13 @@ def test_document_transformer(): print(f" title.en: {title_obj.get('en') if isinstance(title_obj, dict) else None}") print(f" SKU数量: {len(doc.get('skus', []))}") - # 验证租户162翻译关闭 - if isinstance(title_obj, dict) and title_obj.get("en") is None: - print(f" ✓ 翻译已关闭(符合租户162配置)") + # 租户162 index_languages [zh, en],主语言 zh,应有 zh(原文)与 en(翻译) + if isinstance(title_obj, dict) and title_obj.get("zh") and title_obj.get("en"): + print(f" ✓ 多语言字段正确(zh + en)") + elif isinstance(title_obj, dict) and title_obj.get("zh"): + print(f" ⚠ 仅有 zh(若未配置翻译或翻译未调用可接受)") else: - print(f" ⚠ 警告:翻译应该关闭") + print(f" ⚠ 未发现预期多语言字段") return True else: diff --git a/query/query_parser.py b/query/query_parser.py index 28416d9..b4f32e9 100644 --- a/query/query_parser.py +++ b/query/query_parser.py @@ -58,8 +58,7 @@ class ParsedQuery: "rewritten_query": self.rewritten_query, "detected_language": self.detected_language, "translations": self.translations, - "domain": self.domain, - "has_vector": self.query_vector is not None + "domain": self.domain } return result @@ -228,23 +227,16 @@ class QueryParser: translations = {} translation_futures = {} try: - # 根据租户配置决定翻译目标语言 + # 根据租户配置的 index_languages 决定翻译目标语言 from config.tenant_config_loader import get_tenant_config_loader tenant_loader = get_tenant_config_loader() tenant_cfg = tenant_loader.get_tenant_config(tenant_id or "default") + index_langs = tenant_cfg.get("index_languages") or ["en", "zh"] - translate_to_zh = bool(tenant_cfg.get("translate_to_zh")) - translate_to_en = bool(tenant_cfg.get("translate_to_en")) + target_langs_for_translation = [lang for lang in index_langs if lang != detected_lang] - target_langs_for_translation = [] - if translate_to_zh: - target_langs_for_translation.append('zh') - if translate_to_en: - target_langs_for_translation.append('en') - - # 如果该租户未开启任何翻译方向,则直接跳过翻译阶段 if target_langs_for_translation: - target_langs = [lang for lang in target_langs_for_translation if detected_lang != lang] + target_langs = target_langs_for_translation if target_langs: # Use e-commerce context for better disambiguation @@ -254,8 +246,8 @@ class QueryParser: self.config.query_config.translation_prompts.get('default_zh') # Determine if we need to wait for translation results - # If detected_lang is neither 'en' nor 'zh', we must wait for translation - need_wait_translation = detected_lang not in ['en', 'zh'] + # If detected_lang is not in index_languages, we must wait for translation + need_wait_translation = detected_lang not in index_langs if need_wait_translation: # Use async method that returns Futures, so we can wait for results diff --git a/query/translator.py b/query/translator.py index 4130329..117b234 100644 --- a/query/translator.py +++ b/query/translator.py @@ -792,6 +792,18 @@ class Translator: # The user can configure a glossary for better results return translated_text + def _shop_lang_matches(self, shop_lang_lower: str, lang_code: str) -> bool: + """True if shop language matches index language (use source, no translate).""" + if not shop_lang_lower or not lang_code: + return False + if shop_lang_lower == lang_code: + return True + if lang_code == "zh" and "zh" in shop_lang_lower: + return True + if lang_code == "en" and "en" in shop_lang_lower: + return True + return False + def translate_for_indexing( self, text: str, @@ -799,76 +811,55 @@ class Translator: source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None, - translate_to_en: bool = True, - translate_to_zh: bool = True, + index_languages: Optional[List[str]] = None, ) -> Dict[str, Optional[str]]: """ - Translate text for indexing based on shop language and tenant configuration. - - Translation behavior: - - If translate_to_zh=True and shop language is not 'zh', translate to Chinese (zh) - - If translate_to_en=True and shop language is not 'en', translate to English (en) - - If both flags are False, no translation is performed (returns None for both) - + Translate text for indexing based on shop language and tenant index_languages. + + For each language in index_languages: use source text if shop language matches, + otherwise translate to that language. + Args: text: Text to translate - shop_language: Shop's configured language (e.g., 'zh', 'en', 'ru') - source_lang: Source language code (optional, auto-detect if None) + shop_language: Shop primary language (e.g. 'zh', 'en', 'ru') + source_lang: Source language code (optional) context: Additional context for translation (optional) - prompt: Translation prompt/instruction (optional) - translate_to_en: Whether to translate to English (from tenant_config) - translate_to_zh: Whether to translate to Chinese (from tenant_config) - + prompt: Translation prompt (optional) + index_languages: Languages to index (from tenant_config). Default ["en", "zh"]. + Returns: - Dictionary with 'zh' and 'en' keys containing translated text (or None if not needed/not enabled) - Example: {'zh': '中文翻译', 'en': 'English translation'} or {'zh': None, 'en': None} + Dict keyed by each index_language with translated or source text (or None). """ + langs = index_languages if index_languages else ["en", "zh"] + results = {lang: None for lang in langs} if not text or not text.strip(): - return {'zh': None, 'en': None} - - # Skip translation for symbol-only queries + return results if re.match(r'^[\d\s_-]+$', text): logger.info(f"[Translator] Skip translation for symbol-only query: '{text}'") - return {'zh': None, 'en': None} - - results = {'zh': None, 'en': None} - shop_lang_lower = shop_language.lower() if shop_language else "" - - # Determine which languages need translation based on tenant configuration - targets = [] - if translate_to_zh and "zh" not in shop_lang_lower: - targets.append("zh") - if translate_to_en and "en" not in shop_lang_lower: - targets.append("en") - - # If shop language is already zh and en, no translation needed - if not targets: - # Use original text for both languages - if "zh" in shop_lang_lower: - results['zh'] = text - if "en" in shop_lang_lower: - results['en'] = text return results - - # Translate to each target language + + shop_lang_lower = (shop_language or "").strip().lower() + targets = [] + for lang in langs: + if self._shop_lang_matches(shop_lang_lower, lang): + results[lang] = text + else: + targets.append(lang) + for target_lang in targets: - # Check cache first cached = self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt) if cached: results[target_lang] = cached logger.debug(f"[Translator] Cache hit for indexing: '{text}' -> {target_lang}: {cached}") continue - - # Translate synchronously for indexing (we need the result immediately) translated = self.translate( text, target_lang=target_lang, source_lang=source_lang or shop_language, context=context, - prompt=prompt + prompt=prompt, ) results[target_lang] = translated - return results def get_translation_needs( diff --git a/search/searcher.py b/search/searcher.py index 98b6033..9e81206 100644 --- a/search/searcher.py +++ b/search/searcher.py @@ -165,7 +165,8 @@ class Searcher: # 根据租户配置决定翻译开关(离线/在线统一) tenant_loader = get_tenant_config_loader() tenant_cfg = tenant_loader.get_tenant_config(tenant_id) - enable_translation = bool(tenant_cfg.get("translate_to_en") or tenant_cfg.get("translate_to_zh")) + index_langs = tenant_cfg.get("index_languages") or [] + enable_translation = len(index_langs) > 0 enable_embedding = self.config.query_config.enable_text_embedding enable_rerank = False # Temporarily disabled -- libgit2 0.21.2