From 2739b28126a5621b1d03d7eb40dfa4f433d45579 Mon Sep 17 00:00:00 2001 From: tangwang Date: Tue, 6 Jan 2026 19:59:36 +0800 Subject: [PATCH] 多语言索引调整 --- api/result_formatter.py | 23 +++++++---------------- config/config.yaml | 4 ++-- config/config_loader.py | 4 ++-- config/tenant_config_loader.py | 4 ++-- indexer/document_transformer.py | 4 ++-- search/es_query_builder.py | 2 +- search/searcher.py | 6 +++--- 7 files changed, 19 insertions(+), 28 deletions(-) diff --git a/api/result_formatter.py b/api/result_formatter.py index 56e2b1e..88cf61e 100644 --- a/api/result_formatter.py +++ b/api/result_formatter.py @@ -13,7 +13,7 @@ class ResultFormatter: def format_search_results( es_hits: List[Dict[str, Any]], max_score: float = 1.0, - language: str = "zh", + language: str = "en", sku_filter_dimension: Optional[List[str]] = None ) -> List[SpuResult]: """ @@ -27,24 +27,17 @@ class ResultFormatter: List of SpuResult objects """ results = [] - lang = (language or "zh").lower() + lang = (language or "en").lower() if lang not in ("zh", "en"): lang = "en" def pick_lang_field(src: Dict[str, Any], base: str) -> Optional[str]: - """ - 从多语言对象字段中按语言选择一个值: - - 新结构: {base: {"zh": "...", "en": "...", ...}} - - 兼容旧结构: {base_zh: "...", base_en: "..."} - 若目标语言缺失则回退到另一种。 - """ + """从多语言对象字段中按语言选择一个值:{base: {"zh": "...", "en": "...", ...}}""" obj = src.get(base) - if isinstance(obj, dict): - zh_val = obj.get("zh") - en_val = obj.get("en") - else: - zh_val = src.get(f"{base}_zh") - en_val = src.get(f"{base}_en") + if not isinstance(obj, dict): + return None + zh_val = obj.get("zh") + en_val = obj.get("en") if lang == "zh": return zh_val or en_val return en_val or zh_val @@ -319,12 +312,10 @@ class ResultFormatter: is_selected = (name, value_bucket['key']) in selected_specs # 使用 reverse_nested 的 product_count 统计产品数量(而不是规格条目数量) - # 如果没有 product_count(兼容旧格式),回退到 doc_count product_count_agg = value_bucket.get('product_count', {}) if product_count_agg and 'doc_count' in product_count_agg: count = product_count_agg['doc_count'] else: - # 回退到 doc_count(兼容旧格式,但这不是我们想要的计数方式) count = value_bucket.get('doc_count', 0) value = FacetValue( diff --git a/config/config.yaml b/config/config.yaml index 25eefbe..7e96f73 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -83,7 +83,7 @@ query_config: supported_languages: - "zh" - "en" - default_language: "zh" + default_language: "en" # 功能开关(翻译开关由tenant_config控制) enable_text_embedding: true @@ -153,7 +153,7 @@ spu_config: tenant_config: # 默认配置(未配置的租户使用此配置) default: - primary_language: "zh" + primary_language: "en" translate_to_en: true translate_to_zh: false # 租户特定配置 diff --git a/config/config_loader.py b/config/config_loader.py index d0be6ed..2c8585b 100644 --- a/config/config_loader.py +++ b/config/config_loader.py @@ -29,7 +29,7 @@ class IndexConfig: class QueryConfig: """Configuration for query processing.""" supported_languages: List[str] = field(default_factory=lambda: ["zh", "en"]) - default_language: str = "zh" + default_language: str = "en" # Feature flags enable_text_embedding: bool = True @@ -231,7 +231,7 @@ class ConfigLoader: query_config = QueryConfig( supported_languages=query_config_data.get("supported_languages") or ["zh", "en"], - default_language=query_config_data.get("default_language") or "zh", + default_language=query_config_data.get("default_language") or "en", enable_text_embedding=query_config_data.get("enable_text_embedding", True), enable_query_rewrite=query_config_data.get("enable_query_rewrite", True), rewrite_dictionary=rewrite_dictionary, diff --git a/config/tenant_config_loader.py b/config/tenant_config_loader.py index e10bf82..70c4598 100644 --- a/config/tenant_config_loader.py +++ b/config/tenant_config_loader.py @@ -39,7 +39,7 @@ class TenantConfigLoader: # 返回默认配置 self._config = { "default": { - "primary_language": "zh", + "primary_language": "en", "translate_to_en": True, "translate_to_zh": False }, @@ -66,7 +66,7 @@ class TenantConfigLoader: else: logger.debug(f"Tenant {tenant_id} not found in config, using default") return config.get("default", { - "primary_language": "zh", + "primary_language": "en", "translate_to_en": True, "translate_to_zh": False }) diff --git a/indexer/document_transformer.py b/indexer/document_transformer.py index a7a87b6..840fddb 100644 --- a/indexer/document_transformer.py +++ b/indexer/document_transformer.py @@ -87,7 +87,7 @@ class SPUDocumentTransformer: logger.error(f"SPU {spu_id} has no title, this may cause search issues") # 获取租户配置 - primary_lang = self.tenant_config.get('primary_language', 'zh') + primary_lang = self.tenant_config.get('primary_language', 'en') # 文本字段处理(使用translator的内部逻辑自动处理多语言翻译) self._fill_text_fields(doc, spu_row, primary_lang) @@ -283,7 +283,7 @@ class SPUDocumentTransformer: # - 当商品的类目ID在映射中不存在时,视为“不合法类目”,整条类目相关字段都不写入(当成没有类目) # - 仅记录错误日志,不阻塞索引流程 - primary_lang = self.tenant_config.get('primary_language', 'zh') + primary_lang = self.tenant_config.get('primary_language', 'en') if pd.notna(spu_row.get('category_path')): category_path = str(spu_row['category_path']) diff --git a/search/es_query_builder.py b/search/es_query_builder.py index eacc4fa..81d60dd 100644 --- a/search/es_query_builder.py +++ b/search/es_query_builder.py @@ -26,7 +26,7 @@ class ESQueryBuilder: source_fields: Optional[List[str]] = None, function_score_config: Optional[FunctionScoreConfig] = None, enable_multilang_search: bool = True, - default_language: str = "zh", + default_language: str = "en", knn_boost: float = 0.25 ): """ diff --git a/search/searcher.py b/search/searcher.py index e83b79a..66799d0 100644 --- a/search/searcher.py +++ b/search/searcher.py @@ -135,7 +135,7 @@ class Searcher: sort_by: Optional[str] = None, sort_order: Optional[str] = "desc", debug: bool = False, - language: str = "zh", + language: str = "en", sku_filter_dimension: Optional[List[str]] = None, ) -> SearchResult: """ @@ -275,7 +275,7 @@ class Searcher: try: # Generate tenant-specific index name index_name = get_tenant_index_name(tenant_id) - index_name = "search_products" + # index_name = "search_products" # No longer need to add tenant_id to filters since each tenant has its own index @@ -556,7 +556,7 @@ class Searcher: formatted_results = ResultFormatter.format_search_results( es_hits, max_score, - language="zh", # Default language for image search + language="en", # Default language for image search sku_filter_dimension=None # Image search doesn't support SKU filtering ) -- libgit2 0.21.2