支持多语言查询.md 6.63 KB

这次修改没改完。

diff --git a/HighLevelDesign.md b/HighLevelDesign.md index 397a9f7..3e728c9 100644 --- a/HighLevelDesign.md +++ b/HighLevelDesign.md @@ -112,10 +112,9 @@ if response.status_code == 200: 支持多种匹配方式,如AND、OR、RANK、NOTAND以及(),优先级从高到低为(),ANDNOT,AND,OR,RANK。

default域的相关性,是代码里面单独计算,是特定的深度定制优化的,暂时不做配置化。 -多语言搜索: -对外提供的服务 用default域搜索就行,但是内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。

暂时具体实现为 bm25()+0.2*text_embedding_relevence(也就是knn检索表达式的打分) +bm25() 包括多语言的打分:内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。

 bm25打分(base_query):
 "multi_match": {
     "query": search_query,
diff --git a/config/config_loader.py b/config/config_loader.py
index 8df15b3..f3fcaa3 100644
--- a/config/config_loader.py
+++ b/config/config_loader.py
@@ -27,6 +27,9 @@ class IndexConfig:
     boost: float = 1.0
     example: Optional[str] = None

+    # Multi-language field mapping: {"zh": ["name"], "en": ["enSpuName"], "ru": ["ruSkuName"]}
+    language_field_mapping: Optional[Dict[str, List[str]]] = None
+

 @dataclass
 class RankingConfig:
@@ -66,8 +69,6 @@ class CustomerConfig:

     # Database settings
     mysql_config: Dict[str, Any]
-    main_table: str = "shoplazza_product_sku"
-    extension_table: Optional[str] = None

     # Field definitions
     fields: List[FieldConfig]
@@ -86,6 +87,10 @@ class CustomerConfig:

     # ES index settings
     es_index_name: str
+
+    # Optional fields with defaults
+    main_table: str = "shoplazza_product_sku"
+    extension_table: Optional[str] = None
     es_settings: Dict[str, Any] = field(default_factory=dict)


@@ -228,13 +233,17 @@ class ConfigLoader:
         if analyzer_str not in ANALYZER_MAP:
             raise ConfigurationError(f"Unknown analyzer: {analyzer_str}")

+        # Parse language field mapping if present
+        language_field_mapping = index_data.get("language_field_mapping")
+
         return IndexConfig(
             name=index_data["name"],
             label=index_data.get("label", index_data["name"]),
             fields=index_data["fields"],
             analyzer=ANALYZER_MAP[analyzer_str],
             boost=index_data.get("boost", 1.0),
-            example=index_data.get("example")
+            example=index_data.get("example"),
+            language_field_mapping=language_field_mapping
         )

     def validate_config(self, config: CustomerConfig) -> List[str]:
@@ -360,11 +369,16 @@ class ConfigLoader:

     def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]:
         """Convert IndexConfig to dictionary."""
-        return {
+        result = {
             "name": index.name,
             "label": index.label,
             "fields": index.fields,
             "analyzer": index.analyzer.value,
             "boost": index.boost,
             "example": index.example
-        }
\ No newline at end of file
+        }
+
+        if index.language_field_mapping:
+            result["language_field_mapping"] = index.language_field_mapping
+
+        return result
\ No newline at end of file
diff --git a/config/schema/customer1_config.yaml b/config/schema/customer1_config.yaml
index bfe2e53..84e9ba1 100644
--- a/config/schema/customer1_config.yaml
+++ b/config/schema/customer1_config.yaml
@@ -177,6 +177,15 @@ indexes:
     analyzer: "chinese_ecommerce"
     boost: 1.0
     example: 'query=default:"消防套"'
+    language_field_mapping:
+      zh:
+        - "name"
+        - "categoryName"
+        - "brandName"
+      en:
+        - "enSpuName"
+      ru:
+        - "ruSkuName"

   - name: "title"
     label: "标题索引"
@@ -187,6 +196,13 @@ indexes:
     analyzer: "chinese_ecommerce"
     boost: 2.0
     example: 'query=title:"芭比娃娃"'
+    language_field_mapping:
+      zh:
+        - "name"
+      en:
+        - "enSpuName"
+      ru:
+        - "ruSkuName"

   - name: "category"
     label: "类目索引"
diff --git a/search/searcher.py b/search/searcher.py
index a7088ec..0a798ed 100644
--- a/search/searcher.py
+++ b/search/searcher.py
@@ -13,6 +13,7 @@ from query import QueryParser, ParsedQuery
 from indexer import MappingGenerator
 from .boolean_parser import BooleanParser, QueryNode
 from .es_query_builder import ESQueryBuilder
+from .multilang_query_builder import MultiLanguageQueryBuilder
 from .ranking_engine import RankingEngine


@@ -86,10 +87,10 @@ class Searcher:
         self.text_embedding_field = mapping_gen.get_text_embedding_field()
         self.image_embedding_field = mapping_gen.get_image_embedding_field()

-        # Query builder
-        self.query_builder = ESQueryBuilder(
+        # Query builder - use multi-language version
+        self.query_builder = MultiLanguageQueryBuilder(
+            config=config,
             index_name=config.es_index_name,
-            match_fields=self.match_fields,
             text_embedding_field=self.text_embedding_field,
             image_embedding_field=self.image_embedding_field
         )
@@ -144,11 +145,10 @@ class Searcher:
             query_text = parsed_query.rewritten_query
             print(f"[Searcher] Parsed boolean expression: {query_node}")

-        # Step 3: Build ES query
-        es_query = self.query_builder.build_query(
-            query_text=query_text,
+        # Step 3: Build ES query using multi-language builder
+        es_query = self.query_builder.build_multilang_query(
+            parsed_query=parsed_query,
             query_vector=parsed_query.query_vector if enable_embedding else None,
-            query_node=query_node,
             filters=filters,
             size=size,
             from_=from_,
@@ -325,6 +325,15 @@ class Searcher:
             query_info={'image_url': image_url, 'search_type': 'image_similarity'}
         )

+    def get_domain_summary(self) -> Dict[str, Any]:
+        """
+        Get summary of all configured domains.
+
+        Returns:
+            Dictionary with domain information
+        """
+        return self.query_builder.get_domain_summary()
+
     def get_document(self, doc_id: str) -> Optional[Dict[str, Any]]:
         """
         Get single document by ID.