支持多语言查询.md 6.63 KB

这次修改没改完。

diff --git a/HighLevelDesign.md b/HighLevelDesign.md index 397a9f7..3e728c9 100644 --- a/HighLevelDesign.md +++ b/HighLevelDesign.md @@ -112,10 +112,9 @@ if response.status_code == 200: 支持多种匹配方式,如AND、OR、RANK、NOTAND以及(),优先级从高到低为(),ANDNOT,AND,OR,RANK。

default域的相关性,是代码里面单独计算,是特定的深度定制优化的,暂时不做配置化。 -多语言搜索: -对外提供的服务 用default域搜索就行,但是内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。

暂时具体实现为 bm25()+0.2*text_embedding_relevence(也就是knn检索表达式的打分) +bm25() 包括多语言的打分:内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。 bm25打分(base_query): "multi_match": { "query": search_query, diff --git a/config/config_loader.py b/config/config_loader.py index 8df15b3..f3fcaa3 100644 --- a/config/config_loader.py +++ b/config/config_loader.py @@ -27,6 +27,9 @@ class IndexConfig: boost: float = 1.0 example: Optional[str] = None

  • # Multi-language field mapping: {"zh": ["name"], "en": ["enSpuName"], "ru": ["ruSkuName"]}
  • language_field_mapping: Optional[Dict[str, List[str]]] = None +

@dataclass class RankingConfig: @@ -66,8 +69,6 @@ class CustomerConfig:

 # Database settings
 mysql_config: Dict[str, Any]
  • main_table: str = "shoplazza_product_sku"
  • extension_table: Optional[str] = None

    # Field definitions fields: List[FieldConfig] @@ -86,6 +87,10 @@ class CustomerConfig:

    # ES index settings es_index_name: str +

  • # Optional fields with defaults

  • main_table: str = "shoplazza_product_sku"

  • extension_table: Optional[str] = None es_settings: Dict[str, Any] = field(default_factory=dict)

@@ -228,13 +233,17 @@ class ConfigLoader: if analyzer_str not in ANALYZER_MAP: raise ConfigurationError(f"Unknown analyzer: {analyzer_str}")

  • # Parse language field mapping if present
  • language_field_mapping = index_data.get("language_field_mapping") + return IndexConfig( name=index_data["name"], label=index_data.get("label", index_data["name"]), fields=index_data["fields"], analyzer=ANALYZER_MAP[analyzer_str], boost=index_data.get("boost", 1.0),
  • example=index_data.get("example")
  • example=index_data.get("example"),
  •        language_field_mapping=language_field_mapping
     )
    

    def validate_config(self, config: CustomerConfig) -> List[str]: @@ -360,11 +369,16 @@ class ConfigLoader:

    def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]: """Convert IndexConfig to dictionary."""

  •    return {
    
  •    result = {
         "name": index.name,
         "label": index.label,
         "fields": index.fields,
         "analyzer": index.analyzer.value,
         "boost": index.boost,
         "example": index.example
    
  •    }
    

    \ No newline at end of file

  •    }
    

    +

  •    if index.language_field_mapping:
    
  •        result["language_field_mapping"] = index.language_field_mapping
    

    +

  •    return result
    

    \ No newline at end of file diff --git a/config/schema/customer1_config.yaml b/config/schema/customer1_config.yaml index bfe2e53..84e9ba1 100644 --- a/config/schema/customer1_config.yaml +++ b/config/schema/customer1_config.yaml @@ -177,6 +177,15 @@ indexes: analyzer: "chinese_ecommerce" boost: 1.0 example: 'query=default:"消防套"'

  • language_field_mapping:

  •  zh:
    
  •    - "name"
    
  •    - "categoryName"
    
  •    - "brandName"
    
  •  en:
    
  •    - "enSpuName"
    
  •  ru:
    
  •    - "ruSkuName"
    
    • name: "title" label: "标题索引" @@ -187,6 +196,13 @@ indexes: analyzer: "chinese_ecommerce" boost: 2.0 example: 'query=title:"芭比娃娃"'
  • language_field_mapping:

  •  zh:
    
  •    - "name"
    
  •  en:
    
  •    - "enSpuName"
    
  •  ru:
    
  •    - "ruSkuName"
    
    • name: "category" label: "类目索引" diff --git a/search/searcher.py b/search/searcher.py index a7088ec..0a798ed 100644 --- a/search/searcher.py +++ b/search/searcher.py @@ -13,6 +13,7 @@ from query import QueryParser, ParsedQuery from indexer import MappingGenerator from .boolean_parser import BooleanParser, QueryNode from .es_query_builder import ESQueryBuilder +from .multilang_query_builder import MultiLanguageQueryBuilder from .ranking_engine import RankingEngine

@@ -86,10 +87,10 @@ class Searcher: self.text_embedding_field = mapping_gen.get_text_embedding_field() self.image_embedding_field = mapping_gen.get_image_embedding_field()

  • # Query builder
  • self.query_builder = ESQueryBuilder(
  • # Query builder - use multi-language version
  • self.query_builder = MultiLanguageQueryBuilder(
  • config=config, index_name=config.es_index_name,
  •        match_fields=self.match_fields,
         text_embedding_field=self.text_embedding_field,
         image_embedding_field=self.image_embedding_field
     )
    

    @@ -144,11 +145,10 @@ class Searcher: query_text = parsed_query.rewritten_query print(f"[Searcher] Parsed boolean expression: {query_node}")

  •    # Step 3: Build ES query
    
  •    es_query = self.query_builder.build_query(
    
  •        query_text=query_text,
    
  •    # Step 3: Build ES query using multi-language builder
    
  •    es_query = self.query_builder.build_multilang_query(
    
  •        parsed_query=parsed_query,
         query_vector=parsed_query.query_vector if enable_embedding else None,
    
  •        query_node=query_node,
         filters=filters,
         size=size,
         from_=from_,
    

    @@ -325,6 +325,15 @@ class Searcher: query_info={'image_url': image_url, 'search_type': 'image_similarity'} )

  • def get_domain_summary(self) -> Dict[str, Any]:

  •    """
    
  •    Get summary of all configured domains.
    

    +

  •    Returns:
    
  •        Dictionary with domain information
    
  •    """
    
  •    return self.query_builder.get_domain_summary()
    

    + def get_document(self, doc_id: str) -> Optional[Dict[str, Any]]: """ Get single document by ID.