这次修改没改完。 diff --git a/HighLevelDesign.md b/HighLevelDesign.md index 397a9f7..3e728c9 100644 --- a/HighLevelDesign.md +++ b/HighLevelDesign.md @@ -112,10 +112,9 @@ if response.status_code == 200: 支持多种匹配方式,如AND、OR、RANK、NOTAND以及(),优先级从高到低为(),ANDNOT,AND,OR,RANK。 default域的相关性,是代码里面单独计算,是特定的深度定制优化的,暂时不做配置化。 -多语言搜索: -对外提供的服务 用default域搜索就行,但是内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。 暂时具体实现为 bm25()+0.2*text_embedding_relevence(也就是knn检索表达式的打分) +bm25() 包括多语言的打分:内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。 ```text bm25打分(base_query): "multi_match": { "query": search_query, diff --git a/config/config_loader.py b/config/config_loader.py index 8df15b3..f3fcaa3 100644 --- a/config/config_loader.py +++ b/config/config_loader.py @@ -27,6 +27,9 @@ class IndexConfig: boost: float = 1.0 example: Optional[str] = None + # Multi-language field mapping: {"zh": ["name"], "en": ["enSpuName"], "ru": ["ruSkuName"]} + language_field_mapping: Optional[Dict[str, List[str]]] = None + @dataclass class RankingConfig: @@ -66,8 +69,6 @@ class TenantConfig: # Database settings mysql_config: Dict[str, Any] - main_table: str = "shoplazza_product_sku" - extension_table: Optional[str] = None # Field definitions fields: List[FieldConfig] @@ -86,6 +87,10 @@ class TenantConfig: # ES index settings es_index_name: str + + # Optional fields with defaults + main_table: str = "shoplazza_product_sku" + extension_table: Optional[str] = None es_settings: Dict[str, Any] = field(default_factory=dict) @@ -228,13 +233,17 @@ class ConfigLoader: if analyzer_str not in ANALYZER_MAP: raise ConfigurationError(f"Unknown analyzer: {analyzer_str}") + # Parse language field mapping if present + language_field_mapping = index_data.get("language_field_mapping") + return IndexConfig( name=index_data["name"], label=index_data.get("label", index_data["name"]), fields=index_data["fields"], analyzer=ANALYZER_MAP[analyzer_str], boost=index_data.get("boost", 1.0), - example=index_data.get("example") + example=index_data.get("example"), + language_field_mapping=language_field_mapping ) def validate_config(self, config: TenantConfig) -> List[str]: @@ -360,11 +369,16 @@ class ConfigLoader: def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]: """Convert IndexConfig to dictionary.""" - return { + result = { "name": index.name, "label": index.label, "fields": index.fields, "analyzer": index.analyzer.value, "boost": index.boost, "example": index.example - } \ No newline at end of file + } + + if index.language_field_mapping: + result["language_field_mapping"] = index.language_field_mapping + + return result \ No newline at end of file diff --git a/config/schema/tenant1_config.yaml b/config/schema/tenant1_config.yaml index bfe2e53..84e9ba1 100644 --- a/config/schema/tenant1_config.yaml +++ b/config/schema/tenant1_config.yaml @@ -177,6 +177,15 @@ indexes: analyzer: "chinese_ecommerce" boost: 1.0 example: 'query=default:"消防套"' + language_field_mapping: + zh: + - "name" + - "categoryName" + - "brandName" + en: + - "enSpuName" + ru: + - "ruSkuName" - name: "title" label: "标题索引" @@ -187,6 +196,13 @@ indexes: analyzer: "chinese_ecommerce" boost: 2.0 example: 'query=title:"芭比娃娃"' + language_field_mapping: + zh: + - "name" + en: + - "enSpuName" + ru: + - "ruSkuName" - name: "category" label: "类目索引" diff --git a/search/searcher.py b/search/searcher.py index a7088ec..0a798ed 100644 --- a/search/searcher.py +++ b/search/searcher.py @@ -13,6 +13,7 @@ from query import QueryParser, ParsedQuery from indexer import MappingGenerator from .boolean_parser import BooleanParser, QueryNode from .es_query_builder import ESQueryBuilder +from .multilang_query_builder import MultiLanguageQueryBuilder from .ranking_engine import RankingEngine @@ -86,10 +87,10 @@ class Searcher: self.text_embedding_field = mapping_gen.get_text_embedding_field() self.image_embedding_field = mapping_gen.get_image_embedding_field() - # Query builder - self.query_builder = ESQueryBuilder( + # Query builder - use multi-language version + self.query_builder = MultiLanguageQueryBuilder( + config=config, index_name=config.es_index_name, - match_fields=self.match_fields, text_embedding_field=self.text_embedding_field, image_embedding_field=self.image_embedding_field ) @@ -144,11 +145,10 @@ class Searcher: query_text = parsed_query.rewritten_query print(f"[Searcher] Parsed boolean expression: {query_node}") - # Step 3: Build ES query - es_query = self.query_builder.build_query( - query_text=query_text, + # Step 3: Build ES query using multi-language builder + es_query = self.query_builder.build_multilang_query( + parsed_query=parsed_query, query_vector=parsed_query.query_vector if enable_embedding else None, - query_node=query_node, filters=filters, size=size, from_=from_, @@ -325,6 +325,15 @@ class Searcher: query_info={'image_url': image_url, 'search_type': 'image_similarity'} ) + def get_domain_summary(self) -> Dict[str, Any]: + """ + Get summary of all configured domains. + + Returns: + Dictionary with domain information + """ + return self.query_builder.get_domain_summary() + def get_document(self, doc_id: str) -> Optional[Dict[str, Any]]: """ Get single document by ID. ```