这次修改没改完。
diff --git a/HighLevelDesign.md b/HighLevelDesign.md index 397a9f7..3e728c9 100644 --- a/HighLevelDesign.md +++ b/HighLevelDesign.md @@ -112,10 +112,9 @@ if response.status_code == 200: 支持多种匹配方式,如AND、OR、RANK、NOTAND以及(),优先级从高到低为(),ANDNOT,AND,OR,RANK。
default域的相关性,是代码里面单独计算,是特定的深度定制优化的,暂时不做配置化。 -多语言搜索: -对外提供的服务 用default域搜索就行,但是内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。
暂时具体实现为 bm25()+0.2*text_embedding_relevence(也就是knn检索表达式的打分) +bm25() 包括多语言的打分:内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。 bm25打分(base_query): "multi_match": { "query": search_query, diff --git a/config/config_loader.py b/config/config_loader.py index 8df15b3..f3fcaa3 100644 --- a/config/config_loader.py +++ b/config/config_loader.py @@ -27,6 +27,9 @@ class IndexConfig: boost: float = 1.0 example: Optional[str] = None
- # Multi-language field mapping: {"zh": ["name"], "en": ["enSpuName"], "ru": ["ruSkuName"]}
- language_field_mapping: Optional[Dict[str, List[str]]] = None +
@dataclass class RankingConfig: @@ -66,8 +69,6 @@ class CustomerConfig:
# Database settings
mysql_config: Dict[str, Any]
- main_table: str = "shoplazza_product_sku"
extension_table: Optional[str] = None
# Field definitions fields: List[FieldConfig] @@ -86,6 +87,10 @@ class CustomerConfig:
# ES index settings es_index_name: str +
# Optional fields with defaults
main_table: str = "shoplazza_product_sku"
extension_table: Optional[str] = None es_settings: Dict[str, Any] = field(default_factory=dict)
@@ -228,13 +233,17 @@ class ConfigLoader: if analyzer_str not in ANALYZER_MAP: raise ConfigurationError(f"Unknown analyzer: {analyzer_str}")
- # Parse language field mapping if present
- language_field_mapping = index_data.get("language_field_mapping") + return IndexConfig( name=index_data["name"], label=index_data.get("label", index_data["name"]), fields=index_data["fields"], analyzer=ANALYZER_MAP[analyzer_str], boost=index_data.get("boost", 1.0),
- example=index_data.get("example")
- example=index_data.get("example"),
language_field_mapping=language_field_mapping )def validate_config(self, config: CustomerConfig) -> List[str]: @@ -360,11 +369,16 @@ class ConfigLoader:
def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]: """Convert IndexConfig to dictionary."""
return {result = { "name": index.name, "label": index.label, "fields": index.fields, "analyzer": index.analyzer.value, "boost": index.boost, "example": index.example}\ No newline at end of file
}+
if index.language_field_mapping:result["language_field_mapping"] = index.language_field_mapping+
return result\ No newline at end of file diff --git a/config/schema/customer1_config.yaml b/config/schema/customer1_config.yaml index bfe2e53..84e9ba1 100644 --- a/config/schema/customer1_config.yaml +++ b/config/schema/customer1_config.yaml @@ -177,6 +177,15 @@ indexes: analyzer: "chinese_ecommerce" boost: 1.0 example: 'query=default:"消防套"'
language_field_mapping:
zh:- "name"- "categoryName"- "brandName"en:- "enSpuName"ru:- "ruSkuName"- name: "title" label: "标题索引" @@ -187,6 +196,13 @@ indexes: analyzer: "chinese_ecommerce" boost: 2.0 example: 'query=title:"芭比娃娃"'
language_field_mapping:
zh:- "name"en:- "enSpuName"ru:- "ruSkuName"- name: "category" label: "类目索引" diff --git a/search/searcher.py b/search/searcher.py index a7088ec..0a798ed 100644 --- a/search/searcher.py +++ b/search/searcher.py @@ -13,6 +13,7 @@ from query import QueryParser, ParsedQuery from indexer import MappingGenerator from .boolean_parser import BooleanParser, QueryNode from .es_query_builder import ESQueryBuilder +from .multilang_query_builder import MultiLanguageQueryBuilder from .ranking_engine import RankingEngine
@@ -86,10 +87,10 @@ class Searcher: self.text_embedding_field = mapping_gen.get_text_embedding_field() self.image_embedding_field = mapping_gen.get_image_embedding_field()
- # Query builder
- self.query_builder = ESQueryBuilder(
- # Query builder - use multi-language version
- self.query_builder = MultiLanguageQueryBuilder(
- config=config, index_name=config.es_index_name,
match_fields=self.match_fields, text_embedding_field=self.text_embedding_field, image_embedding_field=self.image_embedding_field )@@ -144,11 +145,10 @@ class Searcher: query_text = parsed_query.rewritten_query print(f"[Searcher] Parsed boolean expression: {query_node}")
# Step 3: Build ES queryes_query = self.query_builder.build_query(query_text=query_text,# Step 3: Build ES query using multi-language builderes_query = self.query_builder.build_multilang_query(parsed_query=parsed_query, query_vector=parsed_query.query_vector if enable_embedding else None,query_node=query_node, filters=filters, size=size, from_=from_,@@ -325,6 +325,15 @@ class Searcher: query_info={'image_url': image_url, 'search_type': 'image_similarity'} )
def get_domain_summary(self) -> Dict[str, Any]:
"""Get summary of all configured domains.+
Returns:Dictionary with domain information"""return self.query_builder.get_domain_summary()+ def get_document(self, doc_id: str) -> Optional[Dict[str, Any]]: """ Get single document by ID.