支持多语言查询.md
6.63 KB
这次修改没改完。
diff --git a/HighLevelDesign.md b/HighLevelDesign.md index 397a9f7..3e728c9 100644 --- a/HighLevelDesign.md +++ b/HighLevelDesign.md @@ -112,10 +112,9 @@ if response.status_code == 200: 支持多种匹配方式,如AND、OR、RANK、NOTAND以及(),优先级从高到低为(),ANDNOT,AND,OR,RANK。
default域的相关性,是代码里面单独计算,是特定的深度定制优化的,暂时不做配置化。 -多语言搜索: -对外提供的服务 用default域搜索就行,但是内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。
暂时具体实现为 bm25()+0.2*text_embedding_relevence(也就是knn检索表达式的打分) +bm25() 包括多语言的打分:内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。
bm25打分(base_query):
"multi_match": {
"query": search_query,
diff --git a/config/config_loader.py b/config/config_loader.py
index 8df15b3..f3fcaa3 100644
--- a/config/config_loader.py
+++ b/config/config_loader.py
@@ -27,6 +27,9 @@ class IndexConfig:
boost: float = 1.0
example: Optional[str] = None
+ # Multi-language field mapping: {"zh": ["name"], "en": ["enSpuName"], "ru": ["ruSkuName"]}
+ language_field_mapping: Optional[Dict[str, List[str]]] = None
+
@dataclass
class RankingConfig:
@@ -66,8 +69,6 @@ class CustomerConfig:
# Database settings
mysql_config: Dict[str, Any]
- main_table: str = "shoplazza_product_sku"
- extension_table: Optional[str] = None
# Field definitions
fields: List[FieldConfig]
@@ -86,6 +87,10 @@ class CustomerConfig:
# ES index settings
es_index_name: str
+
+ # Optional fields with defaults
+ main_table: str = "shoplazza_product_sku"
+ extension_table: Optional[str] = None
es_settings: Dict[str, Any] = field(default_factory=dict)
@@ -228,13 +233,17 @@ class ConfigLoader:
if analyzer_str not in ANALYZER_MAP:
raise ConfigurationError(f"Unknown analyzer: {analyzer_str}")
+ # Parse language field mapping if present
+ language_field_mapping = index_data.get("language_field_mapping")
+
return IndexConfig(
name=index_data["name"],
label=index_data.get("label", index_data["name"]),
fields=index_data["fields"],
analyzer=ANALYZER_MAP[analyzer_str],
boost=index_data.get("boost", 1.0),
- example=index_data.get("example")
+ example=index_data.get("example"),
+ language_field_mapping=language_field_mapping
)
def validate_config(self, config: CustomerConfig) -> List[str]:
@@ -360,11 +369,16 @@ class ConfigLoader:
def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]:
"""Convert IndexConfig to dictionary."""
- return {
+ result = {
"name": index.name,
"label": index.label,
"fields": index.fields,
"analyzer": index.analyzer.value,
"boost": index.boost,
"example": index.example
- }
\ No newline at end of file
+ }
+
+ if index.language_field_mapping:
+ result["language_field_mapping"] = index.language_field_mapping
+
+ return result
\ No newline at end of file
diff --git a/config/schema/customer1_config.yaml b/config/schema/customer1_config.yaml
index bfe2e53..84e9ba1 100644
--- a/config/schema/customer1_config.yaml
+++ b/config/schema/customer1_config.yaml
@@ -177,6 +177,15 @@ indexes:
analyzer: "chinese_ecommerce"
boost: 1.0
example: 'query=default:"消防套"'
+ language_field_mapping:
+ zh:
+ - "name"
+ - "categoryName"
+ - "brandName"
+ en:
+ - "enSpuName"
+ ru:
+ - "ruSkuName"
- name: "title"
label: "标题索引"
@@ -187,6 +196,13 @@ indexes:
analyzer: "chinese_ecommerce"
boost: 2.0
example: 'query=title:"芭比娃娃"'
+ language_field_mapping:
+ zh:
+ - "name"
+ en:
+ - "enSpuName"
+ ru:
+ - "ruSkuName"
- name: "category"
label: "类目索引"
diff --git a/search/searcher.py b/search/searcher.py
index a7088ec..0a798ed 100644
--- a/search/searcher.py
+++ b/search/searcher.py
@@ -13,6 +13,7 @@ from query import QueryParser, ParsedQuery
from indexer import MappingGenerator
from .boolean_parser import BooleanParser, QueryNode
from .es_query_builder import ESQueryBuilder
+from .multilang_query_builder import MultiLanguageQueryBuilder
from .ranking_engine import RankingEngine
@@ -86,10 +87,10 @@ class Searcher:
self.text_embedding_field = mapping_gen.get_text_embedding_field()
self.image_embedding_field = mapping_gen.get_image_embedding_field()
- # Query builder
- self.query_builder = ESQueryBuilder(
+ # Query builder - use multi-language version
+ self.query_builder = MultiLanguageQueryBuilder(
+ config=config,
index_name=config.es_index_name,
- match_fields=self.match_fields,
text_embedding_field=self.text_embedding_field,
image_embedding_field=self.image_embedding_field
)
@@ -144,11 +145,10 @@ class Searcher:
query_text = parsed_query.rewritten_query
print(f"[Searcher] Parsed boolean expression: {query_node}")
- # Step 3: Build ES query
- es_query = self.query_builder.build_query(
- query_text=query_text,
+ # Step 3: Build ES query using multi-language builder
+ es_query = self.query_builder.build_multilang_query(
+ parsed_query=parsed_query,
query_vector=parsed_query.query_vector if enable_embedding else None,
- query_node=query_node,
filters=filters,
size=size,
from_=from_,
@@ -325,6 +325,15 @@ class Searcher:
query_info={'image_url': image_url, 'search_type': 'image_similarity'}
)
+ def get_domain_summary(self) -> Dict[str, Any]:
+ """
+ Get summary of all configured domains.
+
+ Returns:
+ Dictionary with domain information
+ """
+ return self.query_builder.get_domain_summary()
+
def get_document(self, doc_id: str) -> Optional[Dict[str, Any]]:
"""
Get single document by ID.