b926f678
tangwang
多语言查询
|
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
|
bm25打分(base_query):
"multi_match": {
"query": search_query,
diff --git a/config/config_loader.py b/config/config_loader.py
index 8df15b3..f3fcaa3 100644
--- a/config/config_loader.py
+++ b/config/config_loader.py
@@ -27,6 +27,9 @@ class IndexConfig:
boost: float = 1.0
example: Optional[str] = None
+ # Multi-language field mapping: {"zh": ["name"], "en": ["enSpuName"], "ru": ["ruSkuName"]}
+ language_field_mapping: Optional[Dict[str, List[str]]] = None
+
@dataclass
class RankingConfig:
@@ -66,8 +69,6 @@ class CustomerConfig:
# Database settings
mysql_config: Dict[str, Any]
- main_table: str = "shoplazza_product_sku"
- extension_table: Optional[str] = None
# Field definitions
fields: List[FieldConfig]
@@ -86,6 +87,10 @@ class CustomerConfig:
# ES index settings
es_index_name: str
+
+ # Optional fields with defaults
+ main_table: str = "shoplazza_product_sku"
+ extension_table: Optional[str] = None
es_settings: Dict[str, Any] = field(default_factory=dict)
@@ -228,13 +233,17 @@ class ConfigLoader:
if analyzer_str not in ANALYZER_MAP:
raise ConfigurationError(f"Unknown analyzer: {analyzer_str}")
+ # Parse language field mapping if present
+ language_field_mapping = index_data.get("language_field_mapping")
+
return IndexConfig(
name=index_data["name"],
label=index_data.get("label", index_data["name"]),
fields=index_data["fields"],
analyzer=ANALYZER_MAP[analyzer_str],
boost=index_data.get("boost", 1.0),
- example=index_data.get("example")
+ example=index_data.get("example"),
+ language_field_mapping=language_field_mapping
)
def validate_config(self, config: CustomerConfig) -> List[str]:
@@ -360,11 +369,16 @@ class ConfigLoader:
def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]:
"""Convert IndexConfig to dictionary."""
- return {
+ result = {
"name": index.name,
"label": index.label,
"fields": index.fields,
"analyzer": index.analyzer.value,
"boost": index.boost,
"example": index.example
- }
\ No newline at end of file
+ }
+
+ if index.language_field_mapping:
+ result["language_field_mapping"] = index.language_field_mapping
+
+ return result
\ No newline at end of file
diff --git a/config/schema/customer1_config.yaml b/config/schema/customer1_config.yaml
index bfe2e53..84e9ba1 100644
--- a/config/schema/customer1_config.yaml
+++ b/config/schema/customer1_config.yaml
@@ -177,6 +177,15 @@ indexes:
analyzer: "chinese_ecommerce"
boost: 1.0
example: 'query=default:"消防套"'
+ language_field_mapping:
+ zh:
+ - "name"
+ - "categoryName"
+ - "brandName"
+ en:
+ - "enSpuName"
+ ru:
+ - "ruSkuName"
- name: "title"
label: "标题索引"
@@ -187,6 +196,13 @@ indexes:
analyzer: "chinese_ecommerce"
boost: 2.0
example: 'query=title:"芭比娃娃"'
+ language_field_mapping:
+ zh:
+ - "name"
+ en:
+ - "enSpuName"
+ ru:
+ - "ruSkuName"
- name: "category"
label: "类目索引"
diff --git a/search/searcher.py b/search/searcher.py
index a7088ec..0a798ed 100644
--- a/search/searcher.py
+++ b/search/searcher.py
@@ -13,6 +13,7 @@ from query import QueryParser, ParsedQuery
from indexer import MappingGenerator
from .boolean_parser import BooleanParser, QueryNode
from .es_query_builder import ESQueryBuilder
+from .multilang_query_builder import MultiLanguageQueryBuilder
from .ranking_engine import RankingEngine
@@ -86,10 +87,10 @@ class Searcher:
self.text_embedding_field = mapping_gen.get_text_embedding_field()
self.image_embedding_field = mapping_gen.get_image_embedding_field()
- # Query builder
- self.query_builder = ESQueryBuilder(
+ # Query builder - use multi-language version
+ self.query_builder = MultiLanguageQueryBuilder(
+ config=config,
index_name=config.es_index_name,
- match_fields=self.match_fields,
text_embedding_field=self.text_embedding_field,
image_embedding_field=self.image_embedding_field
)
@@ -144,11 +145,10 @@ class Searcher:
query_text = parsed_query.rewritten_query
print(f"[Searcher] Parsed boolean expression: {query_node}")
- # Step 3: Build ES query
- es_query = self.query_builder.build_query(
- query_text=query_text,
+ # Step 3: Build ES query using multi-language builder
+ es_query = self.query_builder.build_multilang_query(
+ parsed_query=parsed_query,
query_vector=parsed_query.query_vector if enable_embedding else None,
- query_node=query_node,
filters=filters,
size=size,
from_=from_,
@@ -325,6 +325,15 @@ class Searcher:
query_info={'image_url': image_url, 'search_type': 'image_similarity'}
)
+ def get_domain_summary(self) -> Dict[str, Any]:
+ """
+ Get summary of all configured domains.
+
+ Returns:
+ Dictionary with domain information
+ """
+ return self.query_builder.get_domain_summary()
+
def get_document(self, doc_id: str) -> Optional[Dict[str, Any]]:
"""
Get single document by ID.
|