Commit f0d020c3ae325deb3921307bcd2480d628cc5a7a

Authored by tangwang
1 parent 577ec972

多语言查询改为只支持中英文两种,filters and ( text_call or embedding_recall),然后 function_score 支持 新鲜度等提权字段

1. 前端传递的过滤条件永远是要起作用的
2. 然后召回模块包括文本相关性召回(中英文都是用)和向量召回,两者相互补充。
3. 套用function_score以支持两种打分融合和各种提权字段
4. 只需要build_query 这一层。

实际操作:
1. 架构简化
移除了 MultiLanguageQueryBuilder 层级
只保留单层的 ESQueryBuilder.build_query 方法
2. 查询结构重构
实现了 filters and (text_recall or embedding_recall) 结构:
前端过滤条件:永远起作用(放在 filter 中)
文本召回:同时搜索中英文字段(multi_match 覆盖 title_zh/en, brief_zh/en 等)
向量召回:KNN 查询(独立参数,ES 会自动合并)
Function_score:包装召回部分,支持提权字段配置
3. 文本匹配字段更新
在 DEFAULT_MATCH_FIELDS 中添加了中英文字段:
中文:title_zh, brief_zh, description_zh, vendor_zh, category_path_zh, category_name_zh
英文:title_en, brief_en, description_en, vendor_en, category_path_en, category_name_en
语言无关:tags
4. Function_score 框架保留
保留了 function_score 配置框架(FUNCTION_SCORE_CONFIG)
支持 filter_weight、field_value_factor、decay 等提权函数
可以从配置中扩展提权字段
5. 测试验证
所有功能测试通过:
基本文本搜索
带过滤条件的搜索
范围过滤
分面搜索
英文查询
search/es_query_builder.py
@@ -2,11 +2,16 @@ @@ -2,11 +2,16 @@
2 Elasticsearch query builder. 2 Elasticsearch query builder.
3 3
4 Converts parsed queries and search parameters into ES DSL queries. 4 Converts parsed queries and search parameters into ES DSL queries.
  5 +
  6 +Simplified architecture:
  7 +- filters and (text_recall or embedding_recall)
  8 +- function_score wrapper for boosting fields
5 """ 9 """
6 10
7 from typing import Dict, Any, List, Optional, Union 11 from typing import Dict, Any, List, Optional, Union
8 import numpy as np 12 import numpy as np
9 from .boolean_parser import QueryNode 13 from .boolean_parser import QueryNode
  14 +from .query_config import FUNCTION_SCORE_CONFIG
10 15
11 16
12 class ESQueryBuilder: 17 class ESQueryBuilder:
@@ -51,14 +56,20 @@ class ESQueryBuilder: @@ -51,14 +56,20 @@ class ESQueryBuilder:
51 min_score: Optional[float] = None 56 min_score: Optional[float] = None
52 ) -> Dict[str, Any]: 57 ) -> Dict[str, Any]:
53 """ 58 """
54 - Build complete ES query (重构版). 59 + Build complete ES query (简化版).
  60 +
  61 + 结构:filters and (text_recall or embedding_recall)
  62 + - filters: 前端传递的过滤条件永远起作用
  63 + - text_recall: 文本相关性召回(中英文字段都用)
  64 + - embedding_recall: 向量召回(KNN)
  65 + - function_score: 包装召回部分,支持提权字段
55 66
56 Args: 67 Args:
57 query_text: Query text for BM25 matching 68 query_text: Query text for BM25 matching
58 query_vector: Query embedding for KNN search 69 query_vector: Query embedding for KNN search
59 query_node: Parsed boolean expression tree 70 query_node: Parsed boolean expression tree
60 - filters: Exact match filters  
61 - range_filters: Range filters for numeric fields 71 + filters: Exact match filters (always applied)
  72 + range_filters: Range filters for numeric fields (always applied)
62 size: Number of results 73 size: Number of results
63 from_: Offset for pagination 74 from_: Offset for pagination
64 enable_knn: Whether to use KNN search 75 enable_knn: Whether to use KNN search
@@ -80,44 +91,161 @@ class ESQueryBuilder: @@ -80,44 +91,161 @@ class ESQueryBuilder:
80 "includes": self.source_fields 91 "includes": self.source_fields
81 } 92 }
82 93
83 - # Build main query  
84 - if query_node and query_node.operator != 'TERM':  
85 - # Complex boolean query  
86 - query_clause = self._build_boolean_query(query_node)  
87 - else:  
88 - # Simple text query  
89 - query_clause = self._build_text_query(query_text)  
90 -  
91 - # Add filters if provided  
92 - if filters or range_filters:  
93 - filter_clauses = self._build_filters(filters, range_filters) 94 + # 1. Build recall queries (text or embedding)
  95 + recall_clauses = []
  96 +
  97 + # Text recall (always include if query_text exists)
  98 + if query_text:
  99 + if query_node and query_node.operator != 'TERM':
  100 + # Complex boolean query
  101 + text_query = self._build_boolean_query(query_node)
  102 + else:
  103 + # Simple text query
  104 + text_query = self._build_text_query(query_text)
  105 + recall_clauses.append(text_query)
  106 +
  107 + # Embedding recall (KNN - separate from query, handled below)
  108 + has_embedding = enable_knn and query_vector is not None and self.text_embedding_field
  109 +
  110 + # 2. Build filter clauses (always applied)
  111 + filter_clauses = self._build_filters(filters, range_filters)
  112 +
  113 + # 3. Build main query structure: filters and recall
  114 + if recall_clauses:
  115 + # Combine text recalls with OR logic (if multiple)
  116 + if len(recall_clauses) == 1:
  117 + recall_query = recall_clauses[0]
  118 + else:
  119 + recall_query = {
  120 + "bool": {
  121 + "should": recall_clauses,
  122 + "minimum_should_match": 1
  123 + }
  124 + }
  125 +
  126 + # Wrap recall with function_score for boosting
  127 + recall_query = self._wrap_with_function_score(recall_query)
  128 +
  129 + # Combine filters and recall
94 if filter_clauses: 130 if filter_clauses:
95 es_query["query"] = { 131 es_query["query"] = {
96 "bool": { 132 "bool": {
97 - "must": [query_clause], 133 + "must": [recall_query],
98 "filter": filter_clauses 134 "filter": filter_clauses
99 } 135 }
100 } 136 }
101 else: 137 else:
102 - es_query["query"] = query_clause 138 + es_query["query"] = recall_query
103 else: 139 else:
104 - es_query["query"] = query_clause 140 + # No recall queries, only filters (match_all filtered)
  141 + if filter_clauses:
  142 + es_query["query"] = {
  143 + "bool": {
  144 + "must": [{"match_all": {}}],
  145 + "filter": filter_clauses
  146 + }
  147 + }
  148 + else:
  149 + es_query["query"] = {"match_all": {}}
105 150
106 - # Add KNN search if enabled and vector provided  
107 - if enable_knn and query_vector is not None and self.text_embedding_field: 151 + # 4. Add KNN search if enabled (separate from query, ES will combine)
  152 + if has_embedding:
108 knn_clause = { 153 knn_clause = {
109 "field": self.text_embedding_field, 154 "field": self.text_embedding_field,
110 "query_vector": query_vector.tolist(), 155 "query_vector": query_vector.tolist(),
111 "k": knn_k, 156 "k": knn_k,
112 - "num_candidates": knn_num_candidates 157 + "num_candidates": knn_num_candidates,
  158 + "boost": 0.2 # Lower boost for embedding recall
113 } 159 }
114 es_query["knn"] = knn_clause 160 es_query["knn"] = knn_clause
115 161
116 - # Add minimum score filter 162 + # 5. Add minimum score filter
117 if min_score is not None: 163 if min_score is not None:
118 es_query["min_score"] = min_score 164 es_query["min_score"] = min_score
119 165
120 return es_query 166 return es_query
  167 +
  168 + def _wrap_with_function_score(self, query: Dict[str, Any]) -> Dict[str, Any]:
  169 + """
  170 + Wrap query with function_score for boosting fields.
  171 +
  172 + Args:
  173 + query: Base query to wrap
  174 +
  175 + Returns:
  176 + Function score query or original query if no functions configured
  177 + """
  178 + functions = self._build_score_functions()
  179 +
  180 + # If no functions configured, return original query
  181 + if not functions:
  182 + return query
  183 +
  184 + # Build function_score query
  185 + function_score_query = {
  186 + "function_score": {
  187 + "query": query,
  188 + "functions": functions,
  189 + "score_mode": FUNCTION_SCORE_CONFIG.get("score_mode", "sum"),
  190 + "boost_mode": FUNCTION_SCORE_CONFIG.get("boost_mode", "multiply")
  191 + }
  192 + }
  193 +
  194 + return function_score_query
  195 +
  196 + def _build_score_functions(self) -> List[Dict[str, Any]]:
  197 + """
  198 + Build function_score functions from config.
  199 +
  200 + Returns:
  201 + List of function score functions
  202 + """
  203 + functions = []
  204 + config_functions = FUNCTION_SCORE_CONFIG.get("functions", [])
  205 +
  206 + for func_config in config_functions:
  207 + func_type = func_config.get("type")
  208 +
  209 + if func_type == "filter_weight":
  210 + # Filter + Weight
  211 + functions.append({
  212 + "filter": func_config["filter"],
  213 + "weight": func_config.get("weight", 1.0)
  214 + })
  215 +
  216 + elif func_type == "field_value_factor":
  217 + # Field Value Factor
  218 + functions.append({
  219 + "field_value_factor": {
  220 + "field": func_config["field"],
  221 + "factor": func_config.get("factor", 1.0),
  222 + "modifier": func_config.get("modifier", "none"),
  223 + "missing": func_config.get("missing", 1.0)
  224 + }
  225 + })
  226 +
  227 + elif func_type == "decay":
  228 + # Decay Function (gauss/exp/linear)
  229 + decay_func = func_config.get("function", "gauss")
  230 + field = func_config["field"]
  231 +
  232 + decay_params = {
  233 + "origin": func_config.get("origin", "now"),
  234 + "scale": func_config["scale"]
  235 + }
  236 +
  237 + if "offset" in func_config:
  238 + decay_params["offset"] = func_config["offset"]
  239 + if "decay" in func_config:
  240 + decay_params["decay"] = func_config["decay"]
  241 +
  242 + functions.append({
  243 + decay_func: {
  244 + field: decay_params
  245 + }
  246 + })
  247 +
  248 + return functions
121 249
122 def _build_text_query(self, query_text: str) -> Dict[str, Any]: 250 def _build_text_query(self, query_text: str) -> Dict[str, Any]:
123 """ 251 """
@@ -235,11 +363,19 @@ class ESQueryBuilder: @@ -235,11 +363,19 @@ class ESQueryBuilder:
235 "term": {field: value} 363 "term": {field: value}
236 }) 364 })
237 365
238 - # 2. 处理范围过滤(RangeFilter Pydantic 模型 366 + # 2. 处理范围过滤(支持 RangeFilter Pydantic 模型或字典
239 if range_filters: 367 if range_filters:
240 for field, range_filter in range_filters.items(): 368 for field, range_filter in range_filters.items():
241 - # 将 RangeFilter 模型转换为字典  
242 - range_dict = range_filter.model_dump(exclude_none=True) 369 + # 支持 Pydantic 模型或字典格式
  370 + if hasattr(range_filter, 'model_dump'):
  371 + # Pydantic 模型
  372 + range_dict = range_filter.model_dump(exclude_none=True)
  373 + elif isinstance(range_filter, dict):
  374 + # 已经是字典格式
  375 + range_dict = {k: v for k, v in range_filter.items() if v is not None}
  376 + else:
  377 + # 其他格式,跳过
  378 + continue
243 379
244 if range_dict: 380 if range_dict:
245 filter_clauses.append({ 381 filter_clauses.append({
search/multilang_query_builder.py deleted
@@ -1,459 +0,0 @@ @@ -1,459 +0,0 @@
1 -"""  
2 -Multi-language query builder for handling domain-specific searches.  
3 -  
4 -This module extends the ESQueryBuilder to support multi-language field mappings,  
5 -allowing queries to be routed to appropriate language-specific fields while  
6 -maintaining a unified external interface.  
7 -"""  
8 -  
9 -from typing import Dict, Any, List, Optional  
10 -import numpy as np  
11 -import logging  
12 -import re  
13 -  
14 -from query import ParsedQuery  
15 -from .es_query_builder import ESQueryBuilder  
16 -from .query_config import DEFAULT_MATCH_FIELDS, DOMAIN_FIELDS, FUNCTION_SCORE_CONFIG  
17 -  
18 -logger = logging.getLogger(__name__)  
19 -  
20 -  
21 -class MultiLanguageQueryBuilder(ESQueryBuilder):  
22 - """  
23 - Enhanced query builder with multi-language support.  
24 -  
25 - Handles routing queries to appropriate language-specific fields based on:  
26 - 1. Detected query language  
27 - 2. Available translations  
28 - 3. Domain configuration (language_field_mapping)  
29 - """  
30 -  
31 - def __init__(  
32 - self,  
33 - index_name: str,  
34 - match_fields: Optional[List[str]] = None,  
35 - text_embedding_field: Optional[str] = None,  
36 - image_embedding_field: Optional[str] = None,  
37 - source_fields: Optional[List[str]] = None  
38 - ):  
39 - """  
40 - Initialize multi-language query builder.  
41 -  
42 - Args:  
43 - index_name: ES index name  
44 - match_fields: Fields to search for text matching (default: from query_config)  
45 - text_embedding_field: Field name for text embeddings  
46 - image_embedding_field: Field name for image embeddings  
47 - source_fields: Fields to return in search results (_source includes)  
48 - """  
49 - self.function_score_config = FUNCTION_SCORE_CONFIG  
50 -  
51 - # Use provided match_fields or default  
52 - if match_fields is None:  
53 - match_fields = DEFAULT_MATCH_FIELDS  
54 -  
55 - super().__init__(  
56 - index_name=index_name,  
57 - match_fields=match_fields,  
58 - text_embedding_field=text_embedding_field,  
59 - image_embedding_field=image_embedding_field,  
60 - source_fields=source_fields  
61 - )  
62 -  
63 - # Build domain configurations from query_config  
64 - self.domain_configs = DOMAIN_FIELDS  
65 -  
66 - def _get_domain_fields(self, domain_name: str) -> List[str]:  
67 - """Get fields for a specific domain with boost notation."""  
68 - return self.domain_configs.get(domain_name, DEFAULT_MATCH_FIELDS)  
69 -  
70 - def build_multilang_query(  
71 - self,  
72 - parsed_query: ParsedQuery,  
73 - query_vector: Optional[np.ndarray] = None,  
74 - query_node: Optional[Any] = None,  
75 - filters: Optional[Dict[str, Any]] = None,  
76 - range_filters: Optional[Dict[str, Any]] = None,  
77 - size: int = 10,  
78 - from_: int = 0,  
79 - enable_knn: bool = True,  
80 - knn_k: int = 50,  
81 - knn_num_candidates: int = 200,  
82 - min_score: Optional[float] = None  
83 - ) -> Dict[str, Any]:  
84 - """  
85 - Build ES query with multi-language support (简化版).  
86 -  
87 - Args:  
88 - parsed_query: Parsed query with language info and translations  
89 - query_vector: Query embedding for KNN search  
90 - filters: Exact match filters  
91 - range_filters: Range filters for numeric fields  
92 - size: Number of results  
93 - from_: Offset for pagination  
94 - enable_knn: Whether to use KNN search  
95 - knn_k: K value for KNN  
96 - knn_num_candidates: Number of candidates for KNN  
97 - min_score: Minimum score threshold  
98 -  
99 - Returns:  
100 - ES query DSL dictionary  
101 - """  
102 - # 1. 根据域选择匹配字段(默认域使用 DEFAULT_MATCH_FIELDS)  
103 - domain = parsed_query.domain or "default"  
104 - domain_fields = self.domain_configs.get(domain) or DEFAULT_MATCH_FIELDS  
105 -  
106 - # 2. 临时切换 match_fields,复用基类 build_query 逻辑  
107 - original_match_fields = self.match_fields  
108 - self.match_fields = domain_fields  
109 - try:  
110 - return super().build_query(  
111 - query_text=parsed_query.rewritten_query or parsed_query.normalized_query,  
112 - query_vector=query_vector,  
113 - query_node=query_node,  
114 - filters=filters,  
115 - range_filters=range_filters,  
116 - size=size,  
117 - from_=from_,  
118 - enable_knn=enable_knn,  
119 - knn_k=knn_k,  
120 - knn_num_candidates=knn_num_candidates,  
121 - min_score=min_score  
122 - )  
123 - finally:  
124 - # 恢复原始配置,避免影响后续查询  
125 - self.match_fields = original_match_fields  
126 -  
127 - def _build_score_functions(self) -> List[Dict[str, Any]]:  
128 - """  
129 - 从配置构建 function_score 的打分函数列表  
130 -  
131 - Returns:  
132 - 打分函数列表(ES原生格式)  
133 - """  
134 - if not self.function_score_config or not self.function_score_config.functions:  
135 - return []  
136 -  
137 - functions = []  
138 -  
139 - for func_config in self.function_score_config.functions:  
140 - func_type = func_config.get('type')  
141 -  
142 - if func_type == 'filter_weight':  
143 - # Filter + Weight  
144 - functions.append({  
145 - "filter": func_config['filter'],  
146 - "weight": func_config.get('weight', 1.0)  
147 - })  
148 -  
149 - elif func_type == 'field_value_factor':  
150 - # Field Value Factor  
151 - functions.append({  
152 - "field_value_factor": {  
153 - "field": func_config['field'],  
154 - "factor": func_config.get('factor', 1.0),  
155 - "modifier": func_config.get('modifier', 'none'),  
156 - "missing": func_config.get('missing', 1.0)  
157 - }  
158 - })  
159 -  
160 - elif func_type == 'decay':  
161 - # Decay Function (gauss/exp/linear)  
162 - decay_func = func_config.get('function', 'gauss')  
163 - field = func_config['field']  
164 -  
165 - decay_params = {  
166 - "origin": func_config.get('origin', 'now'),  
167 - "scale": func_config['scale']  
168 - }  
169 -  
170 - if 'offset' in func_config:  
171 - decay_params['offset'] = func_config['offset']  
172 - if 'decay' in func_config:  
173 - decay_params['decay'] = func_config['decay']  
174 -  
175 - functions.append({  
176 - decay_func: {  
177 - field: decay_params  
178 - }  
179 - })  
180 -  
181 - return functions  
182 -  
183 - def _build_multilang_text_query(  
184 - self,  
185 - parsed_query: ParsedQuery,  
186 - domain_config: Dict[str, Any]  
187 - ) -> Dict[str, Any]:  
188 - """  
189 - Build text query with multi-language field routing.  
190 -  
191 - Args:  
192 - parsed_query: Parsed query with language info  
193 - domain_config: Domain configuration  
194 -  
195 - Returns:  
196 - ES query clause  
197 - """  
198 - if not domain_config.language_field_mapping:  
199 - # No multi-language mapping, use all fields with default analyzer  
200 - fields_with_boost = []  
201 - for field_name in domain_config.fields:  
202 - field = self._get_field_by_name(field_name)  
203 - if field and field.boost != 1.0:  
204 - fields_with_boost.append(f"{field_name}^{field.boost}")  
205 - else:  
206 - fields_with_boost.append(field_name)  
207 -  
208 - return {  
209 - "multi_match": {  
210 - "query": parsed_query.rewritten_query,  
211 - "fields": fields_with_boost,  
212 - "minimum_should_match": "67%",  
213 - "tie_breaker": 0.9,  
214 - "boost": domain_config.boost,  
215 - "_name": f"{domain_config.name}_query"  
216 - }  
217 - }  
218 -  
219 - # Multi-language mapping exists - build targeted queries  
220 - should_clauses = []  
221 - available_languages = set(domain_config.language_field_mapping.keys())  
222 -  
223 - # 1. Query in detected language (if it exists in mapping)  
224 - detected_lang = parsed_query.detected_language  
225 - if detected_lang in available_languages:  
226 - target_fields = domain_config.language_field_mapping[detected_lang]  
227 - fields_with_boost = self._apply_field_boosts(target_fields)  
228 -  
229 - should_clauses.append({  
230 - "multi_match": {  
231 - "query": parsed_query.rewritten_query,  
232 - "fields": fields_with_boost,  
233 - "minimum_should_match": "67%",  
234 - "tie_breaker": 0.9,  
235 - "boost": domain_config.boost * 1.5, # Higher boost for detected language  
236 - "_name": f"{domain_config.name}_{detected_lang}_query"  
237 - }  
238 - })  
239 - logger.debug(f"Added query for detected language '{detected_lang}'")  
240 -  
241 - # 2. Query in translated languages (only for languages in mapping)  
242 - for lang, translation in parsed_query.translations.items():  
243 - # Only use translations for languages that exist in the mapping  
244 - if lang in available_languages and translation and translation.strip():  
245 - target_fields = domain_config.language_field_mapping[lang]  
246 - fields_with_boost = self._apply_field_boosts(target_fields)  
247 -  
248 - should_clauses.append({  
249 - "multi_match": {  
250 - "query": translation,  
251 - "fields": fields_with_boost,  
252 - "minimum_should_match": "67%",  
253 - "tie_breaker": 0.9,  
254 - "boost": domain_config.boost,  
255 - "_name": f"{domain_config.name}_{lang}_translated_query"  
256 - }  
257 - })  
258 - logger.debug(f"Added translated query for language '{lang}'")  
259 -  
260 - # 3. Fallback: query all fields in mapping if no language-specific query was built  
261 - if not should_clauses:  
262 - logger.debug("No language mapping matched, using all fields from mapping")  
263 - # Use all fields from all languages in the mapping  
264 - all_mapped_fields = []  
265 - for lang_fields in domain_config.language_field_mapping.values():  
266 - all_mapped_fields.extend(lang_fields)  
267 - # Remove duplicates while preserving order  
268 - unique_fields = list(dict.fromkeys(all_mapped_fields))  
269 - fields_with_boost = self._apply_field_boosts(unique_fields)  
270 -  
271 - should_clauses.append({  
272 - "multi_match": {  
273 - "query": parsed_query.rewritten_query,  
274 - "fields": fields_with_boost,  
275 - "minimum_should_match": "67%",  
276 - "tie_breaker": 0.9,  
277 - "boost": domain_config.boost * 0.8, # Lower boost for fallback  
278 - "_name": f"{domain_config.name}_fallback_query"  
279 - }  
280 - })  
281 -  
282 - if len(should_clauses) == 1:  
283 - return should_clauses[0]  
284 - else:  
285 - return {  
286 - "bool": {  
287 - "should": should_clauses,  
288 - "minimum_should_match": 1  
289 - }  
290 - }  
291 -  
292 - def _apply_field_boosts(self, field_names: List[str]) -> List[str]:  
293 - """Apply boost values to field names."""  
294 - result = []  
295 - for field_name in field_names:  
296 - field = self._get_field_by_name(field_name)  
297 - if field and field.boost != 1.0:  
298 - result.append(f"{field_name}^{field.boost}")  
299 - else:  
300 - result.append(field_name)  
301 - return result  
302 -  
303 - def _build_boolean_query_from_tuple(self, node) -> Dict[str, Any]:  
304 - """  
305 - Build query from boolean expression tuple.  
306 -  
307 - Args:  
308 - node: Boolean expression tuple (operator, terms...)  
309 -  
310 - Returns:  
311 - ES query clause  
312 - """  
313 - if not node:  
314 - return {"match_all": {}}  
315 -  
316 - # Handle different node types from boolean parser  
317 - if hasattr(node, 'operator'):  
318 - # QueryNode object  
319 - operator = node.operator  
320 - terms = node.terms if hasattr(node, 'terms') else None  
321 -  
322 - # For TERM nodes, check if there's a value  
323 - if operator == 'TERM' and hasattr(node, 'value') and node.value:  
324 - terms = node.value  
325 - elif isinstance(node, tuple) and len(node) > 0:  
326 - # Tuple format from boolean parser  
327 - if hasattr(node[0], 'operator'):  
328 - # Nested tuple with QueryNode  
329 - operator = node[0].operator  
330 - terms = node[0].terms  
331 - elif isinstance(node[0], str):  
332 - # Simple tuple like ('TERM', 'field:value')  
333 - operator = node[0]  
334 - terms = node[1] if len(node) > 1 else ''  
335 - else:  
336 - # Complex tuple like (OR( TERM(...), TERM(...) ), score)  
337 - if hasattr(node[0], '__class__') and hasattr(node[0], '__name__'):  
338 - # Constructor call like OR(...)  
339 - operator = node[0].__name__  
340 - elif str(node[0]).startswith('('):  
341 - # String representation of constructor call  
342 - match = re.match(r'(\w+)\(', str(node[0]))  
343 - if match:  
344 - operator = match.group(1)  
345 - else:  
346 - return {"match_all": {}}  
347 - else:  
348 - operator = str(node[0])  
349 -  
350 - # Extract terms from nested structure  
351 - terms = []  
352 - if len(node) > 1 and isinstance(node[1], tuple):  
353 - terms = node[1]  
354 - else:  
355 - return {"match_all": {}}  
356 -  
357 -  
358 - if operator == 'TERM':  
359 - # Leaf node - handle field:query format  
360 - if isinstance(terms, str) and ':' in terms:  
361 - field, value = terms.split(':', 1)  
362 - return {  
363 - "term": {  
364 - field: value  
365 - }  
366 - }  
367 - elif isinstance(terms, str):  
368 - # Simple text term - create match query  
369 - return {  
370 - "multi_match": {  
371 - "query": terms,  
372 - "fields": self.match_fields,  
373 - "type": "best_fields",  
374 - "operator": "AND"  
375 - }  
376 - }  
377 - else:  
378 - # Invalid TERM node - return empty match  
379 - return {  
380 - "match_none": {}  
381 - }  
382 -  
383 - elif operator == 'OR':  
384 - # Any term must match  
385 - should_clauses = []  
386 - if terms:  
387 - for term in terms:  
388 - clause = self._build_boolean_query_from_tuple(term)  
389 - if clause and clause.get("match_none") is None:  
390 - should_clauses.append(clause)  
391 -  
392 - if should_clauses:  
393 - return {  
394 - "bool": {  
395 - "should": should_clauses,  
396 - "minimum_should_match": 1  
397 - }  
398 - }  
399 - else:  
400 - return {"match_none": {}}  
401 -  
402 - elif operator == 'AND':  
403 - # All terms must match  
404 - must_clauses = []  
405 - if terms:  
406 - for term in terms:  
407 - clause = self._build_boolean_query_from_tuple(term)  
408 - if clause and clause.get("match_none") is None:  
409 - must_clauses.append(clause)  
410 -  
411 - if must_clauses:  
412 - return {  
413 - "bool": {  
414 - "must": must_clauses  
415 - }  
416 - }  
417 - else:  
418 - return {"match_none": {}}  
419 -  
420 - elif operator == 'ANDNOT':  
421 - # First term must match, second must not  
422 - if len(terms) >= 2:  
423 - return {  
424 - "bool": {  
425 - "must": [self._build_boolean_query_from_tuple(terms[0])],  
426 - "must_not": [self._build_boolean_query_from_tuple(terms[1])]  
427 - }  
428 - }  
429 - else:  
430 - return self._build_boolean_query_from_tuple(terms[0])  
431 -  
432 - elif operator == 'RANK':  
433 - # Like OR but for ranking (all terms contribute to score)  
434 - should_clauses = []  
435 - for term in terms:  
436 - should_clauses.append(self._build_boolean_query_from_tuple(term))  
437 - return {  
438 - "bool": {  
439 - "should": should_clauses  
440 - }  
441 - }  
442 -  
443 - else:  
444 - # Unknown operator  
445 - return {"match_all": {}}  
446 -  
447 - def get_domain_summary(self) -> Dict[str, Any]:  
448 - """Get summary of all configured domains."""  
449 - summary = {}  
450 - for domain_name, domain_config in self.domain_configs.items():  
451 - summary[domain_name] = {  
452 - "label": domain_config.label,  
453 - "fields": domain_config.fields,  
454 - "analyzer": domain_config.analyzer.value,  
455 - "boost": domain_config.boost,  
456 - "has_multilang_mapping": domain_config.language_field_mapping is not None,  
457 - "supported_languages": list(domain_config.language_field_mapping.keys()) if domain_config.language_field_mapping else []  
458 - }  
459 - return summary  
460 \ No newline at end of file 0 \ No newline at end of file
search/query_config.py
@@ -17,14 +17,24 @@ TEXT_EMBEDDING_FIELD = "title_embedding" @@ -17,14 +17,24 @@ TEXT_EMBEDDING_FIELD = "title_embedding"
17 IMAGE_EMBEDDING_FIELD = "image_embedding" 17 IMAGE_EMBEDDING_FIELD = "image_embedding"
18 18
19 # Default match fields for text search (with boost) 19 # Default match fields for text search (with boost)
  20 +# 文本召回:同时搜索中英文字段,两者相互补充
20 DEFAULT_MATCH_FIELDS = [ 21 DEFAULT_MATCH_FIELDS = [
  22 + # 中文字段
21 "title_zh^3.0", 23 "title_zh^3.0",
22 "brief_zh^1.5", 24 "brief_zh^1.5",
23 "description_zh^1.0", 25 "description_zh^1.0",
24 "vendor_zh^1.5", 26 "vendor_zh^1.5",
25 - "tags^1.0",  
26 "category_path_zh^1.5", 27 "category_path_zh^1.5",
27 - "category_name_zh^1.5" 28 + "category_name_zh^1.5",
  29 + # 英文字段
  30 + "title_en^3.0",
  31 + "brief_en^1.5",
  32 + "description_en^1.0",
  33 + "vendor_en^1.5",
  34 + "category_path_en^1.5",
  35 + "category_name_en^1.5",
  36 + # 语言无关字段
  37 + "tags^1.0",
28 ] 38 ]
29 39
30 # Domain-specific match fields 40 # Domain-specific match fields
search/searcher.py
@@ -13,7 +13,6 @@ from query import QueryParser, ParsedQuery @@ -13,7 +13,6 @@ from query import QueryParser, ParsedQuery
13 from embeddings import CLIPImageEncoder 13 from embeddings import CLIPImageEncoder
14 from .boolean_parser import BooleanParser, QueryNode 14 from .boolean_parser import BooleanParser, QueryNode
15 from .es_query_builder import ESQueryBuilder 15 from .es_query_builder import ESQueryBuilder
16 -from .multilang_query_builder import MultiLanguageQueryBuilder  
17 from .rerank_engine import RerankEngine 16 from .rerank_engine import RerankEngine
18 from .query_config import ( 17 from .query_config import (
19 DEFAULT_INDEX_NAME, 18 DEFAULT_INDEX_NAME,
@@ -112,8 +111,8 @@ class Searcher: @@ -112,8 +111,8 @@ class Searcher:
112 self.text_embedding_field = TEXT_EMBEDDING_FIELD 111 self.text_embedding_field = TEXT_EMBEDDING_FIELD
113 self.image_embedding_field = IMAGE_EMBEDDING_FIELD 112 self.image_embedding_field = IMAGE_EMBEDDING_FIELD
114 113
115 - # Query builder - use multi-language version  
116 - self.query_builder = MultiLanguageQueryBuilder( 114 + # Query builder - simplified single-layer architecture
  115 + self.query_builder = ESQueryBuilder(
117 index_name=index_name, 116 index_name=index_name,
118 match_fields=self.match_fields, 117 match_fields=self.match_fields,
119 text_embedding_field=self.text_embedding_field, 118 text_embedding_field=self.text_embedding_field,
@@ -274,8 +273,8 @@ class Searcher: @@ -274,8 +273,8 @@ class Searcher:
274 filters = {} 273 filters = {}
275 filters['tenant_id'] = tenant_id 274 filters['tenant_id'] = tenant_id
276 275
277 - es_query = self.query_builder.build_multilang_query(  
278 - parsed_query=parsed_query, 276 + es_query = self.query_builder.build_query(
  277 + query_text=parsed_query.rewritten_query or parsed_query.normalized_query,
279 query_vector=parsed_query.query_vector if enable_embedding else None, 278 query_vector=parsed_query.query_vector if enable_embedding else None,
280 query_node=query_node, 279 query_node=query_node,
281 filters=filters, 280 filters=filters,