Commit f0d020c3ae325deb3921307bcd2480d628cc5a7a

Authored by tangwang
1 parent 577ec972

多语言查询改为只支持中英文两种,filters and ( text_call or embedding_recall),然后 function_score 支持 新鲜度等提权字段

1. 前端传递的过滤条件永远是要起作用的
2. 然后召回模块包括文本相关性召回(中英文都是用)和向量召回,两者相互补充。
3. 套用function_score以支持两种打分融合和各种提权字段
4. 只需要build_query 这一层。

实际操作:
1. 架构简化
移除了 MultiLanguageQueryBuilder 层级
只保留单层的 ESQueryBuilder.build_query 方法
2. 查询结构重构
实现了 filters and (text_recall or embedding_recall) 结构:
前端过滤条件:永远起作用(放在 filter 中)
文本召回:同时搜索中英文字段(multi_match 覆盖 title_zh/en, brief_zh/en 等)
向量召回:KNN 查询(独立参数,ES 会自动合并)
Function_score:包装召回部分,支持提权字段配置
3. 文本匹配字段更新
在 DEFAULT_MATCH_FIELDS 中添加了中英文字段:
中文:title_zh, brief_zh, description_zh, vendor_zh, category_path_zh, category_name_zh
英文:title_en, brief_en, description_en, vendor_en, category_path_en, category_name_en
语言无关:tags
4. Function_score 框架保留
保留了 function_score 配置框架(FUNCTION_SCORE_CONFIG)
支持 filter_weight、field_value_factor、decay 等提权函数
可以从配置中扩展提权字段
5. 测试验证
所有功能测试通过:
基本文本搜索
带过滤条件的搜索
范围过滤
分面搜索
英文查询
search/es_query_builder.py
... ... @@ -2,11 +2,16 @@
2 2 Elasticsearch query builder.
3 3  
4 4 Converts parsed queries and search parameters into ES DSL queries.
  5 +
  6 +Simplified architecture:
  7 +- filters and (text_recall or embedding_recall)
  8 +- function_score wrapper for boosting fields
5 9 """
6 10  
7 11 from typing import Dict, Any, List, Optional, Union
8 12 import numpy as np
9 13 from .boolean_parser import QueryNode
  14 +from .query_config import FUNCTION_SCORE_CONFIG
10 15  
11 16  
12 17 class ESQueryBuilder:
... ... @@ -51,14 +56,20 @@ class ESQueryBuilder:
51 56 min_score: Optional[float] = None
52 57 ) -> Dict[str, Any]:
53 58 """
54   - Build complete ES query (重构版).
  59 + Build complete ES query (简化版).
  60 +
  61 + 结构:filters and (text_recall or embedding_recall)
  62 + - filters: 前端传递的过滤条件永远起作用
  63 + - text_recall: 文本相关性召回(中英文字段都用)
  64 + - embedding_recall: 向量召回(KNN)
  65 + - function_score: 包装召回部分,支持提权字段
55 66  
56 67 Args:
57 68 query_text: Query text for BM25 matching
58 69 query_vector: Query embedding for KNN search
59 70 query_node: Parsed boolean expression tree
60   - filters: Exact match filters
61   - range_filters: Range filters for numeric fields
  71 + filters: Exact match filters (always applied)
  72 + range_filters: Range filters for numeric fields (always applied)
62 73 size: Number of results
63 74 from_: Offset for pagination
64 75 enable_knn: Whether to use KNN search
... ... @@ -80,44 +91,161 @@ class ESQueryBuilder:
80 91 "includes": self.source_fields
81 92 }
82 93  
83   - # Build main query
84   - if query_node and query_node.operator != 'TERM':
85   - # Complex boolean query
86   - query_clause = self._build_boolean_query(query_node)
87   - else:
88   - # Simple text query
89   - query_clause = self._build_text_query(query_text)
90   -
91   - # Add filters if provided
92   - if filters or range_filters:
93   - filter_clauses = self._build_filters(filters, range_filters)
  94 + # 1. Build recall queries (text or embedding)
  95 + recall_clauses = []
  96 +
  97 + # Text recall (always include if query_text exists)
  98 + if query_text:
  99 + if query_node and query_node.operator != 'TERM':
  100 + # Complex boolean query
  101 + text_query = self._build_boolean_query(query_node)
  102 + else:
  103 + # Simple text query
  104 + text_query = self._build_text_query(query_text)
  105 + recall_clauses.append(text_query)
  106 +
  107 + # Embedding recall (KNN - separate from query, handled below)
  108 + has_embedding = enable_knn and query_vector is not None and self.text_embedding_field
  109 +
  110 + # 2. Build filter clauses (always applied)
  111 + filter_clauses = self._build_filters(filters, range_filters)
  112 +
  113 + # 3. Build main query structure: filters and recall
  114 + if recall_clauses:
  115 + # Combine text recalls with OR logic (if multiple)
  116 + if len(recall_clauses) == 1:
  117 + recall_query = recall_clauses[0]
  118 + else:
  119 + recall_query = {
  120 + "bool": {
  121 + "should": recall_clauses,
  122 + "minimum_should_match": 1
  123 + }
  124 + }
  125 +
  126 + # Wrap recall with function_score for boosting
  127 + recall_query = self._wrap_with_function_score(recall_query)
  128 +
  129 + # Combine filters and recall
94 130 if filter_clauses:
95 131 es_query["query"] = {
96 132 "bool": {
97   - "must": [query_clause],
  133 + "must": [recall_query],
98 134 "filter": filter_clauses
99 135 }
100 136 }
101 137 else:
102   - es_query["query"] = query_clause
  138 + es_query["query"] = recall_query
103 139 else:
104   - es_query["query"] = query_clause
  140 + # No recall queries, only filters (match_all filtered)
  141 + if filter_clauses:
  142 + es_query["query"] = {
  143 + "bool": {
  144 + "must": [{"match_all": {}}],
  145 + "filter": filter_clauses
  146 + }
  147 + }
  148 + else:
  149 + es_query["query"] = {"match_all": {}}
105 150  
106   - # Add KNN search if enabled and vector provided
107   - if enable_knn and query_vector is not None and self.text_embedding_field:
  151 + # 4. Add KNN search if enabled (separate from query, ES will combine)
  152 + if has_embedding:
108 153 knn_clause = {
109 154 "field": self.text_embedding_field,
110 155 "query_vector": query_vector.tolist(),
111 156 "k": knn_k,
112   - "num_candidates": knn_num_candidates
  157 + "num_candidates": knn_num_candidates,
  158 + "boost": 0.2 # Lower boost for embedding recall
113 159 }
114 160 es_query["knn"] = knn_clause
115 161  
116   - # Add minimum score filter
  162 + # 5. Add minimum score filter
117 163 if min_score is not None:
118 164 es_query["min_score"] = min_score
119 165  
120 166 return es_query
  167 +
  168 + def _wrap_with_function_score(self, query: Dict[str, Any]) -> Dict[str, Any]:
  169 + """
  170 + Wrap query with function_score for boosting fields.
  171 +
  172 + Args:
  173 + query: Base query to wrap
  174 +
  175 + Returns:
  176 + Function score query or original query if no functions configured
  177 + """
  178 + functions = self._build_score_functions()
  179 +
  180 + # If no functions configured, return original query
  181 + if not functions:
  182 + return query
  183 +
  184 + # Build function_score query
  185 + function_score_query = {
  186 + "function_score": {
  187 + "query": query,
  188 + "functions": functions,
  189 + "score_mode": FUNCTION_SCORE_CONFIG.get("score_mode", "sum"),
  190 + "boost_mode": FUNCTION_SCORE_CONFIG.get("boost_mode", "multiply")
  191 + }
  192 + }
  193 +
  194 + return function_score_query
  195 +
  196 + def _build_score_functions(self) -> List[Dict[str, Any]]:
  197 + """
  198 + Build function_score functions from config.
  199 +
  200 + Returns:
  201 + List of function score functions
  202 + """
  203 + functions = []
  204 + config_functions = FUNCTION_SCORE_CONFIG.get("functions", [])
  205 +
  206 + for func_config in config_functions:
  207 + func_type = func_config.get("type")
  208 +
  209 + if func_type == "filter_weight":
  210 + # Filter + Weight
  211 + functions.append({
  212 + "filter": func_config["filter"],
  213 + "weight": func_config.get("weight", 1.0)
  214 + })
  215 +
  216 + elif func_type == "field_value_factor":
  217 + # Field Value Factor
  218 + functions.append({
  219 + "field_value_factor": {
  220 + "field": func_config["field"],
  221 + "factor": func_config.get("factor", 1.0),
  222 + "modifier": func_config.get("modifier", "none"),
  223 + "missing": func_config.get("missing", 1.0)
  224 + }
  225 + })
  226 +
  227 + elif func_type == "decay":
  228 + # Decay Function (gauss/exp/linear)
  229 + decay_func = func_config.get("function", "gauss")
  230 + field = func_config["field"]
  231 +
  232 + decay_params = {
  233 + "origin": func_config.get("origin", "now"),
  234 + "scale": func_config["scale"]
  235 + }
  236 +
  237 + if "offset" in func_config:
  238 + decay_params["offset"] = func_config["offset"]
  239 + if "decay" in func_config:
  240 + decay_params["decay"] = func_config["decay"]
  241 +
  242 + functions.append({
  243 + decay_func: {
  244 + field: decay_params
  245 + }
  246 + })
  247 +
  248 + return functions
121 249  
122 250 def _build_text_query(self, query_text: str) -> Dict[str, Any]:
123 251 """
... ... @@ -235,11 +363,19 @@ class ESQueryBuilder:
235 363 "term": {field: value}
236 364 })
237 365  
238   - # 2. 处理范围过滤(RangeFilter Pydantic 模型
  366 + # 2. 处理范围过滤(支持 RangeFilter Pydantic 模型或字典
239 367 if range_filters:
240 368 for field, range_filter in range_filters.items():
241   - # 将 RangeFilter 模型转换为字典
242   - range_dict = range_filter.model_dump(exclude_none=True)
  369 + # 支持 Pydantic 模型或字典格式
  370 + if hasattr(range_filter, 'model_dump'):
  371 + # Pydantic 模型
  372 + range_dict = range_filter.model_dump(exclude_none=True)
  373 + elif isinstance(range_filter, dict):
  374 + # 已经是字典格式
  375 + range_dict = {k: v for k, v in range_filter.items() if v is not None}
  376 + else:
  377 + # 其他格式,跳过
  378 + continue
243 379  
244 380 if range_dict:
245 381 filter_clauses.append({
... ...
search/multilang_query_builder.py deleted
... ... @@ -1,459 +0,0 @@
1   -"""
2   -Multi-language query builder for handling domain-specific searches.
3   -
4   -This module extends the ESQueryBuilder to support multi-language field mappings,
5   -allowing queries to be routed to appropriate language-specific fields while
6   -maintaining a unified external interface.
7   -"""
8   -
9   -from typing import Dict, Any, List, Optional
10   -import numpy as np
11   -import logging
12   -import re
13   -
14   -from query import ParsedQuery
15   -from .es_query_builder import ESQueryBuilder
16   -from .query_config import DEFAULT_MATCH_FIELDS, DOMAIN_FIELDS, FUNCTION_SCORE_CONFIG
17   -
18   -logger = logging.getLogger(__name__)
19   -
20   -
21   -class MultiLanguageQueryBuilder(ESQueryBuilder):
22   - """
23   - Enhanced query builder with multi-language support.
24   -
25   - Handles routing queries to appropriate language-specific fields based on:
26   - 1. Detected query language
27   - 2. Available translations
28   - 3. Domain configuration (language_field_mapping)
29   - """
30   -
31   - def __init__(
32   - self,
33   - index_name: str,
34   - match_fields: Optional[List[str]] = None,
35   - text_embedding_field: Optional[str] = None,
36   - image_embedding_field: Optional[str] = None,
37   - source_fields: Optional[List[str]] = None
38   - ):
39   - """
40   - Initialize multi-language query builder.
41   -
42   - Args:
43   - index_name: ES index name
44   - match_fields: Fields to search for text matching (default: from query_config)
45   - text_embedding_field: Field name for text embeddings
46   - image_embedding_field: Field name for image embeddings
47   - source_fields: Fields to return in search results (_source includes)
48   - """
49   - self.function_score_config = FUNCTION_SCORE_CONFIG
50   -
51   - # Use provided match_fields or default
52   - if match_fields is None:
53   - match_fields = DEFAULT_MATCH_FIELDS
54   -
55   - super().__init__(
56   - index_name=index_name,
57   - match_fields=match_fields,
58   - text_embedding_field=text_embedding_field,
59   - image_embedding_field=image_embedding_field,
60   - source_fields=source_fields
61   - )
62   -
63   - # Build domain configurations from query_config
64   - self.domain_configs = DOMAIN_FIELDS
65   -
66   - def _get_domain_fields(self, domain_name: str) -> List[str]:
67   - """Get fields for a specific domain with boost notation."""
68   - return self.domain_configs.get(domain_name, DEFAULT_MATCH_FIELDS)
69   -
70   - def build_multilang_query(
71   - self,
72   - parsed_query: ParsedQuery,
73   - query_vector: Optional[np.ndarray] = None,
74   - query_node: Optional[Any] = None,
75   - filters: Optional[Dict[str, Any]] = None,
76   - range_filters: Optional[Dict[str, Any]] = None,
77   - size: int = 10,
78   - from_: int = 0,
79   - enable_knn: bool = True,
80   - knn_k: int = 50,
81   - knn_num_candidates: int = 200,
82   - min_score: Optional[float] = None
83   - ) -> Dict[str, Any]:
84   - """
85   - Build ES query with multi-language support (简化版).
86   -
87   - Args:
88   - parsed_query: Parsed query with language info and translations
89   - query_vector: Query embedding for KNN search
90   - filters: Exact match filters
91   - range_filters: Range filters for numeric fields
92   - size: Number of results
93   - from_: Offset for pagination
94   - enable_knn: Whether to use KNN search
95   - knn_k: K value for KNN
96   - knn_num_candidates: Number of candidates for KNN
97   - min_score: Minimum score threshold
98   -
99   - Returns:
100   - ES query DSL dictionary
101   - """
102   - # 1. 根据域选择匹配字段(默认域使用 DEFAULT_MATCH_FIELDS)
103   - domain = parsed_query.domain or "default"
104   - domain_fields = self.domain_configs.get(domain) or DEFAULT_MATCH_FIELDS
105   -
106   - # 2. 临时切换 match_fields,复用基类 build_query 逻辑
107   - original_match_fields = self.match_fields
108   - self.match_fields = domain_fields
109   - try:
110   - return super().build_query(
111   - query_text=parsed_query.rewritten_query or parsed_query.normalized_query,
112   - query_vector=query_vector,
113   - query_node=query_node,
114   - filters=filters,
115   - range_filters=range_filters,
116   - size=size,
117   - from_=from_,
118   - enable_knn=enable_knn,
119   - knn_k=knn_k,
120   - knn_num_candidates=knn_num_candidates,
121   - min_score=min_score
122   - )
123   - finally:
124   - # 恢复原始配置,避免影响后续查询
125   - self.match_fields = original_match_fields
126   -
127   - def _build_score_functions(self) -> List[Dict[str, Any]]:
128   - """
129   - 从配置构建 function_score 的打分函数列表
130   -
131   - Returns:
132   - 打分函数列表(ES原生格式)
133   - """
134   - if not self.function_score_config or not self.function_score_config.functions:
135   - return []
136   -
137   - functions = []
138   -
139   - for func_config in self.function_score_config.functions:
140   - func_type = func_config.get('type')
141   -
142   - if func_type == 'filter_weight':
143   - # Filter + Weight
144   - functions.append({
145   - "filter": func_config['filter'],
146   - "weight": func_config.get('weight', 1.0)
147   - })
148   -
149   - elif func_type == 'field_value_factor':
150   - # Field Value Factor
151   - functions.append({
152   - "field_value_factor": {
153   - "field": func_config['field'],
154   - "factor": func_config.get('factor', 1.0),
155   - "modifier": func_config.get('modifier', 'none'),
156   - "missing": func_config.get('missing', 1.0)
157   - }
158   - })
159   -
160   - elif func_type == 'decay':
161   - # Decay Function (gauss/exp/linear)
162   - decay_func = func_config.get('function', 'gauss')
163   - field = func_config['field']
164   -
165   - decay_params = {
166   - "origin": func_config.get('origin', 'now'),
167   - "scale": func_config['scale']
168   - }
169   -
170   - if 'offset' in func_config:
171   - decay_params['offset'] = func_config['offset']
172   - if 'decay' in func_config:
173   - decay_params['decay'] = func_config['decay']
174   -
175   - functions.append({
176   - decay_func: {
177   - field: decay_params
178   - }
179   - })
180   -
181   - return functions
182   -
183   - def _build_multilang_text_query(
184   - self,
185   - parsed_query: ParsedQuery,
186   - domain_config: Dict[str, Any]
187   - ) -> Dict[str, Any]:
188   - """
189   - Build text query with multi-language field routing.
190   -
191   - Args:
192   - parsed_query: Parsed query with language info
193   - domain_config: Domain configuration
194   -
195   - Returns:
196   - ES query clause
197   - """
198   - if not domain_config.language_field_mapping:
199   - # No multi-language mapping, use all fields with default analyzer
200   - fields_with_boost = []
201   - for field_name in domain_config.fields:
202   - field = self._get_field_by_name(field_name)
203   - if field and field.boost != 1.0:
204   - fields_with_boost.append(f"{field_name}^{field.boost}")
205   - else:
206   - fields_with_boost.append(field_name)
207   -
208   - return {
209   - "multi_match": {
210   - "query": parsed_query.rewritten_query,
211   - "fields": fields_with_boost,
212   - "minimum_should_match": "67%",
213   - "tie_breaker": 0.9,
214   - "boost": domain_config.boost,
215   - "_name": f"{domain_config.name}_query"
216   - }
217   - }
218   -
219   - # Multi-language mapping exists - build targeted queries
220   - should_clauses = []
221   - available_languages = set(domain_config.language_field_mapping.keys())
222   -
223   - # 1. Query in detected language (if it exists in mapping)
224   - detected_lang = parsed_query.detected_language
225   - if detected_lang in available_languages:
226   - target_fields = domain_config.language_field_mapping[detected_lang]
227   - fields_with_boost = self._apply_field_boosts(target_fields)
228   -
229   - should_clauses.append({
230   - "multi_match": {
231   - "query": parsed_query.rewritten_query,
232   - "fields": fields_with_boost,
233   - "minimum_should_match": "67%",
234   - "tie_breaker": 0.9,
235   - "boost": domain_config.boost * 1.5, # Higher boost for detected language
236   - "_name": f"{domain_config.name}_{detected_lang}_query"
237   - }
238   - })
239   - logger.debug(f"Added query for detected language '{detected_lang}'")
240   -
241   - # 2. Query in translated languages (only for languages in mapping)
242   - for lang, translation in parsed_query.translations.items():
243   - # Only use translations for languages that exist in the mapping
244   - if lang in available_languages and translation and translation.strip():
245   - target_fields = domain_config.language_field_mapping[lang]
246   - fields_with_boost = self._apply_field_boosts(target_fields)
247   -
248   - should_clauses.append({
249   - "multi_match": {
250   - "query": translation,
251   - "fields": fields_with_boost,
252   - "minimum_should_match": "67%",
253   - "tie_breaker": 0.9,
254   - "boost": domain_config.boost,
255   - "_name": f"{domain_config.name}_{lang}_translated_query"
256   - }
257   - })
258   - logger.debug(f"Added translated query for language '{lang}'")
259   -
260   - # 3. Fallback: query all fields in mapping if no language-specific query was built
261   - if not should_clauses:
262   - logger.debug("No language mapping matched, using all fields from mapping")
263   - # Use all fields from all languages in the mapping
264   - all_mapped_fields = []
265   - for lang_fields in domain_config.language_field_mapping.values():
266   - all_mapped_fields.extend(lang_fields)
267   - # Remove duplicates while preserving order
268   - unique_fields = list(dict.fromkeys(all_mapped_fields))
269   - fields_with_boost = self._apply_field_boosts(unique_fields)
270   -
271   - should_clauses.append({
272   - "multi_match": {
273   - "query": parsed_query.rewritten_query,
274   - "fields": fields_with_boost,
275   - "minimum_should_match": "67%",
276   - "tie_breaker": 0.9,
277   - "boost": domain_config.boost * 0.8, # Lower boost for fallback
278   - "_name": f"{domain_config.name}_fallback_query"
279   - }
280   - })
281   -
282   - if len(should_clauses) == 1:
283   - return should_clauses[0]
284   - else:
285   - return {
286   - "bool": {
287   - "should": should_clauses,
288   - "minimum_should_match": 1
289   - }
290   - }
291   -
292   - def _apply_field_boosts(self, field_names: List[str]) -> List[str]:
293   - """Apply boost values to field names."""
294   - result = []
295   - for field_name in field_names:
296   - field = self._get_field_by_name(field_name)
297   - if field and field.boost != 1.0:
298   - result.append(f"{field_name}^{field.boost}")
299   - else:
300   - result.append(field_name)
301   - return result
302   -
303   - def _build_boolean_query_from_tuple(self, node) -> Dict[str, Any]:
304   - """
305   - Build query from boolean expression tuple.
306   -
307   - Args:
308   - node: Boolean expression tuple (operator, terms...)
309   -
310   - Returns:
311   - ES query clause
312   - """
313   - if not node:
314   - return {"match_all": {}}
315   -
316   - # Handle different node types from boolean parser
317   - if hasattr(node, 'operator'):
318   - # QueryNode object
319   - operator = node.operator
320   - terms = node.terms if hasattr(node, 'terms') else None
321   -
322   - # For TERM nodes, check if there's a value
323   - if operator == 'TERM' and hasattr(node, 'value') and node.value:
324   - terms = node.value
325   - elif isinstance(node, tuple) and len(node) > 0:
326   - # Tuple format from boolean parser
327   - if hasattr(node[0], 'operator'):
328   - # Nested tuple with QueryNode
329   - operator = node[0].operator
330   - terms = node[0].terms
331   - elif isinstance(node[0], str):
332   - # Simple tuple like ('TERM', 'field:value')
333   - operator = node[0]
334   - terms = node[1] if len(node) > 1 else ''
335   - else:
336   - # Complex tuple like (OR( TERM(...), TERM(...) ), score)
337   - if hasattr(node[0], '__class__') and hasattr(node[0], '__name__'):
338   - # Constructor call like OR(...)
339   - operator = node[0].__name__
340   - elif str(node[0]).startswith('('):
341   - # String representation of constructor call
342   - match = re.match(r'(\w+)\(', str(node[0]))
343   - if match:
344   - operator = match.group(1)
345   - else:
346   - return {"match_all": {}}
347   - else:
348   - operator = str(node[0])
349   -
350   - # Extract terms from nested structure
351   - terms = []
352   - if len(node) > 1 and isinstance(node[1], tuple):
353   - terms = node[1]
354   - else:
355   - return {"match_all": {}}
356   -
357   -
358   - if operator == 'TERM':
359   - # Leaf node - handle field:query format
360   - if isinstance(terms, str) and ':' in terms:
361   - field, value = terms.split(':', 1)
362   - return {
363   - "term": {
364   - field: value
365   - }
366   - }
367   - elif isinstance(terms, str):
368   - # Simple text term - create match query
369   - return {
370   - "multi_match": {
371   - "query": terms,
372   - "fields": self.match_fields,
373   - "type": "best_fields",
374   - "operator": "AND"
375   - }
376   - }
377   - else:
378   - # Invalid TERM node - return empty match
379   - return {
380   - "match_none": {}
381   - }
382   -
383   - elif operator == 'OR':
384   - # Any term must match
385   - should_clauses = []
386   - if terms:
387   - for term in terms:
388   - clause = self._build_boolean_query_from_tuple(term)
389   - if clause and clause.get("match_none") is None:
390   - should_clauses.append(clause)
391   -
392   - if should_clauses:
393   - return {
394   - "bool": {
395   - "should": should_clauses,
396   - "minimum_should_match": 1
397   - }
398   - }
399   - else:
400   - return {"match_none": {}}
401   -
402   - elif operator == 'AND':
403   - # All terms must match
404   - must_clauses = []
405   - if terms:
406   - for term in terms:
407   - clause = self._build_boolean_query_from_tuple(term)
408   - if clause and clause.get("match_none") is None:
409   - must_clauses.append(clause)
410   -
411   - if must_clauses:
412   - return {
413   - "bool": {
414   - "must": must_clauses
415   - }
416   - }
417   - else:
418   - return {"match_none": {}}
419   -
420   - elif operator == 'ANDNOT':
421   - # First term must match, second must not
422   - if len(terms) >= 2:
423   - return {
424   - "bool": {
425   - "must": [self._build_boolean_query_from_tuple(terms[0])],
426   - "must_not": [self._build_boolean_query_from_tuple(terms[1])]
427   - }
428   - }
429   - else:
430   - return self._build_boolean_query_from_tuple(terms[0])
431   -
432   - elif operator == 'RANK':
433   - # Like OR but for ranking (all terms contribute to score)
434   - should_clauses = []
435   - for term in terms:
436   - should_clauses.append(self._build_boolean_query_from_tuple(term))
437   - return {
438   - "bool": {
439   - "should": should_clauses
440   - }
441   - }
442   -
443   - else:
444   - # Unknown operator
445   - return {"match_all": {}}
446   -
447   - def get_domain_summary(self) -> Dict[str, Any]:
448   - """Get summary of all configured domains."""
449   - summary = {}
450   - for domain_name, domain_config in self.domain_configs.items():
451   - summary[domain_name] = {
452   - "label": domain_config.label,
453   - "fields": domain_config.fields,
454   - "analyzer": domain_config.analyzer.value,
455   - "boost": domain_config.boost,
456   - "has_multilang_mapping": domain_config.language_field_mapping is not None,
457   - "supported_languages": list(domain_config.language_field_mapping.keys()) if domain_config.language_field_mapping else []
458   - }
459   - return summary
460 0 \ No newline at end of file
search/query_config.py
... ... @@ -17,14 +17,24 @@ TEXT_EMBEDDING_FIELD = "title_embedding"
17 17 IMAGE_EMBEDDING_FIELD = "image_embedding"
18 18  
19 19 # Default match fields for text search (with boost)
  20 +# 文本召回:同时搜索中英文字段,两者相互补充
20 21 DEFAULT_MATCH_FIELDS = [
  22 + # 中文字段
21 23 "title_zh^3.0",
22 24 "brief_zh^1.5",
23 25 "description_zh^1.0",
24 26 "vendor_zh^1.5",
25   - "tags^1.0",
26 27 "category_path_zh^1.5",
27   - "category_name_zh^1.5"
  28 + "category_name_zh^1.5",
  29 + # 英文字段
  30 + "title_en^3.0",
  31 + "brief_en^1.5",
  32 + "description_en^1.0",
  33 + "vendor_en^1.5",
  34 + "category_path_en^1.5",
  35 + "category_name_en^1.5",
  36 + # 语言无关字段
  37 + "tags^1.0",
28 38 ]
29 39  
30 40 # Domain-specific match fields
... ...
search/searcher.py
... ... @@ -13,7 +13,6 @@ from query import QueryParser, ParsedQuery
13 13 from embeddings import CLIPImageEncoder
14 14 from .boolean_parser import BooleanParser, QueryNode
15 15 from .es_query_builder import ESQueryBuilder
16   -from .multilang_query_builder import MultiLanguageQueryBuilder
17 16 from .rerank_engine import RerankEngine
18 17 from .query_config import (
19 18 DEFAULT_INDEX_NAME,
... ... @@ -112,8 +111,8 @@ class Searcher:
112 111 self.text_embedding_field = TEXT_EMBEDDING_FIELD
113 112 self.image_embedding_field = IMAGE_EMBEDDING_FIELD
114 113  
115   - # Query builder - use multi-language version
116   - self.query_builder = MultiLanguageQueryBuilder(
  114 + # Query builder - simplified single-layer architecture
  115 + self.query_builder = ESQueryBuilder(
117 116 index_name=index_name,
118 117 match_fields=self.match_fields,
119 118 text_embedding_field=self.text_embedding_field,
... ... @@ -274,8 +273,8 @@ class Searcher:
274 273 filters = {}
275 274 filters['tenant_id'] = tenant_id
276 275  
277   - es_query = self.query_builder.build_multilang_query(
278   - parsed_query=parsed_query,
  276 + es_query = self.query_builder.build_query(
  277 + query_text=parsed_query.rewritten_query or parsed_query.normalized_query,
279 278 query_vector=parsed_query.query_vector if enable_embedding else None,
280 279 query_node=query_node,
281 280 filters=filters,
... ...