Commit f0d020c3ae325deb3921307bcd2480d628cc5a7a
1 parent
577ec972
多语言查询改为只支持中英文两种,filters and ( text_call or embedding_recall),然后 function_score 支持 新鲜度等提权字段
1. 前端传递的过滤条件永远是要起作用的 2. 然后召回模块包括文本相关性召回(中英文都是用)和向量召回,两者相互补充。 3. 套用function_score以支持两种打分融合和各种提权字段 4. 只需要build_query 这一层。 实际操作: 1. 架构简化 移除了 MultiLanguageQueryBuilder 层级 只保留单层的 ESQueryBuilder.build_query 方法 2. 查询结构重构 实现了 filters and (text_recall or embedding_recall) 结构: 前端过滤条件:永远起作用(放在 filter 中) 文本召回:同时搜索中英文字段(multi_match 覆盖 title_zh/en, brief_zh/en 等) 向量召回:KNN 查询(独立参数,ES 会自动合并) Function_score:包装召回部分,支持提权字段配置 3. 文本匹配字段更新 在 DEFAULT_MATCH_FIELDS 中添加了中英文字段: 中文:title_zh, brief_zh, description_zh, vendor_zh, category_path_zh, category_name_zh 英文:title_en, brief_en, description_en, vendor_en, category_path_en, category_name_en 语言无关:tags 4. Function_score 框架保留 保留了 function_score 配置框架(FUNCTION_SCORE_CONFIG) 支持 filter_weight、field_value_factor、decay 等提权函数 可以从配置中扩展提权字段 5. 测试验证 所有功能测试通过: 基本文本搜索 带过滤条件的搜索 范围过滤 分面搜索 英文查询
Showing
4 changed files
with
176 additions
and
490 deletions
Show diff stats
search/es_query_builder.py
| @@ -2,11 +2,16 @@ | @@ -2,11 +2,16 @@ | ||
| 2 | Elasticsearch query builder. | 2 | Elasticsearch query builder. |
| 3 | 3 | ||
| 4 | Converts parsed queries and search parameters into ES DSL queries. | 4 | Converts parsed queries and search parameters into ES DSL queries. |
| 5 | + | ||
| 6 | +Simplified architecture: | ||
| 7 | +- filters and (text_recall or embedding_recall) | ||
| 8 | +- function_score wrapper for boosting fields | ||
| 5 | """ | 9 | """ |
| 6 | 10 | ||
| 7 | from typing import Dict, Any, List, Optional, Union | 11 | from typing import Dict, Any, List, Optional, Union |
| 8 | import numpy as np | 12 | import numpy as np |
| 9 | from .boolean_parser import QueryNode | 13 | from .boolean_parser import QueryNode |
| 14 | +from .query_config import FUNCTION_SCORE_CONFIG | ||
| 10 | 15 | ||
| 11 | 16 | ||
| 12 | class ESQueryBuilder: | 17 | class ESQueryBuilder: |
| @@ -51,14 +56,20 @@ class ESQueryBuilder: | @@ -51,14 +56,20 @@ class ESQueryBuilder: | ||
| 51 | min_score: Optional[float] = None | 56 | min_score: Optional[float] = None |
| 52 | ) -> Dict[str, Any]: | 57 | ) -> Dict[str, Any]: |
| 53 | """ | 58 | """ |
| 54 | - Build complete ES query (重构版). | 59 | + Build complete ES query (简化版). |
| 60 | + | ||
| 61 | + 结构:filters and (text_recall or embedding_recall) | ||
| 62 | + - filters: 前端传递的过滤条件永远起作用 | ||
| 63 | + - text_recall: 文本相关性召回(中英文字段都用) | ||
| 64 | + - embedding_recall: 向量召回(KNN) | ||
| 65 | + - function_score: 包装召回部分,支持提权字段 | ||
| 55 | 66 | ||
| 56 | Args: | 67 | Args: |
| 57 | query_text: Query text for BM25 matching | 68 | query_text: Query text for BM25 matching |
| 58 | query_vector: Query embedding for KNN search | 69 | query_vector: Query embedding for KNN search |
| 59 | query_node: Parsed boolean expression tree | 70 | query_node: Parsed boolean expression tree |
| 60 | - filters: Exact match filters | ||
| 61 | - range_filters: Range filters for numeric fields | 71 | + filters: Exact match filters (always applied) |
| 72 | + range_filters: Range filters for numeric fields (always applied) | ||
| 62 | size: Number of results | 73 | size: Number of results |
| 63 | from_: Offset for pagination | 74 | from_: Offset for pagination |
| 64 | enable_knn: Whether to use KNN search | 75 | enable_knn: Whether to use KNN search |
| @@ -80,44 +91,161 @@ class ESQueryBuilder: | @@ -80,44 +91,161 @@ class ESQueryBuilder: | ||
| 80 | "includes": self.source_fields | 91 | "includes": self.source_fields |
| 81 | } | 92 | } |
| 82 | 93 | ||
| 83 | - # Build main query | ||
| 84 | - if query_node and query_node.operator != 'TERM': | ||
| 85 | - # Complex boolean query | ||
| 86 | - query_clause = self._build_boolean_query(query_node) | ||
| 87 | - else: | ||
| 88 | - # Simple text query | ||
| 89 | - query_clause = self._build_text_query(query_text) | ||
| 90 | - | ||
| 91 | - # Add filters if provided | ||
| 92 | - if filters or range_filters: | ||
| 93 | - filter_clauses = self._build_filters(filters, range_filters) | 94 | + # 1. Build recall queries (text or embedding) |
| 95 | + recall_clauses = [] | ||
| 96 | + | ||
| 97 | + # Text recall (always include if query_text exists) | ||
| 98 | + if query_text: | ||
| 99 | + if query_node and query_node.operator != 'TERM': | ||
| 100 | + # Complex boolean query | ||
| 101 | + text_query = self._build_boolean_query(query_node) | ||
| 102 | + else: | ||
| 103 | + # Simple text query | ||
| 104 | + text_query = self._build_text_query(query_text) | ||
| 105 | + recall_clauses.append(text_query) | ||
| 106 | + | ||
| 107 | + # Embedding recall (KNN - separate from query, handled below) | ||
| 108 | + has_embedding = enable_knn and query_vector is not None and self.text_embedding_field | ||
| 109 | + | ||
| 110 | + # 2. Build filter clauses (always applied) | ||
| 111 | + filter_clauses = self._build_filters(filters, range_filters) | ||
| 112 | + | ||
| 113 | + # 3. Build main query structure: filters and recall | ||
| 114 | + if recall_clauses: | ||
| 115 | + # Combine text recalls with OR logic (if multiple) | ||
| 116 | + if len(recall_clauses) == 1: | ||
| 117 | + recall_query = recall_clauses[0] | ||
| 118 | + else: | ||
| 119 | + recall_query = { | ||
| 120 | + "bool": { | ||
| 121 | + "should": recall_clauses, | ||
| 122 | + "minimum_should_match": 1 | ||
| 123 | + } | ||
| 124 | + } | ||
| 125 | + | ||
| 126 | + # Wrap recall with function_score for boosting | ||
| 127 | + recall_query = self._wrap_with_function_score(recall_query) | ||
| 128 | + | ||
| 129 | + # Combine filters and recall | ||
| 94 | if filter_clauses: | 130 | if filter_clauses: |
| 95 | es_query["query"] = { | 131 | es_query["query"] = { |
| 96 | "bool": { | 132 | "bool": { |
| 97 | - "must": [query_clause], | 133 | + "must": [recall_query], |
| 98 | "filter": filter_clauses | 134 | "filter": filter_clauses |
| 99 | } | 135 | } |
| 100 | } | 136 | } |
| 101 | else: | 137 | else: |
| 102 | - es_query["query"] = query_clause | 138 | + es_query["query"] = recall_query |
| 103 | else: | 139 | else: |
| 104 | - es_query["query"] = query_clause | 140 | + # No recall queries, only filters (match_all filtered) |
| 141 | + if filter_clauses: | ||
| 142 | + es_query["query"] = { | ||
| 143 | + "bool": { | ||
| 144 | + "must": [{"match_all": {}}], | ||
| 145 | + "filter": filter_clauses | ||
| 146 | + } | ||
| 147 | + } | ||
| 148 | + else: | ||
| 149 | + es_query["query"] = {"match_all": {}} | ||
| 105 | 150 | ||
| 106 | - # Add KNN search if enabled and vector provided | ||
| 107 | - if enable_knn and query_vector is not None and self.text_embedding_field: | 151 | + # 4. Add KNN search if enabled (separate from query, ES will combine) |
| 152 | + if has_embedding: | ||
| 108 | knn_clause = { | 153 | knn_clause = { |
| 109 | "field": self.text_embedding_field, | 154 | "field": self.text_embedding_field, |
| 110 | "query_vector": query_vector.tolist(), | 155 | "query_vector": query_vector.tolist(), |
| 111 | "k": knn_k, | 156 | "k": knn_k, |
| 112 | - "num_candidates": knn_num_candidates | 157 | + "num_candidates": knn_num_candidates, |
| 158 | + "boost": 0.2 # Lower boost for embedding recall | ||
| 113 | } | 159 | } |
| 114 | es_query["knn"] = knn_clause | 160 | es_query["knn"] = knn_clause |
| 115 | 161 | ||
| 116 | - # Add minimum score filter | 162 | + # 5. Add minimum score filter |
| 117 | if min_score is not None: | 163 | if min_score is not None: |
| 118 | es_query["min_score"] = min_score | 164 | es_query["min_score"] = min_score |
| 119 | 165 | ||
| 120 | return es_query | 166 | return es_query |
| 167 | + | ||
| 168 | + def _wrap_with_function_score(self, query: Dict[str, Any]) -> Dict[str, Any]: | ||
| 169 | + """ | ||
| 170 | + Wrap query with function_score for boosting fields. | ||
| 171 | + | ||
| 172 | + Args: | ||
| 173 | + query: Base query to wrap | ||
| 174 | + | ||
| 175 | + Returns: | ||
| 176 | + Function score query or original query if no functions configured | ||
| 177 | + """ | ||
| 178 | + functions = self._build_score_functions() | ||
| 179 | + | ||
| 180 | + # If no functions configured, return original query | ||
| 181 | + if not functions: | ||
| 182 | + return query | ||
| 183 | + | ||
| 184 | + # Build function_score query | ||
| 185 | + function_score_query = { | ||
| 186 | + "function_score": { | ||
| 187 | + "query": query, | ||
| 188 | + "functions": functions, | ||
| 189 | + "score_mode": FUNCTION_SCORE_CONFIG.get("score_mode", "sum"), | ||
| 190 | + "boost_mode": FUNCTION_SCORE_CONFIG.get("boost_mode", "multiply") | ||
| 191 | + } | ||
| 192 | + } | ||
| 193 | + | ||
| 194 | + return function_score_query | ||
| 195 | + | ||
| 196 | + def _build_score_functions(self) -> List[Dict[str, Any]]: | ||
| 197 | + """ | ||
| 198 | + Build function_score functions from config. | ||
| 199 | + | ||
| 200 | + Returns: | ||
| 201 | + List of function score functions | ||
| 202 | + """ | ||
| 203 | + functions = [] | ||
| 204 | + config_functions = FUNCTION_SCORE_CONFIG.get("functions", []) | ||
| 205 | + | ||
| 206 | + for func_config in config_functions: | ||
| 207 | + func_type = func_config.get("type") | ||
| 208 | + | ||
| 209 | + if func_type == "filter_weight": | ||
| 210 | + # Filter + Weight | ||
| 211 | + functions.append({ | ||
| 212 | + "filter": func_config["filter"], | ||
| 213 | + "weight": func_config.get("weight", 1.0) | ||
| 214 | + }) | ||
| 215 | + | ||
| 216 | + elif func_type == "field_value_factor": | ||
| 217 | + # Field Value Factor | ||
| 218 | + functions.append({ | ||
| 219 | + "field_value_factor": { | ||
| 220 | + "field": func_config["field"], | ||
| 221 | + "factor": func_config.get("factor", 1.0), | ||
| 222 | + "modifier": func_config.get("modifier", "none"), | ||
| 223 | + "missing": func_config.get("missing", 1.0) | ||
| 224 | + } | ||
| 225 | + }) | ||
| 226 | + | ||
| 227 | + elif func_type == "decay": | ||
| 228 | + # Decay Function (gauss/exp/linear) | ||
| 229 | + decay_func = func_config.get("function", "gauss") | ||
| 230 | + field = func_config["field"] | ||
| 231 | + | ||
| 232 | + decay_params = { | ||
| 233 | + "origin": func_config.get("origin", "now"), | ||
| 234 | + "scale": func_config["scale"] | ||
| 235 | + } | ||
| 236 | + | ||
| 237 | + if "offset" in func_config: | ||
| 238 | + decay_params["offset"] = func_config["offset"] | ||
| 239 | + if "decay" in func_config: | ||
| 240 | + decay_params["decay"] = func_config["decay"] | ||
| 241 | + | ||
| 242 | + functions.append({ | ||
| 243 | + decay_func: { | ||
| 244 | + field: decay_params | ||
| 245 | + } | ||
| 246 | + }) | ||
| 247 | + | ||
| 248 | + return functions | ||
| 121 | 249 | ||
| 122 | def _build_text_query(self, query_text: str) -> Dict[str, Any]: | 250 | def _build_text_query(self, query_text: str) -> Dict[str, Any]: |
| 123 | """ | 251 | """ |
| @@ -235,11 +363,19 @@ class ESQueryBuilder: | @@ -235,11 +363,19 @@ class ESQueryBuilder: | ||
| 235 | "term": {field: value} | 363 | "term": {field: value} |
| 236 | }) | 364 | }) |
| 237 | 365 | ||
| 238 | - # 2. 处理范围过滤(RangeFilter Pydantic 模型) | 366 | + # 2. 处理范围过滤(支持 RangeFilter Pydantic 模型或字典) |
| 239 | if range_filters: | 367 | if range_filters: |
| 240 | for field, range_filter in range_filters.items(): | 368 | for field, range_filter in range_filters.items(): |
| 241 | - # 将 RangeFilter 模型转换为字典 | ||
| 242 | - range_dict = range_filter.model_dump(exclude_none=True) | 369 | + # 支持 Pydantic 模型或字典格式 |
| 370 | + if hasattr(range_filter, 'model_dump'): | ||
| 371 | + # Pydantic 模型 | ||
| 372 | + range_dict = range_filter.model_dump(exclude_none=True) | ||
| 373 | + elif isinstance(range_filter, dict): | ||
| 374 | + # 已经是字典格式 | ||
| 375 | + range_dict = {k: v for k, v in range_filter.items() if v is not None} | ||
| 376 | + else: | ||
| 377 | + # 其他格式,跳过 | ||
| 378 | + continue | ||
| 243 | 379 | ||
| 244 | if range_dict: | 380 | if range_dict: |
| 245 | filter_clauses.append({ | 381 | filter_clauses.append({ |
search/multilang_query_builder.py deleted
| @@ -1,459 +0,0 @@ | @@ -1,459 +0,0 @@ | ||
| 1 | -""" | ||
| 2 | -Multi-language query builder for handling domain-specific searches. | ||
| 3 | - | ||
| 4 | -This module extends the ESQueryBuilder to support multi-language field mappings, | ||
| 5 | -allowing queries to be routed to appropriate language-specific fields while | ||
| 6 | -maintaining a unified external interface. | ||
| 7 | -""" | ||
| 8 | - | ||
| 9 | -from typing import Dict, Any, List, Optional | ||
| 10 | -import numpy as np | ||
| 11 | -import logging | ||
| 12 | -import re | ||
| 13 | - | ||
| 14 | -from query import ParsedQuery | ||
| 15 | -from .es_query_builder import ESQueryBuilder | ||
| 16 | -from .query_config import DEFAULT_MATCH_FIELDS, DOMAIN_FIELDS, FUNCTION_SCORE_CONFIG | ||
| 17 | - | ||
| 18 | -logger = logging.getLogger(__name__) | ||
| 19 | - | ||
| 20 | - | ||
| 21 | -class MultiLanguageQueryBuilder(ESQueryBuilder): | ||
| 22 | - """ | ||
| 23 | - Enhanced query builder with multi-language support. | ||
| 24 | - | ||
| 25 | - Handles routing queries to appropriate language-specific fields based on: | ||
| 26 | - 1. Detected query language | ||
| 27 | - 2. Available translations | ||
| 28 | - 3. Domain configuration (language_field_mapping) | ||
| 29 | - """ | ||
| 30 | - | ||
| 31 | - def __init__( | ||
| 32 | - self, | ||
| 33 | - index_name: str, | ||
| 34 | - match_fields: Optional[List[str]] = None, | ||
| 35 | - text_embedding_field: Optional[str] = None, | ||
| 36 | - image_embedding_field: Optional[str] = None, | ||
| 37 | - source_fields: Optional[List[str]] = None | ||
| 38 | - ): | ||
| 39 | - """ | ||
| 40 | - Initialize multi-language query builder. | ||
| 41 | - | ||
| 42 | - Args: | ||
| 43 | - index_name: ES index name | ||
| 44 | - match_fields: Fields to search for text matching (default: from query_config) | ||
| 45 | - text_embedding_field: Field name for text embeddings | ||
| 46 | - image_embedding_field: Field name for image embeddings | ||
| 47 | - source_fields: Fields to return in search results (_source includes) | ||
| 48 | - """ | ||
| 49 | - self.function_score_config = FUNCTION_SCORE_CONFIG | ||
| 50 | - | ||
| 51 | - # Use provided match_fields or default | ||
| 52 | - if match_fields is None: | ||
| 53 | - match_fields = DEFAULT_MATCH_FIELDS | ||
| 54 | - | ||
| 55 | - super().__init__( | ||
| 56 | - index_name=index_name, | ||
| 57 | - match_fields=match_fields, | ||
| 58 | - text_embedding_field=text_embedding_field, | ||
| 59 | - image_embedding_field=image_embedding_field, | ||
| 60 | - source_fields=source_fields | ||
| 61 | - ) | ||
| 62 | - | ||
| 63 | - # Build domain configurations from query_config | ||
| 64 | - self.domain_configs = DOMAIN_FIELDS | ||
| 65 | - | ||
| 66 | - def _get_domain_fields(self, domain_name: str) -> List[str]: | ||
| 67 | - """Get fields for a specific domain with boost notation.""" | ||
| 68 | - return self.domain_configs.get(domain_name, DEFAULT_MATCH_FIELDS) | ||
| 69 | - | ||
| 70 | - def build_multilang_query( | ||
| 71 | - self, | ||
| 72 | - parsed_query: ParsedQuery, | ||
| 73 | - query_vector: Optional[np.ndarray] = None, | ||
| 74 | - query_node: Optional[Any] = None, | ||
| 75 | - filters: Optional[Dict[str, Any]] = None, | ||
| 76 | - range_filters: Optional[Dict[str, Any]] = None, | ||
| 77 | - size: int = 10, | ||
| 78 | - from_: int = 0, | ||
| 79 | - enable_knn: bool = True, | ||
| 80 | - knn_k: int = 50, | ||
| 81 | - knn_num_candidates: int = 200, | ||
| 82 | - min_score: Optional[float] = None | ||
| 83 | - ) -> Dict[str, Any]: | ||
| 84 | - """ | ||
| 85 | - Build ES query with multi-language support (简化版). | ||
| 86 | - | ||
| 87 | - Args: | ||
| 88 | - parsed_query: Parsed query with language info and translations | ||
| 89 | - query_vector: Query embedding for KNN search | ||
| 90 | - filters: Exact match filters | ||
| 91 | - range_filters: Range filters for numeric fields | ||
| 92 | - size: Number of results | ||
| 93 | - from_: Offset for pagination | ||
| 94 | - enable_knn: Whether to use KNN search | ||
| 95 | - knn_k: K value for KNN | ||
| 96 | - knn_num_candidates: Number of candidates for KNN | ||
| 97 | - min_score: Minimum score threshold | ||
| 98 | - | ||
| 99 | - Returns: | ||
| 100 | - ES query DSL dictionary | ||
| 101 | - """ | ||
| 102 | - # 1. 根据域选择匹配字段(默认域使用 DEFAULT_MATCH_FIELDS) | ||
| 103 | - domain = parsed_query.domain or "default" | ||
| 104 | - domain_fields = self.domain_configs.get(domain) or DEFAULT_MATCH_FIELDS | ||
| 105 | - | ||
| 106 | - # 2. 临时切换 match_fields,复用基类 build_query 逻辑 | ||
| 107 | - original_match_fields = self.match_fields | ||
| 108 | - self.match_fields = domain_fields | ||
| 109 | - try: | ||
| 110 | - return super().build_query( | ||
| 111 | - query_text=parsed_query.rewritten_query or parsed_query.normalized_query, | ||
| 112 | - query_vector=query_vector, | ||
| 113 | - query_node=query_node, | ||
| 114 | - filters=filters, | ||
| 115 | - range_filters=range_filters, | ||
| 116 | - size=size, | ||
| 117 | - from_=from_, | ||
| 118 | - enable_knn=enable_knn, | ||
| 119 | - knn_k=knn_k, | ||
| 120 | - knn_num_candidates=knn_num_candidates, | ||
| 121 | - min_score=min_score | ||
| 122 | - ) | ||
| 123 | - finally: | ||
| 124 | - # 恢复原始配置,避免影响后续查询 | ||
| 125 | - self.match_fields = original_match_fields | ||
| 126 | - | ||
| 127 | - def _build_score_functions(self) -> List[Dict[str, Any]]: | ||
| 128 | - """ | ||
| 129 | - 从配置构建 function_score 的打分函数列表 | ||
| 130 | - | ||
| 131 | - Returns: | ||
| 132 | - 打分函数列表(ES原生格式) | ||
| 133 | - """ | ||
| 134 | - if not self.function_score_config or not self.function_score_config.functions: | ||
| 135 | - return [] | ||
| 136 | - | ||
| 137 | - functions = [] | ||
| 138 | - | ||
| 139 | - for func_config in self.function_score_config.functions: | ||
| 140 | - func_type = func_config.get('type') | ||
| 141 | - | ||
| 142 | - if func_type == 'filter_weight': | ||
| 143 | - # Filter + Weight | ||
| 144 | - functions.append({ | ||
| 145 | - "filter": func_config['filter'], | ||
| 146 | - "weight": func_config.get('weight', 1.0) | ||
| 147 | - }) | ||
| 148 | - | ||
| 149 | - elif func_type == 'field_value_factor': | ||
| 150 | - # Field Value Factor | ||
| 151 | - functions.append({ | ||
| 152 | - "field_value_factor": { | ||
| 153 | - "field": func_config['field'], | ||
| 154 | - "factor": func_config.get('factor', 1.0), | ||
| 155 | - "modifier": func_config.get('modifier', 'none'), | ||
| 156 | - "missing": func_config.get('missing', 1.0) | ||
| 157 | - } | ||
| 158 | - }) | ||
| 159 | - | ||
| 160 | - elif func_type == 'decay': | ||
| 161 | - # Decay Function (gauss/exp/linear) | ||
| 162 | - decay_func = func_config.get('function', 'gauss') | ||
| 163 | - field = func_config['field'] | ||
| 164 | - | ||
| 165 | - decay_params = { | ||
| 166 | - "origin": func_config.get('origin', 'now'), | ||
| 167 | - "scale": func_config['scale'] | ||
| 168 | - } | ||
| 169 | - | ||
| 170 | - if 'offset' in func_config: | ||
| 171 | - decay_params['offset'] = func_config['offset'] | ||
| 172 | - if 'decay' in func_config: | ||
| 173 | - decay_params['decay'] = func_config['decay'] | ||
| 174 | - | ||
| 175 | - functions.append({ | ||
| 176 | - decay_func: { | ||
| 177 | - field: decay_params | ||
| 178 | - } | ||
| 179 | - }) | ||
| 180 | - | ||
| 181 | - return functions | ||
| 182 | - | ||
| 183 | - def _build_multilang_text_query( | ||
| 184 | - self, | ||
| 185 | - parsed_query: ParsedQuery, | ||
| 186 | - domain_config: Dict[str, Any] | ||
| 187 | - ) -> Dict[str, Any]: | ||
| 188 | - """ | ||
| 189 | - Build text query with multi-language field routing. | ||
| 190 | - | ||
| 191 | - Args: | ||
| 192 | - parsed_query: Parsed query with language info | ||
| 193 | - domain_config: Domain configuration | ||
| 194 | - | ||
| 195 | - Returns: | ||
| 196 | - ES query clause | ||
| 197 | - """ | ||
| 198 | - if not domain_config.language_field_mapping: | ||
| 199 | - # No multi-language mapping, use all fields with default analyzer | ||
| 200 | - fields_with_boost = [] | ||
| 201 | - for field_name in domain_config.fields: | ||
| 202 | - field = self._get_field_by_name(field_name) | ||
| 203 | - if field and field.boost != 1.0: | ||
| 204 | - fields_with_boost.append(f"{field_name}^{field.boost}") | ||
| 205 | - else: | ||
| 206 | - fields_with_boost.append(field_name) | ||
| 207 | - | ||
| 208 | - return { | ||
| 209 | - "multi_match": { | ||
| 210 | - "query": parsed_query.rewritten_query, | ||
| 211 | - "fields": fields_with_boost, | ||
| 212 | - "minimum_should_match": "67%", | ||
| 213 | - "tie_breaker": 0.9, | ||
| 214 | - "boost": domain_config.boost, | ||
| 215 | - "_name": f"{domain_config.name}_query" | ||
| 216 | - } | ||
| 217 | - } | ||
| 218 | - | ||
| 219 | - # Multi-language mapping exists - build targeted queries | ||
| 220 | - should_clauses = [] | ||
| 221 | - available_languages = set(domain_config.language_field_mapping.keys()) | ||
| 222 | - | ||
| 223 | - # 1. Query in detected language (if it exists in mapping) | ||
| 224 | - detected_lang = parsed_query.detected_language | ||
| 225 | - if detected_lang in available_languages: | ||
| 226 | - target_fields = domain_config.language_field_mapping[detected_lang] | ||
| 227 | - fields_with_boost = self._apply_field_boosts(target_fields) | ||
| 228 | - | ||
| 229 | - should_clauses.append({ | ||
| 230 | - "multi_match": { | ||
| 231 | - "query": parsed_query.rewritten_query, | ||
| 232 | - "fields": fields_with_boost, | ||
| 233 | - "minimum_should_match": "67%", | ||
| 234 | - "tie_breaker": 0.9, | ||
| 235 | - "boost": domain_config.boost * 1.5, # Higher boost for detected language | ||
| 236 | - "_name": f"{domain_config.name}_{detected_lang}_query" | ||
| 237 | - } | ||
| 238 | - }) | ||
| 239 | - logger.debug(f"Added query for detected language '{detected_lang}'") | ||
| 240 | - | ||
| 241 | - # 2. Query in translated languages (only for languages in mapping) | ||
| 242 | - for lang, translation in parsed_query.translations.items(): | ||
| 243 | - # Only use translations for languages that exist in the mapping | ||
| 244 | - if lang in available_languages and translation and translation.strip(): | ||
| 245 | - target_fields = domain_config.language_field_mapping[lang] | ||
| 246 | - fields_with_boost = self._apply_field_boosts(target_fields) | ||
| 247 | - | ||
| 248 | - should_clauses.append({ | ||
| 249 | - "multi_match": { | ||
| 250 | - "query": translation, | ||
| 251 | - "fields": fields_with_boost, | ||
| 252 | - "minimum_should_match": "67%", | ||
| 253 | - "tie_breaker": 0.9, | ||
| 254 | - "boost": domain_config.boost, | ||
| 255 | - "_name": f"{domain_config.name}_{lang}_translated_query" | ||
| 256 | - } | ||
| 257 | - }) | ||
| 258 | - logger.debug(f"Added translated query for language '{lang}'") | ||
| 259 | - | ||
| 260 | - # 3. Fallback: query all fields in mapping if no language-specific query was built | ||
| 261 | - if not should_clauses: | ||
| 262 | - logger.debug("No language mapping matched, using all fields from mapping") | ||
| 263 | - # Use all fields from all languages in the mapping | ||
| 264 | - all_mapped_fields = [] | ||
| 265 | - for lang_fields in domain_config.language_field_mapping.values(): | ||
| 266 | - all_mapped_fields.extend(lang_fields) | ||
| 267 | - # Remove duplicates while preserving order | ||
| 268 | - unique_fields = list(dict.fromkeys(all_mapped_fields)) | ||
| 269 | - fields_with_boost = self._apply_field_boosts(unique_fields) | ||
| 270 | - | ||
| 271 | - should_clauses.append({ | ||
| 272 | - "multi_match": { | ||
| 273 | - "query": parsed_query.rewritten_query, | ||
| 274 | - "fields": fields_with_boost, | ||
| 275 | - "minimum_should_match": "67%", | ||
| 276 | - "tie_breaker": 0.9, | ||
| 277 | - "boost": domain_config.boost * 0.8, # Lower boost for fallback | ||
| 278 | - "_name": f"{domain_config.name}_fallback_query" | ||
| 279 | - } | ||
| 280 | - }) | ||
| 281 | - | ||
| 282 | - if len(should_clauses) == 1: | ||
| 283 | - return should_clauses[0] | ||
| 284 | - else: | ||
| 285 | - return { | ||
| 286 | - "bool": { | ||
| 287 | - "should": should_clauses, | ||
| 288 | - "minimum_should_match": 1 | ||
| 289 | - } | ||
| 290 | - } | ||
| 291 | - | ||
| 292 | - def _apply_field_boosts(self, field_names: List[str]) -> List[str]: | ||
| 293 | - """Apply boost values to field names.""" | ||
| 294 | - result = [] | ||
| 295 | - for field_name in field_names: | ||
| 296 | - field = self._get_field_by_name(field_name) | ||
| 297 | - if field and field.boost != 1.0: | ||
| 298 | - result.append(f"{field_name}^{field.boost}") | ||
| 299 | - else: | ||
| 300 | - result.append(field_name) | ||
| 301 | - return result | ||
| 302 | - | ||
| 303 | - def _build_boolean_query_from_tuple(self, node) -> Dict[str, Any]: | ||
| 304 | - """ | ||
| 305 | - Build query from boolean expression tuple. | ||
| 306 | - | ||
| 307 | - Args: | ||
| 308 | - node: Boolean expression tuple (operator, terms...) | ||
| 309 | - | ||
| 310 | - Returns: | ||
| 311 | - ES query clause | ||
| 312 | - """ | ||
| 313 | - if not node: | ||
| 314 | - return {"match_all": {}} | ||
| 315 | - | ||
| 316 | - # Handle different node types from boolean parser | ||
| 317 | - if hasattr(node, 'operator'): | ||
| 318 | - # QueryNode object | ||
| 319 | - operator = node.operator | ||
| 320 | - terms = node.terms if hasattr(node, 'terms') else None | ||
| 321 | - | ||
| 322 | - # For TERM nodes, check if there's a value | ||
| 323 | - if operator == 'TERM' and hasattr(node, 'value') and node.value: | ||
| 324 | - terms = node.value | ||
| 325 | - elif isinstance(node, tuple) and len(node) > 0: | ||
| 326 | - # Tuple format from boolean parser | ||
| 327 | - if hasattr(node[0], 'operator'): | ||
| 328 | - # Nested tuple with QueryNode | ||
| 329 | - operator = node[0].operator | ||
| 330 | - terms = node[0].terms | ||
| 331 | - elif isinstance(node[0], str): | ||
| 332 | - # Simple tuple like ('TERM', 'field:value') | ||
| 333 | - operator = node[0] | ||
| 334 | - terms = node[1] if len(node) > 1 else '' | ||
| 335 | - else: | ||
| 336 | - # Complex tuple like (OR( TERM(...), TERM(...) ), score) | ||
| 337 | - if hasattr(node[0], '__class__') and hasattr(node[0], '__name__'): | ||
| 338 | - # Constructor call like OR(...) | ||
| 339 | - operator = node[0].__name__ | ||
| 340 | - elif str(node[0]).startswith('('): | ||
| 341 | - # String representation of constructor call | ||
| 342 | - match = re.match(r'(\w+)\(', str(node[0])) | ||
| 343 | - if match: | ||
| 344 | - operator = match.group(1) | ||
| 345 | - else: | ||
| 346 | - return {"match_all": {}} | ||
| 347 | - else: | ||
| 348 | - operator = str(node[0]) | ||
| 349 | - | ||
| 350 | - # Extract terms from nested structure | ||
| 351 | - terms = [] | ||
| 352 | - if len(node) > 1 and isinstance(node[1], tuple): | ||
| 353 | - terms = node[1] | ||
| 354 | - else: | ||
| 355 | - return {"match_all": {}} | ||
| 356 | - | ||
| 357 | - | ||
| 358 | - if operator == 'TERM': | ||
| 359 | - # Leaf node - handle field:query format | ||
| 360 | - if isinstance(terms, str) and ':' in terms: | ||
| 361 | - field, value = terms.split(':', 1) | ||
| 362 | - return { | ||
| 363 | - "term": { | ||
| 364 | - field: value | ||
| 365 | - } | ||
| 366 | - } | ||
| 367 | - elif isinstance(terms, str): | ||
| 368 | - # Simple text term - create match query | ||
| 369 | - return { | ||
| 370 | - "multi_match": { | ||
| 371 | - "query": terms, | ||
| 372 | - "fields": self.match_fields, | ||
| 373 | - "type": "best_fields", | ||
| 374 | - "operator": "AND" | ||
| 375 | - } | ||
| 376 | - } | ||
| 377 | - else: | ||
| 378 | - # Invalid TERM node - return empty match | ||
| 379 | - return { | ||
| 380 | - "match_none": {} | ||
| 381 | - } | ||
| 382 | - | ||
| 383 | - elif operator == 'OR': | ||
| 384 | - # Any term must match | ||
| 385 | - should_clauses = [] | ||
| 386 | - if terms: | ||
| 387 | - for term in terms: | ||
| 388 | - clause = self._build_boolean_query_from_tuple(term) | ||
| 389 | - if clause and clause.get("match_none") is None: | ||
| 390 | - should_clauses.append(clause) | ||
| 391 | - | ||
| 392 | - if should_clauses: | ||
| 393 | - return { | ||
| 394 | - "bool": { | ||
| 395 | - "should": should_clauses, | ||
| 396 | - "minimum_should_match": 1 | ||
| 397 | - } | ||
| 398 | - } | ||
| 399 | - else: | ||
| 400 | - return {"match_none": {}} | ||
| 401 | - | ||
| 402 | - elif operator == 'AND': | ||
| 403 | - # All terms must match | ||
| 404 | - must_clauses = [] | ||
| 405 | - if terms: | ||
| 406 | - for term in terms: | ||
| 407 | - clause = self._build_boolean_query_from_tuple(term) | ||
| 408 | - if clause and clause.get("match_none") is None: | ||
| 409 | - must_clauses.append(clause) | ||
| 410 | - | ||
| 411 | - if must_clauses: | ||
| 412 | - return { | ||
| 413 | - "bool": { | ||
| 414 | - "must": must_clauses | ||
| 415 | - } | ||
| 416 | - } | ||
| 417 | - else: | ||
| 418 | - return {"match_none": {}} | ||
| 419 | - | ||
| 420 | - elif operator == 'ANDNOT': | ||
| 421 | - # First term must match, second must not | ||
| 422 | - if len(terms) >= 2: | ||
| 423 | - return { | ||
| 424 | - "bool": { | ||
| 425 | - "must": [self._build_boolean_query_from_tuple(terms[0])], | ||
| 426 | - "must_not": [self._build_boolean_query_from_tuple(terms[1])] | ||
| 427 | - } | ||
| 428 | - } | ||
| 429 | - else: | ||
| 430 | - return self._build_boolean_query_from_tuple(terms[0]) | ||
| 431 | - | ||
| 432 | - elif operator == 'RANK': | ||
| 433 | - # Like OR but for ranking (all terms contribute to score) | ||
| 434 | - should_clauses = [] | ||
| 435 | - for term in terms: | ||
| 436 | - should_clauses.append(self._build_boolean_query_from_tuple(term)) | ||
| 437 | - return { | ||
| 438 | - "bool": { | ||
| 439 | - "should": should_clauses | ||
| 440 | - } | ||
| 441 | - } | ||
| 442 | - | ||
| 443 | - else: | ||
| 444 | - # Unknown operator | ||
| 445 | - return {"match_all": {}} | ||
| 446 | - | ||
| 447 | - def get_domain_summary(self) -> Dict[str, Any]: | ||
| 448 | - """Get summary of all configured domains.""" | ||
| 449 | - summary = {} | ||
| 450 | - for domain_name, domain_config in self.domain_configs.items(): | ||
| 451 | - summary[domain_name] = { | ||
| 452 | - "label": domain_config.label, | ||
| 453 | - "fields": domain_config.fields, | ||
| 454 | - "analyzer": domain_config.analyzer.value, | ||
| 455 | - "boost": domain_config.boost, | ||
| 456 | - "has_multilang_mapping": domain_config.language_field_mapping is not None, | ||
| 457 | - "supported_languages": list(domain_config.language_field_mapping.keys()) if domain_config.language_field_mapping else [] | ||
| 458 | - } | ||
| 459 | - return summary | ||
| 460 | \ No newline at end of file | 0 | \ No newline at end of file |
search/query_config.py
| @@ -17,14 +17,24 @@ TEXT_EMBEDDING_FIELD = "title_embedding" | @@ -17,14 +17,24 @@ TEXT_EMBEDDING_FIELD = "title_embedding" | ||
| 17 | IMAGE_EMBEDDING_FIELD = "image_embedding" | 17 | IMAGE_EMBEDDING_FIELD = "image_embedding" |
| 18 | 18 | ||
| 19 | # Default match fields for text search (with boost) | 19 | # Default match fields for text search (with boost) |
| 20 | +# 文本召回:同时搜索中英文字段,两者相互补充 | ||
| 20 | DEFAULT_MATCH_FIELDS = [ | 21 | DEFAULT_MATCH_FIELDS = [ |
| 22 | + # 中文字段 | ||
| 21 | "title_zh^3.0", | 23 | "title_zh^3.0", |
| 22 | "brief_zh^1.5", | 24 | "brief_zh^1.5", |
| 23 | "description_zh^1.0", | 25 | "description_zh^1.0", |
| 24 | "vendor_zh^1.5", | 26 | "vendor_zh^1.5", |
| 25 | - "tags^1.0", | ||
| 26 | "category_path_zh^1.5", | 27 | "category_path_zh^1.5", |
| 27 | - "category_name_zh^1.5" | 28 | + "category_name_zh^1.5", |
| 29 | + # 英文字段 | ||
| 30 | + "title_en^3.0", | ||
| 31 | + "brief_en^1.5", | ||
| 32 | + "description_en^1.0", | ||
| 33 | + "vendor_en^1.5", | ||
| 34 | + "category_path_en^1.5", | ||
| 35 | + "category_name_en^1.5", | ||
| 36 | + # 语言无关字段 | ||
| 37 | + "tags^1.0", | ||
| 28 | ] | 38 | ] |
| 29 | 39 | ||
| 30 | # Domain-specific match fields | 40 | # Domain-specific match fields |
search/searcher.py
| @@ -13,7 +13,6 @@ from query import QueryParser, ParsedQuery | @@ -13,7 +13,6 @@ from query import QueryParser, ParsedQuery | ||
| 13 | from embeddings import CLIPImageEncoder | 13 | from embeddings import CLIPImageEncoder |
| 14 | from .boolean_parser import BooleanParser, QueryNode | 14 | from .boolean_parser import BooleanParser, QueryNode |
| 15 | from .es_query_builder import ESQueryBuilder | 15 | from .es_query_builder import ESQueryBuilder |
| 16 | -from .multilang_query_builder import MultiLanguageQueryBuilder | ||
| 17 | from .rerank_engine import RerankEngine | 16 | from .rerank_engine import RerankEngine |
| 18 | from .query_config import ( | 17 | from .query_config import ( |
| 19 | DEFAULT_INDEX_NAME, | 18 | DEFAULT_INDEX_NAME, |
| @@ -112,8 +111,8 @@ class Searcher: | @@ -112,8 +111,8 @@ class Searcher: | ||
| 112 | self.text_embedding_field = TEXT_EMBEDDING_FIELD | 111 | self.text_embedding_field = TEXT_EMBEDDING_FIELD |
| 113 | self.image_embedding_field = IMAGE_EMBEDDING_FIELD | 112 | self.image_embedding_field = IMAGE_EMBEDDING_FIELD |
| 114 | 113 | ||
| 115 | - # Query builder - use multi-language version | ||
| 116 | - self.query_builder = MultiLanguageQueryBuilder( | 114 | + # Query builder - simplified single-layer architecture |
| 115 | + self.query_builder = ESQueryBuilder( | ||
| 117 | index_name=index_name, | 116 | index_name=index_name, |
| 118 | match_fields=self.match_fields, | 117 | match_fields=self.match_fields, |
| 119 | text_embedding_field=self.text_embedding_field, | 118 | text_embedding_field=self.text_embedding_field, |
| @@ -274,8 +273,8 @@ class Searcher: | @@ -274,8 +273,8 @@ class Searcher: | ||
| 274 | filters = {} | 273 | filters = {} |
| 275 | filters['tenant_id'] = tenant_id | 274 | filters['tenant_id'] = tenant_id |
| 276 | 275 | ||
| 277 | - es_query = self.query_builder.build_multilang_query( | ||
| 278 | - parsed_query=parsed_query, | 276 | + es_query = self.query_builder.build_query( |
| 277 | + query_text=parsed_query.rewritten_query or parsed_query.normalized_query, | ||
| 279 | query_vector=parsed_query.query_vector if enable_embedding else None, | 278 | query_vector=parsed_query.query_vector if enable_embedding else None, |
| 280 | query_node=query_node, | 279 | query_node=query_node, |
| 281 | filters=filters, | 280 | filters=filters, |