Commit f0d020c3ae325deb3921307bcd2480d628cc5a7a
1 parent
577ec972
多语言查询改为只支持中英文两种,filters and ( text_call or embedding_recall),然后 function_score 支持 新鲜度等提权字段
1. 前端传递的过滤条件永远是要起作用的 2. 然后召回模块包括文本相关性召回(中英文都是用)和向量召回,两者相互补充。 3. 套用function_score以支持两种打分融合和各种提权字段 4. 只需要build_query 这一层。 实际操作: 1. 架构简化 移除了 MultiLanguageQueryBuilder 层级 只保留单层的 ESQueryBuilder.build_query 方法 2. 查询结构重构 实现了 filters and (text_recall or embedding_recall) 结构: 前端过滤条件:永远起作用(放在 filter 中) 文本召回:同时搜索中英文字段(multi_match 覆盖 title_zh/en, brief_zh/en 等) 向量召回:KNN 查询(独立参数,ES 会自动合并) Function_score:包装召回部分,支持提权字段配置 3. 文本匹配字段更新 在 DEFAULT_MATCH_FIELDS 中添加了中英文字段: 中文:title_zh, brief_zh, description_zh, vendor_zh, category_path_zh, category_name_zh 英文:title_en, brief_en, description_en, vendor_en, category_path_en, category_name_en 语言无关:tags 4. Function_score 框架保留 保留了 function_score 配置框架(FUNCTION_SCORE_CONFIG) 支持 filter_weight、field_value_factor、decay 等提权函数 可以从配置中扩展提权字段 5. 测试验证 所有功能测试通过: 基本文本搜索 带过滤条件的搜索 范围过滤 分面搜索 英文查询
Showing
4 changed files
with
176 additions
and
490 deletions
Show diff stats
search/es_query_builder.py
| ... | ... | @@ -2,11 +2,16 @@ |
| 2 | 2 | Elasticsearch query builder. |
| 3 | 3 | |
| 4 | 4 | Converts parsed queries and search parameters into ES DSL queries. |
| 5 | + | |
| 6 | +Simplified architecture: | |
| 7 | +- filters and (text_recall or embedding_recall) | |
| 8 | +- function_score wrapper for boosting fields | |
| 5 | 9 | """ |
| 6 | 10 | |
| 7 | 11 | from typing import Dict, Any, List, Optional, Union |
| 8 | 12 | import numpy as np |
| 9 | 13 | from .boolean_parser import QueryNode |
| 14 | +from .query_config import FUNCTION_SCORE_CONFIG | |
| 10 | 15 | |
| 11 | 16 | |
| 12 | 17 | class ESQueryBuilder: |
| ... | ... | @@ -51,14 +56,20 @@ class ESQueryBuilder: |
| 51 | 56 | min_score: Optional[float] = None |
| 52 | 57 | ) -> Dict[str, Any]: |
| 53 | 58 | """ |
| 54 | - Build complete ES query (重构版). | |
| 59 | + Build complete ES query (简化版). | |
| 60 | + | |
| 61 | + 结构:filters and (text_recall or embedding_recall) | |
| 62 | + - filters: 前端传递的过滤条件永远起作用 | |
| 63 | + - text_recall: 文本相关性召回(中英文字段都用) | |
| 64 | + - embedding_recall: 向量召回(KNN) | |
| 65 | + - function_score: 包装召回部分,支持提权字段 | |
| 55 | 66 | |
| 56 | 67 | Args: |
| 57 | 68 | query_text: Query text for BM25 matching |
| 58 | 69 | query_vector: Query embedding for KNN search |
| 59 | 70 | query_node: Parsed boolean expression tree |
| 60 | - filters: Exact match filters | |
| 61 | - range_filters: Range filters for numeric fields | |
| 71 | + filters: Exact match filters (always applied) | |
| 72 | + range_filters: Range filters for numeric fields (always applied) | |
| 62 | 73 | size: Number of results |
| 63 | 74 | from_: Offset for pagination |
| 64 | 75 | enable_knn: Whether to use KNN search |
| ... | ... | @@ -80,44 +91,161 @@ class ESQueryBuilder: |
| 80 | 91 | "includes": self.source_fields |
| 81 | 92 | } |
| 82 | 93 | |
| 83 | - # Build main query | |
| 84 | - if query_node and query_node.operator != 'TERM': | |
| 85 | - # Complex boolean query | |
| 86 | - query_clause = self._build_boolean_query(query_node) | |
| 87 | - else: | |
| 88 | - # Simple text query | |
| 89 | - query_clause = self._build_text_query(query_text) | |
| 90 | - | |
| 91 | - # Add filters if provided | |
| 92 | - if filters or range_filters: | |
| 93 | - filter_clauses = self._build_filters(filters, range_filters) | |
| 94 | + # 1. Build recall queries (text or embedding) | |
| 95 | + recall_clauses = [] | |
| 96 | + | |
| 97 | + # Text recall (always include if query_text exists) | |
| 98 | + if query_text: | |
| 99 | + if query_node and query_node.operator != 'TERM': | |
| 100 | + # Complex boolean query | |
| 101 | + text_query = self._build_boolean_query(query_node) | |
| 102 | + else: | |
| 103 | + # Simple text query | |
| 104 | + text_query = self._build_text_query(query_text) | |
| 105 | + recall_clauses.append(text_query) | |
| 106 | + | |
| 107 | + # Embedding recall (KNN - separate from query, handled below) | |
| 108 | + has_embedding = enable_knn and query_vector is not None and self.text_embedding_field | |
| 109 | + | |
| 110 | + # 2. Build filter clauses (always applied) | |
| 111 | + filter_clauses = self._build_filters(filters, range_filters) | |
| 112 | + | |
| 113 | + # 3. Build main query structure: filters and recall | |
| 114 | + if recall_clauses: | |
| 115 | + # Combine text recalls with OR logic (if multiple) | |
| 116 | + if len(recall_clauses) == 1: | |
| 117 | + recall_query = recall_clauses[0] | |
| 118 | + else: | |
| 119 | + recall_query = { | |
| 120 | + "bool": { | |
| 121 | + "should": recall_clauses, | |
| 122 | + "minimum_should_match": 1 | |
| 123 | + } | |
| 124 | + } | |
| 125 | + | |
| 126 | + # Wrap recall with function_score for boosting | |
| 127 | + recall_query = self._wrap_with_function_score(recall_query) | |
| 128 | + | |
| 129 | + # Combine filters and recall | |
| 94 | 130 | if filter_clauses: |
| 95 | 131 | es_query["query"] = { |
| 96 | 132 | "bool": { |
| 97 | - "must": [query_clause], | |
| 133 | + "must": [recall_query], | |
| 98 | 134 | "filter": filter_clauses |
| 99 | 135 | } |
| 100 | 136 | } |
| 101 | 137 | else: |
| 102 | - es_query["query"] = query_clause | |
| 138 | + es_query["query"] = recall_query | |
| 103 | 139 | else: |
| 104 | - es_query["query"] = query_clause | |
| 140 | + # No recall queries, only filters (match_all filtered) | |
| 141 | + if filter_clauses: | |
| 142 | + es_query["query"] = { | |
| 143 | + "bool": { | |
| 144 | + "must": [{"match_all": {}}], | |
| 145 | + "filter": filter_clauses | |
| 146 | + } | |
| 147 | + } | |
| 148 | + else: | |
| 149 | + es_query["query"] = {"match_all": {}} | |
| 105 | 150 | |
| 106 | - # Add KNN search if enabled and vector provided | |
| 107 | - if enable_knn and query_vector is not None and self.text_embedding_field: | |
| 151 | + # 4. Add KNN search if enabled (separate from query, ES will combine) | |
| 152 | + if has_embedding: | |
| 108 | 153 | knn_clause = { |
| 109 | 154 | "field": self.text_embedding_field, |
| 110 | 155 | "query_vector": query_vector.tolist(), |
| 111 | 156 | "k": knn_k, |
| 112 | - "num_candidates": knn_num_candidates | |
| 157 | + "num_candidates": knn_num_candidates, | |
| 158 | + "boost": 0.2 # Lower boost for embedding recall | |
| 113 | 159 | } |
| 114 | 160 | es_query["knn"] = knn_clause |
| 115 | 161 | |
| 116 | - # Add minimum score filter | |
| 162 | + # 5. Add minimum score filter | |
| 117 | 163 | if min_score is not None: |
| 118 | 164 | es_query["min_score"] = min_score |
| 119 | 165 | |
| 120 | 166 | return es_query |
| 167 | + | |
| 168 | + def _wrap_with_function_score(self, query: Dict[str, Any]) -> Dict[str, Any]: | |
| 169 | + """ | |
| 170 | + Wrap query with function_score for boosting fields. | |
| 171 | + | |
| 172 | + Args: | |
| 173 | + query: Base query to wrap | |
| 174 | + | |
| 175 | + Returns: | |
| 176 | + Function score query or original query if no functions configured | |
| 177 | + """ | |
| 178 | + functions = self._build_score_functions() | |
| 179 | + | |
| 180 | + # If no functions configured, return original query | |
| 181 | + if not functions: | |
| 182 | + return query | |
| 183 | + | |
| 184 | + # Build function_score query | |
| 185 | + function_score_query = { | |
| 186 | + "function_score": { | |
| 187 | + "query": query, | |
| 188 | + "functions": functions, | |
| 189 | + "score_mode": FUNCTION_SCORE_CONFIG.get("score_mode", "sum"), | |
| 190 | + "boost_mode": FUNCTION_SCORE_CONFIG.get("boost_mode", "multiply") | |
| 191 | + } | |
| 192 | + } | |
| 193 | + | |
| 194 | + return function_score_query | |
| 195 | + | |
| 196 | + def _build_score_functions(self) -> List[Dict[str, Any]]: | |
| 197 | + """ | |
| 198 | + Build function_score functions from config. | |
| 199 | + | |
| 200 | + Returns: | |
| 201 | + List of function score functions | |
| 202 | + """ | |
| 203 | + functions = [] | |
| 204 | + config_functions = FUNCTION_SCORE_CONFIG.get("functions", []) | |
| 205 | + | |
| 206 | + for func_config in config_functions: | |
| 207 | + func_type = func_config.get("type") | |
| 208 | + | |
| 209 | + if func_type == "filter_weight": | |
| 210 | + # Filter + Weight | |
| 211 | + functions.append({ | |
| 212 | + "filter": func_config["filter"], | |
| 213 | + "weight": func_config.get("weight", 1.0) | |
| 214 | + }) | |
| 215 | + | |
| 216 | + elif func_type == "field_value_factor": | |
| 217 | + # Field Value Factor | |
| 218 | + functions.append({ | |
| 219 | + "field_value_factor": { | |
| 220 | + "field": func_config["field"], | |
| 221 | + "factor": func_config.get("factor", 1.0), | |
| 222 | + "modifier": func_config.get("modifier", "none"), | |
| 223 | + "missing": func_config.get("missing", 1.0) | |
| 224 | + } | |
| 225 | + }) | |
| 226 | + | |
| 227 | + elif func_type == "decay": | |
| 228 | + # Decay Function (gauss/exp/linear) | |
| 229 | + decay_func = func_config.get("function", "gauss") | |
| 230 | + field = func_config["field"] | |
| 231 | + | |
| 232 | + decay_params = { | |
| 233 | + "origin": func_config.get("origin", "now"), | |
| 234 | + "scale": func_config["scale"] | |
| 235 | + } | |
| 236 | + | |
| 237 | + if "offset" in func_config: | |
| 238 | + decay_params["offset"] = func_config["offset"] | |
| 239 | + if "decay" in func_config: | |
| 240 | + decay_params["decay"] = func_config["decay"] | |
| 241 | + | |
| 242 | + functions.append({ | |
| 243 | + decay_func: { | |
| 244 | + field: decay_params | |
| 245 | + } | |
| 246 | + }) | |
| 247 | + | |
| 248 | + return functions | |
| 121 | 249 | |
| 122 | 250 | def _build_text_query(self, query_text: str) -> Dict[str, Any]: |
| 123 | 251 | """ |
| ... | ... | @@ -235,11 +363,19 @@ class ESQueryBuilder: |
| 235 | 363 | "term": {field: value} |
| 236 | 364 | }) |
| 237 | 365 | |
| 238 | - # 2. 处理范围过滤(RangeFilter Pydantic 模型) | |
| 366 | + # 2. 处理范围过滤(支持 RangeFilter Pydantic 模型或字典) | |
| 239 | 367 | if range_filters: |
| 240 | 368 | for field, range_filter in range_filters.items(): |
| 241 | - # 将 RangeFilter 模型转换为字典 | |
| 242 | - range_dict = range_filter.model_dump(exclude_none=True) | |
| 369 | + # 支持 Pydantic 模型或字典格式 | |
| 370 | + if hasattr(range_filter, 'model_dump'): | |
| 371 | + # Pydantic 模型 | |
| 372 | + range_dict = range_filter.model_dump(exclude_none=True) | |
| 373 | + elif isinstance(range_filter, dict): | |
| 374 | + # 已经是字典格式 | |
| 375 | + range_dict = {k: v for k, v in range_filter.items() if v is not None} | |
| 376 | + else: | |
| 377 | + # 其他格式,跳过 | |
| 378 | + continue | |
| 243 | 379 | |
| 244 | 380 | if range_dict: |
| 245 | 381 | filter_clauses.append({ | ... | ... |
search/multilang_query_builder.py deleted
| ... | ... | @@ -1,459 +0,0 @@ |
| 1 | -""" | |
| 2 | -Multi-language query builder for handling domain-specific searches. | |
| 3 | - | |
| 4 | -This module extends the ESQueryBuilder to support multi-language field mappings, | |
| 5 | -allowing queries to be routed to appropriate language-specific fields while | |
| 6 | -maintaining a unified external interface. | |
| 7 | -""" | |
| 8 | - | |
| 9 | -from typing import Dict, Any, List, Optional | |
| 10 | -import numpy as np | |
| 11 | -import logging | |
| 12 | -import re | |
| 13 | - | |
| 14 | -from query import ParsedQuery | |
| 15 | -from .es_query_builder import ESQueryBuilder | |
| 16 | -from .query_config import DEFAULT_MATCH_FIELDS, DOMAIN_FIELDS, FUNCTION_SCORE_CONFIG | |
| 17 | - | |
| 18 | -logger = logging.getLogger(__name__) | |
| 19 | - | |
| 20 | - | |
| 21 | -class MultiLanguageQueryBuilder(ESQueryBuilder): | |
| 22 | - """ | |
| 23 | - Enhanced query builder with multi-language support. | |
| 24 | - | |
| 25 | - Handles routing queries to appropriate language-specific fields based on: | |
| 26 | - 1. Detected query language | |
| 27 | - 2. Available translations | |
| 28 | - 3. Domain configuration (language_field_mapping) | |
| 29 | - """ | |
| 30 | - | |
| 31 | - def __init__( | |
| 32 | - self, | |
| 33 | - index_name: str, | |
| 34 | - match_fields: Optional[List[str]] = None, | |
| 35 | - text_embedding_field: Optional[str] = None, | |
| 36 | - image_embedding_field: Optional[str] = None, | |
| 37 | - source_fields: Optional[List[str]] = None | |
| 38 | - ): | |
| 39 | - """ | |
| 40 | - Initialize multi-language query builder. | |
| 41 | - | |
| 42 | - Args: | |
| 43 | - index_name: ES index name | |
| 44 | - match_fields: Fields to search for text matching (default: from query_config) | |
| 45 | - text_embedding_field: Field name for text embeddings | |
| 46 | - image_embedding_field: Field name for image embeddings | |
| 47 | - source_fields: Fields to return in search results (_source includes) | |
| 48 | - """ | |
| 49 | - self.function_score_config = FUNCTION_SCORE_CONFIG | |
| 50 | - | |
| 51 | - # Use provided match_fields or default | |
| 52 | - if match_fields is None: | |
| 53 | - match_fields = DEFAULT_MATCH_FIELDS | |
| 54 | - | |
| 55 | - super().__init__( | |
| 56 | - index_name=index_name, | |
| 57 | - match_fields=match_fields, | |
| 58 | - text_embedding_field=text_embedding_field, | |
| 59 | - image_embedding_field=image_embedding_field, | |
| 60 | - source_fields=source_fields | |
| 61 | - ) | |
| 62 | - | |
| 63 | - # Build domain configurations from query_config | |
| 64 | - self.domain_configs = DOMAIN_FIELDS | |
| 65 | - | |
| 66 | - def _get_domain_fields(self, domain_name: str) -> List[str]: | |
| 67 | - """Get fields for a specific domain with boost notation.""" | |
| 68 | - return self.domain_configs.get(domain_name, DEFAULT_MATCH_FIELDS) | |
| 69 | - | |
| 70 | - def build_multilang_query( | |
| 71 | - self, | |
| 72 | - parsed_query: ParsedQuery, | |
| 73 | - query_vector: Optional[np.ndarray] = None, | |
| 74 | - query_node: Optional[Any] = None, | |
| 75 | - filters: Optional[Dict[str, Any]] = None, | |
| 76 | - range_filters: Optional[Dict[str, Any]] = None, | |
| 77 | - size: int = 10, | |
| 78 | - from_: int = 0, | |
| 79 | - enable_knn: bool = True, | |
| 80 | - knn_k: int = 50, | |
| 81 | - knn_num_candidates: int = 200, | |
| 82 | - min_score: Optional[float] = None | |
| 83 | - ) -> Dict[str, Any]: | |
| 84 | - """ | |
| 85 | - Build ES query with multi-language support (简化版). | |
| 86 | - | |
| 87 | - Args: | |
| 88 | - parsed_query: Parsed query with language info and translations | |
| 89 | - query_vector: Query embedding for KNN search | |
| 90 | - filters: Exact match filters | |
| 91 | - range_filters: Range filters for numeric fields | |
| 92 | - size: Number of results | |
| 93 | - from_: Offset for pagination | |
| 94 | - enable_knn: Whether to use KNN search | |
| 95 | - knn_k: K value for KNN | |
| 96 | - knn_num_candidates: Number of candidates for KNN | |
| 97 | - min_score: Minimum score threshold | |
| 98 | - | |
| 99 | - Returns: | |
| 100 | - ES query DSL dictionary | |
| 101 | - """ | |
| 102 | - # 1. 根据域选择匹配字段(默认域使用 DEFAULT_MATCH_FIELDS) | |
| 103 | - domain = parsed_query.domain or "default" | |
| 104 | - domain_fields = self.domain_configs.get(domain) or DEFAULT_MATCH_FIELDS | |
| 105 | - | |
| 106 | - # 2. 临时切换 match_fields,复用基类 build_query 逻辑 | |
| 107 | - original_match_fields = self.match_fields | |
| 108 | - self.match_fields = domain_fields | |
| 109 | - try: | |
| 110 | - return super().build_query( | |
| 111 | - query_text=parsed_query.rewritten_query or parsed_query.normalized_query, | |
| 112 | - query_vector=query_vector, | |
| 113 | - query_node=query_node, | |
| 114 | - filters=filters, | |
| 115 | - range_filters=range_filters, | |
| 116 | - size=size, | |
| 117 | - from_=from_, | |
| 118 | - enable_knn=enable_knn, | |
| 119 | - knn_k=knn_k, | |
| 120 | - knn_num_candidates=knn_num_candidates, | |
| 121 | - min_score=min_score | |
| 122 | - ) | |
| 123 | - finally: | |
| 124 | - # 恢复原始配置,避免影响后续查询 | |
| 125 | - self.match_fields = original_match_fields | |
| 126 | - | |
| 127 | - def _build_score_functions(self) -> List[Dict[str, Any]]: | |
| 128 | - """ | |
| 129 | - 从配置构建 function_score 的打分函数列表 | |
| 130 | - | |
| 131 | - Returns: | |
| 132 | - 打分函数列表(ES原生格式) | |
| 133 | - """ | |
| 134 | - if not self.function_score_config or not self.function_score_config.functions: | |
| 135 | - return [] | |
| 136 | - | |
| 137 | - functions = [] | |
| 138 | - | |
| 139 | - for func_config in self.function_score_config.functions: | |
| 140 | - func_type = func_config.get('type') | |
| 141 | - | |
| 142 | - if func_type == 'filter_weight': | |
| 143 | - # Filter + Weight | |
| 144 | - functions.append({ | |
| 145 | - "filter": func_config['filter'], | |
| 146 | - "weight": func_config.get('weight', 1.0) | |
| 147 | - }) | |
| 148 | - | |
| 149 | - elif func_type == 'field_value_factor': | |
| 150 | - # Field Value Factor | |
| 151 | - functions.append({ | |
| 152 | - "field_value_factor": { | |
| 153 | - "field": func_config['field'], | |
| 154 | - "factor": func_config.get('factor', 1.0), | |
| 155 | - "modifier": func_config.get('modifier', 'none'), | |
| 156 | - "missing": func_config.get('missing', 1.0) | |
| 157 | - } | |
| 158 | - }) | |
| 159 | - | |
| 160 | - elif func_type == 'decay': | |
| 161 | - # Decay Function (gauss/exp/linear) | |
| 162 | - decay_func = func_config.get('function', 'gauss') | |
| 163 | - field = func_config['field'] | |
| 164 | - | |
| 165 | - decay_params = { | |
| 166 | - "origin": func_config.get('origin', 'now'), | |
| 167 | - "scale": func_config['scale'] | |
| 168 | - } | |
| 169 | - | |
| 170 | - if 'offset' in func_config: | |
| 171 | - decay_params['offset'] = func_config['offset'] | |
| 172 | - if 'decay' in func_config: | |
| 173 | - decay_params['decay'] = func_config['decay'] | |
| 174 | - | |
| 175 | - functions.append({ | |
| 176 | - decay_func: { | |
| 177 | - field: decay_params | |
| 178 | - } | |
| 179 | - }) | |
| 180 | - | |
| 181 | - return functions | |
| 182 | - | |
| 183 | - def _build_multilang_text_query( | |
| 184 | - self, | |
| 185 | - parsed_query: ParsedQuery, | |
| 186 | - domain_config: Dict[str, Any] | |
| 187 | - ) -> Dict[str, Any]: | |
| 188 | - """ | |
| 189 | - Build text query with multi-language field routing. | |
| 190 | - | |
| 191 | - Args: | |
| 192 | - parsed_query: Parsed query with language info | |
| 193 | - domain_config: Domain configuration | |
| 194 | - | |
| 195 | - Returns: | |
| 196 | - ES query clause | |
| 197 | - """ | |
| 198 | - if not domain_config.language_field_mapping: | |
| 199 | - # No multi-language mapping, use all fields with default analyzer | |
| 200 | - fields_with_boost = [] | |
| 201 | - for field_name in domain_config.fields: | |
| 202 | - field = self._get_field_by_name(field_name) | |
| 203 | - if field and field.boost != 1.0: | |
| 204 | - fields_with_boost.append(f"{field_name}^{field.boost}") | |
| 205 | - else: | |
| 206 | - fields_with_boost.append(field_name) | |
| 207 | - | |
| 208 | - return { | |
| 209 | - "multi_match": { | |
| 210 | - "query": parsed_query.rewritten_query, | |
| 211 | - "fields": fields_with_boost, | |
| 212 | - "minimum_should_match": "67%", | |
| 213 | - "tie_breaker": 0.9, | |
| 214 | - "boost": domain_config.boost, | |
| 215 | - "_name": f"{domain_config.name}_query" | |
| 216 | - } | |
| 217 | - } | |
| 218 | - | |
| 219 | - # Multi-language mapping exists - build targeted queries | |
| 220 | - should_clauses = [] | |
| 221 | - available_languages = set(domain_config.language_field_mapping.keys()) | |
| 222 | - | |
| 223 | - # 1. Query in detected language (if it exists in mapping) | |
| 224 | - detected_lang = parsed_query.detected_language | |
| 225 | - if detected_lang in available_languages: | |
| 226 | - target_fields = domain_config.language_field_mapping[detected_lang] | |
| 227 | - fields_with_boost = self._apply_field_boosts(target_fields) | |
| 228 | - | |
| 229 | - should_clauses.append({ | |
| 230 | - "multi_match": { | |
| 231 | - "query": parsed_query.rewritten_query, | |
| 232 | - "fields": fields_with_boost, | |
| 233 | - "minimum_should_match": "67%", | |
| 234 | - "tie_breaker": 0.9, | |
| 235 | - "boost": domain_config.boost * 1.5, # Higher boost for detected language | |
| 236 | - "_name": f"{domain_config.name}_{detected_lang}_query" | |
| 237 | - } | |
| 238 | - }) | |
| 239 | - logger.debug(f"Added query for detected language '{detected_lang}'") | |
| 240 | - | |
| 241 | - # 2. Query in translated languages (only for languages in mapping) | |
| 242 | - for lang, translation in parsed_query.translations.items(): | |
| 243 | - # Only use translations for languages that exist in the mapping | |
| 244 | - if lang in available_languages and translation and translation.strip(): | |
| 245 | - target_fields = domain_config.language_field_mapping[lang] | |
| 246 | - fields_with_boost = self._apply_field_boosts(target_fields) | |
| 247 | - | |
| 248 | - should_clauses.append({ | |
| 249 | - "multi_match": { | |
| 250 | - "query": translation, | |
| 251 | - "fields": fields_with_boost, | |
| 252 | - "minimum_should_match": "67%", | |
| 253 | - "tie_breaker": 0.9, | |
| 254 | - "boost": domain_config.boost, | |
| 255 | - "_name": f"{domain_config.name}_{lang}_translated_query" | |
| 256 | - } | |
| 257 | - }) | |
| 258 | - logger.debug(f"Added translated query for language '{lang}'") | |
| 259 | - | |
| 260 | - # 3. Fallback: query all fields in mapping if no language-specific query was built | |
| 261 | - if not should_clauses: | |
| 262 | - logger.debug("No language mapping matched, using all fields from mapping") | |
| 263 | - # Use all fields from all languages in the mapping | |
| 264 | - all_mapped_fields = [] | |
| 265 | - for lang_fields in domain_config.language_field_mapping.values(): | |
| 266 | - all_mapped_fields.extend(lang_fields) | |
| 267 | - # Remove duplicates while preserving order | |
| 268 | - unique_fields = list(dict.fromkeys(all_mapped_fields)) | |
| 269 | - fields_with_boost = self._apply_field_boosts(unique_fields) | |
| 270 | - | |
| 271 | - should_clauses.append({ | |
| 272 | - "multi_match": { | |
| 273 | - "query": parsed_query.rewritten_query, | |
| 274 | - "fields": fields_with_boost, | |
| 275 | - "minimum_should_match": "67%", | |
| 276 | - "tie_breaker": 0.9, | |
| 277 | - "boost": domain_config.boost * 0.8, # Lower boost for fallback | |
| 278 | - "_name": f"{domain_config.name}_fallback_query" | |
| 279 | - } | |
| 280 | - }) | |
| 281 | - | |
| 282 | - if len(should_clauses) == 1: | |
| 283 | - return should_clauses[0] | |
| 284 | - else: | |
| 285 | - return { | |
| 286 | - "bool": { | |
| 287 | - "should": should_clauses, | |
| 288 | - "minimum_should_match": 1 | |
| 289 | - } | |
| 290 | - } | |
| 291 | - | |
| 292 | - def _apply_field_boosts(self, field_names: List[str]) -> List[str]: | |
| 293 | - """Apply boost values to field names.""" | |
| 294 | - result = [] | |
| 295 | - for field_name in field_names: | |
| 296 | - field = self._get_field_by_name(field_name) | |
| 297 | - if field and field.boost != 1.0: | |
| 298 | - result.append(f"{field_name}^{field.boost}") | |
| 299 | - else: | |
| 300 | - result.append(field_name) | |
| 301 | - return result | |
| 302 | - | |
| 303 | - def _build_boolean_query_from_tuple(self, node) -> Dict[str, Any]: | |
| 304 | - """ | |
| 305 | - Build query from boolean expression tuple. | |
| 306 | - | |
| 307 | - Args: | |
| 308 | - node: Boolean expression tuple (operator, terms...) | |
| 309 | - | |
| 310 | - Returns: | |
| 311 | - ES query clause | |
| 312 | - """ | |
| 313 | - if not node: | |
| 314 | - return {"match_all": {}} | |
| 315 | - | |
| 316 | - # Handle different node types from boolean parser | |
| 317 | - if hasattr(node, 'operator'): | |
| 318 | - # QueryNode object | |
| 319 | - operator = node.operator | |
| 320 | - terms = node.terms if hasattr(node, 'terms') else None | |
| 321 | - | |
| 322 | - # For TERM nodes, check if there's a value | |
| 323 | - if operator == 'TERM' and hasattr(node, 'value') and node.value: | |
| 324 | - terms = node.value | |
| 325 | - elif isinstance(node, tuple) and len(node) > 0: | |
| 326 | - # Tuple format from boolean parser | |
| 327 | - if hasattr(node[0], 'operator'): | |
| 328 | - # Nested tuple with QueryNode | |
| 329 | - operator = node[0].operator | |
| 330 | - terms = node[0].terms | |
| 331 | - elif isinstance(node[0], str): | |
| 332 | - # Simple tuple like ('TERM', 'field:value') | |
| 333 | - operator = node[0] | |
| 334 | - terms = node[1] if len(node) > 1 else '' | |
| 335 | - else: | |
| 336 | - # Complex tuple like (OR( TERM(...), TERM(...) ), score) | |
| 337 | - if hasattr(node[0], '__class__') and hasattr(node[0], '__name__'): | |
| 338 | - # Constructor call like OR(...) | |
| 339 | - operator = node[0].__name__ | |
| 340 | - elif str(node[0]).startswith('('): | |
| 341 | - # String representation of constructor call | |
| 342 | - match = re.match(r'(\w+)\(', str(node[0])) | |
| 343 | - if match: | |
| 344 | - operator = match.group(1) | |
| 345 | - else: | |
| 346 | - return {"match_all": {}} | |
| 347 | - else: | |
| 348 | - operator = str(node[0]) | |
| 349 | - | |
| 350 | - # Extract terms from nested structure | |
| 351 | - terms = [] | |
| 352 | - if len(node) > 1 and isinstance(node[1], tuple): | |
| 353 | - terms = node[1] | |
| 354 | - else: | |
| 355 | - return {"match_all": {}} | |
| 356 | - | |
| 357 | - | |
| 358 | - if operator == 'TERM': | |
| 359 | - # Leaf node - handle field:query format | |
| 360 | - if isinstance(terms, str) and ':' in terms: | |
| 361 | - field, value = terms.split(':', 1) | |
| 362 | - return { | |
| 363 | - "term": { | |
| 364 | - field: value | |
| 365 | - } | |
| 366 | - } | |
| 367 | - elif isinstance(terms, str): | |
| 368 | - # Simple text term - create match query | |
| 369 | - return { | |
| 370 | - "multi_match": { | |
| 371 | - "query": terms, | |
| 372 | - "fields": self.match_fields, | |
| 373 | - "type": "best_fields", | |
| 374 | - "operator": "AND" | |
| 375 | - } | |
| 376 | - } | |
| 377 | - else: | |
| 378 | - # Invalid TERM node - return empty match | |
| 379 | - return { | |
| 380 | - "match_none": {} | |
| 381 | - } | |
| 382 | - | |
| 383 | - elif operator == 'OR': | |
| 384 | - # Any term must match | |
| 385 | - should_clauses = [] | |
| 386 | - if terms: | |
| 387 | - for term in terms: | |
| 388 | - clause = self._build_boolean_query_from_tuple(term) | |
| 389 | - if clause and clause.get("match_none") is None: | |
| 390 | - should_clauses.append(clause) | |
| 391 | - | |
| 392 | - if should_clauses: | |
| 393 | - return { | |
| 394 | - "bool": { | |
| 395 | - "should": should_clauses, | |
| 396 | - "minimum_should_match": 1 | |
| 397 | - } | |
| 398 | - } | |
| 399 | - else: | |
| 400 | - return {"match_none": {}} | |
| 401 | - | |
| 402 | - elif operator == 'AND': | |
| 403 | - # All terms must match | |
| 404 | - must_clauses = [] | |
| 405 | - if terms: | |
| 406 | - for term in terms: | |
| 407 | - clause = self._build_boolean_query_from_tuple(term) | |
| 408 | - if clause and clause.get("match_none") is None: | |
| 409 | - must_clauses.append(clause) | |
| 410 | - | |
| 411 | - if must_clauses: | |
| 412 | - return { | |
| 413 | - "bool": { | |
| 414 | - "must": must_clauses | |
| 415 | - } | |
| 416 | - } | |
| 417 | - else: | |
| 418 | - return {"match_none": {}} | |
| 419 | - | |
| 420 | - elif operator == 'ANDNOT': | |
| 421 | - # First term must match, second must not | |
| 422 | - if len(terms) >= 2: | |
| 423 | - return { | |
| 424 | - "bool": { | |
| 425 | - "must": [self._build_boolean_query_from_tuple(terms[0])], | |
| 426 | - "must_not": [self._build_boolean_query_from_tuple(terms[1])] | |
| 427 | - } | |
| 428 | - } | |
| 429 | - else: | |
| 430 | - return self._build_boolean_query_from_tuple(terms[0]) | |
| 431 | - | |
| 432 | - elif operator == 'RANK': | |
| 433 | - # Like OR but for ranking (all terms contribute to score) | |
| 434 | - should_clauses = [] | |
| 435 | - for term in terms: | |
| 436 | - should_clauses.append(self._build_boolean_query_from_tuple(term)) | |
| 437 | - return { | |
| 438 | - "bool": { | |
| 439 | - "should": should_clauses | |
| 440 | - } | |
| 441 | - } | |
| 442 | - | |
| 443 | - else: | |
| 444 | - # Unknown operator | |
| 445 | - return {"match_all": {}} | |
| 446 | - | |
| 447 | - def get_domain_summary(self) -> Dict[str, Any]: | |
| 448 | - """Get summary of all configured domains.""" | |
| 449 | - summary = {} | |
| 450 | - for domain_name, domain_config in self.domain_configs.items(): | |
| 451 | - summary[domain_name] = { | |
| 452 | - "label": domain_config.label, | |
| 453 | - "fields": domain_config.fields, | |
| 454 | - "analyzer": domain_config.analyzer.value, | |
| 455 | - "boost": domain_config.boost, | |
| 456 | - "has_multilang_mapping": domain_config.language_field_mapping is not None, | |
| 457 | - "supported_languages": list(domain_config.language_field_mapping.keys()) if domain_config.language_field_mapping else [] | |
| 458 | - } | |
| 459 | - return summary | |
| 460 | 0 | \ No newline at end of file |
search/query_config.py
| ... | ... | @@ -17,14 +17,24 @@ TEXT_EMBEDDING_FIELD = "title_embedding" |
| 17 | 17 | IMAGE_EMBEDDING_FIELD = "image_embedding" |
| 18 | 18 | |
| 19 | 19 | # Default match fields for text search (with boost) |
| 20 | +# 文本召回:同时搜索中英文字段,两者相互补充 | |
| 20 | 21 | DEFAULT_MATCH_FIELDS = [ |
| 22 | + # 中文字段 | |
| 21 | 23 | "title_zh^3.0", |
| 22 | 24 | "brief_zh^1.5", |
| 23 | 25 | "description_zh^1.0", |
| 24 | 26 | "vendor_zh^1.5", |
| 25 | - "tags^1.0", | |
| 26 | 27 | "category_path_zh^1.5", |
| 27 | - "category_name_zh^1.5" | |
| 28 | + "category_name_zh^1.5", | |
| 29 | + # 英文字段 | |
| 30 | + "title_en^3.0", | |
| 31 | + "brief_en^1.5", | |
| 32 | + "description_en^1.0", | |
| 33 | + "vendor_en^1.5", | |
| 34 | + "category_path_en^1.5", | |
| 35 | + "category_name_en^1.5", | |
| 36 | + # 语言无关字段 | |
| 37 | + "tags^1.0", | |
| 28 | 38 | ] |
| 29 | 39 | |
| 30 | 40 | # Domain-specific match fields | ... | ... |
search/searcher.py
| ... | ... | @@ -13,7 +13,6 @@ from query import QueryParser, ParsedQuery |
| 13 | 13 | from embeddings import CLIPImageEncoder |
| 14 | 14 | from .boolean_parser import BooleanParser, QueryNode |
| 15 | 15 | from .es_query_builder import ESQueryBuilder |
| 16 | -from .multilang_query_builder import MultiLanguageQueryBuilder | |
| 17 | 16 | from .rerank_engine import RerankEngine |
| 18 | 17 | from .query_config import ( |
| 19 | 18 | DEFAULT_INDEX_NAME, |
| ... | ... | @@ -112,8 +111,8 @@ class Searcher: |
| 112 | 111 | self.text_embedding_field = TEXT_EMBEDDING_FIELD |
| 113 | 112 | self.image_embedding_field = IMAGE_EMBEDDING_FIELD |
| 114 | 113 | |
| 115 | - # Query builder - use multi-language version | |
| 116 | - self.query_builder = MultiLanguageQueryBuilder( | |
| 114 | + # Query builder - simplified single-layer architecture | |
| 115 | + self.query_builder = ESQueryBuilder( | |
| 117 | 116 | index_name=index_name, |
| 118 | 117 | match_fields=self.match_fields, |
| 119 | 118 | text_embedding_field=self.text_embedding_field, |
| ... | ... | @@ -274,8 +273,8 @@ class Searcher: |
| 274 | 273 | filters = {} |
| 275 | 274 | filters['tenant_id'] = tenant_id |
| 276 | 275 | |
| 277 | - es_query = self.query_builder.build_multilang_query( | |
| 278 | - parsed_query=parsed_query, | |
| 276 | + es_query = self.query_builder.build_query( | |
| 277 | + query_text=parsed_query.rewritten_query or parsed_query.normalized_query, | |
| 279 | 278 | query_vector=parsed_query.query_vector if enable_embedding else None, |
| 280 | 279 | query_node=query_node, |
| 281 | 280 | filters=filters, | ... | ... |