Commit ea118f2b13c9ac648298f2f588f3bb1fb86acef1

Authored by tangwang
1 parent 985d7fe3

build_query:根据 query_tokens 调整 KNN 参数:

短查询(token_count ≤ 2):knn_k=30, num_candidates=100, boost=0.15(约原权重 0.6)
中等查询(3–4):knn_k=50, num_candidates=200, boost=0.25(默认)
长查询(token_count ≥ 5):knn_k=80, num_candidates=300, boost=0.35(约原权重 1.4)
docs/基础配置指南.md
@@ -78,7 +78,7 @@ @@ -78,7 +78,7 @@
78 78
79 - **filters**: 前端传递的过滤条件(永远起作用) 79 - **filters**: 前端传递的过滤条件(永远起作用)
80 - **text_recall**: 文本相关性召回(同时搜索中英文字段) 80 - **text_recall**: 文本相关性召回(同时搜索中英文字段)
81 -- **embedding_recall**: 向量召回(KNN,使用 `title_embedding`) 81 +- **embedding_recall**: 向量召回(KNN,使用 `title_embedding`),根据 query_tokens 自适应调整 k、num_candidates、boost(详见 `docs/相关性检索优化说明.md` 3.6 节)
82 - **function_score**: 包装召回部分,支持提权字段 82 - **function_score**: 包装召回部分,支持提权字段
83 83
84 ### Function Score 配置 84 ### Function Score 配置
docs/相关性检索优化说明.md
@@ -105,7 +105,7 @@ @@ -105,7 +105,7 @@
105 - 支持跨语言检索 105 - 支持跨语言检索
106 106
107 #### 3.3 短语查询(phrase_query) 107 #### 3.3 短语查询(phrase_query)
108 -- 针对短查询(token_count >= 2 且 is_short_query 108 +- 针对短查询(token_count >= 2 且 is_short,由 query_tokens 推导
109 - 使用 `type: "phrase"` 进行精确短语匹配 109 - 使用 `type: "phrase"` 进行精确短语匹配
110 - 支持 slop(允许词序调整) 110 - 支持 slop(允许词序调整)
111 111
@@ -118,6 +118,21 @@ @@ -118,6 +118,21 @@
118 - 当前已禁用(参考实现中也是 False) 118 - 当前已禁用(参考实现中也是 False)
119 - 未来可根据需要启用 119 - 未来可根据需要启用
120 120
  121 +#### 3.6 KNN 向量召回自适应策略(query_tokens)
  122 +
  123 +根据 `query_tokens`(HanLP 分词后的 token 数量)动态调整 KNN 的召回数量和权重:
  124 +
  125 +| 查询类型 | token_count | knn_k | num_candidates | boost 系数 |
  126 +|---------|-------------|-------|----------------|------------|
  127 +| 短查询 | ≤ 2 | 30 | 100 | 0.6× 默认 |
  128 +| 中等查询| 3~4 | 50 | 200 | 1.0× 默认 |
  129 +| 长查询 | ≥ 5 | 80 | 300 | 1.4× 默认 |
  130 +
  131 +**策略说明**:
  132 +- **短查询**:BM25 对精确匹配更有效,降低 KNN 召回和权重,避免语义召回干扰
  133 +- **长查询**:语义搜索更有利,提高 KNN 召回和权重,增强语义理解能力
  134 +- 默认 boost 由 `config.query_config.knn_boost` 配置(通常 0.25)
  135 +
121 ### 4. 字段映射优化 136 ### 4. 字段映射优化
122 137
123 新增 `_get_match_fields()` 方法,支持: 138 新增 `_get_match_fields()` 方法,支持:
@@ -186,8 +201,7 @@ result = searcher.search( @@ -186,8 +201,7 @@ result = searcher.search(
186 parsed_query = query_parser.parse("戏水动物") 201 parsed_query = query_parser.parse("戏水动物")
187 print(f"关键词: {parsed_query.keywords}") 202 print(f"关键词: {parsed_query.keywords}")
188 print(f"Token数: {parsed_query.token_count}") 203 print(f"Token数: {parsed_query.token_count}")
189 -print(f"短查询: {parsed_query.is_short_query}")  
190 -print(f"长查询: {parsed_query.is_long_query}") 204 +print(f"query_tokens: {parsed_query.query_tokens}")
191 ``` 205 ```
192 206
193 ## 性能考虑 207 ## 性能考虑
docs/系统设计文档.md
@@ -527,6 +527,7 @@ laptop AND (gaming OR professional) ANDNOT cheap @@ -527,6 +527,7 @@ laptop AND (gaming OR professional) ANDNOT cheap
527 } 527 }
528 } 528 }
529 ``` 529 ```
  530 +> **KNN 自适应策略**:`k`、`num_candidates`、`boost` 会根据 `query_tokens` 动态调整:短查询(≤2 token)减少召回和权重,长查询(≥5 token)增加召回和权重。详见 `docs/相关性检索优化说明.md` 3.6 节。
530 531
531 #### 实现模块 532 #### 实现模块
532 - `search/es_query_builder.py` - ES 查询构建器(单层架构,`build_query` 方法) 533 - `search/es_query_builder.py` - ES 查询构建器(单层架构,`build_query` 方法)
query/query_parser.py
@@ -34,8 +34,7 @@ class ParsedQuery: @@ -34,8 +34,7 @@ class ParsedQuery:
34 domain: str = "default", 34 domain: str = "default",
35 keywords: str = "", 35 keywords: str = "",
36 token_count: int = 0, 36 token_count: int = 0,
37 - is_short_query: bool = False,  
38 - is_long_query: bool = False 37 + query_tokens: Optional[List[str]] = None
39 ): 38 ):
40 self.original_query = original_query 39 self.original_query = original_query
41 self.query_normalized = query_normalized 40 self.query_normalized = query_normalized
@@ -47,8 +46,7 @@ class ParsedQuery: @@ -47,8 +46,7 @@ class ParsedQuery:
47 # Query analysis fields 46 # Query analysis fields
48 self.keywords = keywords 47 self.keywords = keywords
49 self.token_count = token_count 48 self.token_count = token_count
50 - self.is_short_query = is_short_query  
51 - self.is_long_query = is_long_query 49 + self.query_tokens = query_tokens or []
52 50
53 def to_dict(self) -> Dict[str, Any]: 51 def to_dict(self) -> Dict[str, Any]:
54 """Convert to dictionary representation.""" 52 """Convert to dictionary representation."""
@@ -144,13 +142,11 @@ class QueryParser: @@ -144,13 +142,11 @@ class QueryParser:
144 """Get token count using HanLP.""" 142 """Get token count using HanLP."""
145 tok_result = self._tok(query) 143 tok_result = self._tok(query)
146 return len(tok_result) if tok_result else 0 144 return len(tok_result) if tok_result else 0
147 -  
148 - def _analyze_query_type(self, query: str, token_count: int) -> tuple:  
149 - """Analyze query type: (is_short_query, is_long_query)."""  
150 - is_quoted = query.startswith('"') and query.endswith('"')  
151 - is_short = is_quoted or ((token_count <= 2 or len(query) <= 4) and ' ' not in query)  
152 - is_long = token_count >= 4  
153 - return is_short, is_long 145 +
  146 + def _get_query_tokens(self, query: str) -> List[str]:
  147 + """Get token list using HanLP."""
  148 + tok_result = self._tok(query)
  149 + return [x[0] for x in tok_result] if tok_result else []
154 150
155 def parse( 151 def parse(
156 self, 152 self,
@@ -294,18 +290,17 @@ class QueryParser: @@ -294,18 +290,17 @@ class QueryParser:
294 if context: 290 if context:
295 context.add_warning(error_msg) 291 context.add_warning(error_msg)
296 292
297 - # Stage 5: Query analysis (keywords, token count, query type) 293 + # Stage 5: Query analysis (keywords, token count, query_tokens)
298 keywords = self._extract_keywords(query_text) 294 keywords = self._extract_keywords(query_text)
299 - token_count = self._get_token_count(query_text)  
300 - is_short_query, is_long_query = self._analyze_query_type(query_text, token_count) 295 + query_tokens = self._get_query_tokens(query_text)
  296 + token_count = len(query_tokens)
301 297
302 log_debug(f"Query analysis | Keywords: {keywords} | Token count: {token_count} | " 298 log_debug(f"Query analysis | Keywords: {keywords} | Token count: {token_count} | "
303 - f"Short query: {is_short_query} | Long query: {is_long_query}") 299 + f"Query tokens: {query_tokens}")
304 if context: 300 if context:
305 context.store_intermediate_result('keywords', keywords) 301 context.store_intermediate_result('keywords', keywords)
306 context.store_intermediate_result('token_count', token_count) 302 context.store_intermediate_result('token_count', token_count)
307 - context.store_intermediate_result('is_short_query', is_short_query)  
308 - context.store_intermediate_result('is_long_query', is_long_query) 303 + context.store_intermediate_result('query_tokens', query_tokens)
309 304
310 # Stage 6: Text embedding (only for non-short queries) - async execution 305 # Stage 6: Text embedding (only for non-short queries) - async execution
311 query_vector = None 306 query_vector = None
@@ -401,8 +396,7 @@ class QueryParser: @@ -401,8 +396,7 @@ class QueryParser:
401 domain=domain, 396 domain=domain,
402 keywords=keywords, 397 keywords=keywords,
403 token_count=token_count, 398 token_count=token_count,
404 - is_short_query=is_short_query,  
405 - is_long_query=is_long_query 399 + query_tokens=query_tokens
406 ) 400 )
407 401
408 if context and hasattr(context, 'logger'): 402 if context and hasattr(context, 'logger'):
search/es_query_builder.py
@@ -218,13 +218,28 @@ class ESQueryBuilder: @@ -218,13 +218,28 @@ class ESQueryBuilder:
218 es_query["query"] = {"match_all": {}} 218 es_query["query"] = {"match_all": {}}
219 219
220 # 4. Add KNN search if enabled (separate from query, ES will combine) 220 # 4. Add KNN search if enabled (separate from query, ES will combine)
  221 + # Adjust KNN k, num_candidates, boost by query_tokens (short query: less KNN; long: more)
221 if has_embedding: 222 if has_embedding:
  223 + knn_boost = self.knn_boost
  224 + if parsed_query:
  225 + query_tokens = getattr(parsed_query, 'query_tokens', None) or []
  226 + token_count = len(query_tokens)
  227 + if token_count <= 2:
  228 + knn_k, knn_num_candidates = 30, 100
  229 + knn_boost = self.knn_boost * 0.6 # Lower weight for short queries
  230 + elif token_count >= 5:
  231 + knn_k, knn_num_candidates = 80, 300
  232 + knn_boost = self.knn_boost * 1.4 # Higher weight for long queries
  233 + else:
  234 + knn_k, knn_num_candidates = 50, 200
  235 + else:
  236 + knn_k, knn_num_candidates = 50, 200
222 knn_clause = { 237 knn_clause = {
223 "field": self.text_embedding_field, 238 "field": self.text_embedding_field,
224 "query_vector": query_vector.tolist(), 239 "query_vector": query_vector.tolist(),
225 "k": knn_k, 240 "k": knn_k,
226 "num_candidates": knn_num_candidates, 241 "num_candidates": knn_num_candidates,
227 - "boost": self.knn_boost # Lower boost for embedding recall 242 + "boost": knn_boost
228 } 243 }
229 es_query["knn"] = knn_clause 244 es_query["knn"] = knn_clause
230 245
@@ -430,9 +445,8 @@ class ESQueryBuilder: @@ -430,9 +445,8 @@ class ESQueryBuilder:
430 translations = {} 445 translations = {}
431 language = self.default_language 446 language = self.default_language
432 keywords = "" 447 keywords = ""
  448 + query_tokens = []
433 token_count = 0 449 token_count = 0
434 - is_short_query = False  
435 - is_long_query = False  
436 450
437 if parsed_query: 451 if parsed_query:
438 translations = parsed_query.translations or {} 452 translations = parsed_query.translations or {}
@@ -443,16 +457,14 @@ class ESQueryBuilder: @@ -443,16 +457,14 @@ class ESQueryBuilder:
443 else: 457 else:
444 language = detected_lang 458 language = detected_lang
445 keywords = getattr(parsed_query, 'keywords', '') or "" 459 keywords = getattr(parsed_query, 'keywords', '') or ""
446 - token_count = getattr(parsed_query, 'token_count', 0) or 0  
447 - is_short_query = getattr(parsed_query, 'is_short_query', False)  
448 - is_long_query = getattr(parsed_query, 'is_long_query', False) 460 + query_tokens = getattr(parsed_query, 'query_tokens', None) or []
  461 + token_count = len(query_tokens) or getattr(parsed_query, 'token_count', 0) or 0
449 462
450 # Get match fields for the detected language 463 # Get match fields for the detected language
451 match_fields, core_fields = self._get_match_fields(language) 464 match_fields, core_fields = self._get_match_fields(language)
452 465
453 # Tie breaker values 466 # Tie breaker values
454 tie_breaker_base_query = 0.9 467 tie_breaker_base_query = 0.9
455 - tie_breaker_long_query = 0.9  
456 tie_breaker_keywords = 0.9 468 tie_breaker_keywords = 0.9
457 469
458 # 1. Base query - main query with AND operator 470 # 1. Base query - main query with AND operator
@@ -496,9 +508,7 @@ class ESQueryBuilder: @@ -496,9 +508,7 @@ class ESQueryBuilder:
496 "_name": "base_query_trans_en" 508 "_name": "base_query_trans_en"
497 } 509 }
498 }) 510 })
499 -  
500 - # 3. Long query - add a query with lower minimum_should_match  
501 - # Currently disabled (False condition in reference) 511 +
502 if False and is_long_query: 512 if False and is_long_query:
503 boost = 0.5 * pow(min(1.0, token_count / 10.0), 0.9) 513 boost = 0.5 * pow(min(1.0, token_count / 10.0), 0.9)
504 minimum_should_match = "70%" 514 minimum_should_match = "70%"
@@ -512,10 +522,13 @@ class ESQueryBuilder: @@ -512,10 +522,13 @@ class ESQueryBuilder:
512 "_name": "long_query" 522 "_name": "long_query"
513 } 523 }
514 }) 524 })
515 -  
516 - # 4. Short query - add phrase query 525 +
  526 + # 3. Short query - add phrase query (derived from query_tokens)
  527 + # is_short: quoted or ((token_count <= 2 or len <= 4) and no space)
517 ENABLE_PHRASE_QUERY = True 528 ENABLE_PHRASE_QUERY = True
518 - if ENABLE_PHRASE_QUERY and token_count >= 2 and is_short_query: 529 + is_quoted = query_text.startswith('"') and query_text.endswith('"')
  530 + is_short = is_quoted or ((token_count <= 2 or len(query_text) <= 4) and ' ' not in query_text)
  531 + if ENABLE_PHRASE_QUERY and token_count >= 2 and is_short:
519 query_length = len(query_text) 532 query_length = len(query_text)
520 slop = 0 if query_length < 3 else 1 if query_length < 5 else 2 533 slop = 0 if query_length < 3 else 1 if query_length < 5 else 2
521 should_clauses.append({ 534 should_clauses.append({
@@ -529,7 +542,7 @@ class ESQueryBuilder: @@ -529,7 +542,7 @@ class ESQueryBuilder:
529 } 542 }
530 }) 543 })
531 544
532 - # 5. Keywords query - extracted nouns from query 545 + # 4. Keywords query - extracted nouns from query
533 elif keywords and len(keywords.split()) <= 2 and 2 * len(keywords.replace(' ', '')) <= len(query_text): 546 elif keywords and len(keywords.split()) <= 2 and 2 * len(keywords.replace(' ', '')) <= len(query_text):
534 should_clauses.append({ 547 should_clauses.append({
535 "multi_match": { 548 "multi_match": {