Commit ea118f2b13c9ac648298f2f588f3bb1fb86acef1

Authored by tangwang
1 parent 985d7fe3

build_query:根据 query_tokens 调整 KNN 参数:

短查询(token_count ≤ 2):knn_k=30, num_candidates=100, boost=0.15(约原权重 0.6)
中等查询(3–4):knn_k=50, num_candidates=200, boost=0.25(默认)
长查询(token_count ≥ 5):knn_k=80, num_candidates=300, boost=0.35(约原权重 1.4)
docs/基础配置指南.md
... ... @@ -78,7 +78,7 @@
78 78  
79 79 - **filters**: 前端传递的过滤条件(永远起作用)
80 80 - **text_recall**: 文本相关性召回(同时搜索中英文字段)
81   -- **embedding_recall**: 向量召回(KNN,使用 `title_embedding`)
  81 +- **embedding_recall**: 向量召回(KNN,使用 `title_embedding`),根据 query_tokens 自适应调整 k、num_candidates、boost(详见 `docs/相关性检索优化说明.md` 3.6 节)
82 82 - **function_score**: 包装召回部分,支持提权字段
83 83  
84 84 ### Function Score 配置
... ...
docs/相关性检索优化说明.md
... ... @@ -105,7 +105,7 @@
105 105 - 支持跨语言检索
106 106  
107 107 #### 3.3 短语查询(phrase_query)
108   -- 针对短查询(token_count >= 2 且 is_short_query
  108 +- 针对短查询(token_count >= 2 且 is_short,由 query_tokens 推导
109 109 - 使用 `type: "phrase"` 进行精确短语匹配
110 110 - 支持 slop(允许词序调整)
111 111  
... ... @@ -118,6 +118,21 @@
118 118 - 当前已禁用(参考实现中也是 False)
119 119 - 未来可根据需要启用
120 120  
  121 +#### 3.6 KNN 向量召回自适应策略(query_tokens)
  122 +
  123 +根据 `query_tokens`(HanLP 分词后的 token 数量)动态调整 KNN 的召回数量和权重:
  124 +
  125 +| 查询类型 | token_count | knn_k | num_candidates | boost 系数 |
  126 +|---------|-------------|-------|----------------|------------|
  127 +| 短查询 | ≤ 2 | 30 | 100 | 0.6× 默认 |
  128 +| 中等查询| 3~4 | 50 | 200 | 1.0× 默认 |
  129 +| 长查询 | ≥ 5 | 80 | 300 | 1.4× 默认 |
  130 +
  131 +**策略说明**:
  132 +- **短查询**:BM25 对精确匹配更有效,降低 KNN 召回和权重,避免语义召回干扰
  133 +- **长查询**:语义搜索更有利,提高 KNN 召回和权重,增强语义理解能力
  134 +- 默认 boost 由 `config.query_config.knn_boost` 配置(通常 0.25)
  135 +
121 136 ### 4. 字段映射优化
122 137  
123 138 新增 `_get_match_fields()` 方法,支持:
... ... @@ -186,8 +201,7 @@ result = searcher.search(
186 201 parsed_query = query_parser.parse("戏水动物")
187 202 print(f"关键词: {parsed_query.keywords}")
188 203 print(f"Token数: {parsed_query.token_count}")
189   -print(f"短查询: {parsed_query.is_short_query}")
190   -print(f"长查询: {parsed_query.is_long_query}")
  204 +print(f"query_tokens: {parsed_query.query_tokens}")
191 205 ```
192 206  
193 207 ## 性能考虑
... ...
docs/系统设计文档.md
... ... @@ -527,6 +527,7 @@ laptop AND (gaming OR professional) ANDNOT cheap
527 527 }
528 528 }
529 529 ```
  530 +> **KNN 自适应策略**:`k`、`num_candidates`、`boost` 会根据 `query_tokens` 动态调整:短查询(≤2 token)减少召回和权重,长查询(≥5 token)增加召回和权重。详见 `docs/相关性检索优化说明.md` 3.6 节。
530 531  
531 532 #### 实现模块
532 533 - `search/es_query_builder.py` - ES 查询构建器(单层架构,`build_query` 方法)
... ...
query/query_parser.py
... ... @@ -34,8 +34,7 @@ class ParsedQuery:
34 34 domain: str = "default",
35 35 keywords: str = "",
36 36 token_count: int = 0,
37   - is_short_query: bool = False,
38   - is_long_query: bool = False
  37 + query_tokens: Optional[List[str]] = None
39 38 ):
40 39 self.original_query = original_query
41 40 self.query_normalized = query_normalized
... ... @@ -47,8 +46,7 @@ class ParsedQuery:
47 46 # Query analysis fields
48 47 self.keywords = keywords
49 48 self.token_count = token_count
50   - self.is_short_query = is_short_query
51   - self.is_long_query = is_long_query
  49 + self.query_tokens = query_tokens or []
52 50  
53 51 def to_dict(self) -> Dict[str, Any]:
54 52 """Convert to dictionary representation."""
... ... @@ -144,13 +142,11 @@ class QueryParser:
144 142 """Get token count using HanLP."""
145 143 tok_result = self._tok(query)
146 144 return len(tok_result) if tok_result else 0
147   -
148   - def _analyze_query_type(self, query: str, token_count: int) -> tuple:
149   - """Analyze query type: (is_short_query, is_long_query)."""
150   - is_quoted = query.startswith('"') and query.endswith('"')
151   - is_short = is_quoted or ((token_count <= 2 or len(query) <= 4) and ' ' not in query)
152   - is_long = token_count >= 4
153   - return is_short, is_long
  145 +
  146 + def _get_query_tokens(self, query: str) -> List[str]:
  147 + """Get token list using HanLP."""
  148 + tok_result = self._tok(query)
  149 + return [x[0] for x in tok_result] if tok_result else []
154 150  
155 151 def parse(
156 152 self,
... ... @@ -294,18 +290,17 @@ class QueryParser:
294 290 if context:
295 291 context.add_warning(error_msg)
296 292  
297   - # Stage 5: Query analysis (keywords, token count, query type)
  293 + # Stage 5: Query analysis (keywords, token count, query_tokens)
298 294 keywords = self._extract_keywords(query_text)
299   - token_count = self._get_token_count(query_text)
300   - is_short_query, is_long_query = self._analyze_query_type(query_text, token_count)
  295 + query_tokens = self._get_query_tokens(query_text)
  296 + token_count = len(query_tokens)
301 297  
302 298 log_debug(f"Query analysis | Keywords: {keywords} | Token count: {token_count} | "
303   - f"Short query: {is_short_query} | Long query: {is_long_query}")
  299 + f"Query tokens: {query_tokens}")
304 300 if context:
305 301 context.store_intermediate_result('keywords', keywords)
306 302 context.store_intermediate_result('token_count', token_count)
307   - context.store_intermediate_result('is_short_query', is_short_query)
308   - context.store_intermediate_result('is_long_query', is_long_query)
  303 + context.store_intermediate_result('query_tokens', query_tokens)
309 304  
310 305 # Stage 6: Text embedding (only for non-short queries) - async execution
311 306 query_vector = None
... ... @@ -401,8 +396,7 @@ class QueryParser:
401 396 domain=domain,
402 397 keywords=keywords,
403 398 token_count=token_count,
404   - is_short_query=is_short_query,
405   - is_long_query=is_long_query
  399 + query_tokens=query_tokens
406 400 )
407 401  
408 402 if context and hasattr(context, 'logger'):
... ...
search/es_query_builder.py
... ... @@ -218,13 +218,28 @@ class ESQueryBuilder:
218 218 es_query["query"] = {"match_all": {}}
219 219  
220 220 # 4. Add KNN search if enabled (separate from query, ES will combine)
  221 + # Adjust KNN k, num_candidates, boost by query_tokens (short query: less KNN; long: more)
221 222 if has_embedding:
  223 + knn_boost = self.knn_boost
  224 + if parsed_query:
  225 + query_tokens = getattr(parsed_query, 'query_tokens', None) or []
  226 + token_count = len(query_tokens)
  227 + if token_count <= 2:
  228 + knn_k, knn_num_candidates = 30, 100
  229 + knn_boost = self.knn_boost * 0.6 # Lower weight for short queries
  230 + elif token_count >= 5:
  231 + knn_k, knn_num_candidates = 80, 300
  232 + knn_boost = self.knn_boost * 1.4 # Higher weight for long queries
  233 + else:
  234 + knn_k, knn_num_candidates = 50, 200
  235 + else:
  236 + knn_k, knn_num_candidates = 50, 200
222 237 knn_clause = {
223 238 "field": self.text_embedding_field,
224 239 "query_vector": query_vector.tolist(),
225 240 "k": knn_k,
226 241 "num_candidates": knn_num_candidates,
227   - "boost": self.knn_boost # Lower boost for embedding recall
  242 + "boost": knn_boost
228 243 }
229 244 es_query["knn"] = knn_clause
230 245  
... ... @@ -430,9 +445,8 @@ class ESQueryBuilder:
430 445 translations = {}
431 446 language = self.default_language
432 447 keywords = ""
  448 + query_tokens = []
433 449 token_count = 0
434   - is_short_query = False
435   - is_long_query = False
436 450  
437 451 if parsed_query:
438 452 translations = parsed_query.translations or {}
... ... @@ -443,16 +457,14 @@ class ESQueryBuilder:
443 457 else:
444 458 language = detected_lang
445 459 keywords = getattr(parsed_query, 'keywords', '') or ""
446   - token_count = getattr(parsed_query, 'token_count', 0) or 0
447   - is_short_query = getattr(parsed_query, 'is_short_query', False)
448   - is_long_query = getattr(parsed_query, 'is_long_query', False)
  460 + query_tokens = getattr(parsed_query, 'query_tokens', None) or []
  461 + token_count = len(query_tokens) or getattr(parsed_query, 'token_count', 0) or 0
449 462  
450 463 # Get match fields for the detected language
451 464 match_fields, core_fields = self._get_match_fields(language)
452 465  
453 466 # Tie breaker values
454 467 tie_breaker_base_query = 0.9
455   - tie_breaker_long_query = 0.9
456 468 tie_breaker_keywords = 0.9
457 469  
458 470 # 1. Base query - main query with AND operator
... ... @@ -496,9 +508,7 @@ class ESQueryBuilder:
496 508 "_name": "base_query_trans_en"
497 509 }
498 510 })
499   -
500   - # 3. Long query - add a query with lower minimum_should_match
501   - # Currently disabled (False condition in reference)
  511 +
502 512 if False and is_long_query:
503 513 boost = 0.5 * pow(min(1.0, token_count / 10.0), 0.9)
504 514 minimum_should_match = "70%"
... ... @@ -512,10 +522,13 @@ class ESQueryBuilder:
512 522 "_name": "long_query"
513 523 }
514 524 })
515   -
516   - # 4. Short query - add phrase query
  525 +
  526 + # 3. Short query - add phrase query (derived from query_tokens)
  527 + # is_short: quoted or ((token_count <= 2 or len <= 4) and no space)
517 528 ENABLE_PHRASE_QUERY = True
518   - if ENABLE_PHRASE_QUERY and token_count >= 2 and is_short_query:
  529 + is_quoted = query_text.startswith('"') and query_text.endswith('"')
  530 + is_short = is_quoted or ((token_count <= 2 or len(query_text) <= 4) and ' ' not in query_text)
  531 + if ENABLE_PHRASE_QUERY and token_count >= 2 and is_short:
519 532 query_length = len(query_text)
520 533 slop = 0 if query_length < 3 else 1 if query_length < 5 else 2
521 534 should_clauses.append({
... ... @@ -529,7 +542,7 @@ class ESQueryBuilder:
529 542 }
530 543 })
531 544  
532   - # 5. Keywords query - extracted nouns from query
  545 + # 4. Keywords query - extracted nouns from query
533 546 elif keywords and len(keywords.split()) <= 2 and 2 * len(keywords.replace(' ', '')) <= len(query_text):
534 547 should_clauses.append({
535 548 "multi_match": {
... ...