Commit ea118f2b13c9ac648298f2f588f3bb1fb86acef1
1 parent
985d7fe3
build_query:根据 query_tokens 调整 KNN 参数:
短查询(token_count ≤ 2):knn_k=30, num_candidates=100, boost=0.15(约原权重 0.6) 中等查询(3–4):knn_k=50, num_candidates=200, boost=0.25(默认) 长查询(token_count ≥ 5):knn_k=80, num_candidates=300, boost=0.35(约原权重 1.4)
Showing
5 changed files
with
59 additions
and
37 deletions
Show diff stats
docs/基础配置指南.md
| ... | ... | @@ -78,7 +78,7 @@ |
| 78 | 78 | |
| 79 | 79 | - **filters**: 前端传递的过滤条件(永远起作用) |
| 80 | 80 | - **text_recall**: 文本相关性召回(同时搜索中英文字段) |
| 81 | -- **embedding_recall**: 向量召回(KNN,使用 `title_embedding`) | |
| 81 | +- **embedding_recall**: 向量召回(KNN,使用 `title_embedding`),根据 query_tokens 自适应调整 k、num_candidates、boost(详见 `docs/相关性检索优化说明.md` 3.6 节) | |
| 82 | 82 | - **function_score**: 包装召回部分,支持提权字段 |
| 83 | 83 | |
| 84 | 84 | ### Function Score 配置 | ... | ... |
docs/相关性检索优化说明.md
| ... | ... | @@ -105,7 +105,7 @@ |
| 105 | 105 | - 支持跨语言检索 |
| 106 | 106 | |
| 107 | 107 | #### 3.3 短语查询(phrase_query) |
| 108 | -- 针对短查询(token_count >= 2 且 is_short_query) | |
| 108 | +- 针对短查询(token_count >= 2 且 is_short,由 query_tokens 推导) | |
| 109 | 109 | - 使用 `type: "phrase"` 进行精确短语匹配 |
| 110 | 110 | - 支持 slop(允许词序调整) |
| 111 | 111 | |
| ... | ... | @@ -118,6 +118,21 @@ |
| 118 | 118 | - 当前已禁用(参考实现中也是 False) |
| 119 | 119 | - 未来可根据需要启用 |
| 120 | 120 | |
| 121 | +#### 3.6 KNN 向量召回自适应策略(query_tokens) | |
| 122 | + | |
| 123 | +根据 `query_tokens`(HanLP 分词后的 token 数量)动态调整 KNN 的召回数量和权重: | |
| 124 | + | |
| 125 | +| 查询类型 | token_count | knn_k | num_candidates | boost 系数 | | |
| 126 | +|---------|-------------|-------|----------------|------------| | |
| 127 | +| 短查询 | ≤ 2 | 30 | 100 | 0.6× 默认 | | |
| 128 | +| 中等查询| 3~4 | 50 | 200 | 1.0× 默认 | | |
| 129 | +| 长查询 | ≥ 5 | 80 | 300 | 1.4× 默认 | | |
| 130 | + | |
| 131 | +**策略说明**: | |
| 132 | +- **短查询**:BM25 对精确匹配更有效,降低 KNN 召回和权重,避免语义召回干扰 | |
| 133 | +- **长查询**:语义搜索更有利,提高 KNN 召回和权重,增强语义理解能力 | |
| 134 | +- 默认 boost 由 `config.query_config.knn_boost` 配置(通常 0.25) | |
| 135 | + | |
| 121 | 136 | ### 4. 字段映射优化 |
| 122 | 137 | |
| 123 | 138 | 新增 `_get_match_fields()` 方法,支持: |
| ... | ... | @@ -186,8 +201,7 @@ result = searcher.search( |
| 186 | 201 | parsed_query = query_parser.parse("戏水动物") |
| 187 | 202 | print(f"关键词: {parsed_query.keywords}") |
| 188 | 203 | print(f"Token数: {parsed_query.token_count}") |
| 189 | -print(f"短查询: {parsed_query.is_short_query}") | |
| 190 | -print(f"长查询: {parsed_query.is_long_query}") | |
| 204 | +print(f"query_tokens: {parsed_query.query_tokens}") | |
| 191 | 205 | ``` |
| 192 | 206 | |
| 193 | 207 | ## 性能考虑 | ... | ... |
docs/系统设计文档.md
| ... | ... | @@ -527,6 +527,7 @@ laptop AND (gaming OR professional) ANDNOT cheap |
| 527 | 527 | } |
| 528 | 528 | } |
| 529 | 529 | ``` |
| 530 | +> **KNN 自适应策略**:`k`、`num_candidates`、`boost` 会根据 `query_tokens` 动态调整:短查询(≤2 token)减少召回和权重,长查询(≥5 token)增加召回和权重。详见 `docs/相关性检索优化说明.md` 3.6 节。 | |
| 530 | 531 | |
| 531 | 532 | #### 实现模块 |
| 532 | 533 | - `search/es_query_builder.py` - ES 查询构建器(单层架构,`build_query` 方法) | ... | ... |
query/query_parser.py
| ... | ... | @@ -34,8 +34,7 @@ class ParsedQuery: |
| 34 | 34 | domain: str = "default", |
| 35 | 35 | keywords: str = "", |
| 36 | 36 | token_count: int = 0, |
| 37 | - is_short_query: bool = False, | |
| 38 | - is_long_query: bool = False | |
| 37 | + query_tokens: Optional[List[str]] = None | |
| 39 | 38 | ): |
| 40 | 39 | self.original_query = original_query |
| 41 | 40 | self.query_normalized = query_normalized |
| ... | ... | @@ -47,8 +46,7 @@ class ParsedQuery: |
| 47 | 46 | # Query analysis fields |
| 48 | 47 | self.keywords = keywords |
| 49 | 48 | self.token_count = token_count |
| 50 | - self.is_short_query = is_short_query | |
| 51 | - self.is_long_query = is_long_query | |
| 49 | + self.query_tokens = query_tokens or [] | |
| 52 | 50 | |
| 53 | 51 | def to_dict(self) -> Dict[str, Any]: |
| 54 | 52 | """Convert to dictionary representation.""" |
| ... | ... | @@ -144,13 +142,11 @@ class QueryParser: |
| 144 | 142 | """Get token count using HanLP.""" |
| 145 | 143 | tok_result = self._tok(query) |
| 146 | 144 | return len(tok_result) if tok_result else 0 |
| 147 | - | |
| 148 | - def _analyze_query_type(self, query: str, token_count: int) -> tuple: | |
| 149 | - """Analyze query type: (is_short_query, is_long_query).""" | |
| 150 | - is_quoted = query.startswith('"') and query.endswith('"') | |
| 151 | - is_short = is_quoted or ((token_count <= 2 or len(query) <= 4) and ' ' not in query) | |
| 152 | - is_long = token_count >= 4 | |
| 153 | - return is_short, is_long | |
| 145 | + | |
| 146 | + def _get_query_tokens(self, query: str) -> List[str]: | |
| 147 | + """Get token list using HanLP.""" | |
| 148 | + tok_result = self._tok(query) | |
| 149 | + return [x[0] for x in tok_result] if tok_result else [] | |
| 154 | 150 | |
| 155 | 151 | def parse( |
| 156 | 152 | self, |
| ... | ... | @@ -294,18 +290,17 @@ class QueryParser: |
| 294 | 290 | if context: |
| 295 | 291 | context.add_warning(error_msg) |
| 296 | 292 | |
| 297 | - # Stage 5: Query analysis (keywords, token count, query type) | |
| 293 | + # Stage 5: Query analysis (keywords, token count, query_tokens) | |
| 298 | 294 | keywords = self._extract_keywords(query_text) |
| 299 | - token_count = self._get_token_count(query_text) | |
| 300 | - is_short_query, is_long_query = self._analyze_query_type(query_text, token_count) | |
| 295 | + query_tokens = self._get_query_tokens(query_text) | |
| 296 | + token_count = len(query_tokens) | |
| 301 | 297 | |
| 302 | 298 | log_debug(f"Query analysis | Keywords: {keywords} | Token count: {token_count} | " |
| 303 | - f"Short query: {is_short_query} | Long query: {is_long_query}") | |
| 299 | + f"Query tokens: {query_tokens}") | |
| 304 | 300 | if context: |
| 305 | 301 | context.store_intermediate_result('keywords', keywords) |
| 306 | 302 | context.store_intermediate_result('token_count', token_count) |
| 307 | - context.store_intermediate_result('is_short_query', is_short_query) | |
| 308 | - context.store_intermediate_result('is_long_query', is_long_query) | |
| 303 | + context.store_intermediate_result('query_tokens', query_tokens) | |
| 309 | 304 | |
| 310 | 305 | # Stage 6: Text embedding (only for non-short queries) - async execution |
| 311 | 306 | query_vector = None |
| ... | ... | @@ -401,8 +396,7 @@ class QueryParser: |
| 401 | 396 | domain=domain, |
| 402 | 397 | keywords=keywords, |
| 403 | 398 | token_count=token_count, |
| 404 | - is_short_query=is_short_query, | |
| 405 | - is_long_query=is_long_query | |
| 399 | + query_tokens=query_tokens | |
| 406 | 400 | ) |
| 407 | 401 | |
| 408 | 402 | if context and hasattr(context, 'logger'): | ... | ... |
search/es_query_builder.py
| ... | ... | @@ -218,13 +218,28 @@ class ESQueryBuilder: |
| 218 | 218 | es_query["query"] = {"match_all": {}} |
| 219 | 219 | |
| 220 | 220 | # 4. Add KNN search if enabled (separate from query, ES will combine) |
| 221 | + # Adjust KNN k, num_candidates, boost by query_tokens (short query: less KNN; long: more) | |
| 221 | 222 | if has_embedding: |
| 223 | + knn_boost = self.knn_boost | |
| 224 | + if parsed_query: | |
| 225 | + query_tokens = getattr(parsed_query, 'query_tokens', None) or [] | |
| 226 | + token_count = len(query_tokens) | |
| 227 | + if token_count <= 2: | |
| 228 | + knn_k, knn_num_candidates = 30, 100 | |
| 229 | + knn_boost = self.knn_boost * 0.6 # Lower weight for short queries | |
| 230 | + elif token_count >= 5: | |
| 231 | + knn_k, knn_num_candidates = 80, 300 | |
| 232 | + knn_boost = self.knn_boost * 1.4 # Higher weight for long queries | |
| 233 | + else: | |
| 234 | + knn_k, knn_num_candidates = 50, 200 | |
| 235 | + else: | |
| 236 | + knn_k, knn_num_candidates = 50, 200 | |
| 222 | 237 | knn_clause = { |
| 223 | 238 | "field": self.text_embedding_field, |
| 224 | 239 | "query_vector": query_vector.tolist(), |
| 225 | 240 | "k": knn_k, |
| 226 | 241 | "num_candidates": knn_num_candidates, |
| 227 | - "boost": self.knn_boost # Lower boost for embedding recall | |
| 242 | + "boost": knn_boost | |
| 228 | 243 | } |
| 229 | 244 | es_query["knn"] = knn_clause |
| 230 | 245 | |
| ... | ... | @@ -430,9 +445,8 @@ class ESQueryBuilder: |
| 430 | 445 | translations = {} |
| 431 | 446 | language = self.default_language |
| 432 | 447 | keywords = "" |
| 448 | + query_tokens = [] | |
| 433 | 449 | token_count = 0 |
| 434 | - is_short_query = False | |
| 435 | - is_long_query = False | |
| 436 | 450 | |
| 437 | 451 | if parsed_query: |
| 438 | 452 | translations = parsed_query.translations or {} |
| ... | ... | @@ -443,16 +457,14 @@ class ESQueryBuilder: |
| 443 | 457 | else: |
| 444 | 458 | language = detected_lang |
| 445 | 459 | keywords = getattr(parsed_query, 'keywords', '') or "" |
| 446 | - token_count = getattr(parsed_query, 'token_count', 0) or 0 | |
| 447 | - is_short_query = getattr(parsed_query, 'is_short_query', False) | |
| 448 | - is_long_query = getattr(parsed_query, 'is_long_query', False) | |
| 460 | + query_tokens = getattr(parsed_query, 'query_tokens', None) or [] | |
| 461 | + token_count = len(query_tokens) or getattr(parsed_query, 'token_count', 0) or 0 | |
| 449 | 462 | |
| 450 | 463 | # Get match fields for the detected language |
| 451 | 464 | match_fields, core_fields = self._get_match_fields(language) |
| 452 | 465 | |
| 453 | 466 | # Tie breaker values |
| 454 | 467 | tie_breaker_base_query = 0.9 |
| 455 | - tie_breaker_long_query = 0.9 | |
| 456 | 468 | tie_breaker_keywords = 0.9 |
| 457 | 469 | |
| 458 | 470 | # 1. Base query - main query with AND operator |
| ... | ... | @@ -496,9 +508,7 @@ class ESQueryBuilder: |
| 496 | 508 | "_name": "base_query_trans_en" |
| 497 | 509 | } |
| 498 | 510 | }) |
| 499 | - | |
| 500 | - # 3. Long query - add a query with lower minimum_should_match | |
| 501 | - # Currently disabled (False condition in reference) | |
| 511 | + | |
| 502 | 512 | if False and is_long_query: |
| 503 | 513 | boost = 0.5 * pow(min(1.0, token_count / 10.0), 0.9) |
| 504 | 514 | minimum_should_match = "70%" |
| ... | ... | @@ -512,10 +522,13 @@ class ESQueryBuilder: |
| 512 | 522 | "_name": "long_query" |
| 513 | 523 | } |
| 514 | 524 | }) |
| 515 | - | |
| 516 | - # 4. Short query - add phrase query | |
| 525 | + | |
| 526 | + # 3. Short query - add phrase query (derived from query_tokens) | |
| 527 | + # is_short: quoted or ((token_count <= 2 or len <= 4) and no space) | |
| 517 | 528 | ENABLE_PHRASE_QUERY = True |
| 518 | - if ENABLE_PHRASE_QUERY and token_count >= 2 and is_short_query: | |
| 529 | + is_quoted = query_text.startswith('"') and query_text.endswith('"') | |
| 530 | + is_short = is_quoted or ((token_count <= 2 or len(query_text) <= 4) and ' ' not in query_text) | |
| 531 | + if ENABLE_PHRASE_QUERY and token_count >= 2 and is_short: | |
| 519 | 532 | query_length = len(query_text) |
| 520 | 533 | slop = 0 if query_length < 3 else 1 if query_length < 5 else 2 |
| 521 | 534 | should_clauses.append({ |
| ... | ... | @@ -529,7 +542,7 @@ class ESQueryBuilder: |
| 529 | 542 | } |
| 530 | 543 | }) |
| 531 | 544 | |
| 532 | - # 5. Keywords query - extracted nouns from query | |
| 545 | + # 4. Keywords query - extracted nouns from query | |
| 533 | 546 | elif keywords and len(keywords.split()) <= 2 and 2 * len(keywords.replace(' ', '')) <= len(query_text): |
| 534 | 547 | should_clauses.append({ |
| 535 | 548 | "multi_match": { | ... | ... |