Commit ea118f2b13c9ac648298f2f588f3bb1fb86acef1
1 parent
985d7fe3
build_query:根据 query_tokens 调整 KNN 参数:
短查询(token_count ≤ 2):knn_k=30, num_candidates=100, boost=0.15(约原权重 0.6) 中等查询(3–4):knn_k=50, num_candidates=200, boost=0.25(默认) 长查询(token_count ≥ 5):knn_k=80, num_candidates=300, boost=0.35(约原权重 1.4)
Showing
5 changed files
with
59 additions
and
37 deletions
Show diff stats
docs/基础配置指南.md
| @@ -78,7 +78,7 @@ | @@ -78,7 +78,7 @@ | ||
| 78 | 78 | ||
| 79 | - **filters**: 前端传递的过滤条件(永远起作用) | 79 | - **filters**: 前端传递的过滤条件(永远起作用) |
| 80 | - **text_recall**: 文本相关性召回(同时搜索中英文字段) | 80 | - **text_recall**: 文本相关性召回(同时搜索中英文字段) |
| 81 | -- **embedding_recall**: 向量召回(KNN,使用 `title_embedding`) | 81 | +- **embedding_recall**: 向量召回(KNN,使用 `title_embedding`),根据 query_tokens 自适应调整 k、num_candidates、boost(详见 `docs/相关性检索优化说明.md` 3.6 节) |
| 82 | - **function_score**: 包装召回部分,支持提权字段 | 82 | - **function_score**: 包装召回部分,支持提权字段 |
| 83 | 83 | ||
| 84 | ### Function Score 配置 | 84 | ### Function Score 配置 |
docs/相关性检索优化说明.md
| @@ -105,7 +105,7 @@ | @@ -105,7 +105,7 @@ | ||
| 105 | - 支持跨语言检索 | 105 | - 支持跨语言检索 |
| 106 | 106 | ||
| 107 | #### 3.3 短语查询(phrase_query) | 107 | #### 3.3 短语查询(phrase_query) |
| 108 | -- 针对短查询(token_count >= 2 且 is_short_query) | 108 | +- 针对短查询(token_count >= 2 且 is_short,由 query_tokens 推导) |
| 109 | - 使用 `type: "phrase"` 进行精确短语匹配 | 109 | - 使用 `type: "phrase"` 进行精确短语匹配 |
| 110 | - 支持 slop(允许词序调整) | 110 | - 支持 slop(允许词序调整) |
| 111 | 111 | ||
| @@ -118,6 +118,21 @@ | @@ -118,6 +118,21 @@ | ||
| 118 | - 当前已禁用(参考实现中也是 False) | 118 | - 当前已禁用(参考实现中也是 False) |
| 119 | - 未来可根据需要启用 | 119 | - 未来可根据需要启用 |
| 120 | 120 | ||
| 121 | +#### 3.6 KNN 向量召回自适应策略(query_tokens) | ||
| 122 | + | ||
| 123 | +根据 `query_tokens`(HanLP 分词后的 token 数量)动态调整 KNN 的召回数量和权重: | ||
| 124 | + | ||
| 125 | +| 查询类型 | token_count | knn_k | num_candidates | boost 系数 | | ||
| 126 | +|---------|-------------|-------|----------------|------------| | ||
| 127 | +| 短查询 | ≤ 2 | 30 | 100 | 0.6× 默认 | | ||
| 128 | +| 中等查询| 3~4 | 50 | 200 | 1.0× 默认 | | ||
| 129 | +| 长查询 | ≥ 5 | 80 | 300 | 1.4× 默认 | | ||
| 130 | + | ||
| 131 | +**策略说明**: | ||
| 132 | +- **短查询**:BM25 对精确匹配更有效,降低 KNN 召回和权重,避免语义召回干扰 | ||
| 133 | +- **长查询**:语义搜索更有利,提高 KNN 召回和权重,增强语义理解能力 | ||
| 134 | +- 默认 boost 由 `config.query_config.knn_boost` 配置(通常 0.25) | ||
| 135 | + | ||
| 121 | ### 4. 字段映射优化 | 136 | ### 4. 字段映射优化 |
| 122 | 137 | ||
| 123 | 新增 `_get_match_fields()` 方法,支持: | 138 | 新增 `_get_match_fields()` 方法,支持: |
| @@ -186,8 +201,7 @@ result = searcher.search( | @@ -186,8 +201,7 @@ result = searcher.search( | ||
| 186 | parsed_query = query_parser.parse("戏水动物") | 201 | parsed_query = query_parser.parse("戏水动物") |
| 187 | print(f"关键词: {parsed_query.keywords}") | 202 | print(f"关键词: {parsed_query.keywords}") |
| 188 | print(f"Token数: {parsed_query.token_count}") | 203 | print(f"Token数: {parsed_query.token_count}") |
| 189 | -print(f"短查询: {parsed_query.is_short_query}") | ||
| 190 | -print(f"长查询: {parsed_query.is_long_query}") | 204 | +print(f"query_tokens: {parsed_query.query_tokens}") |
| 191 | ``` | 205 | ``` |
| 192 | 206 | ||
| 193 | ## 性能考虑 | 207 | ## 性能考虑 |
docs/系统设计文档.md
| @@ -527,6 +527,7 @@ laptop AND (gaming OR professional) ANDNOT cheap | @@ -527,6 +527,7 @@ laptop AND (gaming OR professional) ANDNOT cheap | ||
| 527 | } | 527 | } |
| 528 | } | 528 | } |
| 529 | ``` | 529 | ``` |
| 530 | +> **KNN 自适应策略**:`k`、`num_candidates`、`boost` 会根据 `query_tokens` 动态调整:短查询(≤2 token)减少召回和权重,长查询(≥5 token)增加召回和权重。详见 `docs/相关性检索优化说明.md` 3.6 节。 | ||
| 530 | 531 | ||
| 531 | #### 实现模块 | 532 | #### 实现模块 |
| 532 | - `search/es_query_builder.py` - ES 查询构建器(单层架构,`build_query` 方法) | 533 | - `search/es_query_builder.py` - ES 查询构建器(单层架构,`build_query` 方法) |
query/query_parser.py
| @@ -34,8 +34,7 @@ class ParsedQuery: | @@ -34,8 +34,7 @@ class ParsedQuery: | ||
| 34 | domain: str = "default", | 34 | domain: str = "default", |
| 35 | keywords: str = "", | 35 | keywords: str = "", |
| 36 | token_count: int = 0, | 36 | token_count: int = 0, |
| 37 | - is_short_query: bool = False, | ||
| 38 | - is_long_query: bool = False | 37 | + query_tokens: Optional[List[str]] = None |
| 39 | ): | 38 | ): |
| 40 | self.original_query = original_query | 39 | self.original_query = original_query |
| 41 | self.query_normalized = query_normalized | 40 | self.query_normalized = query_normalized |
| @@ -47,8 +46,7 @@ class ParsedQuery: | @@ -47,8 +46,7 @@ class ParsedQuery: | ||
| 47 | # Query analysis fields | 46 | # Query analysis fields |
| 48 | self.keywords = keywords | 47 | self.keywords = keywords |
| 49 | self.token_count = token_count | 48 | self.token_count = token_count |
| 50 | - self.is_short_query = is_short_query | ||
| 51 | - self.is_long_query = is_long_query | 49 | + self.query_tokens = query_tokens or [] |
| 52 | 50 | ||
| 53 | def to_dict(self) -> Dict[str, Any]: | 51 | def to_dict(self) -> Dict[str, Any]: |
| 54 | """Convert to dictionary representation.""" | 52 | """Convert to dictionary representation.""" |
| @@ -144,13 +142,11 @@ class QueryParser: | @@ -144,13 +142,11 @@ class QueryParser: | ||
| 144 | """Get token count using HanLP.""" | 142 | """Get token count using HanLP.""" |
| 145 | tok_result = self._tok(query) | 143 | tok_result = self._tok(query) |
| 146 | return len(tok_result) if tok_result else 0 | 144 | return len(tok_result) if tok_result else 0 |
| 147 | - | ||
| 148 | - def _analyze_query_type(self, query: str, token_count: int) -> tuple: | ||
| 149 | - """Analyze query type: (is_short_query, is_long_query).""" | ||
| 150 | - is_quoted = query.startswith('"') and query.endswith('"') | ||
| 151 | - is_short = is_quoted or ((token_count <= 2 or len(query) <= 4) and ' ' not in query) | ||
| 152 | - is_long = token_count >= 4 | ||
| 153 | - return is_short, is_long | 145 | + |
| 146 | + def _get_query_tokens(self, query: str) -> List[str]: | ||
| 147 | + """Get token list using HanLP.""" | ||
| 148 | + tok_result = self._tok(query) | ||
| 149 | + return [x[0] for x in tok_result] if tok_result else [] | ||
| 154 | 150 | ||
| 155 | def parse( | 151 | def parse( |
| 156 | self, | 152 | self, |
| @@ -294,18 +290,17 @@ class QueryParser: | @@ -294,18 +290,17 @@ class QueryParser: | ||
| 294 | if context: | 290 | if context: |
| 295 | context.add_warning(error_msg) | 291 | context.add_warning(error_msg) |
| 296 | 292 | ||
| 297 | - # Stage 5: Query analysis (keywords, token count, query type) | 293 | + # Stage 5: Query analysis (keywords, token count, query_tokens) |
| 298 | keywords = self._extract_keywords(query_text) | 294 | keywords = self._extract_keywords(query_text) |
| 299 | - token_count = self._get_token_count(query_text) | ||
| 300 | - is_short_query, is_long_query = self._analyze_query_type(query_text, token_count) | 295 | + query_tokens = self._get_query_tokens(query_text) |
| 296 | + token_count = len(query_tokens) | ||
| 301 | 297 | ||
| 302 | log_debug(f"Query analysis | Keywords: {keywords} | Token count: {token_count} | " | 298 | log_debug(f"Query analysis | Keywords: {keywords} | Token count: {token_count} | " |
| 303 | - f"Short query: {is_short_query} | Long query: {is_long_query}") | 299 | + f"Query tokens: {query_tokens}") |
| 304 | if context: | 300 | if context: |
| 305 | context.store_intermediate_result('keywords', keywords) | 301 | context.store_intermediate_result('keywords', keywords) |
| 306 | context.store_intermediate_result('token_count', token_count) | 302 | context.store_intermediate_result('token_count', token_count) |
| 307 | - context.store_intermediate_result('is_short_query', is_short_query) | ||
| 308 | - context.store_intermediate_result('is_long_query', is_long_query) | 303 | + context.store_intermediate_result('query_tokens', query_tokens) |
| 309 | 304 | ||
| 310 | # Stage 6: Text embedding (only for non-short queries) - async execution | 305 | # Stage 6: Text embedding (only for non-short queries) - async execution |
| 311 | query_vector = None | 306 | query_vector = None |
| @@ -401,8 +396,7 @@ class QueryParser: | @@ -401,8 +396,7 @@ class QueryParser: | ||
| 401 | domain=domain, | 396 | domain=domain, |
| 402 | keywords=keywords, | 397 | keywords=keywords, |
| 403 | token_count=token_count, | 398 | token_count=token_count, |
| 404 | - is_short_query=is_short_query, | ||
| 405 | - is_long_query=is_long_query | 399 | + query_tokens=query_tokens |
| 406 | ) | 400 | ) |
| 407 | 401 | ||
| 408 | if context and hasattr(context, 'logger'): | 402 | if context and hasattr(context, 'logger'): |
search/es_query_builder.py
| @@ -218,13 +218,28 @@ class ESQueryBuilder: | @@ -218,13 +218,28 @@ class ESQueryBuilder: | ||
| 218 | es_query["query"] = {"match_all": {}} | 218 | es_query["query"] = {"match_all": {}} |
| 219 | 219 | ||
| 220 | # 4. Add KNN search if enabled (separate from query, ES will combine) | 220 | # 4. Add KNN search if enabled (separate from query, ES will combine) |
| 221 | + # Adjust KNN k, num_candidates, boost by query_tokens (short query: less KNN; long: more) | ||
| 221 | if has_embedding: | 222 | if has_embedding: |
| 223 | + knn_boost = self.knn_boost | ||
| 224 | + if parsed_query: | ||
| 225 | + query_tokens = getattr(parsed_query, 'query_tokens', None) or [] | ||
| 226 | + token_count = len(query_tokens) | ||
| 227 | + if token_count <= 2: | ||
| 228 | + knn_k, knn_num_candidates = 30, 100 | ||
| 229 | + knn_boost = self.knn_boost * 0.6 # Lower weight for short queries | ||
| 230 | + elif token_count >= 5: | ||
| 231 | + knn_k, knn_num_candidates = 80, 300 | ||
| 232 | + knn_boost = self.knn_boost * 1.4 # Higher weight for long queries | ||
| 233 | + else: | ||
| 234 | + knn_k, knn_num_candidates = 50, 200 | ||
| 235 | + else: | ||
| 236 | + knn_k, knn_num_candidates = 50, 200 | ||
| 222 | knn_clause = { | 237 | knn_clause = { |
| 223 | "field": self.text_embedding_field, | 238 | "field": self.text_embedding_field, |
| 224 | "query_vector": query_vector.tolist(), | 239 | "query_vector": query_vector.tolist(), |
| 225 | "k": knn_k, | 240 | "k": knn_k, |
| 226 | "num_candidates": knn_num_candidates, | 241 | "num_candidates": knn_num_candidates, |
| 227 | - "boost": self.knn_boost # Lower boost for embedding recall | 242 | + "boost": knn_boost |
| 228 | } | 243 | } |
| 229 | es_query["knn"] = knn_clause | 244 | es_query["knn"] = knn_clause |
| 230 | 245 | ||
| @@ -430,9 +445,8 @@ class ESQueryBuilder: | @@ -430,9 +445,8 @@ class ESQueryBuilder: | ||
| 430 | translations = {} | 445 | translations = {} |
| 431 | language = self.default_language | 446 | language = self.default_language |
| 432 | keywords = "" | 447 | keywords = "" |
| 448 | + query_tokens = [] | ||
| 433 | token_count = 0 | 449 | token_count = 0 |
| 434 | - is_short_query = False | ||
| 435 | - is_long_query = False | ||
| 436 | 450 | ||
| 437 | if parsed_query: | 451 | if parsed_query: |
| 438 | translations = parsed_query.translations or {} | 452 | translations = parsed_query.translations or {} |
| @@ -443,16 +457,14 @@ class ESQueryBuilder: | @@ -443,16 +457,14 @@ class ESQueryBuilder: | ||
| 443 | else: | 457 | else: |
| 444 | language = detected_lang | 458 | language = detected_lang |
| 445 | keywords = getattr(parsed_query, 'keywords', '') or "" | 459 | keywords = getattr(parsed_query, 'keywords', '') or "" |
| 446 | - token_count = getattr(parsed_query, 'token_count', 0) or 0 | ||
| 447 | - is_short_query = getattr(parsed_query, 'is_short_query', False) | ||
| 448 | - is_long_query = getattr(parsed_query, 'is_long_query', False) | 460 | + query_tokens = getattr(parsed_query, 'query_tokens', None) or [] |
| 461 | + token_count = len(query_tokens) or getattr(parsed_query, 'token_count', 0) or 0 | ||
| 449 | 462 | ||
| 450 | # Get match fields for the detected language | 463 | # Get match fields for the detected language |
| 451 | match_fields, core_fields = self._get_match_fields(language) | 464 | match_fields, core_fields = self._get_match_fields(language) |
| 452 | 465 | ||
| 453 | # Tie breaker values | 466 | # Tie breaker values |
| 454 | tie_breaker_base_query = 0.9 | 467 | tie_breaker_base_query = 0.9 |
| 455 | - tie_breaker_long_query = 0.9 | ||
| 456 | tie_breaker_keywords = 0.9 | 468 | tie_breaker_keywords = 0.9 |
| 457 | 469 | ||
| 458 | # 1. Base query - main query with AND operator | 470 | # 1. Base query - main query with AND operator |
| @@ -496,9 +508,7 @@ class ESQueryBuilder: | @@ -496,9 +508,7 @@ class ESQueryBuilder: | ||
| 496 | "_name": "base_query_trans_en" | 508 | "_name": "base_query_trans_en" |
| 497 | } | 509 | } |
| 498 | }) | 510 | }) |
| 499 | - | ||
| 500 | - # 3. Long query - add a query with lower minimum_should_match | ||
| 501 | - # Currently disabled (False condition in reference) | 511 | + |
| 502 | if False and is_long_query: | 512 | if False and is_long_query: |
| 503 | boost = 0.5 * pow(min(1.0, token_count / 10.0), 0.9) | 513 | boost = 0.5 * pow(min(1.0, token_count / 10.0), 0.9) |
| 504 | minimum_should_match = "70%" | 514 | minimum_should_match = "70%" |
| @@ -512,10 +522,13 @@ class ESQueryBuilder: | @@ -512,10 +522,13 @@ class ESQueryBuilder: | ||
| 512 | "_name": "long_query" | 522 | "_name": "long_query" |
| 513 | } | 523 | } |
| 514 | }) | 524 | }) |
| 515 | - | ||
| 516 | - # 4. Short query - add phrase query | 525 | + |
| 526 | + # 3. Short query - add phrase query (derived from query_tokens) | ||
| 527 | + # is_short: quoted or ((token_count <= 2 or len <= 4) and no space) | ||
| 517 | ENABLE_PHRASE_QUERY = True | 528 | ENABLE_PHRASE_QUERY = True |
| 518 | - if ENABLE_PHRASE_QUERY and token_count >= 2 and is_short_query: | 529 | + is_quoted = query_text.startswith('"') and query_text.endswith('"') |
| 530 | + is_short = is_quoted or ((token_count <= 2 or len(query_text) <= 4) and ' ' not in query_text) | ||
| 531 | + if ENABLE_PHRASE_QUERY and token_count >= 2 and is_short: | ||
| 519 | query_length = len(query_text) | 532 | query_length = len(query_text) |
| 520 | slop = 0 if query_length < 3 else 1 if query_length < 5 else 2 | 533 | slop = 0 if query_length < 3 else 1 if query_length < 5 else 2 |
| 521 | should_clauses.append({ | 534 | should_clauses.append({ |
| @@ -529,7 +542,7 @@ class ESQueryBuilder: | @@ -529,7 +542,7 @@ class ESQueryBuilder: | ||
| 529 | } | 542 | } |
| 530 | }) | 543 | }) |
| 531 | 544 | ||
| 532 | - # 5. Keywords query - extracted nouns from query | 545 | + # 4. Keywords query - extracted nouns from query |
| 533 | elif keywords and len(keywords.split()) <= 2 and 2 * len(keywords.replace(' ', '')) <= len(query_text): | 546 | elif keywords and len(keywords.split()) <= 2 and 2 * len(keywords.replace(' ', '')) <= len(query_text): |
| 534 | should_clauses.append({ | 547 | should_clauses.append({ |
| 535 | "multi_match": { | 548 | "multi_match": { |