From ea118f2b13c9ac648298f2f588f3bb1fb86acef1 Mon Sep 17 00:00:00 2001 From: tangwang Date: Wed, 11 Feb 2026 18:41:00 +0800 Subject: [PATCH] build_query:根据 query_tokens 调整 KNN 参数: 短查询(token_count ≤ 2):knn_k=30, num_candidates=100, boost=0.15(约原权重 0.6) 中等查询(3–4):knn_k=50, num_candidates=200, boost=0.25(默认) 长查询(token_count ≥ 5):knn_k=80, num_candidates=300, boost=0.35(约原权重 1.4) --- docs/基础配置指南.md | 2 +- docs/相关性检索优化说明.md | 20 +++++++++++++++++--- docs/系统设计文档.md | 1 + query/query_parser.py | 32 +++++++++++++------------------- search/es_query_builder.py | 41 +++++++++++++++++++++++++++-------------- 5 files changed, 59 insertions(+), 37 deletions(-) diff --git a/docs/基础配置指南.md b/docs/基础配置指南.md index 6e060fe..f9d1148 100644 --- a/docs/基础配置指南.md +++ b/docs/基础配置指南.md @@ -78,7 +78,7 @@ - **filters**: 前端传递的过滤条件(永远起作用) - **text_recall**: 文本相关性召回(同时搜索中英文字段) -- **embedding_recall**: 向量召回(KNN,使用 `title_embedding`) +- **embedding_recall**: 向量召回(KNN,使用 `title_embedding`),根据 query_tokens 自适应调整 k、num_candidates、boost(详见 `docs/相关性检索优化说明.md` 3.6 节) - **function_score**: 包装召回部分,支持提权字段 ### Function Score 配置 diff --git a/docs/相关性检索优化说明.md b/docs/相关性检索优化说明.md index f3eeccf..75c94b3 100644 --- a/docs/相关性检索优化说明.md +++ b/docs/相关性检索优化说明.md @@ -105,7 +105,7 @@ - 支持跨语言检索 #### 3.3 短语查询(phrase_query) -- 针对短查询(token_count >= 2 且 is_short_query) +- 针对短查询(token_count >= 2 且 is_short,由 query_tokens 推导) - 使用 `type: "phrase"` 进行精确短语匹配 - 支持 slop(允许词序调整) @@ -118,6 +118,21 @@ - 当前已禁用(参考实现中也是 False) - 未来可根据需要启用 +#### 3.6 KNN 向量召回自适应策略(query_tokens) + +根据 `query_tokens`(HanLP 分词后的 token 数量)动态调整 KNN 的召回数量和权重: + +| 查询类型 | token_count | knn_k | num_candidates | boost 系数 | +|---------|-------------|-------|----------------|------------| +| 短查询 | ≤ 2 | 30 | 100 | 0.6× 默认 | +| 中等查询| 3~4 | 50 | 200 | 1.0× 默认 | +| 长查询 | ≥ 5 | 80 | 300 | 1.4× 默认 | + +**策略说明**: +- **短查询**:BM25 对精确匹配更有效,降低 KNN 召回和权重,避免语义召回干扰 +- **长查询**:语义搜索更有利,提高 KNN 召回和权重,增强语义理解能力 +- 默认 boost 由 `config.query_config.knn_boost` 配置(通常 0.25) + ### 4. 字段映射优化 新增 `_get_match_fields()` 方法,支持: @@ -186,8 +201,7 @@ result = searcher.search( parsed_query = query_parser.parse("戏水动物") print(f"关键词: {parsed_query.keywords}") print(f"Token数: {parsed_query.token_count}") -print(f"短查询: {parsed_query.is_short_query}") -print(f"长查询: {parsed_query.is_long_query}") +print(f"query_tokens: {parsed_query.query_tokens}") ``` ## 性能考虑 diff --git a/docs/系统设计文档.md b/docs/系统设计文档.md index 1b9890d..2be372f 100644 --- a/docs/系统设计文档.md +++ b/docs/系统设计文档.md @@ -527,6 +527,7 @@ laptop AND (gaming OR professional) ANDNOT cheap } } ``` +> **KNN 自适应策略**:`k`、`num_candidates`、`boost` 会根据 `query_tokens` 动态调整:短查询(≤2 token)减少召回和权重,长查询(≥5 token)增加召回和权重。详见 `docs/相关性检索优化说明.md` 3.6 节。 #### 实现模块 - `search/es_query_builder.py` - ES 查询构建器(单层架构,`build_query` 方法) diff --git a/query/query_parser.py b/query/query_parser.py index 992c3c5..452d1de 100644 --- a/query/query_parser.py +++ b/query/query_parser.py @@ -34,8 +34,7 @@ class ParsedQuery: domain: str = "default", keywords: str = "", token_count: int = 0, - is_short_query: bool = False, - is_long_query: bool = False + query_tokens: Optional[List[str]] = None ): self.original_query = original_query self.query_normalized = query_normalized @@ -47,8 +46,7 @@ class ParsedQuery: # Query analysis fields self.keywords = keywords self.token_count = token_count - self.is_short_query = is_short_query - self.is_long_query = is_long_query + self.query_tokens = query_tokens or [] def to_dict(self) -> Dict[str, Any]: """Convert to dictionary representation.""" @@ -144,13 +142,11 @@ class QueryParser: """Get token count using HanLP.""" tok_result = self._tok(query) return len(tok_result) if tok_result else 0 - - def _analyze_query_type(self, query: str, token_count: int) -> tuple: - """Analyze query type: (is_short_query, is_long_query).""" - is_quoted = query.startswith('"') and query.endswith('"') - is_short = is_quoted or ((token_count <= 2 or len(query) <= 4) and ' ' not in query) - is_long = token_count >= 4 - return is_short, is_long + + def _get_query_tokens(self, query: str) -> List[str]: + """Get token list using HanLP.""" + tok_result = self._tok(query) + return [x[0] for x in tok_result] if tok_result else [] def parse( self, @@ -294,18 +290,17 @@ class QueryParser: if context: context.add_warning(error_msg) - # Stage 5: Query analysis (keywords, token count, query type) + # Stage 5: Query analysis (keywords, token count, query_tokens) keywords = self._extract_keywords(query_text) - token_count = self._get_token_count(query_text) - is_short_query, is_long_query = self._analyze_query_type(query_text, token_count) + query_tokens = self._get_query_tokens(query_text) + token_count = len(query_tokens) log_debug(f"Query analysis | Keywords: {keywords} | Token count: {token_count} | " - f"Short query: {is_short_query} | Long query: {is_long_query}") + f"Query tokens: {query_tokens}") if context: context.store_intermediate_result('keywords', keywords) context.store_intermediate_result('token_count', token_count) - context.store_intermediate_result('is_short_query', is_short_query) - context.store_intermediate_result('is_long_query', is_long_query) + context.store_intermediate_result('query_tokens', query_tokens) # Stage 6: Text embedding (only for non-short queries) - async execution query_vector = None @@ -401,8 +396,7 @@ class QueryParser: domain=domain, keywords=keywords, token_count=token_count, - is_short_query=is_short_query, - is_long_query=is_long_query + query_tokens=query_tokens ) if context and hasattr(context, 'logger'): diff --git a/search/es_query_builder.py b/search/es_query_builder.py index fce5261..68278ee 100644 --- a/search/es_query_builder.py +++ b/search/es_query_builder.py @@ -218,13 +218,28 @@ class ESQueryBuilder: es_query["query"] = {"match_all": {}} # 4. Add KNN search if enabled (separate from query, ES will combine) + # Adjust KNN k, num_candidates, boost by query_tokens (short query: less KNN; long: more) if has_embedding: + knn_boost = self.knn_boost + if parsed_query: + query_tokens = getattr(parsed_query, 'query_tokens', None) or [] + token_count = len(query_tokens) + if token_count <= 2: + knn_k, knn_num_candidates = 30, 100 + knn_boost = self.knn_boost * 0.6 # Lower weight for short queries + elif token_count >= 5: + knn_k, knn_num_candidates = 80, 300 + knn_boost = self.knn_boost * 1.4 # Higher weight for long queries + else: + knn_k, knn_num_candidates = 50, 200 + else: + knn_k, knn_num_candidates = 50, 200 knn_clause = { "field": self.text_embedding_field, "query_vector": query_vector.tolist(), "k": knn_k, "num_candidates": knn_num_candidates, - "boost": self.knn_boost # Lower boost for embedding recall + "boost": knn_boost } es_query["knn"] = knn_clause @@ -430,9 +445,8 @@ class ESQueryBuilder: translations = {} language = self.default_language keywords = "" + query_tokens = [] token_count = 0 - is_short_query = False - is_long_query = False if parsed_query: translations = parsed_query.translations or {} @@ -443,16 +457,14 @@ class ESQueryBuilder: else: language = detected_lang keywords = getattr(parsed_query, 'keywords', '') or "" - token_count = getattr(parsed_query, 'token_count', 0) or 0 - is_short_query = getattr(parsed_query, 'is_short_query', False) - is_long_query = getattr(parsed_query, 'is_long_query', False) + query_tokens = getattr(parsed_query, 'query_tokens', None) or [] + token_count = len(query_tokens) or getattr(parsed_query, 'token_count', 0) or 0 # Get match fields for the detected language match_fields, core_fields = self._get_match_fields(language) # Tie breaker values tie_breaker_base_query = 0.9 - tie_breaker_long_query = 0.9 tie_breaker_keywords = 0.9 # 1. Base query - main query with AND operator @@ -496,9 +508,7 @@ class ESQueryBuilder: "_name": "base_query_trans_en" } }) - - # 3. Long query - add a query with lower minimum_should_match - # Currently disabled (False condition in reference) + if False and is_long_query: boost = 0.5 * pow(min(1.0, token_count / 10.0), 0.9) minimum_should_match = "70%" @@ -512,10 +522,13 @@ class ESQueryBuilder: "_name": "long_query" } }) - - # 4. Short query - add phrase query + + # 3. Short query - add phrase query (derived from query_tokens) + # is_short: quoted or ((token_count <= 2 or len <= 4) and no space) ENABLE_PHRASE_QUERY = True - if ENABLE_PHRASE_QUERY and token_count >= 2 and is_short_query: + is_quoted = query_text.startswith('"') and query_text.endswith('"') + is_short = is_quoted or ((token_count <= 2 or len(query_text) <= 4) and ' ' not in query_text) + if ENABLE_PHRASE_QUERY and token_count >= 2 and is_short: query_length = len(query_text) slop = 0 if query_length < 3 else 1 if query_length < 5 else 2 should_clauses.append({ @@ -529,7 +542,7 @@ class ESQueryBuilder: } }) - # 5. Keywords query - extracted nouns from query + # 4. Keywords query - extracted nouns from query elif keywords and len(keywords.split()) <= 2 and 2 * len(keywords.replace(' ', '')) <= len(query_text): should_clauses.append({ "multi_match": { -- libgit2 0.21.2