Commit 814e352b28210fc835604282a1af188d53d81d63
1 parent
581dafae
乘法公式配置化
Showing
9 changed files
with
168 additions
and
227 deletions
Show diff stats
config/__init__.py
| @@ -8,6 +8,7 @@ from config.schema import ( | @@ -8,6 +8,7 @@ from config.schema import ( | ||
| 8 | IndexConfig, | 8 | IndexConfig, |
| 9 | QueryConfig, | 9 | QueryConfig, |
| 10 | RerankConfig, | 10 | RerankConfig, |
| 11 | + RerankFusionConfig, | ||
| 11 | SPUConfig, | 12 | SPUConfig, |
| 12 | SearchConfig, | 13 | SearchConfig, |
| 13 | ServicesConfig, | 14 | ServicesConfig, |
| @@ -36,6 +37,7 @@ __all__ = [ | @@ -36,6 +37,7 @@ __all__ = [ | ||
| 36 | "IndexConfig", | 37 | "IndexConfig", |
| 37 | "QueryConfig", | 38 | "QueryConfig", |
| 38 | "RerankConfig", | 39 | "RerankConfig", |
| 40 | + "RerankFusionConfig", | ||
| 39 | "SPUConfig", | 41 | "SPUConfig", |
| 40 | "SearchConfig", | 42 | "SearchConfig", |
| 41 | "ServicesConfig", | 43 | "ServicesConfig", |
config/config.yaml
| @@ -219,6 +219,14 @@ rerank: | @@ -219,6 +219,14 @@ rerank: | ||
| 219 | weight_ai: 0.6 | 219 | weight_ai: 0.6 |
| 220 | rerank_query_template: "{query}" | 220 | rerank_query_template: "{query}" |
| 221 | rerank_doc_template: "{title}" | 221 | rerank_doc_template: "{title}" |
| 222 | + # 乘法融合:fused = Π (max(score,0) + bias) ** exponent(rerank / text / knn 三项) | ||
| 223 | + fusion: | ||
| 224 | + rerank_bias: 0.00001 | ||
| 225 | + rerank_exponent: 1.0 | ||
| 226 | + text_bias: 0.1 | ||
| 227 | + text_exponent: 0.35 | ||
| 228 | + knn_bias: 0.6 | ||
| 229 | + knn_exponent: 0.2 | ||
| 222 | 230 | ||
| 223 | # 可扩展服务/provider 注册表(单一配置源) | 231 | # 可扩展服务/provider 注册表(单一配置源) |
| 224 | services: | 232 | services: |
config/loader.py
| @@ -37,6 +37,7 @@ from config.schema import ( | @@ -37,6 +37,7 @@ from config.schema import ( | ||
| 37 | ProductEnrichConfig, | 37 | ProductEnrichConfig, |
| 38 | RedisSettings, | 38 | RedisSettings, |
| 39 | RerankConfig, | 39 | RerankConfig, |
| 40 | + RerankFusionConfig, | ||
| 40 | RerankServiceConfig, | 41 | RerankServiceConfig, |
| 41 | RuntimeConfig, | 42 | RuntimeConfig, |
| 42 | SearchConfig, | 43 | SearchConfig, |
| @@ -393,6 +394,7 @@ class AppConfigLoader: | @@ -393,6 +394,7 @@ class AppConfigLoader: | ||
| 393 | 394 | ||
| 394 | function_score_cfg = raw.get("function_score") if isinstance(raw.get("function_score"), dict) else {} | 395 | function_score_cfg = raw.get("function_score") if isinstance(raw.get("function_score"), dict) else {} |
| 395 | rerank_cfg = raw.get("rerank") if isinstance(raw.get("rerank"), dict) else {} | 396 | rerank_cfg = raw.get("rerank") if isinstance(raw.get("rerank"), dict) else {} |
| 397 | + fusion_raw = rerank_cfg.get("fusion") if isinstance(rerank_cfg.get("fusion"), dict) else {} | ||
| 396 | spu_cfg = raw.get("spu_config") if isinstance(raw.get("spu_config"), dict) else {} | 398 | spu_cfg = raw.get("spu_config") if isinstance(raw.get("spu_config"), dict) else {} |
| 397 | 399 | ||
| 398 | return SearchConfig( | 400 | return SearchConfig( |
| @@ -412,6 +414,14 @@ class AppConfigLoader: | @@ -412,6 +414,14 @@ class AppConfigLoader: | ||
| 412 | weight_ai=float(rerank_cfg.get("weight_ai", 0.6)), | 414 | weight_ai=float(rerank_cfg.get("weight_ai", 0.6)), |
| 413 | rerank_query_template=str(rerank_cfg.get("rerank_query_template") or "{query}"), | 415 | rerank_query_template=str(rerank_cfg.get("rerank_query_template") or "{query}"), |
| 414 | rerank_doc_template=str(rerank_cfg.get("rerank_doc_template") or "{title}"), | 416 | rerank_doc_template=str(rerank_cfg.get("rerank_doc_template") or "{title}"), |
| 417 | + fusion=RerankFusionConfig( | ||
| 418 | + rerank_bias=float(fusion_raw.get("rerank_bias", 0.00001)), | ||
| 419 | + rerank_exponent=float(fusion_raw.get("rerank_exponent", 1.0)), | ||
| 420 | + text_bias=float(fusion_raw.get("text_bias", 0.1)), | ||
| 421 | + text_exponent=float(fusion_raw.get("text_exponent", 0.35)), | ||
| 422 | + knn_bias=float(fusion_raw.get("knn_bias", 0.6)), | ||
| 423 | + knn_exponent=float(fusion_raw.get("knn_exponent", 0.2)), | ||
| 424 | + ), | ||
| 415 | ), | 425 | ), |
| 416 | spu_config=SPUConfig( | 426 | spu_config=SPUConfig( |
| 417 | enabled=bool(spu_cfg.get("enabled", False)), | 427 | enabled=bool(spu_cfg.get("enabled", False)), |
config/schema.py
| @@ -91,6 +91,21 @@ class FunctionScoreConfig: | @@ -91,6 +91,21 @@ class FunctionScoreConfig: | ||
| 91 | 91 | ||
| 92 | 92 | ||
| 93 | @dataclass(frozen=True) | 93 | @dataclass(frozen=True) |
| 94 | +class RerankFusionConfig: | ||
| 95 | + """ | ||
| 96 | + Multiplicative fusion: fused = Π (max(score_i, 0) + bias_i) ** exponent_i | ||
| 97 | + for rerank / text / knn terms respectively. | ||
| 98 | + """ | ||
| 99 | + | ||
| 100 | + rerank_bias: float = 0.00001 | ||
| 101 | + rerank_exponent: float = 1.0 | ||
| 102 | + text_bias: float = 0.1 | ||
| 103 | + text_exponent: float = 0.35 | ||
| 104 | + knn_bias: float = 0.6 | ||
| 105 | + knn_exponent: float = 0.2 | ||
| 106 | + | ||
| 107 | + | ||
| 108 | +@dataclass(frozen=True) | ||
| 94 | class RerankConfig: | 109 | class RerankConfig: |
| 95 | """Search-time rerank configuration.""" | 110 | """Search-time rerank configuration.""" |
| 96 | 111 | ||
| @@ -101,6 +116,7 @@ class RerankConfig: | @@ -101,6 +116,7 @@ class RerankConfig: | ||
| 101 | weight_ai: float = 0.6 | 116 | weight_ai: float = 0.6 |
| 102 | rerank_query_template: str = "{query}" | 117 | rerank_query_template: str = "{query}" |
| 103 | rerank_doc_template: str = "{title}" | 118 | rerank_doc_template: str = "{title}" |
| 119 | + fusion: RerankFusionConfig = field(default_factory=RerankFusionConfig) | ||
| 104 | 120 | ||
| 105 | 121 | ||
| 106 | @dataclass(frozen=True) | 122 | @dataclass(frozen=True) |
frontend/static/js/app.js
| @@ -411,11 +411,6 @@ function displayResults(data) { | @@ -411,11 +411,6 @@ function displayResults(data) { | ||
| 411 | const esNorm = typeof debug.es_score_normalized === 'number' | 411 | const esNorm = typeof debug.es_score_normalized === 'number' |
| 412 | ? debug.es_score_normalized.toFixed(4) | 412 | ? debug.es_score_normalized.toFixed(4) |
| 413 | : (debug.es_score_normalized == null ? '' : String(debug.es_score_normalized)); | 413 | : (debug.es_score_normalized == null ? '' : String(debug.es_score_normalized)); |
| 414 | - | ||
| 415 | - const esNormMinMax = typeof debug.es_score_norm === 'number' | ||
| 416 | - ? debug.es_score_norm.toFixed(4) | ||
| 417 | - : (debug.es_score_norm == null ? '' : String(debug.es_score_norm)); | ||
| 418 | - | ||
| 419 | const rerankScore = typeof debug.rerank_score === 'number' | 414 | const rerankScore = typeof debug.rerank_score === 'number' |
| 420 | ? debug.rerank_score.toFixed(4) | 415 | ? debug.rerank_score.toFixed(4) |
| 421 | : (debug.rerank_score == null ? '' : String(debug.rerank_score)); | 416 | : (debug.rerank_score == null ? '' : String(debug.rerank_score)); |
| @@ -437,13 +432,28 @@ function displayResults(data) { | @@ -437,13 +432,28 @@ function displayResults(data) { | ||
| 437 | const resultJson = customStringify(result); | 432 | const resultJson = customStringify(result); |
| 438 | const rawUrl = `${API_BASE_URL}/search/es-doc/${encodeURIComponent(spuId)}?tenant_id=${encodeURIComponent(tenantId)}`; | 433 | const rawUrl = `${API_BASE_URL}/search/es-doc/${encodeURIComponent(spuId)}?tenant_id=${encodeURIComponent(tenantId)}`; |
| 439 | const rerankInputHtml = debug.rerank_input | 434 | const rerankInputHtml = debug.rerank_input |
| 440 | - ? `<details><summary>Rerank input</summary><pre style="background: #f5f5f5; padding: 10px; overflow: auto; max-height: 220px;">${escapeHtml(customStringify(debug.rerank_input))}</pre></details>` | 435 | + ? ` |
| 436 | + <details> | ||
| 437 | + <summary>Rerank input</summary> | ||
| 438 | + <pre style="background: #f5f5f5; padding: 10px; overflow: auto; max-height: 220px;">${escapeHtml(customStringify(debug.rerank_input))}</pre> | ||
| 439 | + </details> | ||
| 440 | + ` | ||
| 441 | : ''; | 441 | : ''; |
| 442 | const styleIntentHtml = debug.style_intent_sku | 442 | const styleIntentHtml = debug.style_intent_sku |
| 443 | - ? `<details><summary>Selected SKU</summary><pre style="background: #f5f5f5; padding: 10px; overflow: auto; max-height: 220px;">${escapeHtml(customStringify(debug.style_intent_sku))}</pre></details>` | 443 | + ? ` |
| 444 | + <details> | ||
| 445 | + <summary>Selected SKU</summary> | ||
| 446 | + <pre style="background: #f5f5f5; padding: 10px; overflow: auto; max-height: 220px;">${escapeHtml(customStringify(debug.style_intent_sku))}</pre> | ||
| 447 | + </details> | ||
| 448 | + ` | ||
| 444 | : ''; | 449 | : ''; |
| 445 | const matchedQueriesHtml = debug.matched_queries | 450 | const matchedQueriesHtml = debug.matched_queries |
| 446 | - ? `<details><summary>matched_queries</summary><pre style="background: #f5f5f5; padding: 10px; overflow: auto; max-height: 220px;">${escapeHtml(customStringify(debug.matched_queries))}</pre></details>` | 451 | + ? ` |
| 452 | + <details> | ||
| 453 | + <summary>matched_queries</summary> | ||
| 454 | + <pre style="background: #f5f5f5; padding: 10px; overflow: auto; max-height: 220px;">${escapeHtml(customStringify(debug.matched_queries))}</pre> | ||
| 455 | + </details> | ||
| 456 | + ` | ||
| 447 | : ''; | 457 | : ''; |
| 448 | 458 | ||
| 449 | debugHtml = ` | 459 | debugHtml = ` |
| @@ -453,9 +463,7 @@ function displayResults(data) { | @@ -453,9 +463,7 @@ function displayResults(data) { | ||
| 453 | <div class="product-debug-line">Position before rerank: ${escapeHtml(String(debug.initial_rank ?? ''))}</div> | 463 | <div class="product-debug-line">Position before rerank: ${escapeHtml(String(debug.initial_rank ?? ''))}</div> |
| 454 | <div class="product-debug-line">Position after rerank: ${escapeHtml(String(debug.final_rank ?? ''))}</div> | 464 | <div class="product-debug-line">Position after rerank: ${escapeHtml(String(debug.final_rank ?? ''))}</div> |
| 455 | <div class="product-debug-line">ES score: ${esScore}</div> | 465 | <div class="product-debug-line">ES score: ${esScore}</div> |
| 456 | - <div class="product-debug-line">ES normalized (score / initial ES max): ${esNorm}</div> | ||
| 457 | - <div class="product-debug-line">ES norm (min-max over initial ES window): ${esNormMinMax}</div> | ||
| 458 | - <div class="product-debug-line">ES score min/max: ${escapeHtml(String(debug.es_score_min ?? ''))} / ${escapeHtml(String(debug.es_score_max ?? ''))}</div> | 466 | + <div class="product-debug-line">ES normalized: ${esNorm}</div> |
| 459 | <div class="product-debug-line">Rerank score: ${rerankScore}</div> | 467 | <div class="product-debug-line">Rerank score: ${rerankScore}</div> |
| 460 | <div class="product-debug-line">rerank_factor: ${escapeHtml(String(debug.rerank_factor ?? ''))}</div> | 468 | <div class="product-debug-line">rerank_factor: ${escapeHtml(String(debug.rerank_factor ?? ''))}</div> |
| 461 | <div class="product-debug-line">text_score: ${escapeHtml(String(debug.text_score ?? ''))}</div> | 469 | <div class="product-debug-line">text_score: ${escapeHtml(String(debug.text_score ?? ''))}</div> |
| @@ -910,9 +918,6 @@ function displayDebugInfo(data) { | @@ -910,9 +918,6 @@ function displayDebugInfo(data) { | ||
| 910 | html += `<div>query_tokens: ${escapeHtml((debugInfo.query_analysis.query_tokens || []).join(', ') || 'N/A')}</div>`; | 918 | html += `<div>query_tokens: ${escapeHtml((debugInfo.query_analysis.query_tokens || []).join(', ') || 'N/A')}</div>`; |
| 911 | html += `<div>domain: ${escapeHtml(debugInfo.query_analysis.domain || 'default')}</div>`; | 919 | html += `<div>domain: ${escapeHtml(debugInfo.query_analysis.domain || 'default')}</div>`; |
| 912 | html += `<div>is_simple_query: ${debugInfo.query_analysis.is_simple_query ? 'yes' : 'no'}</div>`; | 920 | html += `<div>is_simple_query: ${debugInfo.query_analysis.is_simple_query ? 'yes' : 'no'}</div>`; |
| 913 | - if (debugInfo.query_analysis.query_vector_summary) { | ||
| 914 | - html += `<div>query_vector_summary: ${escapeHtml(customStringify(debugInfo.query_analysis.query_vector_summary))}</div>`; | ||
| 915 | - } | ||
| 916 | 921 | ||
| 917 | if (debugInfo.query_analysis.translations && Object.keys(debugInfo.query_analysis.translations).length > 0) { | 922 | if (debugInfo.query_analysis.translations && Object.keys(debugInfo.query_analysis.translations).length > 0) { |
| 918 | html += '<div>translations: '; | 923 | html += '<div>translations: '; |
| @@ -946,25 +951,21 @@ function displayDebugInfo(data) { | @@ -946,25 +951,21 @@ function displayDebugInfo(data) { | ||
| 946 | html += `<div>took_ms: ${debugInfo.es_response.took_ms}ms</div>`; | 951 | html += `<div>took_ms: ${debugInfo.es_response.took_ms}ms</div>`; |
| 947 | html += `<div>total_hits: ${debugInfo.es_response.total_hits}</div>`; | 952 | html += `<div>total_hits: ${debugInfo.es_response.total_hits}</div>`; |
| 948 | html += `<div>max_score: ${debugInfo.es_response.max_score?.toFixed(3) || 0}</div>`; | 953 | html += `<div>max_score: ${debugInfo.es_response.max_score?.toFixed(3) || 0}</div>`; |
| 949 | - html += `<div>initial_es_max_score: ${escapeHtml(String(debugInfo.es_response.initial_es_max_score ?? ''))}</div>`; | ||
| 950 | - html += `<div>initial_es_min_score: ${escapeHtml(String(debugInfo.es_response.initial_es_min_score ?? ''))}</div>`; | 954 | + html += `<div>es_score_normalization_factor: ${escapeHtml(String(debugInfo.es_response.es_score_normalization_factor ?? ''))}</div>`; |
| 951 | html += '</div>'; | 955 | html += '</div>'; |
| 952 | } | 956 | } |
| 953 | 957 | ||
| 954 | if (debugInfo.rerank) { | 958 | if (debugInfo.rerank) { |
| 955 | html += '<div style="margin-bottom: 15px;"><strong style="font-size: 14px;">Rerank:</strong>'; | 959 | html += '<div style="margin-bottom: 15px;"><strong style="font-size: 14px;">Rerank:</strong>'; |
| 956 | - html += `<div>requested: ${debugInfo.rerank.requested ? 'yes' : 'no'}</div>`; | ||
| 957 | - html += `<div>executed: ${debugInfo.rerank.executed ? 'yes' : 'no'}</div>`; | ||
| 958 | - html += `<div>in_rerank_window: ${debugInfo.rerank.in_rerank_window ? 'yes' : 'no'}</div>`; | ||
| 959 | - html += `<div>top_n: ${escapeHtml(String(debugInfo.rerank.top_n ?? ''))}</div>`; | ||
| 960 | html += `<div>query_template: ${escapeHtml(debugInfo.rerank.query_template || 'N/A')}</div>`; | 960 | html += `<div>query_template: ${escapeHtml(debugInfo.rerank.query_template || 'N/A')}</div>`; |
| 961 | html += `<div>doc_template: ${escapeHtml(debugInfo.rerank.doc_template || 'N/A')}</div>`; | 961 | html += `<div>doc_template: ${escapeHtml(debugInfo.rerank.doc_template || 'N/A')}</div>`; |
| 962 | - html += '</div>'; | ||
| 963 | - } | ||
| 964 | - | ||
| 965 | - if (debugInfo.page_fill) { | ||
| 966 | - html += '<div style="margin-bottom: 15px;"><strong style="font-size: 14px;">Page Fill:</strong>'; | ||
| 967 | - html += `<pre style="background: #f5f5f5; padding: 10px; overflow: auto; max-height: 220px;">${escapeHtml(customStringify(debugInfo.page_fill))}</pre>`; | 962 | + html += `<div>query_text: ${escapeHtml(debugInfo.rerank.query_text || 'N/A')}</div>`; |
| 963 | + html += `<div>docs: ${escapeHtml(String(debugInfo.rerank.docs ?? ''))}</div>`; | ||
| 964 | + html += `<div>top_n: ${escapeHtml(String(debugInfo.rerank.top_n ?? ''))}</div>`; | ||
| 965 | + if (debugInfo.rerank.fusion) { | ||
| 966 | + html += '<div>fusion:</div>'; | ||
| 967 | + html += `<pre style="background: #f5f5f5; padding: 10px; overflow: auto; max-height: 160px;">${escapeHtml(customStringify(debugInfo.rerank.fusion))}</pre>`; | ||
| 968 | + } | ||
| 968 | html += '</div>'; | 969 | html += '</div>'; |
| 969 | } | 970 | } |
| 970 | 971 | ||
| @@ -992,7 +993,7 @@ function displayDebugInfo(data) { | @@ -992,7 +993,7 @@ function displayDebugInfo(data) { | ||
| 992 | 993 | ||
| 993 | if (debugInfo.es_query_context) { | 994 | if (debugInfo.es_query_context) { |
| 994 | html += '<div style="margin-bottom: 15px;"><strong style="font-size: 14px;">ES Query Context:</strong>'; | 995 | html += '<div style="margin-bottom: 15px;"><strong style="font-size: 14px;">ES Query Context:</strong>'; |
| 995 | - html += `<pre style="background: #f5f5f5; padding: 10px; overflow: auto; max-height: 300px;">${escapeHtml(customStringify(debugInfo.es_query_context))}</pre>`; | 996 | + html += `<pre style="background: #f5f5f5; padding: 10px; overflow: auto; max-height: 240px;">${escapeHtml(customStringify(debugInfo.es_query_context))}</pre>`; |
| 996 | html += '</div>'; | 997 | html += '</div>'; |
| 997 | } | 998 | } |
| 998 | 999 |
search/rerank_client.py
| @@ -10,6 +10,7 @@ | @@ -10,6 +10,7 @@ | ||
| 10 | from typing import Dict, Any, List, Optional, Tuple | 10 | from typing import Dict, Any, List, Optional, Tuple |
| 11 | import logging | 11 | import logging |
| 12 | 12 | ||
| 13 | +from config.schema import RerankFusionConfig | ||
| 13 | from providers import create_rerank_provider | 14 | from providers import create_rerank_provider |
| 14 | 15 | ||
| 15 | logger = logging.getLogger(__name__) | 16 | logger = logging.getLogger(__name__) |
| @@ -176,17 +177,34 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa | @@ -176,17 +177,34 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa | ||
| 176 | } | 177 | } |
| 177 | 178 | ||
| 178 | 179 | ||
| 180 | +def _multiply_fusion_factors( | ||
| 181 | + rerank_score: float, | ||
| 182 | + text_score: float, | ||
| 183 | + knn_score: float, | ||
| 184 | + fusion: RerankFusionConfig, | ||
| 185 | +) -> Tuple[float, float, float, float]: | ||
| 186 | + """(rerank_factor, text_factor, knn_factor, fused).""" | ||
| 187 | + r = (max(rerank_score, 0.0) + fusion.rerank_bias) ** fusion.rerank_exponent | ||
| 188 | + t = (max(text_score, 0.0) + fusion.text_bias) ** fusion.text_exponent | ||
| 189 | + k = (max(knn_score, 0.0) + fusion.knn_bias) ** fusion.knn_exponent | ||
| 190 | + return r, t, k, r * t * k | ||
| 191 | + | ||
| 192 | + | ||
| 179 | def fuse_scores_and_resort( | 193 | def fuse_scores_and_resort( |
| 180 | es_hits: List[Dict[str, Any]], | 194 | es_hits: List[Dict[str, Any]], |
| 181 | rerank_scores: List[float], | 195 | rerank_scores: List[float], |
| 182 | weight_es: float = DEFAULT_WEIGHT_ES, | 196 | weight_es: float = DEFAULT_WEIGHT_ES, |
| 183 | weight_ai: float = DEFAULT_WEIGHT_AI, | 197 | weight_ai: float = DEFAULT_WEIGHT_AI, |
| 198 | + fusion: Optional[RerankFusionConfig] = None, | ||
| 184 | debug: bool = False, | 199 | debug: bool = False, |
| 185 | rerank_debug_rows: Optional[List[Dict[str, Any]]] = None, | 200 | rerank_debug_rows: Optional[List[Dict[str, Any]]] = None, |
| 186 | ) -> List[Dict[str, Any]]: | 201 | ) -> List[Dict[str, Any]]: |
| 187 | """ | 202 | """ |
| 188 | 将 ES 分数与重排分数按乘法公式融合(不修改原始 _score),并按融合分数降序重排。 | 203 | 将 ES 分数与重排分数按乘法公式融合(不修改原始 _score),并按融合分数降序重排。 |
| 189 | 204 | ||
| 205 | + 融合形式(由 ``fusion`` 配置 bias / exponent):: | ||
| 206 | + fused = (max(rerank,0)+b_r)^e_r * (max(text,0)+b_t)^e_t * (max(knn,0)+b_k)^e_k | ||
| 207 | + | ||
| 190 | 对每条 hit 会写入: | 208 | 对每条 hit 会写入: |
| 191 | - _original_score: 原始 ES 分数 | 209 | - _original_score: 原始 ES 分数 |
| 192 | - _rerank_score: 重排服务返回的分数 | 210 | - _rerank_score: 重排服务返回的分数 |
| @@ -199,40 +217,35 @@ def fuse_scores_and_resort( | @@ -199,40 +217,35 @@ def fuse_scores_and_resort( | ||
| 199 | rerank_scores: 与 es_hits 等长的重排分数列表 | 217 | rerank_scores: 与 es_hits 等长的重排分数列表 |
| 200 | weight_es: 兼容保留,当前未使用 | 218 | weight_es: 兼容保留,当前未使用 |
| 201 | weight_ai: 兼容保留,当前未使用 | 219 | weight_ai: 兼容保留,当前未使用 |
| 202 | - | ||
| 203 | - Returns: | ||
| 204 | - 每条文档的融合调试信息列表,用于 debug_info | ||
| 205 | """ | 220 | """ |
| 206 | n = len(es_hits) | 221 | n = len(es_hits) |
| 207 | if n == 0 or len(rerank_scores) != n: | 222 | if n == 0 or len(rerank_scores) != n: |
| 208 | return [] | 223 | return [] |
| 209 | 224 | ||
| 210 | - fused_debug: List[Dict[str, Any]] = [] | 225 | + f = fusion or RerankFusionConfig() |
| 226 | + fused_debug: List[Dict[str, Any]] = [] if debug else [] | ||
| 211 | 227 | ||
| 212 | for idx, hit in enumerate(es_hits): | 228 | for idx, hit in enumerate(es_hits): |
| 213 | es_score = _to_score(hit.get("_score")) | 229 | es_score = _to_score(hit.get("_score")) |
| 214 | - | ||
| 215 | - ai_score_raw = rerank_scores[idx] | ||
| 216 | - rerank_score = _to_score(ai_score_raw) | ||
| 217 | - | 230 | + rerank_score = _to_score(rerank_scores[idx]) |
| 218 | matched_queries = hit.get("matched_queries") | 231 | matched_queries = hit.get("matched_queries") |
| 219 | knn_score = _extract_named_query_score(matched_queries, "knn_query") | 232 | knn_score = _extract_named_query_score(matched_queries, "knn_query") |
| 220 | text_components = _collect_text_score_components(matched_queries, es_score) | 233 | text_components = _collect_text_score_components(matched_queries, es_score) |
| 221 | text_score = text_components["text_score"] | 234 | text_score = text_components["text_score"] |
| 222 | - rerank_factor = max(rerank_score, 0.0) + 0.00001 | ||
| 223 | - text_factor = (max(text_score, 0.0) + 0.1) ** 0.35 | ||
| 224 | - knn_factor = (max(knn_score, 0.0) + 0.6) ** 0.2 | ||
| 225 | - fused = rerank_factor * text_factor * knn_factor | 235 | + rerank_factor, text_factor, knn_factor, fused = _multiply_fusion_factors( |
| 236 | + rerank_score, text_score, knn_score, f | ||
| 237 | + ) | ||
| 226 | 238 | ||
| 227 | hit["_original_score"] = hit.get("_score") | 239 | hit["_original_score"] = hit.get("_score") |
| 228 | hit["_rerank_score"] = rerank_score | 240 | hit["_rerank_score"] = rerank_score |
| 229 | hit["_text_score"] = text_score | 241 | hit["_text_score"] = text_score |
| 230 | hit["_knn_score"] = knn_score | 242 | hit["_knn_score"] = knn_score |
| 231 | - hit["_text_source_score"] = text_components["source_score"] | ||
| 232 | - hit["_text_translation_score"] = text_components["translation_score"] | ||
| 233 | - hit["_text_primary_score"] = text_components["primary_text_score"] | ||
| 234 | - hit["_text_support_score"] = text_components["support_text_score"] | ||
| 235 | hit["_fused_score"] = fused | 243 | hit["_fused_score"] = fused |
| 244 | + if debug: | ||
| 245 | + hit["_text_source_score"] = text_components["source_score"] | ||
| 246 | + hit["_text_translation_score"] = text_components["translation_score"] | ||
| 247 | + hit["_text_primary_score"] = text_components["primary_text_score"] | ||
| 248 | + hit["_text_support_score"] = text_components["support_text_score"] | ||
| 236 | 249 | ||
| 237 | if debug: | 250 | if debug: |
| 238 | debug_entry = { | 251 | debug_entry = { |
| @@ -262,7 +275,6 @@ def fuse_scores_and_resort( | @@ -262,7 +275,6 @@ def fuse_scores_and_resort( | ||
| 262 | debug_entry["rerank_input"] = rerank_debug_rows[idx] | 275 | debug_entry["rerank_input"] = rerank_debug_rows[idx] |
| 263 | fused_debug.append(debug_entry) | 276 | fused_debug.append(debug_entry) |
| 264 | 277 | ||
| 265 | - # 按融合分数降序重排 | ||
| 266 | es_hits.sort( | 278 | es_hits.sort( |
| 267 | key=lambda h: h.get("_fused_score", h.get("_score", 0.0)), | 279 | key=lambda h: h.get("_fused_score", h.get("_score", 0.0)), |
| 268 | reverse=True, | 280 | reverse=True, |
| @@ -281,6 +293,7 @@ def run_rerank( | @@ -281,6 +293,7 @@ def run_rerank( | ||
| 281 | rerank_doc_template: str = "{title}", | 293 | rerank_doc_template: str = "{title}", |
| 282 | top_n: Optional[int] = None, | 294 | top_n: Optional[int] = None, |
| 283 | debug: bool = False, | 295 | debug: bool = False, |
| 296 | + fusion: Optional[RerankFusionConfig] = None, | ||
| 284 | ) -> Tuple[Dict[str, Any], Optional[Dict[str, Any]], List[Dict[str, Any]]]: | 297 | ) -> Tuple[Dict[str, Any], Optional[Dict[str, Any]], List[Dict[str, Any]]]: |
| 285 | """ | 298 | """ |
| 286 | 完整重排流程:从 es_response 取 hits -> 构造 docs -> 调服务 -> 融合分数并重排 -> 更新 max_score。 | 299 | 完整重排流程:从 es_response 取 hits -> 构造 docs -> 调服务 -> 融合分数并重排 -> 更新 max_score。 |
| @@ -314,6 +327,7 @@ def run_rerank( | @@ -314,6 +327,7 @@ def run_rerank( | ||
| 314 | scores, | 327 | scores, |
| 315 | weight_es=weight_es, | 328 | weight_es=weight_es, |
| 316 | weight_ai=weight_ai, | 329 | weight_ai=weight_ai, |
| 330 | + fusion=fusion, | ||
| 317 | debug=debug, | 331 | debug=debug, |
| 318 | rerank_debug_rows=rerank_debug_rows, | 332 | rerank_debug_rows=rerank_debug_rows, |
| 319 | ) | 333 | ) |
search/searcher.py
| @@ -4,9 +4,8 @@ Main Searcher module - executes search queries against Elasticsearch. | @@ -4,9 +4,8 @@ Main Searcher module - executes search queries against Elasticsearch. | ||
| 4 | Handles query parsing, ranking, and result formatting. | 4 | Handles query parsing, ranking, and result formatting. |
| 5 | """ | 5 | """ |
| 6 | 6 | ||
| 7 | -from typing import Dict, Any, List, Optional, Union, Tuple | ||
| 8 | -import os | ||
| 9 | -import time, json | 7 | +from typing import Dict, Any, List, Optional |
| 8 | +import json | ||
| 10 | import logging | 9 | import logging |
| 11 | import hashlib | 10 | import hashlib |
| 12 | from string import Formatter | 11 | from string import Formatter |
| @@ -20,7 +19,7 @@ from .sku_intent_selector import SkuSelectionDecision, StyleSkuSelector | @@ -20,7 +19,7 @@ from .sku_intent_selector import SkuSelectionDecision, StyleSkuSelector | ||
| 20 | from config import SearchConfig | 19 | from config import SearchConfig |
| 21 | from config.tenant_config_loader import get_tenant_config_loader | 20 | from config.tenant_config_loader import get_tenant_config_loader |
| 22 | from context.request_context import RequestContext, RequestContextStage | 21 | from context.request_context import RequestContext, RequestContextStage |
| 23 | -from api.models import FacetResult, FacetValue, FacetConfig | 22 | +from api.models import FacetResult, FacetConfig |
| 24 | from api.result_formatter import ResultFormatter | 23 | from api.result_formatter import ResultFormatter |
| 25 | from indexer.mapping_generator import get_tenant_index_name | 24 | from indexer.mapping_generator import get_tenant_index_name |
| 26 | 25 | ||
| @@ -259,13 +258,7 @@ class Searcher: | @@ -259,13 +258,7 @@ class Searcher: | ||
| 259 | if context is not None: | 258 | if context is not None: |
| 260 | context.start_stage(RequestContextStage.STYLE_SKU_PREPARE_HITS) | 259 | context.start_stage(RequestContextStage.STYLE_SKU_PREPARE_HITS) |
| 261 | try: | 260 | try: |
| 262 | - decisions = self.style_sku_selector.prepare_hits(es_hits, parsed_query) | ||
| 263 | - if decisions and context is not None: | ||
| 264 | - context.store_intermediate_result( | ||
| 265 | - "style_intent_sku_decisions", | ||
| 266 | - {doc_id: decision.to_dict() for doc_id, decision in decisions.items()}, | ||
| 267 | - ) | ||
| 268 | - return decisions | 261 | + return self.style_sku_selector.prepare_hits(es_hits, parsed_query) |
| 269 | finally: | 262 | finally: |
| 270 | if context is not None: | 263 | if context is not None: |
| 271 | context.end_stage(RequestContextStage.STYLE_SKU_PREPARE_HITS) | 264 | context.end_stage(RequestContextStage.STYLE_SKU_PREPARE_HITS) |
| @@ -339,21 +332,10 @@ class Searcher: | @@ -339,21 +332,10 @@ class Searcher: | ||
| 339 | in_rerank_window = do_rerank and (from_ + size) <= rerank_window | 332 | in_rerank_window = do_rerank and (from_ + size) <= rerank_window |
| 340 | es_fetch_from = 0 if in_rerank_window else from_ | 333 | es_fetch_from = 0 if in_rerank_window else from_ |
| 341 | es_fetch_size = rerank_window if in_rerank_window else size | 334 | es_fetch_size = rerank_window if in_rerank_window else size |
| 342 | - initial_es_positions: Dict[str, int] = {} | ||
| 343 | - initial_es_min_score: Optional[float] = None | ||
| 344 | - initial_es_max_score: Optional[float] = None | ||
| 345 | - page_fill_debug: Optional[Dict[str, Any]] = None | 335 | + |
| 336 | + es_score_normalization_factor: Optional[float] = None | ||
| 337 | + initial_ranks_by_doc: Dict[str, int] = {} | ||
| 346 | rerank_debug_info: Optional[Dict[str, Any]] = None | 338 | rerank_debug_info: Optional[Dict[str, Any]] = None |
| 347 | - if debug: | ||
| 348 | - rerank_debug_info = { | ||
| 349 | - "requested": do_rerank, | ||
| 350 | - "executed": False, | ||
| 351 | - "in_rerank_window": in_rerank_window, | ||
| 352 | - "rerank_window": rerank_window, | ||
| 353 | - "top_n": from_ + size, | ||
| 354 | - "query_template": effective_query_template, | ||
| 355 | - "doc_template": effective_doc_template, | ||
| 356 | - } | ||
| 357 | 339 | ||
| 358 | # Start timing | 340 | # Start timing |
| 359 | context.start_stage(RequestContextStage.TOTAL) | 341 | context.start_stage(RequestContextStage.TOTAL) |
| @@ -402,8 +384,8 @@ class Searcher: | @@ -402,8 +384,8 @@ class Searcher: | ||
| 402 | try: | 384 | try: |
| 403 | parsed_query = self.query_parser.parse( | 385 | parsed_query = self.query_parser.parse( |
| 404 | query, | 386 | query, |
| 405 | - tenant_id=tenant_id, | ||
| 406 | generate_vector=enable_embedding, | 387 | generate_vector=enable_embedding, |
| 388 | + tenant_id=tenant_id, | ||
| 407 | context=context, | 389 | context=context, |
| 408 | target_languages=index_langs if enable_translation else [], | 390 | target_languages=index_langs if enable_translation else [], |
| 409 | ) | 391 | ) |
| @@ -493,8 +475,6 @@ class Searcher: | @@ -493,8 +475,6 @@ class Searcher: | ||
| 493 | context.store_intermediate_result('es_query', es_query) | 475 | context.store_intermediate_result('es_query', es_query) |
| 494 | if in_rerank_window and rerank_prefetch_source is not None: | 476 | if in_rerank_window and rerank_prefetch_source is not None: |
| 495 | context.store_intermediate_result('es_query_rerank_prefetch_source', rerank_prefetch_source) | 477 | context.store_intermediate_result('es_query_rerank_prefetch_source', rerank_prefetch_source) |
| 496 | - context.store_intermediate_result('es_body_for_search', body_for_es) | ||
| 497 | - | ||
| 498 | # Serialize ES query to compute a compact size + stable digest for correlation | 478 | # Serialize ES query to compute a compact size + stable digest for correlation |
| 499 | es_query_compact = json.dumps(es_query_for_fetch, ensure_ascii=False, separators=(",", ":")) | 479 | es_query_compact = json.dumps(es_query_for_fetch, ensure_ascii=False, separators=(",", ":")) |
| 500 | es_query_digest = hashlib.sha256(es_query_compact.encode("utf-8")).hexdigest()[:16] | 480 | es_query_digest = hashlib.sha256(es_query_compact.encode("utf-8")).hexdigest()[:16] |
| @@ -548,35 +528,29 @@ class Searcher: | @@ -548,35 +528,29 @@ class Searcher: | ||
| 548 | # Store ES response in context | 528 | # Store ES response in context |
| 549 | context.store_intermediate_result('es_response', es_response) | 529 | context.store_intermediate_result('es_response', es_response) |
| 550 | if debug: | 530 | if debug: |
| 551 | - initial_hits = es_response.get('hits', {}).get('hits') or [] | ||
| 552 | - initial_scores: List[float] = [] | 531 | + initial_hits = es_response.get("hits", {}).get("hits") or [] |
| 553 | for rank, hit in enumerate(initial_hits, 1): | 532 | for rank, hit in enumerate(initial_hits, 1): |
| 554 | doc_id = hit.get("_id") | 533 | doc_id = hit.get("_id") |
| 555 | if doc_id is not None: | 534 | if doc_id is not None: |
| 556 | - initial_es_positions[str(doc_id)] = rank | ||
| 557 | - raw_score = hit.get("_score") | ||
| 558 | - try: | ||
| 559 | - if raw_score is not None: | ||
| 560 | - initial_scores.append(float(raw_score)) | ||
| 561 | - except (TypeError, ValueError): | ||
| 562 | - pass | ||
| 563 | - raw_max_score = es_response.get('hits', {}).get('max_score') | 535 | + initial_ranks_by_doc[str(doc_id)] = rank |
| 536 | + raw_initial_max_score = es_response.get("hits", {}).get("max_score") | ||
| 564 | try: | 537 | try: |
| 565 | - initial_es_max_score = float(raw_max_score) if raw_max_score is not None else None | 538 | + es_score_normalization_factor = float(raw_initial_max_score) if raw_initial_max_score is not None else None |
| 566 | except (TypeError, ValueError): | 539 | except (TypeError, ValueError): |
| 567 | - initial_es_max_score = None | ||
| 568 | - if initial_es_max_score is None and initial_scores: | ||
| 569 | - initial_es_max_score = max(initial_scores) | ||
| 570 | - initial_es_min_score = min(initial_scores) if initial_scores else None | 540 | + es_score_normalization_factor = None |
| 541 | + if es_score_normalization_factor is None and initial_hits: | ||
| 542 | + first_score = initial_hits[0].get("_score") | ||
| 543 | + try: | ||
| 544 | + es_score_normalization_factor = float(first_score) if first_score is not None else None | ||
| 545 | + except (TypeError, ValueError): | ||
| 546 | + es_score_normalization_factor = None | ||
| 571 | 547 | ||
| 572 | # Extract timing from ES response | 548 | # Extract timing from ES response |
| 573 | es_took = es_response.get('took', 0) | 549 | es_took = es_response.get('took', 0) |
| 574 | context.logger.info( | 550 | context.logger.info( |
| 575 | f"ES搜索完成 | 耗时: {es_took}ms | " | 551 | f"ES搜索完成 | 耗时: {es_took}ms | " |
| 576 | f"命中数: {es_response.get('hits', {}).get('total', {}).get('value', 0)} | " | 552 | f"命中数: {es_response.get('hits', {}).get('total', {}).get('value', 0)} | " |
| 577 | - f"最高分: {(es_response.get('hits', {}).get('max_score') or 0):.3f} | " | ||
| 578 | - f"detected_language={parsed_query.detected_language} | " | ||
| 579 | - f"translations={list((parsed_query.translations or {}).keys())}", | 553 | + f"最高分: {(es_response.get('hits', {}).get('max_score') or 0):.3f}", |
| 580 | extra={'reqid': context.reqid, 'uid': context.uid} | 554 | extra={'reqid': context.reqid, 'uid': context.uid} |
| 581 | ) | 555 | ) |
| 582 | except Exception as e: | 556 | except Exception as e: |
| @@ -621,30 +595,31 @@ class Searcher: | @@ -621,30 +595,31 @@ class Searcher: | ||
| 621 | rerank_doc_template=effective_doc_template, | 595 | rerank_doc_template=effective_doc_template, |
| 622 | top_n=(from_ + size), | 596 | top_n=(from_ + size), |
| 623 | debug=debug, | 597 | debug=debug, |
| 598 | + fusion=rc.fusion, | ||
| 624 | ) | 599 | ) |
| 625 | 600 | ||
| 626 | if rerank_meta is not None: | 601 | if rerank_meta is not None: |
| 627 | - from config.services_config import get_rerank_service_url | ||
| 628 | - rerank_url = get_rerank_service_url() | ||
| 629 | - if debug and rerank_debug_info is not None: | ||
| 630 | - rerank_debug_info.update({ | ||
| 631 | - "executed": True, | ||
| 632 | - "service_url": rerank_url, | 602 | + if debug: |
| 603 | + from dataclasses import asdict | ||
| 604 | + from config.services_config import get_rerank_service_url | ||
| 605 | + rerank_debug_info = { | ||
| 606 | + "service_url": get_rerank_service_url(), | ||
| 607 | + "query_template": effective_query_template, | ||
| 608 | + "doc_template": effective_doc_template, | ||
| 633 | "query_text": str(effective_query_template).format_map({"query": rerank_query}), | 609 | "query_text": str(effective_query_template).format_map({"query": rerank_query}), |
| 634 | "docs": len(es_response.get("hits", {}).get("hits") or []), | 610 | "docs": len(es_response.get("hits", {}).get("hits") or []), |
| 611 | + "top_n": from_ + size, | ||
| 635 | "meta": rerank_meta, | 612 | "meta": rerank_meta, |
| 636 | - }) | 613 | + "fusion": asdict(rc.fusion), |
| 614 | + } | ||
| 637 | context.store_intermediate_result("rerank_scores", fused_debug) | 615 | context.store_intermediate_result("rerank_scores", fused_debug) |
| 638 | context.logger.info( | 616 | context.logger.info( |
| 639 | f"重排完成 | docs={len(es_response.get('hits', {}).get('hits') or [])} | " | 617 | f"重排完成 | docs={len(es_response.get('hits', {}).get('hits') or [])} | " |
| 640 | - f"top_n={from_ + size} | query_template={effective_query_template} | " | ||
| 641 | - f"doc_template={effective_doc_template} | meta={rerank_meta}", | 618 | + f"top_n={from_ + size} | meta={rerank_meta}", |
| 642 | extra={'reqid': context.reqid, 'uid': context.uid} | 619 | extra={'reqid': context.reqid, 'uid': context.uid} |
| 643 | ) | 620 | ) |
| 644 | except Exception as e: | 621 | except Exception as e: |
| 645 | context.add_warning(f"Rerank failed: {e}") | 622 | context.add_warning(f"Rerank failed: {e}") |
| 646 | - if debug and rerank_debug_info is not None: | ||
| 647 | - rerank_debug_info["error"] = str(e) | ||
| 648 | context.logger.warning( | 623 | context.logger.warning( |
| 649 | f"调用重排服务失败 | error: {e}", | 624 | f"调用重排服务失败 | error: {e}", |
| 650 | extra={'reqid': context.reqid, 'uid': context.uid}, | 625 | extra={'reqid': context.reqid, 'uid': context.uid}, |
| @@ -707,13 +682,6 @@ class Searcher: | @@ -707,13 +682,6 @@ class Searcher: | ||
| 707 | ) | 682 | ) |
| 708 | if fill_took: | 683 | if fill_took: |
| 709 | es_response["took"] = int((es_response.get("took", 0) or 0) + fill_took) | 684 | es_response["took"] = int((es_response.get("took", 0) or 0) + fill_took) |
| 710 | - if debug: | ||
| 711 | - page_fill_debug = { | ||
| 712 | - "requested_ids": page_ids, | ||
| 713 | - "filled": filled, | ||
| 714 | - "fill_took_ms": fill_took, | ||
| 715 | - "response_source_spec": response_source_spec, | ||
| 716 | - } | ||
| 717 | context.logger.info( | 685 | context.logger.info( |
| 718 | f"分页详情回填 | ids={len(page_ids)} | filled={filled} | took={fill_took}ms", | 686 | f"分页详情回填 | ids={len(page_ids)} | filled={filled} | took={fill_took}ms", |
| 719 | extra={'reqid': context.reqid, 'uid': context.uid} | 687 | extra={'reqid': context.reqid, 'uid': context.uid} |
| @@ -781,8 +749,8 @@ class Searcher: | @@ -781,8 +749,8 @@ class Searcher: | ||
| 781 | # Build per-result debug info (per SPU) when debug mode is enabled | 749 | # Build per-result debug info (per SPU) when debug mode is enabled |
| 782 | per_result_debug = [] | 750 | per_result_debug = [] |
| 783 | if debug and es_hits and formatted_results: | 751 | if debug and es_hits and formatted_results: |
| 784 | - final_positions_by_doc = { | ||
| 785 | - str(hit.get("_id")): (from_ + rank) | 752 | + final_ranks_by_doc = { |
| 753 | + str(hit.get("_id")): from_ + rank | ||
| 786 | for rank, hit in enumerate(es_hits, 1) | 754 | for rank, hit in enumerate(es_hits, 1) |
| 787 | if hit.get("_id") is not None | 755 | if hit.get("_id") is not None |
| 788 | } | 756 | } |
| @@ -805,28 +773,11 @@ class Searcher: | @@ -805,28 +773,11 @@ class Searcher: | ||
| 805 | es_score = 0.0 | 773 | es_score = 0.0 |
| 806 | try: | 774 | try: |
| 807 | normalized = ( | 775 | normalized = ( |
| 808 | - float(es_score) / float(initial_es_max_score) | ||
| 809 | - if initial_es_max_score | ||
| 810 | - else None | 776 | + float(es_score) / float(es_score_normalization_factor) |
| 777 | + if es_score_normalization_factor else None | ||
| 811 | ) | 778 | ) |
| 812 | except (TypeError, ValueError, ZeroDivisionError): | 779 | except (TypeError, ValueError, ZeroDivisionError): |
| 813 | normalized = None | 780 | normalized = None |
| 814 | - try: | ||
| 815 | - es_score_norm = ( | ||
| 816 | - (float(es_score) - float(initial_es_min_score)) | ||
| 817 | - / (float(initial_es_max_score) - float(initial_es_min_score)) | ||
| 818 | - if initial_es_min_score is not None | ||
| 819 | - and initial_es_max_score is not None | ||
| 820 | - and float(initial_es_max_score) > float(initial_es_min_score) | ||
| 821 | - else ( | ||
| 822 | - 1.0 | ||
| 823 | - if initial_es_min_score is not None | ||
| 824 | - and initial_es_max_score is not None | ||
| 825 | - else None | ||
| 826 | - ) | ||
| 827 | - ) | ||
| 828 | - except (TypeError, ValueError, ZeroDivisionError): | ||
| 829 | - es_score_norm = None | ||
| 830 | 781 | ||
| 831 | title_multilingual = source.get("title") if isinstance(source.get("title"), dict) else None | 782 | title_multilingual = source.get("title") if isinstance(source.get("title"), dict) else None |
| 832 | brief_multilingual = source.get("brief") if isinstance(source.get("brief"), dict) else None | 783 | brief_multilingual = source.get("brief") if isinstance(source.get("brief"), dict) else None |
| @@ -836,11 +787,8 @@ class Searcher: | @@ -836,11 +787,8 @@ class Searcher: | ||
| 836 | "spu_id": spu.spu_id, | 787 | "spu_id": spu.spu_id, |
| 837 | "es_score": es_score, | 788 | "es_score": es_score, |
| 838 | "es_score_normalized": normalized, | 789 | "es_score_normalized": normalized, |
| 839 | - "es_score_norm": es_score_norm, | ||
| 840 | - "es_score_min": initial_es_min_score, | ||
| 841 | - "es_score_max": initial_es_max_score, | ||
| 842 | - "initial_rank": initial_es_positions.get(str(doc_id)) if doc_id is not None else None, | ||
| 843 | - "final_rank": final_positions_by_doc.get(str(doc_id)) if doc_id is not None else None, | 790 | + "initial_rank": initial_ranks_by_doc.get(str(doc_id)) if doc_id is not None else None, |
| 791 | + "final_rank": final_ranks_by_doc.get(str(doc_id)) if doc_id is not None else None, | ||
| 844 | "title_multilingual": title_multilingual, | 792 | "title_multilingual": title_multilingual, |
| 845 | "brief_multilingual": brief_multilingual, | 793 | "brief_multilingual": brief_multilingual, |
| 846 | "vendor_multilingual": vendor_multilingual, | 794 | "vendor_multilingual": vendor_multilingual, |
| @@ -908,12 +856,6 @@ class Searcher: | @@ -908,12 +856,6 @@ class Searcher: | ||
| 908 | # Collect debug information if requested | 856 | # Collect debug information if requested |
| 909 | debug_info = None | 857 | debug_info = None |
| 910 | if debug: | 858 | if debug: |
| 911 | - query_vector_summary = None | ||
| 912 | - if parsed_query.query_vector is not None: | ||
| 913 | - query_vector_summary = { | ||
| 914 | - "dims": int(len(parsed_query.query_vector)), | ||
| 915 | - "preview": [round(float(v), 6) for v in parsed_query.query_vector[:8].tolist()], | ||
| 916 | - } | ||
| 917 | debug_info = { | 859 | debug_info = { |
| 918 | "query_analysis": { | 860 | "query_analysis": { |
| 919 | "original_query": context.query_analysis.original_query, | 861 | "original_query": context.query_analysis.original_query, |
| @@ -923,7 +865,6 @@ class Searcher: | @@ -923,7 +865,6 @@ class Searcher: | ||
| 923 | "index_languages": index_langs, | 865 | "index_languages": index_langs, |
| 924 | "translations": context.query_analysis.translations, | 866 | "translations": context.query_analysis.translations, |
| 925 | "has_vector": context.query_analysis.query_vector is not None, | 867 | "has_vector": context.query_analysis.query_vector is not None, |
| 926 | - "query_vector_summary": query_vector_summary, | ||
| 927 | "query_tokens": getattr(parsed_query, "query_tokens", []), | 868 | "query_tokens": getattr(parsed_query, "query_tokens", []), |
| 928 | "is_simple_query": context.query_analysis.is_simple_query, | 869 | "is_simple_query": context.query_analysis.is_simple_query, |
| 929 | "domain": context.query_analysis.domain, | 870 | "domain": context.query_analysis.domain, |
| @@ -931,12 +872,6 @@ class Searcher: | @@ -931,12 +872,6 @@ class Searcher: | ||
| 931 | }, | 872 | }, |
| 932 | "es_query": context.get_intermediate_result('es_query', {}), | 873 | "es_query": context.get_intermediate_result('es_query', {}), |
| 933 | "es_query_context": { | 874 | "es_query_context": { |
| 934 | - "filters": filters, | ||
| 935 | - "range_filters": range_filters, | ||
| 936 | - "facets": [getattr(facet, "field", str(facet)) for facet in facets] if facets else [], | ||
| 937 | - "sort_by": sort_by, | ||
| 938 | - "sort_order": sort_order, | ||
| 939 | - "min_score": min_score, | ||
| 940 | "es_fetch_from": es_fetch_from, | 875 | "es_fetch_from": es_fetch_from, |
| 941 | "es_fetch_size": es_fetch_size, | 876 | "es_fetch_size": es_fetch_size, |
| 942 | "in_rerank_window": in_rerank_window, | 877 | "in_rerank_window": in_rerank_window, |
| @@ -948,11 +883,9 @@ class Searcher: | @@ -948,11 +883,9 @@ class Searcher: | ||
| 948 | "total_hits": total_value, | 883 | "total_hits": total_value, |
| 949 | "max_score": max_score, | 884 | "max_score": max_score, |
| 950 | "shards": es_response.get('_shards', {}), | 885 | "shards": es_response.get('_shards', {}), |
| 951 | - "initial_es_max_score": initial_es_max_score, | ||
| 952 | - "initial_es_min_score": initial_es_min_score, | 886 | + "es_score_normalization_factor": es_score_normalization_factor, |
| 953 | }, | 887 | }, |
| 954 | "rerank": rerank_debug_info, | 888 | "rerank": rerank_debug_info, |
| 955 | - "page_fill": page_fill_debug, | ||
| 956 | "feature_flags": context.metadata.get('feature_flags', {}), | 889 | "feature_flags": context.metadata.get('feature_flags', {}), |
| 957 | "stage_timings": { | 890 | "stage_timings": { |
| 958 | k: round(v, 2) for k, v in context.performance_metrics.stage_timings.items() | 891 | k: round(v, 2) for k, v in context.performance_metrics.stage_timings.items() |
| @@ -1126,76 +1059,3 @@ class Searcher: | @@ -1126,76 +1059,3 @@ class Searcher: | ||
| 1126 | logger.error(f"Failed to get document {doc_id} from tenant {tenant_id}: {e}", exc_info=True) | 1059 | logger.error(f"Failed to get document {doc_id} from tenant {tenant_id}: {e}", exc_info=True) |
| 1127 | return None | 1060 | return None |
| 1128 | 1061 | ||
| 1129 | - def _standardize_facets( | ||
| 1130 | - self, | ||
| 1131 | - es_aggregations: Dict[str, Any], | ||
| 1132 | - facet_configs: Optional[List[Union[str, Any]]], | ||
| 1133 | - current_filters: Optional[Dict[str, Any]] | ||
| 1134 | - ) -> Optional[List[FacetResult]]: | ||
| 1135 | - """ | ||
| 1136 | - 将 ES 聚合结果转换为标准化的分面格式(返回 Pydantic 模型)。 | ||
| 1137 | - | ||
| 1138 | - Args: | ||
| 1139 | - es_aggregations: ES 原始聚合结果 | ||
| 1140 | - facet_configs: 分面配置列表(str 或 FacetConfig) | ||
| 1141 | - current_filters: 当前应用的过滤器 | ||
| 1142 | - | ||
| 1143 | - Returns: | ||
| 1144 | - 标准化的分面结果列表(FacetResult 对象) | ||
| 1145 | - """ | ||
| 1146 | - if not es_aggregations or not facet_configs: | ||
| 1147 | - return None | ||
| 1148 | - | ||
| 1149 | - standardized_facets: List[FacetResult] = [] | ||
| 1150 | - | ||
| 1151 | - for config in facet_configs: | ||
| 1152 | - # 解析配置 | ||
| 1153 | - if isinstance(config, str): | ||
| 1154 | - field = config | ||
| 1155 | - facet_type = "terms" | ||
| 1156 | - else: | ||
| 1157 | - # FacetConfig 对象 | ||
| 1158 | - field = config.field | ||
| 1159 | - facet_type = config.type | ||
| 1160 | - | ||
| 1161 | - agg_name = f"{field}_facet" | ||
| 1162 | - | ||
| 1163 | - if agg_name not in es_aggregations: | ||
| 1164 | - continue | ||
| 1165 | - | ||
| 1166 | - agg_result = es_aggregations[agg_name] | ||
| 1167 | - | ||
| 1168 | - # 获取当前字段的选中值 | ||
| 1169 | - selected_values = set() | ||
| 1170 | - if current_filters and field in current_filters: | ||
| 1171 | - filter_value = current_filters[field] | ||
| 1172 | - if isinstance(filter_value, list): | ||
| 1173 | - selected_values = set(filter_value) | ||
| 1174 | - else: | ||
| 1175 | - selected_values = {filter_value} | ||
| 1176 | - | ||
| 1177 | - # 转换 buckets 为 FacetValue 对象 | ||
| 1178 | - facet_values: List[FacetValue] = [] | ||
| 1179 | - if 'buckets' in agg_result: | ||
| 1180 | - for bucket in agg_result['buckets']: | ||
| 1181 | - value = bucket.get('key') | ||
| 1182 | - count = bucket.get('doc_count', 0) | ||
| 1183 | - | ||
| 1184 | - facet_values.append(FacetValue( | ||
| 1185 | - value=value, | ||
| 1186 | - label=str(value), | ||
| 1187 | - count=count, | ||
| 1188 | - selected=value in selected_values | ||
| 1189 | - )) | ||
| 1190 | - | ||
| 1191 | - # 构建 FacetResult 对象 | ||
| 1192 | - facet_result = FacetResult( | ||
| 1193 | - field=field, | ||
| 1194 | - label=field, | ||
| 1195 | - type=facet_type, | ||
| 1196 | - values=facet_values | ||
| 1197 | - ) | ||
| 1198 | - | ||
| 1199 | - standardized_facets.append(facet_result) | ||
| 1200 | - | ||
| 1201 | - return standardized_facets if standardized_facets else None |
tests/test_rerank_client.py
| 1 | from math import isclose | 1 | from math import isclose |
| 2 | 2 | ||
| 3 | +from config.schema import RerankFusionConfig | ||
| 3 | from search.rerank_client import fuse_scores_and_resort | 4 | from search.rerank_client import fuse_scores_and_resort |
| 4 | 5 | ||
| 5 | 6 | ||
| @@ -88,3 +89,32 @@ def test_fuse_scores_and_resort_downweights_text_only_advantage(): | @@ -88,3 +89,32 @@ def test_fuse_scores_and_resort_downweights_text_only_advantage(): | ||
| 88 | fuse_scores_and_resort(hits, [0.72, 0.98]) | 89 | fuse_scores_and_resort(hits, [0.72, 0.98]) |
| 89 | 90 | ||
| 90 | assert [hit["_id"] for hit in hits] == ["rerank-better", "lexical-heavy"] | 91 | assert [hit["_id"] for hit in hits] == ["rerank-better", "lexical-heavy"] |
| 92 | + | ||
| 93 | + | ||
| 94 | +def test_fuse_scores_and_resort_uses_configurable_fusion_params(): | ||
| 95 | + hits = [ | ||
| 96 | + { | ||
| 97 | + "_id": "a", | ||
| 98 | + "_score": 1.0, | ||
| 99 | + "matched_queries": {"base_query": 2.0, "knn_query": 0.5}, | ||
| 100 | + }, | ||
| 101 | + { | ||
| 102 | + "_id": "b", | ||
| 103 | + "_score": 1.0, | ||
| 104 | + "matched_queries": {"base_query": 3.0, "knn_query": 0.0}, | ||
| 105 | + }, | ||
| 106 | + ] | ||
| 107 | + fusion = RerankFusionConfig( | ||
| 108 | + rerank_bias=0.0, | ||
| 109 | + rerank_exponent=1.0, | ||
| 110 | + text_bias=0.0, | ||
| 111 | + text_exponent=1.0, | ||
| 112 | + knn_bias=0.0, | ||
| 113 | + knn_exponent=1.0, | ||
| 114 | + ) | ||
| 115 | + fuse_scores_and_resort(hits, [1.0, 1.0], fusion=fusion) | ||
| 116 | + # b 的 knn 为 0 -> 融合为 0;a 为 1 * 2 * 0.5 | ||
| 117 | + assert [h["_id"] for h in hits] == ["a", "b"] | ||
| 118 | + by_id = {h["_id"]: h for h in hits} | ||
| 119 | + assert isclose(by_id["a"]["_fused_score"], 1.0, rel_tol=1e-9) | ||
| 120 | + assert isclose(by_id["b"]["_fused_score"], 0.0, rel_tol=1e-9) |
tests/test_search_rerank_window.py
| @@ -614,7 +614,7 @@ def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_matc | @@ -614,7 +614,7 @@ def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_matc | ||
| 614 | assert result.results[0].image_url == "https://img/blue.jpg" | 614 | assert result.results[0].image_url == "https://img/blue.jpg" |
| 615 | 615 | ||
| 616 | 616 | ||
| 617 | -def test_searcher_debug_info_includes_es_positions_and_context(monkeypatch): | 617 | +def test_searcher_debug_info_uses_initial_es_max_score_for_normalization(monkeypatch): |
| 618 | es_client = _FakeESClient(total_hits=3) | 618 | es_client = _FakeESClient(total_hits=3) |
| 619 | searcher = _build_searcher(_build_search_config(rerank_enabled=False), es_client) | 619 | searcher = _build_searcher(_build_search_config(rerank_enabled=False), es_client) |
| 620 | context = create_request_context(reqid="dbg", uid="u-dbg") | 620 | context = create_request_context(reqid="dbg", uid="u-dbg") |
| @@ -635,10 +635,10 @@ def test_searcher_debug_info_includes_es_positions_and_context(monkeypatch): | @@ -635,10 +635,10 @@ def test_searcher_debug_info_includes_es_positions_and_context(monkeypatch): | ||
| 635 | ) | 635 | ) |
| 636 | 636 | ||
| 637 | assert result.debug_info["query_analysis"]["index_languages"] == ["en", "zh"] | 637 | assert result.debug_info["query_analysis"]["index_languages"] == ["en", "zh"] |
| 638 | + assert result.debug_info["query_analysis"]["query_tokens"] == [] | ||
| 638 | assert result.debug_info["es_query_context"]["es_fetch_size"] == 2 | 639 | assert result.debug_info["es_query_context"]["es_fetch_size"] == 2 |
| 639 | - assert result.debug_info["es_response"]["initial_es_max_score"] == 3.0 | ||
| 640 | - assert result.debug_info["es_response"]["initial_es_min_score"] == 2.0 | 640 | + assert result.debug_info["es_response"]["es_score_normalization_factor"] == 3.0 |
| 641 | assert result.debug_info["per_result"][0]["initial_rank"] == 1 | 641 | assert result.debug_info["per_result"][0]["initial_rank"] == 1 |
| 642 | assert result.debug_info["per_result"][0]["final_rank"] == 1 | 642 | assert result.debug_info["per_result"][0]["final_rank"] == 1 |
| 643 | assert result.debug_info["per_result"][0]["es_score_normalized"] == 1.0 | 643 | assert result.debug_info["per_result"][0]["es_score_normalized"] == 1.0 |
| 644 | - assert result.debug_info["per_result"][1]["es_score_norm"] == 0.0 | 644 | + assert result.debug_info["per_result"][1]["es_score_normalized"] == 2.0 / 3.0 |