diff --git a/config/__init__.py b/config/__init__.py index 650f116..87aec28 100644 --- a/config/__init__.py +++ b/config/__init__.py @@ -8,6 +8,7 @@ from config.schema import ( IndexConfig, QueryConfig, RerankConfig, + RerankFusionConfig, SPUConfig, SearchConfig, ServicesConfig, @@ -36,6 +37,7 @@ __all__ = [ "IndexConfig", "QueryConfig", "RerankConfig", + "RerankFusionConfig", "SPUConfig", "SearchConfig", "ServicesConfig", diff --git a/config/config.yaml b/config/config.yaml index 5335ebc..dd92457 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -219,6 +219,14 @@ rerank: weight_ai: 0.6 rerank_query_template: "{query}" rerank_doc_template: "{title}" + # 乘法融合:fused = Π (max(score,0) + bias) ** exponent(rerank / text / knn 三项) + fusion: + rerank_bias: 0.00001 + rerank_exponent: 1.0 + text_bias: 0.1 + text_exponent: 0.35 + knn_bias: 0.6 + knn_exponent: 0.2 # 可扩展服务/provider 注册表(单一配置源) services: diff --git a/config/loader.py b/config/loader.py index 584a37d..3b36e67 100644 --- a/config/loader.py +++ b/config/loader.py @@ -37,6 +37,7 @@ from config.schema import ( ProductEnrichConfig, RedisSettings, RerankConfig, + RerankFusionConfig, RerankServiceConfig, RuntimeConfig, SearchConfig, @@ -393,6 +394,7 @@ class AppConfigLoader: function_score_cfg = raw.get("function_score") if isinstance(raw.get("function_score"), dict) else {} rerank_cfg = raw.get("rerank") if isinstance(raw.get("rerank"), dict) else {} + fusion_raw = rerank_cfg.get("fusion") if isinstance(rerank_cfg.get("fusion"), dict) else {} spu_cfg = raw.get("spu_config") if isinstance(raw.get("spu_config"), dict) else {} return SearchConfig( @@ -412,6 +414,14 @@ class AppConfigLoader: weight_ai=float(rerank_cfg.get("weight_ai", 0.6)), rerank_query_template=str(rerank_cfg.get("rerank_query_template") or "{query}"), rerank_doc_template=str(rerank_cfg.get("rerank_doc_template") or "{title}"), + fusion=RerankFusionConfig( + rerank_bias=float(fusion_raw.get("rerank_bias", 0.00001)), + rerank_exponent=float(fusion_raw.get("rerank_exponent", 1.0)), + text_bias=float(fusion_raw.get("text_bias", 0.1)), + text_exponent=float(fusion_raw.get("text_exponent", 0.35)), + knn_bias=float(fusion_raw.get("knn_bias", 0.6)), + knn_exponent=float(fusion_raw.get("knn_exponent", 0.2)), + ), ), spu_config=SPUConfig( enabled=bool(spu_cfg.get("enabled", False)), diff --git a/config/schema.py b/config/schema.py index 690c2b1..f28329f 100644 --- a/config/schema.py +++ b/config/schema.py @@ -91,6 +91,21 @@ class FunctionScoreConfig: @dataclass(frozen=True) +class RerankFusionConfig: + """ + Multiplicative fusion: fused = Π (max(score_i, 0) + bias_i) ** exponent_i + for rerank / text / knn terms respectively. + """ + + rerank_bias: float = 0.00001 + rerank_exponent: float = 1.0 + text_bias: float = 0.1 + text_exponent: float = 0.35 + knn_bias: float = 0.6 + knn_exponent: float = 0.2 + + +@dataclass(frozen=True) class RerankConfig: """Search-time rerank configuration.""" @@ -101,6 +116,7 @@ class RerankConfig: weight_ai: float = 0.6 rerank_query_template: str = "{query}" rerank_doc_template: str = "{title}" + fusion: RerankFusionConfig = field(default_factory=RerankFusionConfig) @dataclass(frozen=True) diff --git a/frontend/static/js/app.js b/frontend/static/js/app.js index 5466315..a8a0630 100644 --- a/frontend/static/js/app.js +++ b/frontend/static/js/app.js @@ -411,11 +411,6 @@ function displayResults(data) { const esNorm = typeof debug.es_score_normalized === 'number' ? debug.es_score_normalized.toFixed(4) : (debug.es_score_normalized == null ? '' : String(debug.es_score_normalized)); - - const esNormMinMax = typeof debug.es_score_norm === 'number' - ? debug.es_score_norm.toFixed(4) - : (debug.es_score_norm == null ? '' : String(debug.es_score_norm)); - const rerankScore = typeof debug.rerank_score === 'number' ? debug.rerank_score.toFixed(4) : (debug.rerank_score == null ? '' : String(debug.rerank_score)); @@ -437,13 +432,28 @@ function displayResults(data) { const resultJson = customStringify(result); const rawUrl = `${API_BASE_URL}/search/es-doc/${encodeURIComponent(spuId)}?tenant_id=${encodeURIComponent(tenantId)}`; const rerankInputHtml = debug.rerank_input - ? `
Rerank input
${escapeHtml(customStringify(debug.rerank_input))}
` + ? ` +
+ Rerank input +
${escapeHtml(customStringify(debug.rerank_input))}
+
+ ` : ''; const styleIntentHtml = debug.style_intent_sku - ? `
Selected SKU
${escapeHtml(customStringify(debug.style_intent_sku))}
` + ? ` +
+ Selected SKU +
${escapeHtml(customStringify(debug.style_intent_sku))}
+
+ ` : ''; const matchedQueriesHtml = debug.matched_queries - ? `
matched_queries
${escapeHtml(customStringify(debug.matched_queries))}
` + ? ` +
+ matched_queries +
${escapeHtml(customStringify(debug.matched_queries))}
+
+ ` : ''; debugHtml = ` @@ -453,9 +463,7 @@ function displayResults(data) {
Position before rerank: ${escapeHtml(String(debug.initial_rank ?? ''))}
Position after rerank: ${escapeHtml(String(debug.final_rank ?? ''))}
ES score: ${esScore}
-
ES normalized (score / initial ES max): ${esNorm}
-
ES norm (min-max over initial ES window): ${esNormMinMax}
-
ES score min/max: ${escapeHtml(String(debug.es_score_min ?? ''))} / ${escapeHtml(String(debug.es_score_max ?? ''))}
+
ES normalized: ${esNorm}
Rerank score: ${rerankScore}
rerank_factor: ${escapeHtml(String(debug.rerank_factor ?? ''))}
text_score: ${escapeHtml(String(debug.text_score ?? ''))}
@@ -910,9 +918,6 @@ function displayDebugInfo(data) { html += `
query_tokens: ${escapeHtml((debugInfo.query_analysis.query_tokens || []).join(', ') || 'N/A')}
`; html += `
domain: ${escapeHtml(debugInfo.query_analysis.domain || 'default')}
`; html += `
is_simple_query: ${debugInfo.query_analysis.is_simple_query ? 'yes' : 'no'}
`; - if (debugInfo.query_analysis.query_vector_summary) { - html += `
query_vector_summary: ${escapeHtml(customStringify(debugInfo.query_analysis.query_vector_summary))}
`; - } if (debugInfo.query_analysis.translations && Object.keys(debugInfo.query_analysis.translations).length > 0) { html += '
translations: '; @@ -946,25 +951,21 @@ function displayDebugInfo(data) { html += `
took_ms: ${debugInfo.es_response.took_ms}ms
`; html += `
total_hits: ${debugInfo.es_response.total_hits}
`; html += `
max_score: ${debugInfo.es_response.max_score?.toFixed(3) || 0}
`; - html += `
initial_es_max_score: ${escapeHtml(String(debugInfo.es_response.initial_es_max_score ?? ''))}
`; - html += `
initial_es_min_score: ${escapeHtml(String(debugInfo.es_response.initial_es_min_score ?? ''))}
`; + html += `
es_score_normalization_factor: ${escapeHtml(String(debugInfo.es_response.es_score_normalization_factor ?? ''))}
`; html += '
'; } if (debugInfo.rerank) { html += '
Rerank:'; - html += `
requested: ${debugInfo.rerank.requested ? 'yes' : 'no'}
`; - html += `
executed: ${debugInfo.rerank.executed ? 'yes' : 'no'}
`; - html += `
in_rerank_window: ${debugInfo.rerank.in_rerank_window ? 'yes' : 'no'}
`; - html += `
top_n: ${escapeHtml(String(debugInfo.rerank.top_n ?? ''))}
`; html += `
query_template: ${escapeHtml(debugInfo.rerank.query_template || 'N/A')}
`; html += `
doc_template: ${escapeHtml(debugInfo.rerank.doc_template || 'N/A')}
`; - html += '
'; - } - - if (debugInfo.page_fill) { - html += '
Page Fill:'; - html += `
${escapeHtml(customStringify(debugInfo.page_fill))}
`; + html += `
query_text: ${escapeHtml(debugInfo.rerank.query_text || 'N/A')}
`; + html += `
docs: ${escapeHtml(String(debugInfo.rerank.docs ?? ''))}
`; + html += `
top_n: ${escapeHtml(String(debugInfo.rerank.top_n ?? ''))}
`; + if (debugInfo.rerank.fusion) { + html += '
fusion:
'; + html += `
${escapeHtml(customStringify(debugInfo.rerank.fusion))}
`; + } html += '
'; } @@ -992,7 +993,7 @@ function displayDebugInfo(data) { if (debugInfo.es_query_context) { html += '
ES Query Context:'; - html += `
${escapeHtml(customStringify(debugInfo.es_query_context))}
`; + html += `
${escapeHtml(customStringify(debugInfo.es_query_context))}
`; html += '
'; } diff --git a/search/rerank_client.py b/search/rerank_client.py index ebddeb0..a5b7791 100644 --- a/search/rerank_client.py +++ b/search/rerank_client.py @@ -10,6 +10,7 @@ from typing import Dict, Any, List, Optional, Tuple import logging +from config.schema import RerankFusionConfig from providers import create_rerank_provider logger = logging.getLogger(__name__) @@ -176,17 +177,34 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa } +def _multiply_fusion_factors( + rerank_score: float, + text_score: float, + knn_score: float, + fusion: RerankFusionConfig, +) -> Tuple[float, float, float, float]: + """(rerank_factor, text_factor, knn_factor, fused).""" + r = (max(rerank_score, 0.0) + fusion.rerank_bias) ** fusion.rerank_exponent + t = (max(text_score, 0.0) + fusion.text_bias) ** fusion.text_exponent + k = (max(knn_score, 0.0) + fusion.knn_bias) ** fusion.knn_exponent + return r, t, k, r * t * k + + def fuse_scores_and_resort( es_hits: List[Dict[str, Any]], rerank_scores: List[float], weight_es: float = DEFAULT_WEIGHT_ES, weight_ai: float = DEFAULT_WEIGHT_AI, + fusion: Optional[RerankFusionConfig] = None, debug: bool = False, rerank_debug_rows: Optional[List[Dict[str, Any]]] = None, ) -> List[Dict[str, Any]]: """ 将 ES 分数与重排分数按乘法公式融合(不修改原始 _score),并按融合分数降序重排。 + 融合形式(由 ``fusion`` 配置 bias / exponent):: + fused = (max(rerank,0)+b_r)^e_r * (max(text,0)+b_t)^e_t * (max(knn,0)+b_k)^e_k + 对每条 hit 会写入: - _original_score: 原始 ES 分数 - _rerank_score: 重排服务返回的分数 @@ -199,40 +217,35 @@ def fuse_scores_and_resort( rerank_scores: 与 es_hits 等长的重排分数列表 weight_es: 兼容保留,当前未使用 weight_ai: 兼容保留,当前未使用 - - Returns: - 每条文档的融合调试信息列表,用于 debug_info """ n = len(es_hits) if n == 0 or len(rerank_scores) != n: return [] - fused_debug: List[Dict[str, Any]] = [] + f = fusion or RerankFusionConfig() + fused_debug: List[Dict[str, Any]] = [] if debug else [] for idx, hit in enumerate(es_hits): es_score = _to_score(hit.get("_score")) - - ai_score_raw = rerank_scores[idx] - rerank_score = _to_score(ai_score_raw) - + rerank_score = _to_score(rerank_scores[idx]) matched_queries = hit.get("matched_queries") knn_score = _extract_named_query_score(matched_queries, "knn_query") text_components = _collect_text_score_components(matched_queries, es_score) text_score = text_components["text_score"] - rerank_factor = max(rerank_score, 0.0) + 0.00001 - text_factor = (max(text_score, 0.0) + 0.1) ** 0.35 - knn_factor = (max(knn_score, 0.0) + 0.6) ** 0.2 - fused = rerank_factor * text_factor * knn_factor + rerank_factor, text_factor, knn_factor, fused = _multiply_fusion_factors( + rerank_score, text_score, knn_score, f + ) hit["_original_score"] = hit.get("_score") hit["_rerank_score"] = rerank_score hit["_text_score"] = text_score hit["_knn_score"] = knn_score - hit["_text_source_score"] = text_components["source_score"] - hit["_text_translation_score"] = text_components["translation_score"] - hit["_text_primary_score"] = text_components["primary_text_score"] - hit["_text_support_score"] = text_components["support_text_score"] hit["_fused_score"] = fused + if debug: + hit["_text_source_score"] = text_components["source_score"] + hit["_text_translation_score"] = text_components["translation_score"] + hit["_text_primary_score"] = text_components["primary_text_score"] + hit["_text_support_score"] = text_components["support_text_score"] if debug: debug_entry = { @@ -262,7 +275,6 @@ def fuse_scores_and_resort( debug_entry["rerank_input"] = rerank_debug_rows[idx] fused_debug.append(debug_entry) - # 按融合分数降序重排 es_hits.sort( key=lambda h: h.get("_fused_score", h.get("_score", 0.0)), reverse=True, @@ -281,6 +293,7 @@ def run_rerank( rerank_doc_template: str = "{title}", top_n: Optional[int] = None, debug: bool = False, + fusion: Optional[RerankFusionConfig] = None, ) -> Tuple[Dict[str, Any], Optional[Dict[str, Any]], List[Dict[str, Any]]]: """ 完整重排流程:从 es_response 取 hits -> 构造 docs -> 调服务 -> 融合分数并重排 -> 更新 max_score。 @@ -314,6 +327,7 @@ def run_rerank( scores, weight_es=weight_es, weight_ai=weight_ai, + fusion=fusion, debug=debug, rerank_debug_rows=rerank_debug_rows, ) diff --git a/search/searcher.py b/search/searcher.py index 7654a9d..e3e7138 100644 --- a/search/searcher.py +++ b/search/searcher.py @@ -4,9 +4,8 @@ Main Searcher module - executes search queries against Elasticsearch. Handles query parsing, ranking, and result formatting. """ -from typing import Dict, Any, List, Optional, Union, Tuple -import os -import time, json +from typing import Dict, Any, List, Optional +import json import logging import hashlib from string import Formatter @@ -20,7 +19,7 @@ from .sku_intent_selector import SkuSelectionDecision, StyleSkuSelector from config import SearchConfig from config.tenant_config_loader import get_tenant_config_loader from context.request_context import RequestContext, RequestContextStage -from api.models import FacetResult, FacetValue, FacetConfig +from api.models import FacetResult, FacetConfig from api.result_formatter import ResultFormatter from indexer.mapping_generator import get_tenant_index_name @@ -259,13 +258,7 @@ class Searcher: if context is not None: context.start_stage(RequestContextStage.STYLE_SKU_PREPARE_HITS) try: - decisions = self.style_sku_selector.prepare_hits(es_hits, parsed_query) - if decisions and context is not None: - context.store_intermediate_result( - "style_intent_sku_decisions", - {doc_id: decision.to_dict() for doc_id, decision in decisions.items()}, - ) - return decisions + return self.style_sku_selector.prepare_hits(es_hits, parsed_query) finally: if context is not None: context.end_stage(RequestContextStage.STYLE_SKU_PREPARE_HITS) @@ -339,21 +332,10 @@ class Searcher: in_rerank_window = do_rerank and (from_ + size) <= rerank_window es_fetch_from = 0 if in_rerank_window else from_ es_fetch_size = rerank_window if in_rerank_window else size - initial_es_positions: Dict[str, int] = {} - initial_es_min_score: Optional[float] = None - initial_es_max_score: Optional[float] = None - page_fill_debug: Optional[Dict[str, Any]] = None + + es_score_normalization_factor: Optional[float] = None + initial_ranks_by_doc: Dict[str, int] = {} rerank_debug_info: Optional[Dict[str, Any]] = None - if debug: - rerank_debug_info = { - "requested": do_rerank, - "executed": False, - "in_rerank_window": in_rerank_window, - "rerank_window": rerank_window, - "top_n": from_ + size, - "query_template": effective_query_template, - "doc_template": effective_doc_template, - } # Start timing context.start_stage(RequestContextStage.TOTAL) @@ -402,8 +384,8 @@ class Searcher: try: parsed_query = self.query_parser.parse( query, - tenant_id=tenant_id, generate_vector=enable_embedding, + tenant_id=tenant_id, context=context, target_languages=index_langs if enable_translation else [], ) @@ -493,8 +475,6 @@ class Searcher: context.store_intermediate_result('es_query', es_query) if in_rerank_window and rerank_prefetch_source is not None: context.store_intermediate_result('es_query_rerank_prefetch_source', rerank_prefetch_source) - context.store_intermediate_result('es_body_for_search', body_for_es) - # Serialize ES query to compute a compact size + stable digest for correlation es_query_compact = json.dumps(es_query_for_fetch, ensure_ascii=False, separators=(",", ":")) es_query_digest = hashlib.sha256(es_query_compact.encode("utf-8")).hexdigest()[:16] @@ -548,35 +528,29 @@ class Searcher: # Store ES response in context context.store_intermediate_result('es_response', es_response) if debug: - initial_hits = es_response.get('hits', {}).get('hits') or [] - initial_scores: List[float] = [] + initial_hits = es_response.get("hits", {}).get("hits") or [] for rank, hit in enumerate(initial_hits, 1): doc_id = hit.get("_id") if doc_id is not None: - initial_es_positions[str(doc_id)] = rank - raw_score = hit.get("_score") - try: - if raw_score is not None: - initial_scores.append(float(raw_score)) - except (TypeError, ValueError): - pass - raw_max_score = es_response.get('hits', {}).get('max_score') + initial_ranks_by_doc[str(doc_id)] = rank + raw_initial_max_score = es_response.get("hits", {}).get("max_score") try: - initial_es_max_score = float(raw_max_score) if raw_max_score is not None else None + es_score_normalization_factor = float(raw_initial_max_score) if raw_initial_max_score is not None else None except (TypeError, ValueError): - initial_es_max_score = None - if initial_es_max_score is None and initial_scores: - initial_es_max_score = max(initial_scores) - initial_es_min_score = min(initial_scores) if initial_scores else None + es_score_normalization_factor = None + if es_score_normalization_factor is None and initial_hits: + first_score = initial_hits[0].get("_score") + try: + es_score_normalization_factor = float(first_score) if first_score is not None else None + except (TypeError, ValueError): + es_score_normalization_factor = None # Extract timing from ES response es_took = es_response.get('took', 0) context.logger.info( f"ES搜索完成 | 耗时: {es_took}ms | " f"命中数: {es_response.get('hits', {}).get('total', {}).get('value', 0)} | " - f"最高分: {(es_response.get('hits', {}).get('max_score') or 0):.3f} | " - f"detected_language={parsed_query.detected_language} | " - f"translations={list((parsed_query.translations or {}).keys())}", + f"最高分: {(es_response.get('hits', {}).get('max_score') or 0):.3f}", extra={'reqid': context.reqid, 'uid': context.uid} ) except Exception as e: @@ -621,30 +595,31 @@ class Searcher: rerank_doc_template=effective_doc_template, top_n=(from_ + size), debug=debug, + fusion=rc.fusion, ) if rerank_meta is not None: - from config.services_config import get_rerank_service_url - rerank_url = get_rerank_service_url() - if debug and rerank_debug_info is not None: - rerank_debug_info.update({ - "executed": True, - "service_url": rerank_url, + if debug: + from dataclasses import asdict + from config.services_config import get_rerank_service_url + rerank_debug_info = { + "service_url": get_rerank_service_url(), + "query_template": effective_query_template, + "doc_template": effective_doc_template, "query_text": str(effective_query_template).format_map({"query": rerank_query}), "docs": len(es_response.get("hits", {}).get("hits") or []), + "top_n": from_ + size, "meta": rerank_meta, - }) + "fusion": asdict(rc.fusion), + } context.store_intermediate_result("rerank_scores", fused_debug) context.logger.info( f"重排完成 | docs={len(es_response.get('hits', {}).get('hits') or [])} | " - f"top_n={from_ + size} | query_template={effective_query_template} | " - f"doc_template={effective_doc_template} | meta={rerank_meta}", + f"top_n={from_ + size} | meta={rerank_meta}", extra={'reqid': context.reqid, 'uid': context.uid} ) except Exception as e: context.add_warning(f"Rerank failed: {e}") - if debug and rerank_debug_info is not None: - rerank_debug_info["error"] = str(e) context.logger.warning( f"调用重排服务失败 | error: {e}", extra={'reqid': context.reqid, 'uid': context.uid}, @@ -707,13 +682,6 @@ class Searcher: ) if fill_took: es_response["took"] = int((es_response.get("took", 0) or 0) + fill_took) - if debug: - page_fill_debug = { - "requested_ids": page_ids, - "filled": filled, - "fill_took_ms": fill_took, - "response_source_spec": response_source_spec, - } context.logger.info( f"分页详情回填 | ids={len(page_ids)} | filled={filled} | took={fill_took}ms", extra={'reqid': context.reqid, 'uid': context.uid} @@ -781,8 +749,8 @@ class Searcher: # Build per-result debug info (per SPU) when debug mode is enabled per_result_debug = [] if debug and es_hits and formatted_results: - final_positions_by_doc = { - str(hit.get("_id")): (from_ + rank) + final_ranks_by_doc = { + str(hit.get("_id")): from_ + rank for rank, hit in enumerate(es_hits, 1) if hit.get("_id") is not None } @@ -805,28 +773,11 @@ class Searcher: es_score = 0.0 try: normalized = ( - float(es_score) / float(initial_es_max_score) - if initial_es_max_score - else None + float(es_score) / float(es_score_normalization_factor) + if es_score_normalization_factor else None ) except (TypeError, ValueError, ZeroDivisionError): normalized = None - try: - es_score_norm = ( - (float(es_score) - float(initial_es_min_score)) - / (float(initial_es_max_score) - float(initial_es_min_score)) - if initial_es_min_score is not None - and initial_es_max_score is not None - and float(initial_es_max_score) > float(initial_es_min_score) - else ( - 1.0 - if initial_es_min_score is not None - and initial_es_max_score is not None - else None - ) - ) - except (TypeError, ValueError, ZeroDivisionError): - es_score_norm = None title_multilingual = source.get("title") if isinstance(source.get("title"), dict) else None brief_multilingual = source.get("brief") if isinstance(source.get("brief"), dict) else None @@ -836,11 +787,8 @@ class Searcher: "spu_id": spu.spu_id, "es_score": es_score, "es_score_normalized": normalized, - "es_score_norm": es_score_norm, - "es_score_min": initial_es_min_score, - "es_score_max": initial_es_max_score, - "initial_rank": initial_es_positions.get(str(doc_id)) if doc_id is not None else None, - "final_rank": final_positions_by_doc.get(str(doc_id)) if doc_id is not None else None, + "initial_rank": initial_ranks_by_doc.get(str(doc_id)) if doc_id is not None else None, + "final_rank": final_ranks_by_doc.get(str(doc_id)) if doc_id is not None else None, "title_multilingual": title_multilingual, "brief_multilingual": brief_multilingual, "vendor_multilingual": vendor_multilingual, @@ -908,12 +856,6 @@ class Searcher: # Collect debug information if requested debug_info = None if debug: - query_vector_summary = None - if parsed_query.query_vector is not None: - query_vector_summary = { - "dims": int(len(parsed_query.query_vector)), - "preview": [round(float(v), 6) for v in parsed_query.query_vector[:8].tolist()], - } debug_info = { "query_analysis": { "original_query": context.query_analysis.original_query, @@ -923,7 +865,6 @@ class Searcher: "index_languages": index_langs, "translations": context.query_analysis.translations, "has_vector": context.query_analysis.query_vector is not None, - "query_vector_summary": query_vector_summary, "query_tokens": getattr(parsed_query, "query_tokens", []), "is_simple_query": context.query_analysis.is_simple_query, "domain": context.query_analysis.domain, @@ -931,12 +872,6 @@ class Searcher: }, "es_query": context.get_intermediate_result('es_query', {}), "es_query_context": { - "filters": filters, - "range_filters": range_filters, - "facets": [getattr(facet, "field", str(facet)) for facet in facets] if facets else [], - "sort_by": sort_by, - "sort_order": sort_order, - "min_score": min_score, "es_fetch_from": es_fetch_from, "es_fetch_size": es_fetch_size, "in_rerank_window": in_rerank_window, @@ -948,11 +883,9 @@ class Searcher: "total_hits": total_value, "max_score": max_score, "shards": es_response.get('_shards', {}), - "initial_es_max_score": initial_es_max_score, - "initial_es_min_score": initial_es_min_score, + "es_score_normalization_factor": es_score_normalization_factor, }, "rerank": rerank_debug_info, - "page_fill": page_fill_debug, "feature_flags": context.metadata.get('feature_flags', {}), "stage_timings": { k: round(v, 2) for k, v in context.performance_metrics.stage_timings.items() @@ -1126,76 +1059,3 @@ class Searcher: logger.error(f"Failed to get document {doc_id} from tenant {tenant_id}: {e}", exc_info=True) return None - def _standardize_facets( - self, - es_aggregations: Dict[str, Any], - facet_configs: Optional[List[Union[str, Any]]], - current_filters: Optional[Dict[str, Any]] - ) -> Optional[List[FacetResult]]: - """ - 将 ES 聚合结果转换为标准化的分面格式(返回 Pydantic 模型)。 - - Args: - es_aggregations: ES 原始聚合结果 - facet_configs: 分面配置列表(str 或 FacetConfig) - current_filters: 当前应用的过滤器 - - Returns: - 标准化的分面结果列表(FacetResult 对象) - """ - if not es_aggregations or not facet_configs: - return None - - standardized_facets: List[FacetResult] = [] - - for config in facet_configs: - # 解析配置 - if isinstance(config, str): - field = config - facet_type = "terms" - else: - # FacetConfig 对象 - field = config.field - facet_type = config.type - - agg_name = f"{field}_facet" - - if agg_name not in es_aggregations: - continue - - agg_result = es_aggregations[agg_name] - - # 获取当前字段的选中值 - selected_values = set() - if current_filters and field in current_filters: - filter_value = current_filters[field] - if isinstance(filter_value, list): - selected_values = set(filter_value) - else: - selected_values = {filter_value} - - # 转换 buckets 为 FacetValue 对象 - facet_values: List[FacetValue] = [] - if 'buckets' in agg_result: - for bucket in agg_result['buckets']: - value = bucket.get('key') - count = bucket.get('doc_count', 0) - - facet_values.append(FacetValue( - value=value, - label=str(value), - count=count, - selected=value in selected_values - )) - - # 构建 FacetResult 对象 - facet_result = FacetResult( - field=field, - label=field, - type=facet_type, - values=facet_values - ) - - standardized_facets.append(facet_result) - - return standardized_facets if standardized_facets else None diff --git a/tests/test_rerank_client.py b/tests/test_rerank_client.py index bfca160..683606b 100644 --- a/tests/test_rerank_client.py +++ b/tests/test_rerank_client.py @@ -1,5 +1,6 @@ from math import isclose +from config.schema import RerankFusionConfig from search.rerank_client import fuse_scores_and_resort @@ -88,3 +89,32 @@ def test_fuse_scores_and_resort_downweights_text_only_advantage(): fuse_scores_and_resort(hits, [0.72, 0.98]) assert [hit["_id"] for hit in hits] == ["rerank-better", "lexical-heavy"] + + +def test_fuse_scores_and_resort_uses_configurable_fusion_params(): + hits = [ + { + "_id": "a", + "_score": 1.0, + "matched_queries": {"base_query": 2.0, "knn_query": 0.5}, + }, + { + "_id": "b", + "_score": 1.0, + "matched_queries": {"base_query": 3.0, "knn_query": 0.0}, + }, + ] + fusion = RerankFusionConfig( + rerank_bias=0.0, + rerank_exponent=1.0, + text_bias=0.0, + text_exponent=1.0, + knn_bias=0.0, + knn_exponent=1.0, + ) + fuse_scores_and_resort(hits, [1.0, 1.0], fusion=fusion) + # b 的 knn 为 0 -> 融合为 0;a 为 1 * 2 * 0.5 + assert [h["_id"] for h in hits] == ["a", "b"] + by_id = {h["_id"]: h for h in hits} + assert isclose(by_id["a"]["_fused_score"], 1.0, rel_tol=1e-9) + assert isclose(by_id["b"]["_fused_score"], 0.0, rel_tol=1e-9) diff --git a/tests/test_search_rerank_window.py b/tests/test_search_rerank_window.py index 1bff506..cfb798e 100644 --- a/tests/test_search_rerank_window.py +++ b/tests/test_search_rerank_window.py @@ -614,7 +614,7 @@ def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_matc assert result.results[0].image_url == "https://img/blue.jpg" -def test_searcher_debug_info_includes_es_positions_and_context(monkeypatch): +def test_searcher_debug_info_uses_initial_es_max_score_for_normalization(monkeypatch): es_client = _FakeESClient(total_hits=3) searcher = _build_searcher(_build_search_config(rerank_enabled=False), es_client) context = create_request_context(reqid="dbg", uid="u-dbg") @@ -635,10 +635,10 @@ def test_searcher_debug_info_includes_es_positions_and_context(monkeypatch): ) assert result.debug_info["query_analysis"]["index_languages"] == ["en", "zh"] + assert result.debug_info["query_analysis"]["query_tokens"] == [] assert result.debug_info["es_query_context"]["es_fetch_size"] == 2 - assert result.debug_info["es_response"]["initial_es_max_score"] == 3.0 - assert result.debug_info["es_response"]["initial_es_min_score"] == 2.0 + assert result.debug_info["es_response"]["es_score_normalization_factor"] == 3.0 assert result.debug_info["per_result"][0]["initial_rank"] == 1 assert result.debug_info["per_result"][0]["final_rank"] == 1 assert result.debug_info["per_result"][0]["es_score_normalized"] == 1.0 - assert result.debug_info["per_result"][1]["es_score_norm"] == 0.0 + assert result.debug_info["per_result"][1]["es_score_normalized"] == 2.0 / 3.0 -- libgit2 0.21.2