From 9df421edc66838b08a6693b98830c4c974583f14 Mon Sep 17 00:00:00 2001 From: tangwang Date: Wed, 1 Apr 2026 20:05:22 +0800 Subject: [PATCH] 基于eval框架开始调参 --- config/config.yaml | 10 +++++++--- config/loader.py | 4 ++++ config/schema.py | 9 +++++++-- docs/常用查询 - ES.md | 3 +++ docs/相关性检索优化说明.md | 336 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------- scripts/evaluation/eval_framework/framework.py | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------- search/rerank_client.py | 34 ++++++++++++++++++++++++++-------- search/searcher.py | 25 ++++++++++++++++++++++--- tests/test_rerank_client.py | 32 ++++++++++++++++++++++++++++++++ tests/test_search_rerank_window.py | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 10 files changed, 543 insertions(+), 46 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 5c659a8..0b2f9c2 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -285,6 +285,8 @@ coarse_rank: input_window: 700 output_window: 240 fusion: + es_bias: 0.1 + es_exponent: 0.05 text_bias: 0.1 text_exponent: 0.35 # base_query_trans_* 相对 base_query 的权重(见 search/rerank_client 中文本 dismax 融合) @@ -294,7 +296,7 @@ coarse_rank: knn_image_weight: 1.0 knn_tie_breaker: 0.1 knn_bias: 0.6 - knn_exponent: 0.0 + knn_exponent: 0.2 # 精排配置(轻量 reranker) fine_rank: @@ -317,11 +319,13 @@ rerank: rerank_doc_template: '{title}' service_profile: default - # 乘法融合:fused = Π (max(score,0) + bias) ** exponent(rerank / text / knn 三项) + # 乘法融合:fused = Π (max(score,0) + bias) ** exponent(es / rerank / fine / text / knn) # 其中 knn_score 先做一层 dis_max: # max(knn_text_weight * text_knn, knn_image_weight * image_knn) # + knn_tie_breaker * 另一侧较弱信号 fusion: + es_bias: 0.1 + es_exponent: 0.05 rerank_bias: 1.0e-05 rerank_exponent: 1.15 fine_bias: 1.0e-05 @@ -334,7 +338,7 @@ rerank: knn_image_weight: 1.0 knn_tie_breaker: 0.1 knn_bias: 0.6 - knn_exponent: 0.0 + knn_exponent: 0.2 # 可扩展服务/provider 注册表(单一配置源) services: diff --git a/config/loader.py b/config/loader.py index c2332c6..5306f8c 100644 --- a/config/loader.py +++ b/config/loader.py @@ -578,6 +578,8 @@ class AppConfigLoader: input_window=int(coarse_rank_cfg.get("input_window", 700)), output_window=int(coarse_rank_cfg.get("output_window", 240)), fusion=CoarseRankFusionConfig( + es_bias=float(coarse_fusion_raw.get("es_bias", 0.1)), + es_exponent=float(coarse_fusion_raw.get("es_exponent", 0.0)), text_bias=float(coarse_fusion_raw.get("text_bias", 0.1)), text_exponent=float(coarse_fusion_raw.get("text_exponent", 0.35)), knn_text_weight=float(coarse_fusion_raw.get("knn_text_weight", 1.0)), @@ -617,6 +619,8 @@ class AppConfigLoader: else None ), fusion=RerankFusionConfig( + es_bias=float(fusion_raw.get("es_bias", 0.1)), + es_exponent=float(fusion_raw.get("es_exponent", 0.0)), rerank_bias=float(fusion_raw.get("rerank_bias", 0.00001)), rerank_exponent=float(fusion_raw.get("rerank_exponent", 1.0)), text_bias=float(fusion_raw.get("text_bias", 0.1)), diff --git a/config/schema.py b/config/schema.py index 8cf0466..cbd4328 100644 --- a/config/schema.py +++ b/config/schema.py @@ -105,9 +105,11 @@ class FunctionScoreConfig: class RerankFusionConfig: """ Multiplicative fusion: fused = Π (max(score_i, 0) + bias_i) ** exponent_i - for rerank / text / knn terms respectively. + for es / rerank / fine / text / knn terms respectively. """ + es_bias: float = 0.1 + es_exponent: float = 0.0 rerank_bias: float = 0.00001 rerank_exponent: float = 1.0 text_bias: float = 0.1 @@ -127,10 +129,13 @@ class RerankFusionConfig: class CoarseRankFusionConfig: """ Multiplicative fusion without model score: - fused = (max(text, 0) + text_bias) ** text_exponent + fused = (max(es, 0) + es_bias) ** es_exponent + * (max(text, 0) + text_bias) ** text_exponent * (max(knn, 0) + knn_bias) ** knn_exponent """ + es_bias: float = 0.1 + es_exponent: float = 0.0 text_bias: float = 0.1 text_exponent: float = 0.35 knn_text_weight: float = 1.0 diff --git a/docs/常用查询 - ES.md b/docs/常用查询 - ES.md index ed3927d..a582a8c 100644 --- a/docs/常用查询 - ES.md +++ b/docs/常用查询 - ES.md @@ -651,6 +651,9 @@ GET /search_products_tenant_170/_search ## 检查字段是否存在 +GET search_products_tenant_163/_mapping +GET search_products_tenant_163/_field_caps?fields=* + ```bash curl -u 'saas:4hOaLaf41y2VuI8y' -X POST \ 'http://localhost:9200/search_products_tenant_163/_count' \ diff --git a/docs/相关性检索优化说明.md b/docs/相关性检索优化说明.md index 29d158c..58629ba 100644 --- a/docs/相关性检索优化说明.md +++ b/docs/相关性检索优化说明.md @@ -155,7 +155,7 @@ 这种分层让 parser 不再返回 ES 专用的“语言计划字段”,职责边界更清晰。 -## 8. 融合打分(Rerank + Text + KNN) +## 8. 融合打分(ES + Text + KNN + Model) 当前融合逻辑位于 `search/rerank_client.py`。 @@ -180,27 +180,83 @@ 如果以上子分都缺失,则回退到 ES `_score` 作为 `text_score`,避免纯文本召回被误打成 0。 -### 8.2 最终融合公式 +### 8.2 向量相关性大分 + +向量不是两路分别进入最终公式,而是**先融合成一个统一的 `knn_score`**。 + +当前实现位于 `search/rerank_client.py` 的 `_collect_knn_score_components()`: + +1. `text_knn_score = matched_queries["knn_query"]` +2. `image_knn_score = matched_queries["image_knn_query"]` +3. 分别乘权重: + - `weighted_text_knn_score = knn_text_weight * text_knn_score` + - `weighted_image_knn_score = knn_image_weight * image_knn_score` +4. 再做一层 dismax 融合: + - `primary_knn_score = max(weighted_text_knn_score, weighted_image_knn_score)` + - `support_knn_score = 另一侧较弱信号` + - `knn_score = primary_knn_score + knn_tie_breaker * support_knn_score` + +当前默认配置在 [config.yaml](/data/saas-search/config/config.yaml) 中是: + +- `knn_text_weight = 1.0` +- `knn_image_weight = 1.0` +- `knn_tie_breaker = 0.1` + +也就是说: + +- 现在确实是“文本 KNN + 图片 KNN 先融合成一项 `knn_score`” +- 但**图片权重目前并没有略高于文本权重** +- 当前两路权重是相等的,只是通过 dismax 机制保留“主路 + 辅助路” + +如果业务上希望 image 语义更主导,可以把 `knn_image_weight` 调成略高于 `knn_text_weight`,例如 `1.1 ~ 1.3` 这一类小幅领先值,再观察 query 分布与 bad case。 + +### 8.3 各阶段融合公式 ```python -fused_score = ( - (rerank_score + 0.00001) * - (text_score + 0.1) ** 0.35 * - (knn_score + 0.6) ** 0.2 +coarse_score = ( + (es_score + es_bias) ** es_exponent + * (text_score + text_bias) ** text_exponent + * (knn_score + knn_bias) ** knn_exponent +) + +fine_stage_score = ( + (es_score + es_bias) ** es_exponent + * (fine_score + fine_bias) ** fine_exponent + * (text_score + text_bias) ** text_exponent + * (knn_score + knn_bias) ** knn_exponent + * style_boost +) + +final_score = ( + (es_score + es_bias) ** es_exponent + * (rerank_score + rerank_bias) ** rerank_exponent + * (fine_score + fine_bias) ** fine_exponent # 仅当 fine rank 打开且有分数时参与 + * (text_score + text_bias) ** text_exponent + * (knn_score + knn_bias) ** knn_exponent + * style_boost ) ``` -设计意图: +当前默认配置下: + +- `coarse`: `es_exponent=0.05`, `text_exponent=0.35`, `knn_exponent=0.2` +- `fine/final`: `es_exponent=0.05`, `text_exponent=0.25`, `knn_exponent=0.2` +- `final`: 额外有 `rerank_exponent=1.15` + +设计意图可以概括成: -- `rerank_score` 是主导信号 -- `text_score` 保留乘法增益,但通过较低指数避免词法高分过度放大 -- `knn_score` 保持弱参与,只作为语义召回补充 +- `es_score` 不再只做 debug,而是作为全阶段都保留的弱先验 +- `text_score` 是稳定主干信号 +- `knn_score` 是统一的语义信号入口 +- `fine_score` / `rerank_score` 是越往后越贵、越强的模型因子 +- `style_boost` 只在命中已选 SKU 时乘上去 -### 8.3 调试字段 +### 8.4 调试字段 开启 `debug=true` 后,`debug_info.per_result` 会暴露: - `es_score` +- `es_factor` - `rerank_score` - `text_score` - `text_source_score` @@ -261,10 +317,10 @@ sleep 3 1. Query 解析 2. ES 召回 -3. 粗排:只用 ES 内部文本/KNN 信号 +3. 粗排:ES 原始总分 + 文本大分 + 统一 KNN 大分 4. 款式 SKU 选择 + title suffix -5. 精排:轻量 reranker + 文本/KNN 融合 -6. 最终 rerank:重 reranker + fine score + 文本/KNN 融合 +5. 精排:轻量 reranker + ES/text/KNN 融合 +6. 最终 rerank:重 reranker + fine score + ES/text/KNN 融合 7. 分页、补全字段、格式化返回 主控代码在 [searcher.py](/data/saas-search/search/searcher.py),打分与 rerank 细节在 [rerank_client.py](/data/saas-search/search/rerank_client.py),配置定义在 [schema.py](/data/saas-search/config/schema.py) 和 [config.yaml](/data/saas-search/config/config.yaml)。 @@ -339,7 +395,8 @@ KNN 部分在 [es_query_builder.py:250](/data/saas-search/search/es_query_builde **Step 4:粗排** 粗排入口在 [searcher.py:638](/data/saas-search/search/searcher.py#L638),真正的打分在 [rerank_client.py:348](/data/saas-search/search/rerank_client.py#L348) 的 `coarse_resort_hits()`。 -粗排只看两类信号: +粗排现在看三类信号: +- `es_score` - `text_score` - `knn_score` @@ -362,9 +419,13 @@ KNN 部分在 [es_query_builder.py:250](/data/saas-search/search/es_query_builde - 分别乘自己的 weight - 取强的一路做主路 - 弱的一路按 `knn_tie_breaker` 做辅助 +- 产出一个统一的 `knn_score` -然后粗排融合公式在 [rerank_client.py:334](/data/saas-search/search/rerank_client.py#L334): -- `coarse_score = (text_score + text_bias)^text_exponent * (knn_score + knn_bias)^knn_exponent` +然后粗排融合公式在 [rerank_client.py:346](/data/saas-search/search/rerank_client.py#L346): +- `coarse_score = es_factor * text_factor * knn_factor` +- `es_factor = (es_score + es_bias)^es_exponent` +- `text_factor = (text_score + text_bias)^text_exponent` +- `knn_factor = (knn_score + knn_bias)^knn_exponent` 配置定义在 [schema.py:124](/data/saas-search/config/schema.py#L124) 和 [config.yaml:231](/data/saas-search/config/config.yaml#L231)。 @@ -398,9 +459,10 @@ KNN 部分在 [es_query_builder.py:250](/data/saas-search/search/es_query_builde 3. 不再只按 `fine_score` 排,而是按融合后的 `_fine_fused_score` 排 精排融合公式现在是: -- `fine_stage_score = fine_factor * text_factor * knn_factor * style_boost` +- `fine_stage_score = es_factor * fine_factor * text_factor * knn_factor * style_boost` 具体公共计算在 [rerank_client.py:286](/data/saas-search/search/rerank_client.py#L286) 的 `_compute_multiplicative_fusion()`: +- `es_factor = (es_score + es_bias)^es_exponent` - `fine_factor = (fine_score + fine_bias)^fine_exponent` - `text_factor = (text_score + text_bias)^text_exponent` - `knn_factor = (knn_score + knn_bias)^knn_exponent` @@ -423,9 +485,10 @@ KNN 部分在 [es_query_builder.py:250](/data/saas-search/search/es_query_builde 它和 fine rank 很像,但多了一个更重的模型分 `rerank_score`。 最终公式是: -- `final_score = rerank_factor * fine_factor * text_factor * knn_factor * style_boost` +- `final_score = es_factor * rerank_factor * fine_factor * text_factor * knn_factor * style_boost` 也就是: +- ES 原始总分也会继续保留到最终阶段 - fine rank 产生的 `fine_score` 不会丢 - 到最终 rerank 时,它会继续作为一个乘法项参与最终融合 @@ -468,9 +531,10 @@ KNN 部分在 [es_query_builder.py:250](/data/saas-search/search/es_query_builde - `final_page` 其中: -- coarse stage 主要保留 text/translation/knn 的拆分信号 +- coarse stage 保留 es/text/translation/knn 的拆分信号 - fine/rerank stage 现在都保留 `fusion_inputs`、`fusion_factors`、`fusion_summary` - `fusion_summary` 来自真实计算过程本身,见 [rerank_client.py:265](/data/saas-search/search/rerank_client.py#L265) +- 当 `fine_rank` 关闭时,`rerank.rank_change` 会继承 `coarse_rank` 作为上游阶段,不会错误地全部显示为 0 这点很重要,因为现在“实际排序逻辑”和“debug 展示逻辑”是同源的,不是两套各写一份。 @@ -486,6 +550,238 @@ KNN 部分在 [es_query_builder.py:250](/data/saas-search/search/es_query_builde 如果你愿意,我下一步可以继续按“一个具体 query 的真实流转样例”来讲,比如假设用户搜 `black dress`,我把它从 `parsed_query`、ES named queries、coarse/fine/final 的每个分数怎么出来,完整手推一遍。 +## 12. 值得优先探索的相关性实验方向 + +下面这些方向按我对当前 rank 体系的判断,优先级大致是“先做低风险高收益,再做结构性升级”。 + +### 12.1 Query 分桶,而不是所有 query 共用一套融合参数 + +当前问题: + +- 所有 query 基本共用同一套 exponent / bias +- 但“强词法 query”、“泛类目 query”、“风格词 query”、“图搜触发 query”、“中英混输 query”的最优信号配比通常不同 + +建议实验: + +- 先做轻量 query 分桶: + - 精准实体词 + - 泛类目词 + - 风格/属性词 + - 中英混输 + - 带强图片语义的 query +- 每个桶单独调: + - `text_translation_weight` + - `knn_text_weight / knn_image_weight` + - `es_exponent / text_exponent / knn_exponent` + +为什么值得先做: + +- 不改主架构 +- 容易上线灰度 +- 往往比“全局调一个 exponent”稳定得多 + +### 12.2 把 image KNN 设成略高于 text KNN,但只在合适 query 上生效 + +当前问题: + +- 现在 `knn_text_weight = 1.0`,`knn_image_weight = 1.0` +- 对鞋、服饰款式、图案、轮廓类 query,image embedding 往往比 text embedding 更接近用户真实意图 +- 但不是所有 query 都适合直接全局抬高 image 权重 + +建议实验: + +- 离线先试: + - `knn_image_weight = 1.1 / 1.2 / 1.3` + - `knn_text_weight = 1.0` +- 再进一步试 query gating: + - 若 query 命中款式词、形状词、鞋包词、图案词,则抬高 image weight + - 若 query 是明确品类词或强属性词,则维持中性 + +为什么我不建议一上来全局大幅抬高: + +- 会把一些“文本很明确,但图像泛相似”的结果抬上来 +- 容易让高视觉相似、低语义准确的商品误冲前排 + +### 12.3 不只融合“分数”,还要融合“排名证据” + +当前问题: + +- 现在所有阶段都高度依赖 score 级别的乘法融合 +- 不同信号源的 score 标度未必天然可比 +- reranker 分数、ES score、named query score、KNN score 的数值空间差异很大 + +建议实验: + +- 增加 rank-based 特征: + - `es_rank` + - `text_rank` + - `knn_rank` + - `rerank_rank` +- 试两类简单方法: + - RRF(Reciprocal Rank Fusion) + - score-rank 混合:先做 rank 融合,再乘少量 score 因子 + +为什么值得做: + +- 对异常 score 分布更稳 +- 对模型偶发极端分更鲁棒 +- 很适合拿来做基线对照 + +### 12.4 将 `base_query` 和 `translation_query` 从“单点 max”升级为“更完整的 lexical 证据” + +当前问题: + +- 文本大分现在只抓: + - `base_query` + - `max(base_query_trans_*)` +- 这很干净,但可能过于压缩文本证据 +- phrase 命中、best_fields 命中、多语言字段命中、字段质量差异,没有更细粒度地进入后续 rank + +建议实验: + +- 把 lexical 证据拆得更细: + - exact / phrase + - best_fields + - title 命中 + - category 命中 + - brand/vendor 命中 +- 后续不一定都入主公式,但可以先做 debug / feature log + +这样做的收益: + +- 更容易解释“为什么这条词法上明明更准却没排上来” +- 为后续 learning-to-rank 或规则门控准备特征 + +### 12.5 增加“类目先验”和“商品类型约束” + +当前问题: + +- 现在体系更偏“文本/向量相似度驱动” +- 对“牛仔裤 vs 连裤袜”这种 bad case,问题常常不只是分数融合,而是**商品类型约束太弱** + +建议实验: + +- query 侧先做轻量商品类型识别: + - 裙子 + - 裤子 + - 上衣 + - 鞋 +- doc 侧取: + - category_path + - taxonomy leaf + - 类目 embedding / one-hot +- 然后试: + - 作为 hard filter 候选约束 + - 作为 coarse/final 的 boost 因子 + - 作为 rerank 输入字段增强 + +这是我认为对明显 bad case 最有价值的一类结构性修复。 + +### 12.6 把“负证据”纳入体系,而不只是累加正证据 + +当前问题: + +- 当前乘法体系主要是在积累正向因子 +- 但很多错误结果不是“正向不够强”,而是“存在明显负证据” +- 例如 query 是“半身裙”,doc 却强命中“上衣”“打底衫”“连裤袜” + +建议实验: + +- 抽取轻量负词特征: + - 商品类型冲突词 + - 性别/人群冲突词 + - 长度/版型冲突词 +- 方式可以先很简单: + - penalty factor + - blacklist term penalty + - query-doc type mismatch penalty + +这是当前体系里非常缺的一块。 + +### 12.7 把 KNN 从“单一总分”升级为“多语义子通道” + +当前问题: + +- 现在 KNN 最终会被压成一个 `knn_score` +- 这对工程简单很好,但损失了“这条向量信号到底为什么相似”的信息 + +建议实验: + +- 分通道记录和使用: + - text semantic similarity + - image appearance similarity + - category-aware similarity + - style-aware similarity +- 即使最终仍合成一个总分,也建议先保留分通道特征 + +这样未来才能回答: + +- 这条结果是“外观像” +- 还是“描述语义像” +- 还是“类目像但款式不对” + +### 12.8 从纯手工公式,逐步过渡到轻量 LTR + +当前问题: + +- 目前公式已经比较清晰,但本质还是手工 feature engineering + 手工 exponent +- 一旦信号变多,靠手调很难长期维护 + +建议实验: + +- 先不引入复杂在线模型 +- 先做离线 LTR baseline: + - LambdaMART / XGBoost ranker + - 输入现成特征: + - es_score + - text_score + - text_source_score + - translation_score + - text_knn_score + - image_knn_score + - coarse_rank + - rerank_score + - category match + - style intent match + +为什么这一步值得准备: + +- 你们现在的 debug 字段已经很接近 feature log 了 +- 其实已经具备往 LTR 过渡的土壤 + +### 12.9 先把评估体系补齐,再谈大改 + +当前问题: + +- 很多相关性讨论容易停留在个例 +- 但融合改动经常存在 query 分布层面的 tradeoff + +建议实验配套: + +- 建立 query slice 指标: + - 鞋靴 + - 裙装 + - 裤装 + - 中英混输 + - 图像语义强 query + - 属性词强 query +- 每次实验至少看: + - overall + - top 1 + - top 3 + - slice breakdown + - bad case 回归集 + +### 12.10 我对当前体系的几个核心判断 + +1. 当前体系最大的优点不是公式本身,而是已经把信号拆成了可解释的层级,这非常适合继续做实验。 +2. 当前体系最大的短板不是“knn exponent 还不够准”,而是缺少 query 分桶、类目先验和负证据。 +3. 只调融合公式还能继续拿到一部分收益,但中期最值得投入的是: + - query-aware 参数 + - 类型/类目约束 + - score + rank 混合融合 + - 为 LTR 做特征沉淀 + ## reranker方面: diff --git a/scripts/evaluation/eval_framework/framework.py b/scripts/evaluation/eval_framework/framework.py index ba7c90e..5c8fcc6 100644 --- a/scripts/evaluation/eval_framework/framework.py +++ b/scripts/evaluation/eval_framework/framework.py @@ -272,6 +272,40 @@ class SearchEvaluationFramework: ranked.sort(key=lambda item: item["score"], reverse=True) return ranked + def _assign_fixed_rerank_scores( + self, + query: str, + spu_ids: Sequence[str], + *, + score: float, + force_refresh: bool = False, + ) -> Dict[str, float]: + """Persist a fixed rerank score for a deduplicated ``spu_id`` list.""" + normalized_ids: List[str] = [] + seen: set[str] = set() + for spu_id in spu_ids: + sid = str(spu_id or "").strip() + if not sid or sid in seen: + continue + seen.add(sid) + normalized_ids.append(sid) + if not normalized_ids: + return {} + + cached = {} if force_refresh else self.store.get_rerank_scores(self.tenant_id, query) + to_store: Dict[str, float] = {} + for sid in normalized_ids: + if force_refresh or sid not in cached or float(cached[sid]) != float(score): + to_store[sid] = float(score) + if to_store: + self.store.upsert_rerank_scores( + self.tenant_id, + query, + to_store, + model_name="search_recall_pool_fixed", + ) + return {sid: float(score) for sid in normalized_ids} + def _rerank_batch_with_retry(self, query: str, docs: Sequence[Dict[str, Any]]) -> List[float]: if not docs: return [] @@ -631,12 +665,25 @@ class SearchEvaluationFramework: search_size = max(int(search_depth), int(search_recall_top_k)) search_payload = self.search_client.search(query=query, size=search_size, from_=0, language=language) search_results = list(search_payload.get("results") or []) - recall_n = min(int(search_recall_top_k), len(search_results)) - pool_search_docs = search_results[:recall_n] - pool_spu_ids = {str(d.get("spu_id")) for d in pool_search_docs if str(d.get("spu_id") or "").strip()} + search_result_spu_ids = [str(doc.get("spu_id") or "").strip() for doc in search_results] + recall_spu_ids: List[str] = [] + seen_recall_spu_ids: set[str] = set() + for spu_id in search_result_spu_ids[: int(search_recall_top_k)]: + if not spu_id or spu_id in seen_recall_spu_ids: + continue + seen_recall_spu_ids.add(spu_id) + recall_spu_ids.append(spu_id) + recall_n = len(recall_spu_ids) + pool_spu_ids = set(recall_spu_ids) corpus = self.corpus_docs(refresh=False) corpus_by_id = {str(d.get("spu_id")): d for d in corpus if str(d.get("spu_id") or "").strip()} + self._assign_fixed_rerank_scores( + query=query, + spu_ids=recall_spu_ids, + score=1.0, + force_refresh=force_refresh_rerank, + ) rerank_pending_n = sum( 1 @@ -697,12 +744,13 @@ class SearchEvaluationFramework: else: ordered_docs: List[Dict[str, Any]] = [] seen_ordered: set[str] = set() - for doc in pool_search_docs: - sid = str(doc.get("spu_id") or "") + for sid in recall_spu_ids: if not sid or sid in seen_ordered: continue seen_ordered.add(sid) - ordered_docs.append(corpus_by_id.get(sid, doc)) + doc = corpus_by_id.get(sid) + if doc is not None: + ordered_docs.append(doc) for item in ranked_outside: sid = str(item["spu_id"]) if sid in seen_ordered: @@ -730,9 +778,10 @@ class SearchEvaluationFramework: rerank_depth_effective = min(int(rerank_depth), len(ranked_outside)) search_labeled_results: List[Dict[str, Any]] = [] - for rank, doc in enumerate(search_results, start=1): - spu_id = str(doc.get("spu_id")) - in_pool = rank <= recall_n + for rank, search_doc in enumerate(search_results, start=1): + spu_id = str(search_doc.get("spu_id") or "") + doc = corpus_by_id.get(spu_id, search_doc) + in_pool = spu_id in pool_spu_ids search_labeled_results.append( { "rank": rank, @@ -998,4 +1047,3 @@ class SearchEvaluationFramework: output_json_path, ) return payload - diff --git a/search/rerank_client.py b/search/rerank_client.py index 4384e50..659b453 100644 --- a/search/rerank_client.py +++ b/search/rerank_client.py @@ -252,17 +252,18 @@ def _build_hit_signal_bundle( hit: Dict[str, Any], fusion: CoarseRankFusionConfig | RerankFusionConfig, ) -> Dict[str, Any]: - es_score = _to_score(hit.get("_score")) + raw_es_score = _to_score(hit.get("_raw_es_score", hit.get("_original_score", hit.get("_score")))) + hit["_raw_es_score"] = raw_es_score matched_queries = hit.get("matched_queries") text_components = _collect_text_score_components( matched_queries, - es_score, + raw_es_score, translation_weight=fusion.text_translation_weight, ) knn_components = _collect_knn_score_components(matched_queries, fusion) return { "doc_id": hit.get("_id"), - "es_score": es_score, + "es_score": raw_es_score, "matched_queries": matched_queries, "text_components": text_components, "knn_components": knn_components, @@ -294,6 +295,7 @@ def _build_formula_summary( def _compute_multiplicative_fusion( *, + es_score: float, text_score: float, knn_score: float, fusion: RerankFusionConfig, @@ -317,6 +319,7 @@ def _compute_multiplicative_fusion( } ) + _add_term("es_score", es_score, fusion.es_bias, fusion.es_exponent) _add_term("rerank_score", rerank_score, fusion.rerank_bias, fusion.rerank_exponent) _add_term("fine_score", fine_score, fusion.fine_bias, fusion.fine_exponent) _add_term("text_score", text_score, fusion.text_bias, fusion.text_exponent) @@ -341,13 +344,15 @@ def _compute_multiplicative_fusion( def _multiply_coarse_fusion_factors( + es_score: float, text_score: float, knn_score: float, fusion: CoarseRankFusionConfig, -) -> Tuple[float, float, float]: +) -> Tuple[float, float, float, float]: + es_factor = (max(es_score, 0.0) + fusion.es_bias) ** fusion.es_exponent text_factor = (max(text_score, 0.0) + fusion.text_bias) ** fusion.text_exponent knn_factor = (max(knn_score, 0.0) + fusion.knn_bias) ** fusion.knn_exponent - return text_factor, knn_factor, text_factor * knn_factor + return es_factor, text_factor, knn_factor, es_factor * text_factor * knn_factor def _has_selected_sku(hit: Dict[str, Any]) -> bool: @@ -359,7 +364,7 @@ def coarse_resort_hits( fusion: Optional[CoarseRankFusionConfig] = None, debug: bool = False, ) -> List[Dict[str, Any]]: - """Coarse rank with text/knn fusion only.""" + """Coarse rank with es/text/knn multiplicative fusion.""" if not es_hits: return [] @@ -373,7 +378,8 @@ def coarse_resort_hits( knn_components = signal_bundle["knn_components"] text_score = signal_bundle["text_score"] knn_score = signal_bundle["knn_score"] - text_factor, knn_factor, coarse_score = _multiply_coarse_fusion_factors( + es_factor, text_factor, knn_factor, coarse_score = _multiply_coarse_fusion_factors( + es_score=es_score, text_score=text_score, knn_score=knn_score, fusion=f, @@ -409,6 +415,7 @@ def coarse_resort_hits( "knn_primary_score": knn_components["primary_knn_score"], "knn_support_score": knn_components["support_knn_score"], "knn_score": knn_score, + "coarse_es_factor": es_factor, "coarse_text_factor": text_factor, "coarse_knn_factor": knn_factor, "coarse_score": coarse_score, @@ -435,13 +442,19 @@ def fuse_scores_and_resort( 将 ES 分数与重排分数按乘法公式融合(不修改原始 _score),并按融合分数降序重排。 融合形式(由 ``fusion`` 配置 bias / exponent):: - fused = (max(rerank,0)+b_r)^e_r * (max(text,0)+b_t)^e_t * (max(knn,0)+b_k)^e_k * sku_boost + fused = (max(es,0)+b_es)^e_es + * (max(rerank,0)+b_r)^e_r + * (max(fine,0)+b_f)^e_f + * (max(text,0)+b_t)^e_t + * (max(knn,0)+b_k)^e_k + * sku_boost 其中 sku_boost 仅在当前 hit 已选中 SKU 时生效,默认值为 1.2,可通过 ``query.style_intent.selected_sku_boost`` 配置。 对每条 hit 会写入: - _original_score: 原始 ES 分数 + - _raw_es_score: ES 原始总分(后续阶段始终复用,不依赖可能被改写的 `_score`) - _rerank_score: 重排服务返回的分数 - _fused_score: 融合分数 - _text_score: 文本相关性分数(优先取 named queries 的 base_query 分数) @@ -475,6 +488,7 @@ def fuse_scores_and_resort( sku_selected = _has_selected_sku(hit) style_boost = style_intent_selected_sku_boost if sku_selected else 1.0 fusion_result = _compute_multiplicative_fusion( + es_score=signal_bundle["es_score"], rerank_score=rerank_score, fine_score=fine_score, text_score=text_score, @@ -526,6 +540,7 @@ def fuse_scores_and_resort( ), "rerank_factor": fusion_result["factors"].get("rerank_score"), "fine_factor": fusion_result["factors"].get("fine_score"), + "es_factor": fusion_result["factors"].get("es_score"), "text_factor": fusion_result["factors"].get("text_score"), "knn_factor": fusion_result["factors"].get("knn_score"), "style_intent_selected_sku": sku_selected, @@ -654,6 +669,7 @@ def run_lightweight_rerank( sku_selected = _has_selected_sku(hit) style_boost = style_intent_selected_sku_boost if sku_selected else 1.0 fusion_result = _compute_multiplicative_fusion( + es_score=signal_bundle["es_score"], fine_score=fine_score, text_score=text_score, knn_score=knn_score, @@ -679,7 +695,9 @@ def run_lightweight_rerank( "fusion_inputs": fusion_result["inputs"], "fusion_factors": fusion_result["factors"], "fusion_summary": fusion_result["summary"], + "es_score": signal_bundle["es_score"], "fine_factor": fusion_result["factors"].get("fine_score"), + "es_factor": fusion_result["factors"].get("es_score"), "text_factor": fusion_result["factors"].get("text_score"), "knn_factor": fusion_result["factors"].get("knn_score"), "style_intent_selected_sku": sku_selected, diff --git a/search/searcher.py b/search/searcher.py index e1a1c72..6a38d4d 100644 --- a/search/searcher.py +++ b/search/searcher.py @@ -994,7 +994,7 @@ class Searcher: if decision is not None: style_intent_debug = decision.to_dict() - raw_score = hit.get("_score") + raw_score = hit.get("_raw_es_score", hit.get("_original_score", hit.get("_score"))) try: es_score = float(raw_score) if raw_score is not None else 0.0 except (TypeError, ValueError): @@ -1024,6 +1024,7 @@ class Searcher: if coarse_debug: debug_entry["coarse_score"] = coarse_debug.get("coarse_score") + debug_entry["coarse_es_factor"] = coarse_debug.get("coarse_es_factor") debug_entry["coarse_text_factor"] = coarse_debug.get("coarse_text_factor") debug_entry["coarse_knn_factor"] = coarse_debug.get("coarse_knn_factor") @@ -1033,6 +1034,7 @@ class Searcher: debug_entry["score"] = rerank_debug.get("score") debug_entry["rerank_score"] = rerank_debug.get("rerank_score") debug_entry["fine_score"] = rerank_debug.get("fine_score") + debug_entry["es_score"] = rerank_debug.get("es_score", es_score) debug_entry["text_score"] = rerank_debug.get("text_score") debug_entry["knn_score"] = rerank_debug.get("knn_score") debug_entry["fusion_inputs"] = rerank_debug.get("fusion_inputs") @@ -1040,6 +1042,7 @@ class Searcher: debug_entry["fusion_summary"] = rerank_debug.get("fusion_summary") debug_entry["rerank_factor"] = rerank_debug.get("rerank_factor") debug_entry["fine_factor"] = rerank_debug.get("fine_factor") + debug_entry["es_factor"] = rerank_debug.get("es_factor") debug_entry["text_factor"] = rerank_debug.get("text_factor") debug_entry["knn_factor"] = rerank_debug.get("knn_factor") debug_entry["fused_score"] = rerank_debug.get("fused_score") @@ -1049,11 +1052,13 @@ class Searcher: debug_entry["doc_id"] = fine_debug.get("doc_id") debug_entry["score"] = fine_debug.get("score") debug_entry["fine_score"] = fine_debug.get("fine_score") + debug_entry["es_score"] = fine_debug.get("es_score", es_score) debug_entry["text_score"] = fine_debug.get("text_score") debug_entry["knn_score"] = fine_debug.get("knn_score") debug_entry["fusion_inputs"] = fine_debug.get("fusion_inputs") debug_entry["fusion_factors"] = fine_debug.get("fusion_factors") debug_entry["fusion_summary"] = fine_debug.get("fusion_summary") + debug_entry["es_factor"] = fine_debug.get("es_factor") debug_entry["rerank_input"] = fine_debug.get("rerank_input") initial_rank = initial_ranks_by_doc.get(str(doc_id)) if doc_id is not None else None @@ -1061,6 +1066,14 @@ class Searcher: fine_rank = fine_ranks_by_doc.get(str(doc_id)) if doc_id is not None else None rerank_rank = rerank_ranks_by_doc.get(str(doc_id)) if doc_id is not None else None final_rank = final_ranks_by_doc.get(str(doc_id)) if doc_id is not None else None + rerank_previous_rank = fine_rank if fine_rank is not None else coarse_rank + final_previous_rank = rerank_rank + if final_previous_rank is None: + final_previous_rank = fine_rank + if final_previous_rank is None: + final_previous_rank = coarse_rank + if final_previous_rank is None: + final_previous_rank = initial_rank def _rank_change(previous_rank: Optional[int], current_rank: Optional[int]) -> Optional[int]: if previous_rank is None or current_rank is None: @@ -1078,8 +1091,10 @@ class Searcher: "rank": coarse_rank, "rank_change": _rank_change(initial_rank, coarse_rank), "score": coarse_debug.get("coarse_score") if coarse_debug else None, + "es_score": coarse_debug.get("es_score") if coarse_debug else es_score, "text_score": coarse_debug.get("text_score") if coarse_debug else None, "knn_score": coarse_debug.get("knn_score") if coarse_debug else None, + "es_factor": coarse_debug.get("coarse_es_factor") if coarse_debug else None, "text_factor": coarse_debug.get("coarse_text_factor") if coarse_debug else None, "knn_factor": coarse_debug.get("coarse_knn_factor") if coarse_debug else None, "signals": coarse_debug, @@ -1093,8 +1108,10 @@ class Searcher: else hit.get("_fine_fused_score", hit.get("_fine_score")) ), "fine_score": fine_debug.get("fine_score") if fine_debug else hit.get("_fine_score"), + "es_score": fine_debug.get("es_score") if fine_debug else es_score, "text_score": fine_debug.get("text_score") if fine_debug else hit.get("_text_score"), "knn_score": fine_debug.get("knn_score") if fine_debug else hit.get("_knn_score"), + "es_factor": fine_debug.get("es_factor") if fine_debug else None, "fusion_summary": fine_debug.get("fusion_summary") if fine_debug else None, "fusion_inputs": fine_debug.get("fusion_inputs") if fine_debug else None, "fusion_factors": fine_debug.get("fusion_factors") if fine_debug else None, @@ -1103,8 +1120,9 @@ class Searcher: }, "rerank": { "rank": rerank_rank, - "rank_change": _rank_change(fine_rank, rerank_rank), + "rank_change": _rank_change(rerank_previous_rank, rerank_rank), "score": rerank_debug.get("score") if rerank_debug else hit.get("_fused_score"), + "es_score": rerank_debug.get("es_score") if rerank_debug else es_score, "rerank_score": rerank_debug.get("rerank_score") if rerank_debug else hit.get("_rerank_score"), "fine_score": rerank_debug.get("fine_score") if rerank_debug else hit.get("_fine_score"), "fused_score": rerank_debug.get("fused_score") if rerank_debug else hit.get("_fused_score"), @@ -1115,13 +1133,14 @@ class Searcher: "fusion_factors": rerank_debug.get("fusion_factors") if rerank_debug else None, "rerank_factor": rerank_debug.get("rerank_factor") if rerank_debug else None, "fine_factor": rerank_debug.get("fine_factor") if rerank_debug else None, + "es_factor": rerank_debug.get("es_factor") if rerank_debug else None, "text_factor": rerank_debug.get("text_factor") if rerank_debug else None, "knn_factor": rerank_debug.get("knn_factor") if rerank_debug else None, "signals": rerank_debug, }, "final_page": { "rank": final_rank, - "rank_change": _rank_change(rerank_rank, final_rank), + "rank_change": _rank_change(final_previous_rank, final_rank), }, } diff --git a/tests/test_rerank_client.py b/tests/test_rerank_client.py index 8ef8210..658601b 100644 --- a/tests/test_rerank_client.py +++ b/tests/test_rerank_client.py @@ -258,3 +258,35 @@ def test_fuse_scores_and_resort_uses_hit_level_fine_score_when_not_passed_separa assert isclose(debug[0]["fine_factor"], (0.7 + 0.00001), rel_tol=1e-9) assert debug[0]["fusion_inputs"]["fine_score"] == 0.7 assert "fine_score=" in debug[0]["fusion_summary"] + + +def test_fuse_scores_and_resort_can_include_raw_es_score_as_factor(): + hits = [ + { + "_id": "es-strong", + "_score": 100.0, + "matched_queries": {"base_query": 1.0, "knn_query": 0.0}, + }, + { + "_id": "es-weak", + "_score": 1.0, + "matched_queries": {"base_query": 1.0, "knn_query": 0.0}, + }, + ] + fusion = RerankFusionConfig( + es_bias=0.0, + es_exponent=1.0, + rerank_bias=0.0, + rerank_exponent=1.0, + text_bias=0.0, + text_exponent=0.0, + knn_bias=1.0, + knn_exponent=0.0, + ) + + debug = fuse_scores_and_resort(hits, [1.0, 1.0], fusion=fusion, debug=True) + + assert [hit["_id"] for hit in hits] == ["es-strong", "es-weak"] + assert isclose(hits[0]["_raw_es_score"], 100.0, rel_tol=1e-9) + assert isclose(debug[0]["es_factor"], 100.0, rel_tol=1e-9) + assert debug[0]["fusion_inputs"]["es_score"] == 100.0 diff --git a/tests/test_search_rerank_window.py b/tests/test_search_rerank_window.py index 85b65bc..5002b89 100644 --- a/tests/test_search_rerank_window.py +++ b/tests/test_search_rerank_window.py @@ -10,6 +10,7 @@ import yaml from config import ( ConfigLoader, + FineRankConfig, FunctionScoreConfig, IndexConfig, QueryConfig, @@ -944,3 +945,70 @@ def test_searcher_debug_info_uses_initial_es_max_score_for_normalization(monkeyp assert result.debug_info["per_result"][0]["final_rank"] == 1 assert result.debug_info["per_result"][0]["es_score_normalized"] == 1.0 assert result.debug_info["per_result"][1]["es_score_normalized"] == 2.0 / 3.0 + + +def test_searcher_rerank_rank_change_falls_back_to_coarse_rank_when_fine_disabled(monkeypatch): + es_client = _FakeESClient(total_hits=5) + config = _build_search_config(rerank_enabled=True, rerank_window=5) + config = SearchConfig( + field_boosts=config.field_boosts, + indexes=config.indexes, + query_config=config.query_config, + function_score=config.function_score, + coarse_rank=config.coarse_rank, + fine_rank=FineRankConfig(enabled=False, input_window=5, output_window=5), + rerank=config.rerank, + spu_config=config.spu_config, + es_index_name=config.es_index_name, + es_settings=config.es_settings, + ) + searcher = _build_searcher(config, es_client) + context = create_request_context(reqid="rank-fallback", uid="u-rank-fallback") + + monkeypatch.setattr( + "search.searcher.get_tenant_config_loader", + lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en"]}), + ) + + def _fake_run_rerank(**kwargs): + hits = kwargs["es_response"]["hits"]["hits"] + hits.reverse() + fused_debug = [] + for idx, hit in enumerate(hits): + hit["_fused_score"] = 100.0 - idx + hit["_rerank_score"] = 1.0 - 0.1 * idx + fused_debug.append( + { + "doc_id": hit["_id"], + "score": hit["_fused_score"], + "es_score": hit.get("_raw_es_score", hit.get("_score")), + "rerank_score": hit["_rerank_score"], + "text_score": hit.get("_text_score", hit.get("_score")), + "knn_score": hit.get("_knn_score", 0.0), + "es_factor": 1.0, + "rerank_factor": 1.0, + "text_factor": 1.0, + "knn_factor": 1.0, + "fused_score": hit["_fused_score"], + } + ) + return kwargs["es_response"], {"model": "final-reranker"}, fused_debug + + monkeypatch.setattr("search.rerank_client.run_rerank", _fake_run_rerank) + + result = searcher.search( + query="toy", + tenant_id="162", + from_=0, + size=5, + context=context, + enable_rerank=True, + debug=True, + ) + + per_result = {row["spu_id"]: row for row in result.debug_info["per_result"]} + moved = per_result["4"]["ranking_funnel"] + assert moved["fine_rank"]["rank"] is None + assert moved["rerank"]["rank"] == 1 + assert moved["rerank"]["rank_change"] == 4 + assert moved["final_page"]["rank_change"] == 0 -- libgit2 0.21.2