Commit 0536222c6d7fcf1bb9339299b67409c918bae320

Authored by tangwang
1 parent ef5baa86

query parser优化

config/config.yaml
@@ -20,7 +20,7 @@ es_settings: @@ -20,7 +20,7 @@ es_settings:
20 refresh_interval: "30s" 20 refresh_interval: "30s"
21 21
22 # 字段权重配置(用于搜索时的字段boost) 22 # 字段权重配置(用于搜索时的字段boost)
23 -# 统一按“字段基名”配置;查询时按 search_langs 动态拼接 .{lang}。 23 +# 统一按“字段基名”配置;查询时按实际检索语言动态拼接 .{lang}。
24 # 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。 24 # 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。
25 field_boosts: 25 field_boosts:
26 title: 3.0 26 title: 3.0
@@ -74,14 +74,11 @@ query_config: @@ -74,14 +74,11 @@ query_config:
74 - "vendor" 74 - "vendor"
75 - "category_name_text" 75 - "category_name_text"
76 76
77 - # 统一文本召回策略(主查询 + 翻译查询 + 原始查询兜底 77 + # 统一文本召回策略(主查询 + 翻译查询
78 text_query_strategy: 78 text_query_strategy:
79 base_minimum_should_match: "75%" 79 base_minimum_should_match: "75%"
80 translation_minimum_should_match: "75%" 80 translation_minimum_should_match: "75%"
81 translation_boost: 0.4 81 translation_boost: 0.4
82 - translation_boost_when_source_missing: 1.0  
83 - source_boost_when_missing: 0.6  
84 - original_query_fallback_boost_when_translation_missing: 0.2  
85 tie_breaker_base_query: 0.9 82 tie_breaker_base_query: 0.9
86 83
87 # Embedding字段名称 84 # Embedding字段名称
@@ -284,13 +284,6 @@ class AppConfigLoader: @@ -284,13 +284,6 @@ class AppConfigLoader:
284 base_minimum_should_match=str(text_strategy.get("base_minimum_should_match", "70%")), 284 base_minimum_should_match=str(text_strategy.get("base_minimum_should_match", "70%")),
285 translation_minimum_should_match=str(text_strategy.get("translation_minimum_should_match", "70%")), 285 translation_minimum_should_match=str(text_strategy.get("translation_minimum_should_match", "70%")),
286 translation_boost=float(text_strategy.get("translation_boost", 0.4)), 286 translation_boost=float(text_strategy.get("translation_boost", 0.4)),
287 - translation_boost_when_source_missing=float(  
288 - text_strategy.get("translation_boost_when_source_missing", 1.0)  
289 - ),  
290 - source_boost_when_missing=float(text_strategy.get("source_boost_when_missing", 0.6)),  
291 - original_query_fallback_boost_when_translation_missing=float(  
292 - text_strategy.get("original_query_fallback_boost_when_translation_missing", 0.2)  
293 - ),  
294 tie_breaker_base_query=float(text_strategy.get("tie_breaker_base_query", 0.9)), 287 tie_breaker_base_query=float(text_strategy.get("tie_breaker_base_query", 0.9)),
295 zh_to_en_model=str(query_cfg.get("zh_to_en_model") or "opus-mt-zh-en"), 288 zh_to_en_model=str(query_cfg.get("zh_to_en_model") or "opus-mt-zh-en"),
296 en_to_zh_model=str(query_cfg.get("en_to_zh_model") or "opus-mt-en-zh"), 289 en_to_zh_model=str(query_cfg.get("en_to_zh_model") or "opus-mt-en-zh"),
@@ -54,9 +54,6 @@ class QueryConfig: @@ -54,9 +54,6 @@ class QueryConfig:
54 base_minimum_should_match: str = "70%" 54 base_minimum_should_match: str = "70%"
55 translation_minimum_should_match: str = "70%" 55 translation_minimum_should_match: str = "70%"
56 translation_boost: float = 0.4 56 translation_boost: float = 0.4
57 - translation_boost_when_source_missing: float = 1.0  
58 - source_boost_when_missing: float = 0.6  
59 - original_query_fallback_boost_when_translation_missing: float = 0.2  
60 tie_breaker_base_query: float = 0.9 57 tie_breaker_base_query: float = 0.9
61 zh_to_en_model: str = "opus-mt-zh-en" 58 zh_to_en_model: str = "opus-mt-zh-en"
62 en_to_zh_model: str = "opus-mt-en-zh" 59 en_to_zh_model: str = "opus-mt-en-zh"
docs/DEVELOPER_GUIDE.md
@@ -147,7 +147,7 @@ docs/ # 文档(含本指南) @@ -147,7 +147,7 @@ docs/ # 文档(含本指南)
147 147
148 ### 4.4 query 148 ### 4.4 query
149 149
150 -- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化;输出可供 Searcher 使用的结构化查询信息(含 search_langs 语言计划) 150 +- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化;输出解析事实(如 `rewritten_query`、`detected_language`、`translations`、`query_vector`),不再承担 ES 语言计划拼装
151 - **原则**:翻译/向量通过 `providers` 获取,不直接依赖具体服务 URL 或实现;支持按配置关闭翻译/向量(如短查询、typing 场景)。 151 - **原则**:翻译/向量通过 `providers` 获取,不直接依赖具体服务 URL 或实现;支持按配置关闭翻译/向量(如短查询、typing 场景)。
152 152
153 ### 4.5 search 153 ### 4.5 search
docs/QUICKSTART.md
@@ -558,6 +558,21 @@ lsof -i :6004 @@ -558,6 +558,21 @@ lsof -i :6004
558 558
559 更完整的运行排障(多环境切换、Suggestion 构建、FAQ)见 `docs/Usage-Guide.md`。 559 更完整的运行排障(多环境切换、Suggestion 构建、FAQ)见 `docs/Usage-Guide.md`。
560 560
  561 +### 5.4 HanLP 与 `transformers` 版本(`BertTokenizer.encode_plus`)
  562 +
  563 +若日志出现 **`AttributeError: BertTokenizer has no attribute encode_plus`**,通常是 **同一 venv 里装了 `transformers` 5.x**,与 **HanLP 2.1.x** 不兼容(HanLP 仍调用已移除的 `encode_plus`)。
  564 +
  565 +**处理:** 将 `transformers` 固定到 **4.x**(例如 4.44+),然后重装/校验 HanLP:
  566 +
  567 +```bash
  568 +source activate.sh
  569 +pip install -r requirements_hanlp.txt
  570 +python -c "from transformers import BertTokenizer; import transformers as t; print(t.__version__, hasattr(BertTokenizer, 'encode_plus'))"
  571 +# 期望:4.x 且 True
  572 +```
  573 +
  574 +**说明:** 重排/TEI 等若使用 **独立 venv**(如 `.venv-reranker`),可与主 venv 的 `transformers` 版本分离;主 venv 只要装了 HanLP 做查询分词,就不要把 `transformers` 升到 5。
  575 +
561 --- 576 ---
562 577
563 ## 6. 相关文档 578 ## 6. 相关文档
@@ -32,7 +32,7 @@ @@ -32,7 +32,7 @@
32 }, 32 },
33 去掉 image_embedding_512 33 去掉 image_embedding_512
34 image_embedding改为,一个spu有多个sku向量,每个向量内部properties: 34 image_embedding改为,一个spu有多个sku向量,每个向量内部properties:
35 -除了vector url还应该包括 35 +除了vector url还应该包括,该图片是对应哪些sku
36 "image_embedding": { 36 "image_embedding": {
37 "type": "nested", 37 "type": "nested",
38 "properties": { 38 "properties": {
@@ -117,6 +117,11 @@ requests.exceptions.HTTPError: 502 Server Error: Bad Gateway for url: http://127 @@ -117,6 +117,11 @@ requests.exceptions.HTTPError: 502 Server Error: Bad Gateway for url: http://127
117 117
118 118
119 119
  120 +是否需要:
  121 +当「源语言不在 index_languages」且「某些目标语言的翻译缺失」时,ES 里会额外加一层 用「原始 query 字符串」去撞缺失语种字段
  122 +
  123 +
  124 +
120 先阅读文本embedding相关的代码: 125 先阅读文本embedding相关的代码:
121 @embeddings/README.md @embeddings/server.py @docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md @embeddings/text_encoder.py 126 @embeddings/README.md @embeddings/server.py @docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md @embeddings/text_encoder.py
122 目前有TEXT_MAX_INFLIGHT / IMAGE_MAX_INFLIGHT 准入限制,超限返回过载状态码。 127 目前有TEXT_MAX_INFLIGHT / IMAGE_MAX_INFLIGHT 准入限制,超限返回过载状态码。
docs/搜索API对接指南-01-搜索接口.md
@@ -553,9 +553,6 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) @@ -553,9 +553,6 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"})
553 | `rewritten_query` | string | 重写后的查询 | 553 | `rewritten_query` | string | 重写后的查询 |
554 | `detected_language` | string | 检测到的语言 | 554 | `detected_language` | string | 检测到的语言 |
555 | `translations` | object | 翻译结果 | 555 | `translations` | object | 翻译结果 |
556 -| `query_text_by_lang` | object | 实际参与检索的多语言 query 文本 |  
557 -| `search_langs` | array[string] | 实际参与检索的语言列表 |  
558 -| `supplemental_search_langs` | array[string] | 因 mixed query 补入的附加语言列表 |  
559 | `has_vector` | boolean | 是否生成了向量 | 556 | `has_vector` | boolean | 是否生成了向量 |
560 557
561 `debug_info.per_result[]` 常见字段: 558 `debug_info.per_result[]` 常见字段:
@@ -565,10 +562,9 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) @@ -565,10 +562,9 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"})
565 | `spu_id` | string | 结果 SPU ID | 562 | `spu_id` | string | 结果 SPU ID |
566 | `es_score` | float | ES 原始 `_score` | 563 | `es_score` | float | ES 原始 `_score` |
567 | `rerank_score` | float | 重排分数 | 564 | `rerank_score` | float | 重排分数 |
568 -| `text_score` | float | 文本相关性大分(由 `base_query` / `base_query_trans_*` / `fallback_original_query_*` 聚合而来) | 565 +| `text_score` | float | 文本相关性大分(由 `base_query` / `base_query_trans_*` 聚合而来) |
569 | `text_source_score` | float | `base_query` 分数 | 566 | `text_source_score` | float | `base_query` 分数 |
570 | `text_translation_score` | float | `base_query_trans_*` 里的最大分数 | 567 | `text_translation_score` | float | `base_query_trans_*` 里的最大分数 |
571 -| `text_fallback_score` | float | `fallback_original_query_*` 里的最大分数 |  
572 | `text_primary_score` | float | 文本大分中的主证据部分 | 568 | `text_primary_score` | float | 文本大分中的主证据部分 |
573 | `text_support_score` | float | 文本大分中的辅助证据部分 | 569 | `text_support_score` | float | 文本大分中的辅助证据部分 |
574 | `knn_score` | float | `knn_query` 分数 | 570 | `knn_score` | float | `knn_query` 分数 |
docs/相关性检索优化说明.md
@@ -2,11 +2,11 @@ @@ -2,11 +2,11 @@
2 2
3 ## 1. 文档目标 3 ## 1. 文档目标
4 4
5 -本文描述当前线上代码的文本检索策略,重点覆盖: 5 +本文描述当前代码中的文本检索策略,重点覆盖:
6 6
7 - 多语言检索路由(`detector` / `translator` / `indexed` 的关系) 7 - 多语言检索路由(`detector` / `translator` / `indexed` 的关系)
8 - 统一文本召回表达式(无布尔 AST 分支) 8 - 统一文本召回表达式(无布尔 AST 分支)
9 -- 翻译缺失时的兜底策略 9 +- 解析层与检索表达式层的职责边界
10 - 重排融合打分与调试字段 10 - 重排融合打分与调试字段
11 - 典型场景下实际生成的 ES 查询结构 11 - 典型场景下实际生成的 ES 查询结构
12 12
@@ -17,9 +17,11 @@ @@ -17,9 +17,11 @@
17 查询链路(文本相关): 17 查询链路(文本相关):
18 18
19 1. `QueryParser.parse()` 19 1. `QueryParser.parse()`
20 - 输出 `detected_language`、`query_text_by_lang`、`search_langs`、`index_languages`、`source_in_index_languages`;另输出 `contains_chinese` / `contains_english`(仅服务混写辅助召回,见 §4 末)。 20 + 负责产出解析事实:`query_normalized`、`rewritten_query`、`detected_language`、`translations`、`query_vector`、`query_tokens`、`contains_chinese`、`contains_english`。
  21 +2. `Searcher.search()`
  22 + 负责读取租户 `index_languages`,并将其一方面传给 `QueryParser` 作为 `target_languages`,另一方面传给 `ESQueryBuilder` 作为字段展开约束。
21 2. `ESQueryBuilder._build_advanced_text_query()` 23 2. `ESQueryBuilder._build_advanced_text_query()`
22 - 按 `search_langs` 动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段,叠加 shared 字段(`tags`、`option*_values`);若命中混写辅助条件,在同一子句内并入另一语种列(§4 末)。 24 + 基于 `rewritten_query + detected_language + translations + index_languages` 构建 `base_query` 与 `base_query_trans_*`;并按语言动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段,叠加 shared 字段(`tags`、`option*_values`)。
23 3. `build_query()` 25 3. `build_query()`
24 统一走文本策略,不再有布尔 AST 枝路。 26 统一走文本策略,不再有布尔 AST 枝路。
25 27
@@ -37,18 +39,18 @@ @@ -37,18 +39,18 @@
37 源语言字段做主召回;其他语言走翻译补召回(低权重)。 39 源语言字段做主召回;其他语言走翻译补召回(低权重)。
38 2. 若 `detected_language not in index_languages`: 40 2. 若 `detected_language not in index_languages`:
39 翻译到 `index_languages` 是主路径;源语言字段仅作弱召回。 41 翻译到 `index_languages` 是主路径;源语言字段仅作弱召回。
40 -3. 若第 2 步翻译部分失败或全部失败:  
41 - 对缺失翻译的 `index_languages` 字段,追加“原文低权重兜底”子句,避免完全丢失这些语种索引面的召回机会。 42 +3. 若翻译部分失败或全部失败:
  43 + 当前实现不会再额外生成“原文打到其他语种字段”的兜底子句;系统保留 `base_query` 并继续执行,可观测性由 `translations` / warning / 命名子句分数提供。
42 44
43 ### 3.2 翻译与向量:并发提交与共享超时 45 ### 3.2 翻译与向量:并发提交与共享超时
44 46
45 -`QueryParser.parse()` 内(Stage 4–6)对**离线调用**采用线程池提交 + **一次** `concurrent.futures.wait`: 47 +`QueryParser.parse()` 内对翻译与向量采用线程池提交 + **一次** `concurrent.futures.wait`:
46 48
47 -- **翻译**:对 `index_languages` 中除 `detected_language` 外的每个目标语种各提交一个 `translator.translate` 任务(多目标时并发执行)。  
48 -- **查询向量**(若开启 `enable_text_embedding` 且域为 default):再提交一个 `text_encoder.encode` 任务。 49 +- **翻译**:对调用方传入的 `target_languages` 中、除 `detected_language` 外的每个目标语种各提交一个 `translator.translate` 任务(多目标时并发执行)。
  50 +- **查询向量**:若开启 `enable_text_embedding`,再提交一个 `text_encoder.encode` 任务。
49 - 上述任务进入**同一** future 集合;例如租户索引为 `[zh, en]` 且检测语种**不在**索引内时,常为 **2 路翻译 + 1 路向量,共 3 个任务并发**,共用超时。 51 - 上述任务进入**同一** future 集合;例如租户索引为 `[zh, en]` 且检测语种**不在**索引内时,常为 **2 路翻译 + 1 路向量,共 3 个任务并发**,共用超时。
50 52
51 -**等待预算(毫秒)**由 `detected_language` 是否属于租户 `index_languages` 决定(`query_config`): 53 +**等待预算(毫秒)**由 `detected_language` 是否属于调用方传入的 `target_languages` 决定(`query_config`):
52 54
53 - **在索引内**:`translation_embedding_wait_budget_ms_source_in_index`(默认较短,如 80ms)— 主召回已能打在源语种字段,翻译/向量稍慢可容忍。 55 - **在索引内**:`translation_embedding_wait_budget_ms_source_in_index`(默认较短,如 80ms)— 主召回已能打在源语种字段,翻译/向量稍慢可容忍。
54 - **不在索引内**:`translation_embedding_wait_budget_ms_source_not_in_index`(默认较长,如 200ms)— 翻译对可检索文本更关键,给足时间。 56 - **不在索引内**:`translation_embedding_wait_budget_ms_source_not_in_index`(默认较长,如 200ms)— 翻译对可检索文本更关键,给足时间。
@@ -62,7 +64,7 @@ @@ -62,7 +64,7 @@
62 ```json 64 ```json
63 { 65 {
64 "multi_match": { 66 "multi_match": {
65 - "_name": "base_query|base_query_trans_xx|fallback_original_query_xx", 67 + "_name": "base_query|base_query_trans_xx",
66 "query": "<text>", 68 "query": "<text>",
67 "fields": ["title.xx^3.0", "brief.xx^1.5", "...", "tags", "option1_values^0.5", "..."], 69 "fields": ["title.xx^3.0", "brief.xx^1.5", "...", "tags", "option1_values^0.5", "..."],
68 "minimum_should_match": "75%", 70 "minimum_should_match": "75%",
@@ -75,7 +77,7 @@ @@ -75,7 +77,7 @@
75 最终按 `bool.should` 组合,`minimum_should_match: 1`。 77 最终按 `bool.should` 组合,`minimum_should_match: 1`。
76 78
77 > **附 — 混写辅助召回** 79 > **附 — 混写辅助召回**
78 -> 当中英(或多脚本)混写时,为略抬召回:`QueryParser` 用 `contains_chinese`(文中有汉字)、`contains_english`(分词中有长度 ≥3 的纯英文 token)打标;`ESQueryBuilder` 在某一语言的 `multi_match` 上,按规则把**另一语种**的同类字段并入同一 `fields`(受 `index_languages` 限制),并入列的 boost 为配置值再乘 **`mixed_script_merged_field_boost_scale`(默认 0.8,`ESQueryBuilder` 构造参数)**。`fallback_original_query_*` 同样适用。字段在内部以 `(path, boost)` 列表合并后再格式化为 ES 字符串。 80 +> 当中英(或多脚本)混写时,为略抬召回:`QueryParser` 用 `contains_chinese`(文中有汉字)、`contains_english`(分词中有长度 ≥3 的纯英文 token)打标;`ESQueryBuilder` 在某一语言的 `multi_match` 上,按规则把**另一语种**的同类字段并入同一 `fields`(受 `index_languages` 限制),并入列的 boost 为配置值再乘 **`mixed_script_merged_field_boost_scale`(默认 0.6,`ESQueryBuilder` 构造参数)**。字段在内部以 `(path, boost)` 列表合并后再格式化为 ES 字符串。
79 81
80 ## 5. 关键配置项(文本策略) 82 ## 5. 关键配置项(文本策略)
81 83
@@ -88,20 +90,12 @@ @@ -88,20 +90,12 @@
88 90
89 - `base_minimum_should_match` 91 - `base_minimum_should_match`
90 - `translation_minimum_should_match` 92 - `translation_minimum_should_match`
91 -- `translation_boost`  
92 -- `translation_boost_when_source_missing`  
93 -- `source_boost_when_missing`  
94 -- `original_query_fallback_boost_when_translation_missing`(新增) 93 +- `translation_boost`(所有 `base_query_trans_*` 共用)
95 - `tie_breaker_base_query` 94 - `tie_breaker_base_query`
96 95
97 -新增项说明:  
98 -  
99 -- `original_query_fallback_boost_when_translation_missing`:  
100 - 当源语种不在索引语言且翻译缺失时,原文打到缺失目标语字段的低权重系数,默认 `0.2`。  
101 -  
102 说明: 96 说明:
103 97
104 -- `phrase_query` / `keywords_query` 已从当前实现中移除,文本相关性只由 `base_query`、`base_query_trans_*`、`fallback_original_query_*` 三类子句组成。 98 +- `phrase_query` / `keywords_query` 已从当前实现中移除,文本相关性只由 `base_query`、`base_query_trans_*`类子句组成。
105 99
106 ## 6. 典型场景与实际 DSL 100 ## 6. 典型场景与实际 DSL
107 101
@@ -111,11 +105,12 @@ @@ -111,11 +105,12 @@
111 105
112 - `detected_language=de` 106 - `detected_language=de`
113 - `index_languages=[de,en]` 107 - `index_languages=[de,en]`
114 -- `query_text_by_lang={de:"herren schuhe", en:"men shoes"}` 108 +- `rewritten_query="herren schuhe"`
  109 +- `translations={en:"men shoes"}`
115 110
116 策略结果: 111 策略结果:
117 112
118 -- `base_query`:德语字段,正常权重 113 +- `base_query`:德语字段,**不写** `multi_match.boost`
119 - `base_query_trans_en`:英语字段,`boost=translation_boost`(默认 0.4) 114 - `base_query_trans_en`:英语字段,`boost=translation_boost`(默认 0.4)
120 115
121 ### 场景 B:源语种不在索引语言中,部分翻译缺失 116 ### 场景 B:源语种不在索引语言中,部分翻译缺失
@@ -126,38 +121,44 @@ @@ -126,38 +121,44 @@
126 121
127 策略结果: 122 策略结果:
128 123
129 -- `base_query`(德语字段):`boost=source_boost_when_missing`(默认 0.6)  
130 -- `base_query_trans_en`(英文字段):`boost=translation_boost_when_source_missing`(默认 1.0)  
131 -- `fallback_original_query_zh`(中文字段):原文低权重兜底(默认 0.2) 124 +- `base_query`(德语字段):**不写** `multi_match.boost`(默认 1.0)
  125 +- `base_query_trans_en`(英文字段):`boost=translation_boost`(如 0.4)
  126 +- 不会生成额外中文兜底子句
132 127
133 ### 场景 C:源语种不在索引语言中,翻译全部失败 128 ### 场景 C:源语种不在索引语言中,翻译全部失败
134 129
135 - `detected_language=de` 130 - `detected_language=de`
136 - `index_languages=[en,zh]` 131 - `index_languages=[en,zh]`
137 -- `query_text_by_lang` 仅有 `de` 132 +- `translations={}`
138 133
139 策略结果: 134 策略结果:
140 135
141 -- `base_query`(德语字段,低权重)  
142 -- `fallback_original_query_en`(英文字段原文兜底)  
143 -- `fallback_original_query_zh`(中文字段原文兜底) 136 +- `base_query`(德语字段,**无** `boost` 字段)
  137 +- 不会生成 `base_query_trans_*`
144 138
145 -这能避免“只有源语种字段查询,且该语种字段在商家索引中稀疏/为空”导致的弱召回问题 139 +这意味着当前实现优先保证职责清晰与可解释性,而不是继续在 Builder 内部隐式制造“跨语种原文兜底”
146 140
147 -## 7. QueryParser 与 ESBuilder 的职责分工 141 +## 7. QueryParser 与 Searcher / ESBuilder 的职责分工
148 142
149 -- `QueryParser` 负责“语言计划”与“可用文本”:  
150 - - `search_langs`  
151 - - `query_text_by_lang`  
152 - - `source_in_index_languages`  
153 - - `index_languages` 143 +- `QueryParser` 负责“解析事实”:
  144 + - `query_normalized`
  145 + - `rewritten_query`
  146 + - `detected_language`
  147 + - `translations`
  148 + - `query_vector`
  149 + - `query_tokens`
154 - `contains_chinese` / `contains_english` 150 - `contains_chinese` / `contains_english`
  151 +- `Searcher` 负责“租户语境”:
  152 + - `index_languages`
  153 + - 将其传给 parser 作为 `target_languages`
  154 + - 将其传给 builder 作为字段展开约束
155 - `ESQueryBuilder` 负责“表达式展开”: 155 - `ESQueryBuilder` 负责“表达式展开”:
156 - 动态字段组装 156 - 动态字段组装
157 - 子句权重分配 157 - 子句权重分配
158 - - 翻译缺失兜底子句拼接 158 + - `base_query` / `base_query_trans_*` 子句拼接
  159 + - 跳过“与 base_query 文本和语言完全相同”的重复翻译子句
159 160
160 -这种分层让策略调优主要落在配置和 Builder,不破坏 Parser 的职责边界 161 +这种分层让 parser 不再返回 ES 专用的“语言计划字段”,职责边界更清晰
161 162
162 ## 8. 融合打分(Rerank + Text + KNN) 163 ## 8. 融合打分(Rerank + Text + KNN)
163 164
@@ -165,24 +166,21 @@ @@ -165,24 +166,21 @@
165 166
166 ### 8.1 文本相关性大分 167 ### 8.1 文本相关性大分
167 168
168 -文本大分由部分组成: 169 +文本大分由部分组成:
169 170
170 - `base_query` 171 - `base_query`
171 - `base_query_trans_*` 172 - `base_query_trans_*`
172 -- `fallback_original_query_*`  
173 173
174 聚合方式: 174 聚合方式:
175 175
176 1. `source_score = base_query` 176 1. `source_score = base_query`
177 2. `translation_score = max(base_query_trans_*)` 177 2. `translation_score = max(base_query_trans_*)`
178 -3. `fallback_score = max(fallback_original_query_*)`  
179 -4. 加权: 178 +3. 加权:
180 - `weighted_source = source_score` 179 - `weighted_source = source_score`
181 - `weighted_translation = 0.8 * translation_score` 180 - `weighted_translation = 0.8 * translation_score`
182 - - `weighted_fallback = 0.55 * fallback_score`  
183 -5. 合成:  
184 - - `primary = max(weighted_source, weighted_translation, weighted_fallback)`  
185 - - `support = weighted_source + weighted_translation + weighted_fallback - primary` 181 +4. 合成:
  182 + - `primary = max(weighted_source, weighted_translation)`
  183 + - `support = weighted_source + weighted_translation - primary`
186 - `text_score = primary + 0.25 * support` 184 - `text_score = primary + 0.25 * support`
187 185
188 如果以上子分都缺失,则回退到 ES `_score` 作为 `text_score`,避免纯文本召回被误打成 0。 186 如果以上子分都缺失,则回退到 ES `_score` 作为 `text_score`,避免纯文本召回被误打成 0。
@@ -212,7 +210,6 @@ fused_score = ( @@ -212,7 +210,6 @@ fused_score = (
212 - `text_score` 210 - `text_score`
213 - `text_source_score` 211 - `text_source_score`
214 - `text_translation_score` 212 - `text_translation_score`
215 -- `text_fallback_score`  
216 - `text_primary_score` 213 - `text_primary_score`
217 - `text_support_score` 214 - `text_support_score`
218 - `knn_score` 215 - `knn_score`
@@ -221,9 +218,9 @@ fused_score = ( @@ -221,9 +218,9 @@ fused_score = (
221 218
222 `debug_info.query_analysis` 还会暴露: 219 `debug_info.query_analysis` 还会暴露:
223 220
224 -- `query_text_by_lang`  
225 -- `search_langs`  
226 -- `supplemental_search_langs` 221 +- `translations`
  222 +- `detected_language`
  223 +- `rewritten_query`
227 224
228 这些字段用于检索效果评估与 bad case 归因。 225 这些字段用于检索效果评估与 bad case 归因。
229 226
@@ -231,7 +228,7 @@ fused_score = ( @@ -231,7 +228,7 @@ fused_score = (
231 228
232 1. 当前文本主链路已移除布尔 AST 分支。 229 1. 当前文本主链路已移除布尔 AST 分支。
233 2. 文档中的旧描述(如 `operator: AND` 固定开启)不再适用,当前实现未强制设置该参数。 230 2. 文档中的旧描述(如 `operator: AND` 固定开启)不再适用,当前实现未强制设置该参数。
234 -3. `HanLP` 为可选依赖;不可用时退化到轻量分词,不影响主链路可用性 231 +3. `HanLP` 为必需依赖;当前 parser 不再提供轻量 fallback
235 4. 若后续扩展到更多语种,请确保: 232 4. 若后续扩展到更多语种,请确保:
236 - mapping 中存在对应 `.<lang>` 字段 233 - mapping 中存在对应 `.<lang>` 字段
237 - `index_languages` 配置在支持列表内 234 - `index_languages` 配置在支持列表内
@@ -263,10 +260,9 @@ python ./scripts/eval_search_quality.py @@ -263,10 +260,9 @@ python ./scripts/eval_search_quality.py
263 建议在 `tests/` 增加文本策略用例: 260 建议在 `tests/` 增加文本策略用例:
264 261
265 1. 源语种在索引语言,翻译命中缓存 262 1. 源语种在索引语言,翻译命中缓存
266 -2. 源语种不在索引语言,翻译部分失败(验证 fallback 子句)  
267 -3. 源语种不在索引语言,翻译全部失败(验证多目标 fallback)  
268 -4. 自定义 `original_query_fallback_boost_when_translation_missing` 生效  
269 -5. 非 `zh/en` 语种字段动态拼接(如 `de/fr/es`) 263 +2. 源语种不在索引语言,翻译部分失败(验证仅保留 `base_query` + 成功翻译子句)
  264 +3. 源语种不在索引语言,翻译全部失败(验证无 `base_query_trans_*` 时仍可正常执行)
  265 +4. 非 `zh/en` 语种字段动态拼接(如 `de/fr/es`)
270 266
271 267
272 268
requirements_hanlp.txt 0 → 100644
@@ -0,0 +1,13 @@ @@ -0,0 +1,13 @@
  1 +# Optional: HanLP query tokenization for the main backend venv (QueryParser).
  2 +#
  3 +# Install:
  4 +# source activate.sh
  5 +# pip install -r requirements_hanlp.txt
  6 +#
  7 +# Why pin transformers<5:
  8 +# transformers 5.x no longer exposes `encode_plus` on `BertTokenizer`, but HanLP 2.1.x
  9 +# still calls it → AttributeError during `hanlp.load(...)`.
  10 +# Use transformers 4.44+ (4.x) which remains API-compatible with HanLP.
  11 +
  12 +hanlp>=2.1.0
  13 +transformers>=4.44,<5
scripts/eval_search_quality.py
@@ -83,7 +83,6 @@ class RankedItem: @@ -83,7 +83,6 @@ class RankedItem:
83 text_score: float | None 83 text_score: float | None
84 text_source_score: float | None 84 text_source_score: float | None
85 text_translation_score: float | None 85 text_translation_score: float | None
86 - text_fallback_score: float | None  
87 text_primary_score: float | None 86 text_primary_score: float | None
88 text_support_score: float | None 87 text_support_score: float | None
89 knn_score: float | None 88 knn_score: float | None
@@ -146,7 +145,6 @@ def _evaluate_query(searcher, tenant_id: str, query: str) -&gt; Dict[str, Any]: @@ -146,7 +145,6 @@ def _evaluate_query(searcher, tenant_id: str, query: str) -&gt; Dict[str, Any]:
146 text_score=_to_float(debug_item.get("text_score")), 145 text_score=_to_float(debug_item.get("text_score")),
147 text_source_score=_to_float(debug_item.get("text_source_score")), 146 text_source_score=_to_float(debug_item.get("text_source_score")),
148 text_translation_score=_to_float(debug_item.get("text_translation_score")), 147 text_translation_score=_to_float(debug_item.get("text_translation_score")),
149 - text_fallback_score=_to_float(debug_item.get("text_fallback_score")),  
150 text_primary_score=_to_float(debug_item.get("text_primary_score")), 148 text_primary_score=_to_float(debug_item.get("text_primary_score")),
151 text_support_score=_to_float(debug_item.get("text_support_score")), 149 text_support_score=_to_float(debug_item.get("text_support_score")),
152 knn_score=_to_float(debug_item.get("knn_score")), 150 knn_score=_to_float(debug_item.get("knn_score")),
@@ -185,12 +183,11 @@ def _render_markdown(report: Dict[str, Any]) -&gt; str: @@ -185,12 +183,11 @@ def _render_markdown(report: Dict[str, Any]) -&gt; str:
185 f"- total={entry['total']} max_score={entry['max_score']:.6f} took_ms={entry['took_ms']}" 183 f"- total={entry['total']} max_score={entry['max_score']:.6f} took_ms={entry['took_ms']}"
186 ) 184 )
187 lines.append( 185 lines.append(
188 - f"- detected_language={qa.get('detected_language')} search_langs={qa.get('search_langs')} supplemental_search_langs={qa.get('supplemental_search_langs')}" 186 + f"- detected_language={qa.get('detected_language')} translations={qa.get('translations')}"
189 ) 187 )
190 - lines.append(f"- query_text_by_lang={qa.get('query_text_by_lang')}")  
191 lines.append("") 188 lines.append("")
192 - lines.append("| rank | spu_id | title | fused | rerank | text | text_src | text_trans | text_fb | knn | es | matched_queries |")  
193 - lines.append("| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |") 189 + lines.append("| rank | spu_id | title | fused | rerank | text | text_src | text_trans | knn | es | matched_queries |")
  190 + lines.append("| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |")
194 for item in entry.get("top20", []): 191 for item in entry.get("top20", []):
195 title = str(item.get("title", "")).replace("|", "/") 192 title = str(item.get("title", "")).replace("|", "/")
196 matched = json.dumps(item.get("matched_queries"), ensure_ascii=False) 193 matched = json.dumps(item.get("matched_queries"), ensure_ascii=False)
@@ -199,7 +196,7 @@ def _render_markdown(report: Dict[str, Any]) -&gt; str: @@ -199,7 +196,7 @@ def _render_markdown(report: Dict[str, Any]) -&gt; str:
199 f"| {item.get('rank')} | {item.get('spu_id')} | {title} | " 196 f"| {item.get('rank')} | {item.get('spu_id')} | {title} | "
200 f"{item.get('fused_score')} | {item.get('rerank_score')} | {item.get('text_score')} | " 197 f"{item.get('fused_score')} | {item.get('rerank_score')} | {item.get('text_score')} | "
201 f"{item.get('text_source_score')} | {item.get('text_translation_score')} | " 198 f"{item.get('text_source_score')} | {item.get('text_translation_score')} | "
202 - f"{item.get('text_fallback_score')} | {item.get('knn_score')} | {item.get('es_score')} | {matched} |" 199 + f"{item.get('knn_score')} | {item.get('es_score')} | {matched} |"
203 ) 200 )
204 lines.append("") 201 lines.append("")
205 return "\n".join(lines) 202 return "\n".join(lines)
search/es_query_builder.py
@@ -36,9 +36,6 @@ class ESQueryBuilder: @@ -36,9 +36,6 @@ class ESQueryBuilder:
36 base_minimum_should_match: str = "70%", 36 base_minimum_should_match: str = "70%",
37 translation_minimum_should_match: str = "70%", 37 translation_minimum_should_match: str = "70%",
38 translation_boost: float = 0.4, 38 translation_boost: float = 0.4,
39 - translation_boost_when_source_missing: float = 1.0,  
40 - source_boost_when_missing: float = 0.6,  
41 - original_query_fallback_boost_when_translation_missing: float = 0.2,  
42 tie_breaker_base_query: float = 0.9, 39 tie_breaker_base_query: float = 0.9,
43 mixed_script_merged_field_boost_scale: float = 0.6, 40 mixed_script_merged_field_boost_scale: float = 0.6,
44 ): 41 ):
@@ -74,11 +71,6 @@ class ESQueryBuilder: @@ -74,11 +71,6 @@ class ESQueryBuilder:
74 self.base_minimum_should_match = base_minimum_should_match 71 self.base_minimum_should_match = base_minimum_should_match
75 self.translation_minimum_should_match = translation_minimum_should_match 72 self.translation_minimum_should_match = translation_minimum_should_match
76 self.translation_boost = float(translation_boost) 73 self.translation_boost = float(translation_boost)
77 - self.translation_boost_when_source_missing = float(translation_boost_when_source_missing)  
78 - self.source_boost_when_missing = float(source_boost_when_missing)  
79 - self.original_query_fallback_boost_when_translation_missing = float(  
80 - original_query_fallback_boost_when_translation_missing  
81 - )  
82 self.tie_breaker_base_query = float(tie_breaker_base_query) 74 self.tie_breaker_base_query = float(tie_breaker_base_query)
83 self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale) 75 self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale)
84 76
@@ -168,7 +160,7 @@ class ESQueryBuilder: @@ -168,7 +160,7 @@ class ESQueryBuilder:
168 结构:filters and (text_recall or embedding_recall) + post_filter 160 结构:filters and (text_recall or embedding_recall) + post_filter
169 - conjunctive_filters: 应用在 query.bool.filter(影响结果和聚合) 161 - conjunctive_filters: 应用在 query.bool.filter(影响结果和聚合)
170 - disjunctive_filters: 应用在 post_filter(只影响结果,不影响聚合) 162 - disjunctive_filters: 应用在 post_filter(只影响结果,不影响聚合)
171 - - text_recall: 文本相关性召回(按 search_langs 动态语言字段) 163 + - text_recall: 文本相关性召回(按实际 clause 语言动态字段)
172 - embedding_recall: 向量召回(KNN) 164 - embedding_recall: 向量召回(KNN)
173 - function_score: 包装召回部分,支持提权字段 165 - function_score: 包装召回部分,支持提权字段
174 166
@@ -484,6 +476,7 @@ class ESQueryBuilder: @@ -484,6 +476,7 @@ class ESQueryBuilder:
484 contains_chinese: bool, 476 contains_chinese: bool,
485 contains_english: bool, 477 contains_english: bool,
486 index_languages: List[str], 478 index_languages: List[str],
  479 + is_source: bool = False
487 ) -> List[MatchFieldSpec]: 480 ) -> List[MatchFieldSpec]:
488 """ 481 """
489 When the query mixes scripts, widen each clause to indexed fields for the other script 482 When the query mixes scripts, widen each clause to indexed fields for the other script
@@ -497,10 +490,11 @@ class ESQueryBuilder: @@ -497,10 +490,11 @@ class ESQueryBuilder:
497 490
498 out = list(specs) 491 out = list(specs)
499 lnorm = (lang or "").strip().lower() 492 lnorm = (lang or "").strip().lower()
500 - if contains_english and lnorm != "en" and can_use("en"):  
501 - out = self._merge_supplemental_lang_field_specs(out, "en")  
502 - if contains_chinese and lnorm != "zh" and can_use("zh"):  
503 - out = self._merge_supplemental_lang_field_specs(out, "zh") 493 + if is_source:
  494 + if contains_english and lnorm != "en" and can_use("en"):
  495 + out = self._merge_supplemental_lang_field_specs(out, "en")
  496 + if contains_chinese and lnorm != "zh" and can_use("zh"):
  497 + out = self._merge_supplemental_lang_field_specs(out, "zh")
504 return out 498 return out
505 499
506 def _get_embedding_field(self, language: str) -> str: 500 def _get_embedding_field(self, language: str) -> str:
@@ -557,10 +551,6 @@ class ESQueryBuilder: @@ -557,10 +551,6 @@ class ESQueryBuilder:
557 contains_english = bool(getattr(parsed_query, "contains_english", False)) 551 contains_english = bool(getattr(parsed_query, "contains_english", False))
558 552
559 source_lang = str(source_lang or self.default_language).strip().lower() or self.default_language 553 source_lang = str(source_lang or self.default_language).strip().lower() or self.default_language
560 - source_in_index_languages = (  
561 - True if not normalized_index_languages else source_lang in normalized_index_languages  
562 - )  
563 -  
564 base_query_text = ( 554 base_query_text = (
565 getattr(parsed_query, "rewritten_query", None) if parsed_query else None 555 getattr(parsed_query, "rewritten_query", None) if parsed_query else None
566 ) or query_text 556 ) or query_text
@@ -574,22 +564,14 @@ class ESQueryBuilder: @@ -574,22 +564,14 @@ class ESQueryBuilder:
574 contains_chinese, 564 contains_chinese,
575 contains_english, 565 contains_english,
576 normalized_index_languages, 566 normalized_index_languages,
  567 + is_source,
577 ) 568 )
578 match_fields = self._format_match_field_specs(expanded_specs) 569 match_fields = self._format_match_field_specs(expanded_specs)
579 if not match_fields: 570 if not match_fields:
580 return 571 return
581 - clause_boost = 1.0  
582 minimum_should_match = ( 572 minimum_should_match = (
583 self.base_minimum_should_match if is_source else self.translation_minimum_should_match 573 self.base_minimum_should_match if is_source else self.translation_minimum_should_match
584 ) 574 )
585 - if is_source and not source_in_index_languages:  
586 - clause_boost = self.source_boost_when_missing  
587 - elif not is_source:  
588 - clause_boost = (  
589 - self.translation_boost  
590 - if source_in_index_languages  
591 - else self.translation_boost_when_source_missing  
592 - )  
593 575
594 clause = { 576 clause = {
595 "multi_match": { 577 "multi_match": {
@@ -600,8 +582,11 @@ class ESQueryBuilder: @@ -600,8 +582,11 @@ class ESQueryBuilder:
600 "tie_breaker": self.tie_breaker_base_query, 582 "tie_breaker": self.tie_breaker_base_query,
601 } 583 }
602 } 584 }
603 - if abs(clause_boost - 1.0) > 1e-9:  
604 - clause["multi_match"]["boost"] = clause_boost 585 + # base_query: never set multi_match.boost (ES default 1.0).
  586 + # Translation clauses: single knob from config — translation_boost.
  587 + if not is_source:
  588 + tb = float(self.translation_boost)
  589 + clause["multi_match"]["boost"] = tb
605 should_clauses.append({ 590 should_clauses.append({
606 "multi_match": clause["multi_match"] 591 "multi_match": clause["multi_match"]
607 }) 592 })
search/rerank_client.py
@@ -116,7 +116,6 @@ def _extract_named_query_score(matched_queries: Any, name: str) -&gt; float: @@ -116,7 +116,6 @@ def _extract_named_query_score(matched_queries: Any, name: str) -&gt; float:
116 def _collect_text_score_components(matched_queries: Any, fallback_es_score: float) -> Dict[str, float]: 116 def _collect_text_score_components(matched_queries: Any, fallback_es_score: float) -> Dict[str, float]:
117 source_score = _extract_named_query_score(matched_queries, "base_query") 117 source_score = _extract_named_query_score(matched_queries, "base_query")
118 translation_score = 0.0 118 translation_score = 0.0
119 - fallback_score = 0.0  
120 119
121 if isinstance(matched_queries, dict): 120 if isinstance(matched_queries, dict):
122 for query_name, score in matched_queries.items(): 121 for query_name, score in matched_queries.items():
@@ -125,21 +124,16 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa @@ -125,21 +124,16 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa
125 numeric_score = _to_score(score) 124 numeric_score = _to_score(score)
126 if query_name.startswith("base_query_trans_"): 125 if query_name.startswith("base_query_trans_"):
127 translation_score = max(translation_score, numeric_score) 126 translation_score = max(translation_score, numeric_score)
128 - elif query_name.startswith("fallback_original_query_"):  
129 - fallback_score = max(fallback_score, numeric_score)  
130 elif isinstance(matched_queries, list): 127 elif isinstance(matched_queries, list):
131 for query_name in matched_queries: 128 for query_name in matched_queries:
132 if not isinstance(query_name, str): 129 if not isinstance(query_name, str):
133 continue 130 continue
134 if query_name.startswith("base_query_trans_"): 131 if query_name.startswith("base_query_trans_"):
135 translation_score = 1.0 132 translation_score = 1.0
136 - elif query_name.startswith("fallback_original_query_"):  
137 - fallback_score = 1.0  
138 133
139 weighted_source = source_score 134 weighted_source = source_score
140 weighted_translation = 0.8 * translation_score 135 weighted_translation = 0.8 * translation_score
141 - weighted_fallback = 0.55 * fallback_score  
142 - weighted_components = [weighted_source, weighted_translation, weighted_fallback] 136 + weighted_components = [weighted_source, weighted_translation]
143 primary_text_score = max(weighted_components) 137 primary_text_score = max(weighted_components)
144 support_text_score = sum(weighted_components) - primary_text_score 138 support_text_score = sum(weighted_components) - primary_text_score
145 text_score = primary_text_score + 0.25 * support_text_score 139 text_score = primary_text_score + 0.25 * support_text_score
@@ -153,10 +147,8 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa @@ -153,10 +147,8 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa
153 return { 147 return {
154 "source_score": source_score, 148 "source_score": source_score,
155 "translation_score": translation_score, 149 "translation_score": translation_score,
156 - "fallback_score": fallback_score,  
157 "weighted_source_score": weighted_source, 150 "weighted_source_score": weighted_source,
158 "weighted_translation_score": weighted_translation, 151 "weighted_translation_score": weighted_translation,
159 - "weighted_fallback_score": weighted_fallback,  
160 "primary_text_score": primary_text_score, 152 "primary_text_score": primary_text_score,
161 "support_text_score": support_text_score, 153 "support_text_score": support_text_score,
162 "text_score": text_score, 154 "text_score": text_score,
@@ -219,7 +211,6 @@ def fuse_scores_and_resort( @@ -219,7 +211,6 @@ def fuse_scores_and_resort(
219 hit["_knn_score"] = knn_score 211 hit["_knn_score"] = knn_score
220 hit["_text_source_score"] = text_components["source_score"] 212 hit["_text_source_score"] = text_components["source_score"]
221 hit["_text_translation_score"] = text_components["translation_score"] 213 hit["_text_translation_score"] = text_components["translation_score"]
222 - hit["_text_fallback_score"] = text_components["fallback_score"]  
223 hit["_text_primary_score"] = text_components["primary_text_score"] 214 hit["_text_primary_score"] = text_components["primary_text_score"]
224 hit["_text_support_score"] = text_components["support_text_score"] 215 hit["_text_support_score"] = text_components["support_text_score"]
225 hit["_fused_score"] = fused 216 hit["_fused_score"] = fused
@@ -231,7 +222,6 @@ def fuse_scores_and_resort( @@ -231,7 +222,6 @@ def fuse_scores_and_resort(
231 "text_score": text_score, 222 "text_score": text_score,
232 "text_source_score": text_components["source_score"], 223 "text_source_score": text_components["source_score"],
233 "text_translation_score": text_components["translation_score"], 224 "text_translation_score": text_components["translation_score"],
234 - "text_fallback_score": text_components["fallback_score"],  
235 "text_primary_score": text_components["primary_text_score"], 225 "text_primary_score": text_components["primary_text_score"],
236 "text_support_score": text_components["support_text_score"], 226 "text_support_score": text_components["support_text_score"],
237 "knn_score": knn_score, 227 "knn_score": knn_score,
search/searcher.py
@@ -132,11 +132,6 @@ class Searcher: @@ -132,11 +132,6 @@ class Searcher:
132 base_minimum_should_match=self.config.query_config.base_minimum_should_match, 132 base_minimum_should_match=self.config.query_config.base_minimum_should_match,
133 translation_minimum_should_match=self.config.query_config.translation_minimum_should_match, 133 translation_minimum_should_match=self.config.query_config.translation_minimum_should_match,
134 translation_boost=self.config.query_config.translation_boost, 134 translation_boost=self.config.query_config.translation_boost,
135 - translation_boost_when_source_missing=self.config.query_config.translation_boost_when_source_missing,  
136 - source_boost_when_missing=self.config.query_config.source_boost_when_missing,  
137 - original_query_fallback_boost_when_translation_missing=(  
138 - self.config.query_config.original_query_fallback_boost_when_translation_missing  
139 - ),  
140 tie_breaker_base_query=self.config.query_config.tie_breaker_base_query, 135 tie_breaker_base_query=self.config.query_config.tie_breaker_base_query,
141 ) 136 )
142 137
@@ -267,13 +262,6 @@ class Searcher: @@ -267,13 +262,6 @@ class Searcher:
267 if normalized: 262 if normalized:
268 candidates.append(normalized) 263 candidates.append(normalized)
269 264
270 - query_text_by_lang = getattr(parsed_query, "query_text_by_lang", {}) or {}  
271 - if isinstance(query_text_by_lang, dict):  
272 - for text in query_text_by_lang.values():  
273 - normalized = self._normalize_sku_match_text(text)  
274 - if normalized:  
275 - candidates.append(normalized)  
276 -  
277 translations = getattr(parsed_query, "translations", {}) or {} 265 translations = getattr(parsed_query, "translations", {}) or {}
278 if isinstance(translations, dict): 266 if isinstance(translations, dict):
279 for text in translations.values(): 267 for text in translations.values():
@@ -943,7 +931,6 @@ class Searcher: @@ -943,7 +931,6 @@ class Searcher:
943 debug_entry["text_score"] = rerank_debug.get("text_score") 931 debug_entry["text_score"] = rerank_debug.get("text_score")
944 debug_entry["text_source_score"] = rerank_debug.get("text_source_score") 932 debug_entry["text_source_score"] = rerank_debug.get("text_source_score")
945 debug_entry["text_translation_score"] = rerank_debug.get("text_translation_score") 933 debug_entry["text_translation_score"] = rerank_debug.get("text_translation_score")
946 - debug_entry["text_fallback_score"] = rerank_debug.get("text_fallback_score")  
947 debug_entry["text_primary_score"] = rerank_debug.get("text_primary_score") 934 debug_entry["text_primary_score"] = rerank_debug.get("text_primary_score")
948 debug_entry["text_support_score"] = rerank_debug.get("text_support_score") 935 debug_entry["text_support_score"] = rerank_debug.get("text_support_score")
949 debug_entry["knn_score"] = rerank_debug.get("knn_score") 936 debug_entry["knn_score"] = rerank_debug.get("knn_score")
tests/test_es_query_builder_text_recall_languages.py 0 → 100644
@@ -0,0 +1,519 @@ @@ -0,0 +1,519 @@
  1 +"""
  2 +ES text recall: base_query (rewritten @ detected_language) + base_query_trans_*.
  3 +
  4 +Covers combinations of query language vs tenant index_languages, translations,
  5 +and mixed Chinese/English queries. Asserts multi_match _name, query text, and
  6 +target language fields (title.{lang}).
  7 +"""
  8 +
  9 +from types import SimpleNamespace
  10 +from typing import Any, Dict, List
  11 +
  12 +import numpy as np
  13 +
  14 +from search.es_query_builder import ESQueryBuilder
  15 +
  16 +
  17 +def _builder_multilingual_title_only(
  18 + *,
  19 + default_language: str = "en",
  20 + mixed_script_scale: float = 0.6,
  21 +) -> ESQueryBuilder:
  22 + """Minimal builder: only title.{lang} for easy field assertions."""
  23 + return ESQueryBuilder(
  24 + match_fields=["title.en^1.0"],
  25 + multilingual_fields=["title"],
  26 + shared_fields=[],
  27 + text_embedding_field="title_embedding",
  28 + default_language=default_language,
  29 + mixed_script_merged_field_boost_scale=mixed_script_scale,
  30 + function_score_config=None,
  31 + )
  32 +
  33 +
  34 +def _unwrap_inner_query(es_body: Dict[str, Any]) -> Dict[str, Any]:
  35 + """Navigate bool.must / function_score wrappers to the text recall root."""
  36 + q = es_body.get("query") or {}
  37 + if "bool" in q and "must" in q["bool"] and q["bool"]["must"]:
  38 + q = q["bool"]["must"][0]
  39 + if "function_score" in q:
  40 + q = q["function_score"]["query"]
  41 + return q
  42 +
  43 +
  44 +def _extract_multi_match_clauses(es_body: Dict[str, Any]) -> List[Dict[str, Any]]:
  45 + inner = _unwrap_inner_query(es_body)
  46 + if "multi_match" in inner:
  47 + return [inner["multi_match"]]
  48 + should = (inner.get("bool") or {}).get("should") or []
  49 + return [c["multi_match"] for c in should if "multi_match" in c]
  50 +
  51 +
  52 +def _clauses_index(es_body: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
  53 + """Map _name -> multi_match dict."""
  54 + out: Dict[str, Dict[str, Any]] = {}
  55 + for mm in _extract_multi_match_clauses(es_body):
  56 + name = mm.get("_name")
  57 + if name:
  58 + out[str(name)] = mm
  59 + return out
  60 +
  61 +
  62 +def _title_fields(mm: Dict[str, Any]) -> List[str]:
  63 + fields = mm.get("fields") or []
  64 + return [f for f in fields if str(f).startswith("title.")]
  65 +
  66 +
  67 +def _has_title_lang(mm: Dict[str, Any], lang: str) -> bool:
  68 + """True if any field is title.{lang} with optional ^boost suffix."""
  69 + prefix = f"title.{lang}"
  70 + for f in mm.get("fields") or []:
  71 + s = str(f)
  72 + if s == prefix or s.startswith(prefix + "^"):
  73 + return True
  74 + return False
  75 +
  76 +
  77 +def _build(
  78 + qb: ESQueryBuilder,
  79 + *,
  80 + query_text: str,
  81 + rewritten: str,
  82 + detected_language: str,
  83 + translations: Dict[str, str],
  84 + index_languages: List[str],
  85 + contains_chinese: bool = False,
  86 + contains_english: bool = False,
  87 +) -> Dict[str, Any]:
  88 + parsed = SimpleNamespace(
  89 + rewritten_query=rewritten,
  90 + detected_language=detected_language,
  91 + translations=dict(translations),
  92 + contains_chinese=contains_chinese,
  93 + contains_english=contains_english,
  94 + )
  95 + return qb.build_query(
  96 + query_text=query_text,
  97 + parsed_query=parsed,
  98 + enable_knn=False,
  99 + index_languages=index_languages,
  100 + )
  101 +
  102 +
  103 +# --- 检测语言在 index_languages 内:主召回 + 翻译补召回 ---
  104 +
  105 +
  106 +def test_zh_query_index_zh_en_includes_base_zh_and_trans_en():
  107 + qb = _builder_multilingual_title_only(default_language="en")
  108 + q = _build(
  109 + qb,
  110 + query_text="连衣裙",
  111 + rewritten="连衣裙",
  112 + detected_language="zh",
  113 + translations={"en": "dress"},
  114 + index_languages=["zh", "en"],
  115 + )
  116 + idx = _clauses_index(q)
  117 + assert set(idx) == {"base_query", "base_query_trans_en"}
  118 + assert idx["base_query"]["query"] == "连衣裙"
  119 + assert "title.zh" in _title_fields(idx["base_query"])
  120 + assert idx["base_query_trans_en"]["query"] == "dress"
  121 + assert "title.en" in _title_fields(idx["base_query_trans_en"])
  122 +
  123 +
  124 +def test_en_query_index_zh_en_includes_base_en_and_trans_zh():
  125 + qb = _builder_multilingual_title_only(default_language="en")
  126 + q = _build(
  127 + qb,
  128 + query_text="dress",
  129 + rewritten="dress",
  130 + detected_language="en",
  131 + translations={"zh": "连衣裙"},
  132 + index_languages=["en", "zh"],
  133 + )
  134 + idx = _clauses_index(q)
  135 + assert set(idx) == {"base_query", "base_query_trans_zh"}
  136 + assert idx["base_query"]["query"] == "dress"
  137 + assert "title.en" in _title_fields(idx["base_query"])
  138 + assert idx["base_query_trans_zh"]["query"] == "连衣裙"
  139 + assert "title.zh" in _title_fields(idx["base_query_trans_zh"])
  140 +
  141 +
  142 +def test_de_query_index_de_en_fr_includes_base_and_two_translations():
  143 + qb = _builder_multilingual_title_only(default_language="en")
  144 + q = _build(
  145 + qb,
  146 + query_text="kleid",
  147 + rewritten="kleid",
  148 + detected_language="de",
  149 + translations={"en": "dress", "fr": "robe"},
  150 + index_languages=["de", "en", "fr"],
  151 + )
  152 + idx = _clauses_index(q)
  153 + assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_fr"}
  154 + assert idx["base_query"]["query"] == "kleid"
  155 + assert "title.de" in _title_fields(idx["base_query"])
  156 + assert idx["base_query_trans_en"]["query"] == "dress"
  157 + assert idx["base_query_trans_fr"]["query"] == "robe"
  158 +
  159 +
  160 +# --- 检测语言不在 index_languages:仍有 base(弱)+ 翻译(强) ---
  161 +
  162 +
  163 +def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields():
  164 + qb = _builder_multilingual_title_only(default_language="en")
  165 + q = _build(
  166 + qb,
  167 + query_text="schuh",
  168 + rewritten="schuh",
  169 + detected_language="de",
  170 + translations={"en": "shoe", "zh": "鞋"},
  171 + index_languages=["en", "zh"],
  172 + )
  173 + idx = _clauses_index(q)
  174 + assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_zh"}
  175 + assert idx["base_query"]["query"] == "schuh"
  176 + assert "title.de" in _title_fields(idx["base_query"])
  177 + assert "boost" not in idx["base_query"]
  178 + assert idx["base_query_trans_en"]["query"] == "shoe"
  179 + assert idx["base_query_trans_en"]["boost"] == qb.translation_boost
  180 + assert idx["base_query_trans_zh"]["query"] == "鞋"
  181 + assert idx["base_query_trans_zh"]["boost"] == qb.translation_boost
  182 +
  183 +
  184 +# --- 中英混写:原文在 base_query;翻译子句独立;混写时 base 子句扩列 ---
  185 +
  186 +
  187 +def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause():
  188 + qb = _builder_multilingual_title_only(default_language="en")
  189 + q = _build(
  190 + qb,
  191 + query_text="红色 dress",
  192 + rewritten="红色 dress",
  193 + detected_language="zh",
  194 + translations={"en": "red dress"},
  195 + index_languages=["zh", "en"],
  196 + contains_chinese=True,
  197 + contains_english=True,
  198 + )
  199 + idx = _clauses_index(q)
  200 + assert set(idx) == {"base_query", "base_query_trans_en"}
  201 + assert idx["base_query"]["query"] == "红色 dress"
  202 + assert _has_title_lang(idx["base_query"], "zh") and _has_title_lang(idx["base_query"], "en")
  203 + assert idx["base_query_trans_en"]["query"] == "red dress"
  204 + assert _has_title_lang(idx["base_query_trans_en"], "en")
  205 +
  206 +
  207 +def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause():
  208 + qb = _builder_multilingual_title_only(default_language="en")
  209 + q = _build(
  210 + qb,
  211 + query_text="nike 运动鞋",
  212 + rewritten="nike 运动鞋",
  213 + detected_language="en",
  214 + translations={"zh": "耐克运动鞋"},
  215 + index_languages=["zh", "en"],
  216 + contains_chinese=True,
  217 + contains_english=True,
  218 + )
  219 + idx = _clauses_index(q)
  220 + assert set(idx) == {"base_query", "base_query_trans_zh"}
  221 + assert idx["base_query"]["query"] == "nike 运动鞋"
  222 + assert _has_title_lang(idx["base_query"], "en") and _has_title_lang(idx["base_query"], "zh")
  223 + assert idx["base_query_trans_zh"]["query"] == "耐克运动鞋"
  224 +
  225 +
  226 +def test_mixed_zh_query_index_zh_only_no_en_merge_in_base():
  227 + qb = _builder_multilingual_title_only(default_language="en")
  228 + q = _build(
  229 + qb,
  230 + query_text="法式 dress",
  231 + rewritten="法式 dress",
  232 + detected_language="zh",
  233 + translations={},
  234 + index_languages=["zh"],
  235 + contains_chinese=True,
  236 + contains_english=True,
  237 + )
  238 + idx = _clauses_index(q)
  239 + assert set(idx) == {"base_query"}
  240 + bases = {f.split("^", 1)[0] for f in _title_fields(idx["base_query"])}
  241 + assert bases == {"title.zh"}
  242 +
  243 +
  244 +# --- 去重:与 base 同语言同文本的翻译项跳过 ---
  245 +
  246 +
  247 +def test_skips_translation_when_same_lang_and_same_text_as_base():
  248 + qb = _builder_multilingual_title_only(default_language="en")
  249 + q = _build(
  250 + qb,
  251 + query_text="NIKE",
  252 + rewritten="NIKE",
  253 + detected_language="en",
  254 + translations={"en": "NIKE", "zh": "耐克"},
  255 + index_languages=["en", "zh"],
  256 + )
  257 + idx = _clauses_index(q)
  258 + assert set(idx) == {"base_query", "base_query_trans_zh"}
  259 +
  260 +
  261 +def test_keeps_translation_when_same_text_but_different_lang_than_base():
  262 + qb = _builder_multilingual_title_only(default_language="en")
  263 + q = _build(
  264 + qb,
  265 + query_text="NIKE",
  266 + rewritten="NIKE",
  267 + detected_language="en",
  268 + translations={"zh": "NIKE"},
  269 + index_languages=["en", "zh"],
  270 + )
  271 + idx = _clauses_index(q)
  272 + assert set(idx) == {"base_query", "base_query_trans_zh"}
  273 + assert idx["base_query_trans_zh"]["query"] == "NIKE"
  274 +
  275 +
  276 +# --- 翻译 key 规范化、空翻译跳过 ---
  277 +
  278 +
  279 +def test_translation_language_key_is_normalized_case_insensitive():
  280 + qb = _builder_multilingual_title_only(default_language="en")
  281 + q = _build(
  282 + qb,
  283 + query_text="dress",
  284 + rewritten="dress",
  285 + detected_language="en",
  286 + translations={"ZH": "连衣裙"},
  287 + index_languages=["en", "zh"],
  288 + )
  289 + idx = _clauses_index(q)
  290 + assert "base_query_trans_zh" in idx
  291 + assert idx["base_query_trans_zh"]["query"] == "连衣裙"
  292 +
  293 +
  294 +def test_empty_translation_value_is_skipped():
  295 + qb = _builder_multilingual_title_only(default_language="en")
  296 + q = _build(
  297 + qb,
  298 + query_text="dress",
  299 + rewritten="dress",
  300 + detected_language="en",
  301 + translations={"zh": " ", "fr": "robe"},
  302 + index_languages=["en", "zh", "fr"],
  303 + )
  304 + idx = _clauses_index(q)
  305 + assert "base_query_trans_zh" not in idx
  306 + assert "base_query_trans_fr" in idx
  307 +
  308 +
  309 +# --- index_languages 为空:视为「未约束」source_in_index 为 True ---
  310 +
  311 +
  312 +def test_empty_index_languages_treats_source_as_in_index_boosts():
  313 + qb = _builder_multilingual_title_only(default_language="en")
  314 + q = _build(
  315 + qb,
  316 + query_text="x",
  317 + rewritten="x",
  318 + detected_language="de",
  319 + translations={"en": "y"},
  320 + index_languages=[],
  321 + )
  322 + idx = _clauses_index(q)
  323 + assert "boost" not in idx["base_query"]
  324 + assert idx["base_query_trans_en"]["boost"] == qb.translation_boost
  325 +
  326 +
  327 +# --- 无翻译:仅 base_query ---
  328 +
  329 +
  330 +def test_no_translations_only_base_query():
  331 + qb = _builder_multilingual_title_only(default_language="en")
  332 + q = _build(
  333 + qb,
  334 + query_text="hello",
  335 + rewritten="hello",
  336 + detected_language="en",
  337 + translations={},
  338 + index_languages=["en", "zh"],
  339 + )
  340 + idx = _clauses_index(q)
  341 + assert set(idx) == {"base_query"}
  342 +
  343 +
  344 +# --- 与 KNN 同存时仍能解析文本子句(顶层 knn 不影响 query 内结构) ---
  345 +
  346 +
  347 +def test_text_clauses_present_alongside_knn():
  348 + qb = _builder_multilingual_title_only(default_language="en")
  349 + parsed = SimpleNamespace(
  350 + rewritten_query="dress",
  351 + detected_language="en",
  352 + translations={"zh": "连衣裙"},
  353 + contains_chinese=False,
  354 + contains_english=True,
  355 + )
  356 + q = qb.build_query(
  357 + query_text="dress",
  358 + query_vector=np.array([0.1, 0.2, 0.3], dtype=np.float32),
  359 + parsed_query=parsed,
  360 + enable_knn=True,
  361 + index_languages=["en", "zh"],
  362 + )
  363 + assert "knn" in q
  364 + idx = _clauses_index(q)
  365 + assert set(idx) == {"base_query", "base_query_trans_zh"}
  366 +
  367 +
  368 +def test_detected_language_unknown_falls_back_to_default_language():
  369 + """与 LanguageDetector 失败时 QueryConfig.default_language 行为对齐。"""
  370 + qb = _builder_multilingual_title_only(default_language="en")
  371 + parsed = SimpleNamespace(
  372 + rewritten_query="shirt",
  373 + detected_language="unknown",
  374 + translations={"zh": "衬衫"},
  375 + contains_chinese=False,
  376 + contains_english=True,
  377 + )
  378 + q = qb.build_query(
  379 + query_text="shirt",
  380 + parsed_query=parsed,
  381 + enable_knn=False,
  382 + index_languages=["en", "zh"],
  383 + )
  384 + idx = _clauses_index(q)
  385 + assert set(idx) == {"base_query", "base_query_trans_zh"}
  386 + assert idx["base_query"]["query"] == "shirt"
  387 + assert _has_title_lang(idx["base_query"], "en")
  388 +
  389 +
  390 +def test_ru_query_index_ru_en_includes_base_ru_and_trans_en():
  391 + qb = _builder_multilingual_title_only(default_language="en")
  392 + q = _build(
  393 + qb,
  394 + query_text="платье",
  395 + rewritten="платье",
  396 + detected_language="ru",
  397 + translations={"en": "dress"},
  398 + index_languages=["ru", "en"],
  399 + )
  400 + idx = _clauses_index(q)
  401 + assert set(idx) == {"base_query", "base_query_trans_en"}
  402 + assert idx["base_query"]["query"] == "платье"
  403 + assert _has_title_lang(idx["base_query"], "ru")
  404 + assert idx["base_query_trans_en"]["query"] == "dress"
  405 +
  406 +
  407 +def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause():
  408 + """
  409 + 当前实现:凡是 translations 里非空的条目都会生成子句;
  410 + index_languages 只约束混写扩列,不用于过滤翻译子句。
  411 + """
  412 + qb = _builder_multilingual_title_only(default_language="en")
  413 + q = _build(
  414 + qb,
  415 + query_text="dress",
  416 + rewritten="dress",
  417 + detected_language="en",
  418 + translations={"zh": "连衣裙", "de": "Kleid"},
  419 + index_languages=["en", "zh"],
  420 + )
  421 + idx = _clauses_index(q)
  422 + assert "base_query_trans_de" in idx
  423 + assert idx["base_query_trans_de"]["query"] == "Kleid"
  424 + assert _has_title_lang(idx["base_query_trans_de"], "de")
  425 +
  426 +
  427 +def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_base():
  428 + """base_query 始终用 rewritten_query,而非仅 query_text。"""
  429 + qb = _builder_multilingual_title_only(default_language="en")
  430 + q = _build(
  431 + qb,
  432 + query_text=" 红色 ",
  433 + rewritten="红色连衣裙",
  434 + detected_language="zh",
  435 + translations={"en": "red dress"},
  436 + index_languages=["zh", "en"],
  437 + contains_chinese=True,
  438 + contains_english=False,
  439 + )
  440 + idx = _clauses_index(q)
  441 + assert idx["base_query"]["query"] == "红色连衣裙"
  442 + assert idx["base_query_trans_en"]["query"] == "red dress"
  443 +
  444 +
  445 +def test_detected_language_unknown_falls_back_to_default_language():
  446 + """与 LanguageDetector 失败时 QueryConfig.default_language 行为对齐。"""
  447 + qb = _builder_multilingual_title_only(default_language="en")
  448 + parsed = SimpleNamespace(
  449 + rewritten_query="shirt",
  450 + detected_language="unknown",
  451 + translations={"zh": "衬衫"},
  452 + contains_chinese=False,
  453 + contains_english=True,
  454 + )
  455 + q = qb.build_query(
  456 + query_text="shirt",
  457 + parsed_query=parsed,
  458 + enable_knn=False,
  459 + index_languages=["en", "zh"],
  460 + )
  461 + idx = _clauses_index(q)
  462 + assert set(idx) == {"base_query", "base_query_trans_zh"}
  463 + assert idx["base_query"]["query"] == "shirt"
  464 + assert _has_title_lang(idx["base_query"], "en")
  465 +
  466 +
  467 +def test_ru_query_index_ru_en_includes_base_ru_and_trans_en():
  468 + qb = _builder_multilingual_title_only(default_language="en")
  469 + q = _build(
  470 + qb,
  471 + query_text="платье",
  472 + rewritten="платье",
  473 + detected_language="ru",
  474 + translations={"en": "dress"},
  475 + index_languages=["ru", "en"],
  476 + )
  477 + idx = _clauses_index(q)
  478 + assert set(idx) == {"base_query", "base_query_trans_en"}
  479 + assert idx["base_query"]["query"] == "платье"
  480 + assert _has_title_lang(idx["base_query"], "ru")
  481 + assert idx["base_query_trans_en"]["query"] == "dress"
  482 +
  483 +
  484 +def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause():
  485 + """
  486 + 当前实现:凡是 translations 里非空的条目都会生成子句;
  487 + index_languages 只约束混写扩列,不用于过滤翻译子句。
  488 + """
  489 + qb = _builder_multilingual_title_only(default_language="en")
  490 + q = _build(
  491 + qb,
  492 + query_text="dress",
  493 + rewritten="dress",
  494 + detected_language="en",
  495 + translations={"zh": "连衣裙", "de": "Kleid"},
  496 + index_languages=["en", "zh"],
  497 + )
  498 + idx = _clauses_index(q)
  499 + assert "base_query_trans_de" in idx
  500 + assert idx["base_query_trans_de"]["query"] == "Kleid"
  501 + assert _has_title_lang(idx["base_query_trans_de"], "de")
  502 +
  503 +
  504 +def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_base():
  505 + """base_query 始终用 rewritten_query,而非仅 query_text。"""
  506 + qb = _builder_multilingual_title_only(default_language="en")
  507 + q = _build(
  508 + qb,
  509 + query_text=" 红色 ",
  510 + rewritten="红色连衣裙",
  511 + detected_language="zh",
  512 + translations={"en": "red dress"},
  513 + index_languages=["zh", "en"],
  514 + contains_chinese=True,
  515 + contains_english=False,
  516 + )
  517 + idx = _clauses_index(q)
  518 + assert idx["base_query"]["query"] == "红色连衣裙"
  519 + assert idx["base_query_trans_en"]["query"] == "red dress"
tests/test_rerank_client.py
@@ -11,7 +11,6 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim @@ -11,7 +11,6 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim
11 "matched_queries": { 11 "matched_queries": {
12 "base_query": 2.4, 12 "base_query": 2.4,
13 "base_query_trans_zh": 1.8, 13 "base_query_trans_zh": 1.8,
14 - "fallback_original_query_zh": 1.2,  
15 "knn_query": 0.8, 14 "knn_query": 0.8,
16 }, 15 },
17 }, 16 },
@@ -27,7 +26,7 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim @@ -27,7 +26,7 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim
27 26
28 debug = fuse_scores_and_resort(hits, [0.9, 0.7]) 27 debug = fuse_scores_and_resort(hits, [0.9, 0.7])
29 28
30 - expected_text_1 = 2.4 + 0.25 * ((0.8 * 1.8) + (0.55 * 1.2)) 29 + expected_text_1 = 2.4 + 0.25 * (0.8 * 1.8)
31 expected_fused_1 = (0.9 + 0.00001) * ((expected_text_1 + 0.1) ** 0.35) * ((0.8 + 0.6) ** 0.2) 30 expected_fused_1 = (0.9 + 0.00001) * ((expected_text_1 + 0.1) ** 0.35) * ((0.8 + 0.6) ** 0.2)
32 expected_fused_2 = (0.7 + 0.00001) * ((9.0 + 0.1) ** 0.35) * ((0.2 + 0.6) ** 0.2) 31 expected_fused_2 = (0.7 + 0.00001) * ((9.0 + 0.1) ** 0.35) * ((0.2 + 0.6) ** 0.2)
33 32
@@ -38,7 +37,6 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim @@ -38,7 +37,6 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim
38 assert isclose(by_id["2"]["_fused_score"], expected_fused_2, rel_tol=1e-9) 37 assert isclose(by_id["2"]["_fused_score"], expected_fused_2, rel_tol=1e-9)
39 assert debug[0]["text_source_score"] == 2.4 38 assert debug[0]["text_source_score"] == 2.4
40 assert debug[0]["text_translation_score"] == 1.8 39 assert debug[0]["text_translation_score"] == 1.8
41 - assert debug[0]["text_fallback_score"] == 1.2  
42 assert debug[0]["knn_score"] == 0.8 40 assert debug[0]["knn_score"] == 0.8
43 assert [hit["_id"] for hit in hits] == ["2", "1"] 41 assert [hit["_id"] for hit in hits] == ["2", "1"]
44 42