Commit 0536222c6d7fcf1bb9339299b67409c918bae320

Authored by tangwang
1 parent ef5baa86

query parser优化

config/config.yaml
... ... @@ -20,7 +20,7 @@ es_settings:
20 20 refresh_interval: "30s"
21 21  
22 22 # 字段权重配置(用于搜索时的字段boost)
23   -# 统一按“字段基名”配置;查询时按 search_langs 动态拼接 .{lang}。
  23 +# 统一按“字段基名”配置;查询时按实际检索语言动态拼接 .{lang}。
24 24 # 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。
25 25 field_boosts:
26 26 title: 3.0
... ... @@ -74,14 +74,11 @@ query_config:
74 74 - "vendor"
75 75 - "category_name_text"
76 76  
77   - # 统一文本召回策略(主查询 + 翻译查询 + 原始查询兜底
  77 + # 统一文本召回策略(主查询 + 翻译查询
78 78 text_query_strategy:
79 79 base_minimum_should_match: "75%"
80 80 translation_minimum_should_match: "75%"
81 81 translation_boost: 0.4
82   - translation_boost_when_source_missing: 1.0
83   - source_boost_when_missing: 0.6
84   - original_query_fallback_boost_when_translation_missing: 0.2
85 82 tie_breaker_base_query: 0.9
86 83  
87 84 # Embedding字段名称
... ...
config/loader.py
... ... @@ -284,13 +284,6 @@ class AppConfigLoader:
284 284 base_minimum_should_match=str(text_strategy.get("base_minimum_should_match", "70%")),
285 285 translation_minimum_should_match=str(text_strategy.get("translation_minimum_should_match", "70%")),
286 286 translation_boost=float(text_strategy.get("translation_boost", 0.4)),
287   - translation_boost_when_source_missing=float(
288   - text_strategy.get("translation_boost_when_source_missing", 1.0)
289   - ),
290   - source_boost_when_missing=float(text_strategy.get("source_boost_when_missing", 0.6)),
291   - original_query_fallback_boost_when_translation_missing=float(
292   - text_strategy.get("original_query_fallback_boost_when_translation_missing", 0.2)
293   - ),
294 287 tie_breaker_base_query=float(text_strategy.get("tie_breaker_base_query", 0.9)),
295 288 zh_to_en_model=str(query_cfg.get("zh_to_en_model") or "opus-mt-zh-en"),
296 289 en_to_zh_model=str(query_cfg.get("en_to_zh_model") or "opus-mt-en-zh"),
... ...
config/schema.py
... ... @@ -54,9 +54,6 @@ class QueryConfig:
54 54 base_minimum_should_match: str = "70%"
55 55 translation_minimum_should_match: str = "70%"
56 56 translation_boost: float = 0.4
57   - translation_boost_when_source_missing: float = 1.0
58   - source_boost_when_missing: float = 0.6
59   - original_query_fallback_boost_when_translation_missing: float = 0.2
60 57 tie_breaker_base_query: float = 0.9
61 58 zh_to_en_model: str = "opus-mt-zh-en"
62 59 en_to_zh_model: str = "opus-mt-en-zh"
... ...
docs/DEVELOPER_GUIDE.md
... ... @@ -147,7 +147,7 @@ docs/ # 文档(含本指南)
147 147  
148 148 ### 4.4 query
149 149  
150   -- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化;输出可供 Searcher 使用的结构化查询信息(含 search_langs 语言计划)
  150 +- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化;输出解析事实(如 `rewritten_query`、`detected_language`、`translations`、`query_vector`),不再承担 ES 语言计划拼装
151 151 - **原则**:翻译/向量通过 `providers` 获取,不直接依赖具体服务 URL 或实现;支持按配置关闭翻译/向量(如短查询、typing 场景)。
152 152  
153 153 ### 4.5 search
... ...
docs/QUICKSTART.md
... ... @@ -558,6 +558,21 @@ lsof -i :6004
558 558  
559 559 更完整的运行排障(多环境切换、Suggestion 构建、FAQ)见 `docs/Usage-Guide.md`。
560 560  
  561 +### 5.4 HanLP 与 `transformers` 版本(`BertTokenizer.encode_plus`)
  562 +
  563 +若日志出现 **`AttributeError: BertTokenizer has no attribute encode_plus`**,通常是 **同一 venv 里装了 `transformers` 5.x**,与 **HanLP 2.1.x** 不兼容(HanLP 仍调用已移除的 `encode_plus`)。
  564 +
  565 +**处理:** 将 `transformers` 固定到 **4.x**(例如 4.44+),然后重装/校验 HanLP:
  566 +
  567 +```bash
  568 +source activate.sh
  569 +pip install -r requirements_hanlp.txt
  570 +python -c "from transformers import BertTokenizer; import transformers as t; print(t.__version__, hasattr(BertTokenizer, 'encode_plus'))"
  571 +# 期望:4.x 且 True
  572 +```
  573 +
  574 +**说明:** 重排/TEI 等若使用 **独立 venv**(如 `.venv-reranker`),可与主 venv 的 `transformers` 版本分离;主 venv 只要装了 HanLP 做查询分词,就不要把 `transformers` 升到 5。
  575 +
561 576 ---
562 577  
563 578 ## 6. 相关文档
... ...
docs/TODO.txt
... ... @@ -32,7 +32,7 @@
32 32 },
33 33 去掉 image_embedding_512
34 34 image_embedding改为,一个spu有多个sku向量,每个向量内部properties:
35   -除了vector url还应该包括
  35 +除了vector url还应该包括,该图片是对应哪些sku
36 36 "image_embedding": {
37 37 "type": "nested",
38 38 "properties": {
... ... @@ -117,6 +117,11 @@ requests.exceptions.HTTPError: 502 Server Error: Bad Gateway for url: http://127
117 117  
118 118  
119 119  
  120 +是否需要:
  121 +当「源语言不在 index_languages」且「某些目标语言的翻译缺失」时,ES 里会额外加一层 用「原始 query 字符串」去撞缺失语种字段
  122 +
  123 +
  124 +
120 125 先阅读文本embedding相关的代码:
121 126 @embeddings/README.md @embeddings/server.py @docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md @embeddings/text_encoder.py
122 127 目前有TEXT_MAX_INFLIGHT / IMAGE_MAX_INFLIGHT 准入限制,超限返回过载状态码。
... ...
docs/搜索API对接指南-01-搜索接口.md
... ... @@ -553,9 +553,6 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"})
553 553 | `rewritten_query` | string | 重写后的查询 |
554 554 | `detected_language` | string | 检测到的语言 |
555 555 | `translations` | object | 翻译结果 |
556   -| `query_text_by_lang` | object | 实际参与检索的多语言 query 文本 |
557   -| `search_langs` | array[string] | 实际参与检索的语言列表 |
558   -| `supplemental_search_langs` | array[string] | 因 mixed query 补入的附加语言列表 |
559 556 | `has_vector` | boolean | 是否生成了向量 |
560 557  
561 558 `debug_info.per_result[]` 常见字段:
... ... @@ -565,10 +562,9 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"})
565 562 | `spu_id` | string | 结果 SPU ID |
566 563 | `es_score` | float | ES 原始 `_score` |
567 564 | `rerank_score` | float | 重排分数 |
568   -| `text_score` | float | 文本相关性大分(由 `base_query` / `base_query_trans_*` / `fallback_original_query_*` 聚合而来) |
  565 +| `text_score` | float | 文本相关性大分(由 `base_query` / `base_query_trans_*` 聚合而来) |
569 566 | `text_source_score` | float | `base_query` 分数 |
570 567 | `text_translation_score` | float | `base_query_trans_*` 里的最大分数 |
571   -| `text_fallback_score` | float | `fallback_original_query_*` 里的最大分数 |
572 568 | `text_primary_score` | float | 文本大分中的主证据部分 |
573 569 | `text_support_score` | float | 文本大分中的辅助证据部分 |
574 570 | `knn_score` | float | `knn_query` 分数 |
... ...
docs/相关性检索优化说明.md
... ... @@ -2,11 +2,11 @@
2 2  
3 3 ## 1. 文档目标
4 4  
5   -本文描述当前线上代码的文本检索策略,重点覆盖:
  5 +本文描述当前代码中的文本检索策略,重点覆盖:
6 6  
7 7 - 多语言检索路由(`detector` / `translator` / `indexed` 的关系)
8 8 - 统一文本召回表达式(无布尔 AST 分支)
9   -- 翻译缺失时的兜底策略
  9 +- 解析层与检索表达式层的职责边界
10 10 - 重排融合打分与调试字段
11 11 - 典型场景下实际生成的 ES 查询结构
12 12  
... ... @@ -17,9 +17,11 @@
17 17 查询链路(文本相关):
18 18  
19 19 1. `QueryParser.parse()`
20   - 输出 `detected_language`、`query_text_by_lang`、`search_langs`、`index_languages`、`source_in_index_languages`;另输出 `contains_chinese` / `contains_english`(仅服务混写辅助召回,见 §4 末)。
  20 + 负责产出解析事实:`query_normalized`、`rewritten_query`、`detected_language`、`translations`、`query_vector`、`query_tokens`、`contains_chinese`、`contains_english`。
  21 +2. `Searcher.search()`
  22 + 负责读取租户 `index_languages`,并将其一方面传给 `QueryParser` 作为 `target_languages`,另一方面传给 `ESQueryBuilder` 作为字段展开约束。
21 23 2. `ESQueryBuilder._build_advanced_text_query()`
22   - 按 `search_langs` 动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段,叠加 shared 字段(`tags`、`option*_values`);若命中混写辅助条件,在同一子句内并入另一语种列(§4 末)。
  24 + 基于 `rewritten_query + detected_language + translations + index_languages` 构建 `base_query` 与 `base_query_trans_*`;并按语言动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段,叠加 shared 字段(`tags`、`option*_values`)。
23 25 3. `build_query()`
24 26 统一走文本策略,不再有布尔 AST 枝路。
25 27  
... ... @@ -37,18 +39,18 @@
37 39 源语言字段做主召回;其他语言走翻译补召回(低权重)。
38 40 2. 若 `detected_language not in index_languages`:
39 41 翻译到 `index_languages` 是主路径;源语言字段仅作弱召回。
40   -3. 若第 2 步翻译部分失败或全部失败:
41   - 对缺失翻译的 `index_languages` 字段,追加“原文低权重兜底”子句,避免完全丢失这些语种索引面的召回机会。
  42 +3. 若翻译部分失败或全部失败:
  43 + 当前实现不会再额外生成“原文打到其他语种字段”的兜底子句;系统保留 `base_query` 并继续执行,可观测性由 `translations` / warning / 命名子句分数提供。
42 44  
43 45 ### 3.2 翻译与向量:并发提交与共享超时
44 46  
45   -`QueryParser.parse()` 内(Stage 4–6)对**离线调用**采用线程池提交 + **一次** `concurrent.futures.wait`:
  47 +`QueryParser.parse()` 内对翻译与向量采用线程池提交 + **一次** `concurrent.futures.wait`:
46 48  
47   -- **翻译**:对 `index_languages` 中除 `detected_language` 外的每个目标语种各提交一个 `translator.translate` 任务(多目标时并发执行)。
48   -- **查询向量**(若开启 `enable_text_embedding` 且域为 default):再提交一个 `text_encoder.encode` 任务。
  49 +- **翻译**:对调用方传入的 `target_languages` 中、除 `detected_language` 外的每个目标语种各提交一个 `translator.translate` 任务(多目标时并发执行)。
  50 +- **查询向量**:若开启 `enable_text_embedding`,再提交一个 `text_encoder.encode` 任务。
49 51 - 上述任务进入**同一** future 集合;例如租户索引为 `[zh, en]` 且检测语种**不在**索引内时,常为 **2 路翻译 + 1 路向量,共 3 个任务并发**,共用超时。
50 52  
51   -**等待预算(毫秒)**由 `detected_language` 是否属于租户 `index_languages` 决定(`query_config`):
  53 +**等待预算(毫秒)**由 `detected_language` 是否属于调用方传入的 `target_languages` 决定(`query_config`):
52 54  
53 55 - **在索引内**:`translation_embedding_wait_budget_ms_source_in_index`(默认较短,如 80ms)— 主召回已能打在源语种字段,翻译/向量稍慢可容忍。
54 56 - **不在索引内**:`translation_embedding_wait_budget_ms_source_not_in_index`(默认较长,如 200ms)— 翻译对可检索文本更关键,给足时间。
... ... @@ -62,7 +64,7 @@
62 64 ```json
63 65 {
64 66 "multi_match": {
65   - "_name": "base_query|base_query_trans_xx|fallback_original_query_xx",
  67 + "_name": "base_query|base_query_trans_xx",
66 68 "query": "<text>",
67 69 "fields": ["title.xx^3.0", "brief.xx^1.5", "...", "tags", "option1_values^0.5", "..."],
68 70 "minimum_should_match": "75%",
... ... @@ -75,7 +77,7 @@
75 77 最终按 `bool.should` 组合,`minimum_should_match: 1`。
76 78  
77 79 > **附 — 混写辅助召回**
78   -> 当中英(或多脚本)混写时,为略抬召回:`QueryParser` 用 `contains_chinese`(文中有汉字)、`contains_english`(分词中有长度 ≥3 的纯英文 token)打标;`ESQueryBuilder` 在某一语言的 `multi_match` 上,按规则把**另一语种**的同类字段并入同一 `fields`(受 `index_languages` 限制),并入列的 boost 为配置值再乘 **`mixed_script_merged_field_boost_scale`(默认 0.8,`ESQueryBuilder` 构造参数)**。`fallback_original_query_*` 同样适用。字段在内部以 `(path, boost)` 列表合并后再格式化为 ES 字符串。
  80 +> 当中英(或多脚本)混写时,为略抬召回:`QueryParser` 用 `contains_chinese`(文中有汉字)、`contains_english`(分词中有长度 ≥3 的纯英文 token)打标;`ESQueryBuilder` 在某一语言的 `multi_match` 上,按规则把**另一语种**的同类字段并入同一 `fields`(受 `index_languages` 限制),并入列的 boost 为配置值再乘 **`mixed_script_merged_field_boost_scale`(默认 0.6,`ESQueryBuilder` 构造参数)**。字段在内部以 `(path, boost)` 列表合并后再格式化为 ES 字符串。
79 81  
80 82 ## 5. 关键配置项(文本策略)
81 83  
... ... @@ -88,20 +90,12 @@
88 90  
89 91 - `base_minimum_should_match`
90 92 - `translation_minimum_should_match`
91   -- `translation_boost`
92   -- `translation_boost_when_source_missing`
93   -- `source_boost_when_missing`
94   -- `original_query_fallback_boost_when_translation_missing`(新增)
  93 +- `translation_boost`(所有 `base_query_trans_*` 共用)
95 94 - `tie_breaker_base_query`
96 95  
97   -新增项说明:
98   -
99   -- `original_query_fallback_boost_when_translation_missing`:
100   - 当源语种不在索引语言且翻译缺失时,原文打到缺失目标语字段的低权重系数,默认 `0.2`。
101   -
102 96 说明:
103 97  
104   -- `phrase_query` / `keywords_query` 已从当前实现中移除,文本相关性只由 `base_query`、`base_query_trans_*`、`fallback_original_query_*` 三类子句组成。
  98 +- `phrase_query` / `keywords_query` 已从当前实现中移除,文本相关性只由 `base_query`、`base_query_trans_*`类子句组成。
105 99  
106 100 ## 6. 典型场景与实际 DSL
107 101  
... ... @@ -111,11 +105,12 @@
111 105  
112 106 - `detected_language=de`
113 107 - `index_languages=[de,en]`
114   -- `query_text_by_lang={de:"herren schuhe", en:"men shoes"}`
  108 +- `rewritten_query="herren schuhe"`
  109 +- `translations={en:"men shoes"}`
115 110  
116 111 策略结果:
117 112  
118   -- `base_query`:德语字段,正常权重
  113 +- `base_query`:德语字段,**不写** `multi_match.boost`
119 114 - `base_query_trans_en`:英语字段,`boost=translation_boost`(默认 0.4)
120 115  
121 116 ### 场景 B:源语种不在索引语言中,部分翻译缺失
... ... @@ -126,38 +121,44 @@
126 121  
127 122 策略结果:
128 123  
129   -- `base_query`(德语字段):`boost=source_boost_when_missing`(默认 0.6)
130   -- `base_query_trans_en`(英文字段):`boost=translation_boost_when_source_missing`(默认 1.0)
131   -- `fallback_original_query_zh`(中文字段):原文低权重兜底(默认 0.2)
  124 +- `base_query`(德语字段):**不写** `multi_match.boost`(默认 1.0)
  125 +- `base_query_trans_en`(英文字段):`boost=translation_boost`(如 0.4)
  126 +- 不会生成额外中文兜底子句
132 127  
133 128 ### 场景 C:源语种不在索引语言中,翻译全部失败
134 129  
135 130 - `detected_language=de`
136 131 - `index_languages=[en,zh]`
137   -- `query_text_by_lang` 仅有 `de`
  132 +- `translations={}`
138 133  
139 134 策略结果:
140 135  
141   -- `base_query`(德语字段,低权重)
142   -- `fallback_original_query_en`(英文字段原文兜底)
143   -- `fallback_original_query_zh`(中文字段原文兜底)
  136 +- `base_query`(德语字段,**无** `boost` 字段)
  137 +- 不会生成 `base_query_trans_*`
144 138  
145   -这能避免“只有源语种字段查询,且该语种字段在商家索引中稀疏/为空”导致的弱召回问题
  139 +这意味着当前实现优先保证职责清晰与可解释性,而不是继续在 Builder 内部隐式制造“跨语种原文兜底”
146 140  
147   -## 7. QueryParser 与 ESBuilder 的职责分工
  141 +## 7. QueryParser 与 Searcher / ESBuilder 的职责分工
148 142  
149   -- `QueryParser` 负责“语言计划”与“可用文本”:
150   - - `search_langs`
151   - - `query_text_by_lang`
152   - - `source_in_index_languages`
153   - - `index_languages`
  143 +- `QueryParser` 负责“解析事实”:
  144 + - `query_normalized`
  145 + - `rewritten_query`
  146 + - `detected_language`
  147 + - `translations`
  148 + - `query_vector`
  149 + - `query_tokens`
154 150 - `contains_chinese` / `contains_english`
  151 +- `Searcher` 负责“租户语境”:
  152 + - `index_languages`
  153 + - 将其传给 parser 作为 `target_languages`
  154 + - 将其传给 builder 作为字段展开约束
155 155 - `ESQueryBuilder` 负责“表达式展开”:
156 156 - 动态字段组装
157 157 - 子句权重分配
158   - - 翻译缺失兜底子句拼接
  158 + - `base_query` / `base_query_trans_*` 子句拼接
  159 + - 跳过“与 base_query 文本和语言完全相同”的重复翻译子句
159 160  
160   -这种分层让策略调优主要落在配置和 Builder,不破坏 Parser 的职责边界
  161 +这种分层让 parser 不再返回 ES 专用的“语言计划字段”,职责边界更清晰
161 162  
162 163 ## 8. 融合打分(Rerank + Text + KNN)
163 164  
... ... @@ -165,24 +166,21 @@
165 166  
166 167 ### 8.1 文本相关性大分
167 168  
168   -文本大分由部分组成:
  169 +文本大分由部分组成:
169 170  
170 171 - `base_query`
171 172 - `base_query_trans_*`
172   -- `fallback_original_query_*`
173 173  
174 174 聚合方式:
175 175  
176 176 1. `source_score = base_query`
177 177 2. `translation_score = max(base_query_trans_*)`
178   -3. `fallback_score = max(fallback_original_query_*)`
179   -4. 加权:
  178 +3. 加权:
180 179 - `weighted_source = source_score`
181 180 - `weighted_translation = 0.8 * translation_score`
182   - - `weighted_fallback = 0.55 * fallback_score`
183   -5. 合成:
184   - - `primary = max(weighted_source, weighted_translation, weighted_fallback)`
185   - - `support = weighted_source + weighted_translation + weighted_fallback - primary`
  181 +4. 合成:
  182 + - `primary = max(weighted_source, weighted_translation)`
  183 + - `support = weighted_source + weighted_translation - primary`
186 184 - `text_score = primary + 0.25 * support`
187 185  
188 186 如果以上子分都缺失,则回退到 ES `_score` 作为 `text_score`,避免纯文本召回被误打成 0。
... ... @@ -212,7 +210,6 @@ fused_score = (
212 210 - `text_score`
213 211 - `text_source_score`
214 212 - `text_translation_score`
215   -- `text_fallback_score`
216 213 - `text_primary_score`
217 214 - `text_support_score`
218 215 - `knn_score`
... ... @@ -221,9 +218,9 @@ fused_score = (
221 218  
222 219 `debug_info.query_analysis` 还会暴露:
223 220  
224   -- `query_text_by_lang`
225   -- `search_langs`
226   -- `supplemental_search_langs`
  221 +- `translations`
  222 +- `detected_language`
  223 +- `rewritten_query`
227 224  
228 225 这些字段用于检索效果评估与 bad case 归因。
229 226  
... ... @@ -231,7 +228,7 @@ fused_score = (
231 228  
232 229 1. 当前文本主链路已移除布尔 AST 分支。
233 230 2. 文档中的旧描述(如 `operator: AND` 固定开启)不再适用,当前实现未强制设置该参数。
234   -3. `HanLP` 为可选依赖;不可用时退化到轻量分词,不影响主链路可用性
  231 +3. `HanLP` 为必需依赖;当前 parser 不再提供轻量 fallback
235 232 4. 若后续扩展到更多语种,请确保:
236 233 - mapping 中存在对应 `.<lang>` 字段
237 234 - `index_languages` 配置在支持列表内
... ... @@ -263,10 +260,9 @@ python ./scripts/eval_search_quality.py
263 260 建议在 `tests/` 增加文本策略用例:
264 261  
265 262 1. 源语种在索引语言,翻译命中缓存
266   -2. 源语种不在索引语言,翻译部分失败(验证 fallback 子句)
267   -3. 源语种不在索引语言,翻译全部失败(验证多目标 fallback)
268   -4. 自定义 `original_query_fallback_boost_when_translation_missing` 生效
269   -5. 非 `zh/en` 语种字段动态拼接(如 `de/fr/es`)
  263 +2. 源语种不在索引语言,翻译部分失败(验证仅保留 `base_query` + 成功翻译子句)
  264 +3. 源语种不在索引语言,翻译全部失败(验证无 `base_query_trans_*` 时仍可正常执行)
  265 +4. 非 `zh/en` 语种字段动态拼接(如 `de/fr/es`)
270 266  
271 267  
272 268  
... ...
requirements_hanlp.txt 0 → 100644
... ... @@ -0,0 +1,13 @@
  1 +# Optional: HanLP query tokenization for the main backend venv (QueryParser).
  2 +#
  3 +# Install:
  4 +# source activate.sh
  5 +# pip install -r requirements_hanlp.txt
  6 +#
  7 +# Why pin transformers<5:
  8 +# transformers 5.x no longer exposes `encode_plus` on `BertTokenizer`, but HanLP 2.1.x
  9 +# still calls it → AttributeError during `hanlp.load(...)`.
  10 +# Use transformers 4.44+ (4.x) which remains API-compatible with HanLP.
  11 +
  12 +hanlp>=2.1.0
  13 +transformers>=4.44,<5
... ...
scripts/eval_search_quality.py
... ... @@ -83,7 +83,6 @@ class RankedItem:
83 83 text_score: float | None
84 84 text_source_score: float | None
85 85 text_translation_score: float | None
86   - text_fallback_score: float | None
87 86 text_primary_score: float | None
88 87 text_support_score: float | None
89 88 knn_score: float | None
... ... @@ -146,7 +145,6 @@ def _evaluate_query(searcher, tenant_id: str, query: str) -&gt; Dict[str, Any]:
146 145 text_score=_to_float(debug_item.get("text_score")),
147 146 text_source_score=_to_float(debug_item.get("text_source_score")),
148 147 text_translation_score=_to_float(debug_item.get("text_translation_score")),
149   - text_fallback_score=_to_float(debug_item.get("text_fallback_score")),
150 148 text_primary_score=_to_float(debug_item.get("text_primary_score")),
151 149 text_support_score=_to_float(debug_item.get("text_support_score")),
152 150 knn_score=_to_float(debug_item.get("knn_score")),
... ... @@ -185,12 +183,11 @@ def _render_markdown(report: Dict[str, Any]) -&gt; str:
185 183 f"- total={entry['total']} max_score={entry['max_score']:.6f} took_ms={entry['took_ms']}"
186 184 )
187 185 lines.append(
188   - f"- detected_language={qa.get('detected_language')} search_langs={qa.get('search_langs')} supplemental_search_langs={qa.get('supplemental_search_langs')}"
  186 + f"- detected_language={qa.get('detected_language')} translations={qa.get('translations')}"
189 187 )
190   - lines.append(f"- query_text_by_lang={qa.get('query_text_by_lang')}")
191 188 lines.append("")
192   - lines.append("| rank | spu_id | title | fused | rerank | text | text_src | text_trans | text_fb | knn | es | matched_queries |")
193   - lines.append("| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |")
  189 + lines.append("| rank | spu_id | title | fused | rerank | text | text_src | text_trans | knn | es | matched_queries |")
  190 + lines.append("| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |")
194 191 for item in entry.get("top20", []):
195 192 title = str(item.get("title", "")).replace("|", "/")
196 193 matched = json.dumps(item.get("matched_queries"), ensure_ascii=False)
... ... @@ -199,7 +196,7 @@ def _render_markdown(report: Dict[str, Any]) -&gt; str:
199 196 f"| {item.get('rank')} | {item.get('spu_id')} | {title} | "
200 197 f"{item.get('fused_score')} | {item.get('rerank_score')} | {item.get('text_score')} | "
201 198 f"{item.get('text_source_score')} | {item.get('text_translation_score')} | "
202   - f"{item.get('text_fallback_score')} | {item.get('knn_score')} | {item.get('es_score')} | {matched} |"
  199 + f"{item.get('knn_score')} | {item.get('es_score')} | {matched} |"
203 200 )
204 201 lines.append("")
205 202 return "\n".join(lines)
... ...
search/es_query_builder.py
... ... @@ -36,9 +36,6 @@ class ESQueryBuilder:
36 36 base_minimum_should_match: str = "70%",
37 37 translation_minimum_should_match: str = "70%",
38 38 translation_boost: float = 0.4,
39   - translation_boost_when_source_missing: float = 1.0,
40   - source_boost_when_missing: float = 0.6,
41   - original_query_fallback_boost_when_translation_missing: float = 0.2,
42 39 tie_breaker_base_query: float = 0.9,
43 40 mixed_script_merged_field_boost_scale: float = 0.6,
44 41 ):
... ... @@ -74,11 +71,6 @@ class ESQueryBuilder:
74 71 self.base_minimum_should_match = base_minimum_should_match
75 72 self.translation_minimum_should_match = translation_minimum_should_match
76 73 self.translation_boost = float(translation_boost)
77   - self.translation_boost_when_source_missing = float(translation_boost_when_source_missing)
78   - self.source_boost_when_missing = float(source_boost_when_missing)
79   - self.original_query_fallback_boost_when_translation_missing = float(
80   - original_query_fallback_boost_when_translation_missing
81   - )
82 74 self.tie_breaker_base_query = float(tie_breaker_base_query)
83 75 self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale)
84 76  
... ... @@ -168,7 +160,7 @@ class ESQueryBuilder:
168 160 结构:filters and (text_recall or embedding_recall) + post_filter
169 161 - conjunctive_filters: 应用在 query.bool.filter(影响结果和聚合)
170 162 - disjunctive_filters: 应用在 post_filter(只影响结果,不影响聚合)
171   - - text_recall: 文本相关性召回(按 search_langs 动态语言字段)
  163 + - text_recall: 文本相关性召回(按实际 clause 语言动态字段)
172 164 - embedding_recall: 向量召回(KNN)
173 165 - function_score: 包装召回部分,支持提权字段
174 166  
... ... @@ -484,6 +476,7 @@ class ESQueryBuilder:
484 476 contains_chinese: bool,
485 477 contains_english: bool,
486 478 index_languages: List[str],
  479 + is_source: bool = False
487 480 ) -> List[MatchFieldSpec]:
488 481 """
489 482 When the query mixes scripts, widen each clause to indexed fields for the other script
... ... @@ -497,10 +490,11 @@ class ESQueryBuilder:
497 490  
498 491 out = list(specs)
499 492 lnorm = (lang or "").strip().lower()
500   - if contains_english and lnorm != "en" and can_use("en"):
501   - out = self._merge_supplemental_lang_field_specs(out, "en")
502   - if contains_chinese and lnorm != "zh" and can_use("zh"):
503   - out = self._merge_supplemental_lang_field_specs(out, "zh")
  493 + if is_source:
  494 + if contains_english and lnorm != "en" and can_use("en"):
  495 + out = self._merge_supplemental_lang_field_specs(out, "en")
  496 + if contains_chinese and lnorm != "zh" and can_use("zh"):
  497 + out = self._merge_supplemental_lang_field_specs(out, "zh")
504 498 return out
505 499  
506 500 def _get_embedding_field(self, language: str) -> str:
... ... @@ -557,10 +551,6 @@ class ESQueryBuilder:
557 551 contains_english = bool(getattr(parsed_query, "contains_english", False))
558 552  
559 553 source_lang = str(source_lang or self.default_language).strip().lower() or self.default_language
560   - source_in_index_languages = (
561   - True if not normalized_index_languages else source_lang in normalized_index_languages
562   - )
563   -
564 554 base_query_text = (
565 555 getattr(parsed_query, "rewritten_query", None) if parsed_query else None
566 556 ) or query_text
... ... @@ -574,22 +564,14 @@ class ESQueryBuilder:
574 564 contains_chinese,
575 565 contains_english,
576 566 normalized_index_languages,
  567 + is_source,
577 568 )
578 569 match_fields = self._format_match_field_specs(expanded_specs)
579 570 if not match_fields:
580 571 return
581   - clause_boost = 1.0
582 572 minimum_should_match = (
583 573 self.base_minimum_should_match if is_source else self.translation_minimum_should_match
584 574 )
585   - if is_source and not source_in_index_languages:
586   - clause_boost = self.source_boost_when_missing
587   - elif not is_source:
588   - clause_boost = (
589   - self.translation_boost
590   - if source_in_index_languages
591   - else self.translation_boost_when_source_missing
592   - )
593 575  
594 576 clause = {
595 577 "multi_match": {
... ... @@ -600,8 +582,11 @@ class ESQueryBuilder:
600 582 "tie_breaker": self.tie_breaker_base_query,
601 583 }
602 584 }
603   - if abs(clause_boost - 1.0) > 1e-9:
604   - clause["multi_match"]["boost"] = clause_boost
  585 + # base_query: never set multi_match.boost (ES default 1.0).
  586 + # Translation clauses: single knob from config — translation_boost.
  587 + if not is_source:
  588 + tb = float(self.translation_boost)
  589 + clause["multi_match"]["boost"] = tb
605 590 should_clauses.append({
606 591 "multi_match": clause["multi_match"]
607 592 })
... ...
search/rerank_client.py
... ... @@ -116,7 +116,6 @@ def _extract_named_query_score(matched_queries: Any, name: str) -&gt; float:
116 116 def _collect_text_score_components(matched_queries: Any, fallback_es_score: float) -> Dict[str, float]:
117 117 source_score = _extract_named_query_score(matched_queries, "base_query")
118 118 translation_score = 0.0
119   - fallback_score = 0.0
120 119  
121 120 if isinstance(matched_queries, dict):
122 121 for query_name, score in matched_queries.items():
... ... @@ -125,21 +124,16 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa
125 124 numeric_score = _to_score(score)
126 125 if query_name.startswith("base_query_trans_"):
127 126 translation_score = max(translation_score, numeric_score)
128   - elif query_name.startswith("fallback_original_query_"):
129   - fallback_score = max(fallback_score, numeric_score)
130 127 elif isinstance(matched_queries, list):
131 128 for query_name in matched_queries:
132 129 if not isinstance(query_name, str):
133 130 continue
134 131 if query_name.startswith("base_query_trans_"):
135 132 translation_score = 1.0
136   - elif query_name.startswith("fallback_original_query_"):
137   - fallback_score = 1.0
138 133  
139 134 weighted_source = source_score
140 135 weighted_translation = 0.8 * translation_score
141   - weighted_fallback = 0.55 * fallback_score
142   - weighted_components = [weighted_source, weighted_translation, weighted_fallback]
  136 + weighted_components = [weighted_source, weighted_translation]
143 137 primary_text_score = max(weighted_components)
144 138 support_text_score = sum(weighted_components) - primary_text_score
145 139 text_score = primary_text_score + 0.25 * support_text_score
... ... @@ -153,10 +147,8 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa
153 147 return {
154 148 "source_score": source_score,
155 149 "translation_score": translation_score,
156   - "fallback_score": fallback_score,
157 150 "weighted_source_score": weighted_source,
158 151 "weighted_translation_score": weighted_translation,
159   - "weighted_fallback_score": weighted_fallback,
160 152 "primary_text_score": primary_text_score,
161 153 "support_text_score": support_text_score,
162 154 "text_score": text_score,
... ... @@ -219,7 +211,6 @@ def fuse_scores_and_resort(
219 211 hit["_knn_score"] = knn_score
220 212 hit["_text_source_score"] = text_components["source_score"]
221 213 hit["_text_translation_score"] = text_components["translation_score"]
222   - hit["_text_fallback_score"] = text_components["fallback_score"]
223 214 hit["_text_primary_score"] = text_components["primary_text_score"]
224 215 hit["_text_support_score"] = text_components["support_text_score"]
225 216 hit["_fused_score"] = fused
... ... @@ -231,7 +222,6 @@ def fuse_scores_and_resort(
231 222 "text_score": text_score,
232 223 "text_source_score": text_components["source_score"],
233 224 "text_translation_score": text_components["translation_score"],
234   - "text_fallback_score": text_components["fallback_score"],
235 225 "text_primary_score": text_components["primary_text_score"],
236 226 "text_support_score": text_components["support_text_score"],
237 227 "knn_score": knn_score,
... ...
search/searcher.py
... ... @@ -132,11 +132,6 @@ class Searcher:
132 132 base_minimum_should_match=self.config.query_config.base_minimum_should_match,
133 133 translation_minimum_should_match=self.config.query_config.translation_minimum_should_match,
134 134 translation_boost=self.config.query_config.translation_boost,
135   - translation_boost_when_source_missing=self.config.query_config.translation_boost_when_source_missing,
136   - source_boost_when_missing=self.config.query_config.source_boost_when_missing,
137   - original_query_fallback_boost_when_translation_missing=(
138   - self.config.query_config.original_query_fallback_boost_when_translation_missing
139   - ),
140 135 tie_breaker_base_query=self.config.query_config.tie_breaker_base_query,
141 136 )
142 137  
... ... @@ -267,13 +262,6 @@ class Searcher:
267 262 if normalized:
268 263 candidates.append(normalized)
269 264  
270   - query_text_by_lang = getattr(parsed_query, "query_text_by_lang", {}) or {}
271   - if isinstance(query_text_by_lang, dict):
272   - for text in query_text_by_lang.values():
273   - normalized = self._normalize_sku_match_text(text)
274   - if normalized:
275   - candidates.append(normalized)
276   -
277 265 translations = getattr(parsed_query, "translations", {}) or {}
278 266 if isinstance(translations, dict):
279 267 for text in translations.values():
... ... @@ -943,7 +931,6 @@ class Searcher:
943 931 debug_entry["text_score"] = rerank_debug.get("text_score")
944 932 debug_entry["text_source_score"] = rerank_debug.get("text_source_score")
945 933 debug_entry["text_translation_score"] = rerank_debug.get("text_translation_score")
946   - debug_entry["text_fallback_score"] = rerank_debug.get("text_fallback_score")
947 934 debug_entry["text_primary_score"] = rerank_debug.get("text_primary_score")
948 935 debug_entry["text_support_score"] = rerank_debug.get("text_support_score")
949 936 debug_entry["knn_score"] = rerank_debug.get("knn_score")
... ...
tests/test_es_query_builder_text_recall_languages.py 0 → 100644
... ... @@ -0,0 +1,519 @@
  1 +"""
  2 +ES text recall: base_query (rewritten @ detected_language) + base_query_trans_*.
  3 +
  4 +Covers combinations of query language vs tenant index_languages, translations,
  5 +and mixed Chinese/English queries. Asserts multi_match _name, query text, and
  6 +target language fields (title.{lang}).
  7 +"""
  8 +
  9 +from types import SimpleNamespace
  10 +from typing import Any, Dict, List
  11 +
  12 +import numpy as np
  13 +
  14 +from search.es_query_builder import ESQueryBuilder
  15 +
  16 +
  17 +def _builder_multilingual_title_only(
  18 + *,
  19 + default_language: str = "en",
  20 + mixed_script_scale: float = 0.6,
  21 +) -> ESQueryBuilder:
  22 + """Minimal builder: only title.{lang} for easy field assertions."""
  23 + return ESQueryBuilder(
  24 + match_fields=["title.en^1.0"],
  25 + multilingual_fields=["title"],
  26 + shared_fields=[],
  27 + text_embedding_field="title_embedding",
  28 + default_language=default_language,
  29 + mixed_script_merged_field_boost_scale=mixed_script_scale,
  30 + function_score_config=None,
  31 + )
  32 +
  33 +
  34 +def _unwrap_inner_query(es_body: Dict[str, Any]) -> Dict[str, Any]:
  35 + """Navigate bool.must / function_score wrappers to the text recall root."""
  36 + q = es_body.get("query") or {}
  37 + if "bool" in q and "must" in q["bool"] and q["bool"]["must"]:
  38 + q = q["bool"]["must"][0]
  39 + if "function_score" in q:
  40 + q = q["function_score"]["query"]
  41 + return q
  42 +
  43 +
  44 +def _extract_multi_match_clauses(es_body: Dict[str, Any]) -> List[Dict[str, Any]]:
  45 + inner = _unwrap_inner_query(es_body)
  46 + if "multi_match" in inner:
  47 + return [inner["multi_match"]]
  48 + should = (inner.get("bool") or {}).get("should") or []
  49 + return [c["multi_match"] for c in should if "multi_match" in c]
  50 +
  51 +
  52 +def _clauses_index(es_body: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
  53 + """Map _name -> multi_match dict."""
  54 + out: Dict[str, Dict[str, Any]] = {}
  55 + for mm in _extract_multi_match_clauses(es_body):
  56 + name = mm.get("_name")
  57 + if name:
  58 + out[str(name)] = mm
  59 + return out
  60 +
  61 +
  62 +def _title_fields(mm: Dict[str, Any]) -> List[str]:
  63 + fields = mm.get("fields") or []
  64 + return [f for f in fields if str(f).startswith("title.")]
  65 +
  66 +
  67 +def _has_title_lang(mm: Dict[str, Any], lang: str) -> bool:
  68 + """True if any field is title.{lang} with optional ^boost suffix."""
  69 + prefix = f"title.{lang}"
  70 + for f in mm.get("fields") or []:
  71 + s = str(f)
  72 + if s == prefix or s.startswith(prefix + "^"):
  73 + return True
  74 + return False
  75 +
  76 +
  77 +def _build(
  78 + qb: ESQueryBuilder,
  79 + *,
  80 + query_text: str,
  81 + rewritten: str,
  82 + detected_language: str,
  83 + translations: Dict[str, str],
  84 + index_languages: List[str],
  85 + contains_chinese: bool = False,
  86 + contains_english: bool = False,
  87 +) -> Dict[str, Any]:
  88 + parsed = SimpleNamespace(
  89 + rewritten_query=rewritten,
  90 + detected_language=detected_language,
  91 + translations=dict(translations),
  92 + contains_chinese=contains_chinese,
  93 + contains_english=contains_english,
  94 + )
  95 + return qb.build_query(
  96 + query_text=query_text,
  97 + parsed_query=parsed,
  98 + enable_knn=False,
  99 + index_languages=index_languages,
  100 + )
  101 +
  102 +
  103 +# --- 检测语言在 index_languages 内:主召回 + 翻译补召回 ---
  104 +
  105 +
  106 +def test_zh_query_index_zh_en_includes_base_zh_and_trans_en():
  107 + qb = _builder_multilingual_title_only(default_language="en")
  108 + q = _build(
  109 + qb,
  110 + query_text="连衣裙",
  111 + rewritten="连衣裙",
  112 + detected_language="zh",
  113 + translations={"en": "dress"},
  114 + index_languages=["zh", "en"],
  115 + )
  116 + idx = _clauses_index(q)
  117 + assert set(idx) == {"base_query", "base_query_trans_en"}
  118 + assert idx["base_query"]["query"] == "连衣裙"
  119 + assert "title.zh" in _title_fields(idx["base_query"])
  120 + assert idx["base_query_trans_en"]["query"] == "dress"
  121 + assert "title.en" in _title_fields(idx["base_query_trans_en"])
  122 +
  123 +
  124 +def test_en_query_index_zh_en_includes_base_en_and_trans_zh():
  125 + qb = _builder_multilingual_title_only(default_language="en")
  126 + q = _build(
  127 + qb,
  128 + query_text="dress",
  129 + rewritten="dress",
  130 + detected_language="en",
  131 + translations={"zh": "连衣裙"},
  132 + index_languages=["en", "zh"],
  133 + )
  134 + idx = _clauses_index(q)
  135 + assert set(idx) == {"base_query", "base_query_trans_zh"}
  136 + assert idx["base_query"]["query"] == "dress"
  137 + assert "title.en" in _title_fields(idx["base_query"])
  138 + assert idx["base_query_trans_zh"]["query"] == "连衣裙"
  139 + assert "title.zh" in _title_fields(idx["base_query_trans_zh"])
  140 +
  141 +
  142 +def test_de_query_index_de_en_fr_includes_base_and_two_translations():
  143 + qb = _builder_multilingual_title_only(default_language="en")
  144 + q = _build(
  145 + qb,
  146 + query_text="kleid",
  147 + rewritten="kleid",
  148 + detected_language="de",
  149 + translations={"en": "dress", "fr": "robe"},
  150 + index_languages=["de", "en", "fr"],
  151 + )
  152 + idx = _clauses_index(q)
  153 + assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_fr"}
  154 + assert idx["base_query"]["query"] == "kleid"
  155 + assert "title.de" in _title_fields(idx["base_query"])
  156 + assert idx["base_query_trans_en"]["query"] == "dress"
  157 + assert idx["base_query_trans_fr"]["query"] == "robe"
  158 +
  159 +
  160 +# --- 检测语言不在 index_languages:仍有 base(弱)+ 翻译(强) ---
  161 +
  162 +
  163 +def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields():
  164 + qb = _builder_multilingual_title_only(default_language="en")
  165 + q = _build(
  166 + qb,
  167 + query_text="schuh",
  168 + rewritten="schuh",
  169 + detected_language="de",
  170 + translations={"en": "shoe", "zh": "鞋"},
  171 + index_languages=["en", "zh"],
  172 + )
  173 + idx = _clauses_index(q)
  174 + assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_zh"}
  175 + assert idx["base_query"]["query"] == "schuh"
  176 + assert "title.de" in _title_fields(idx["base_query"])
  177 + assert "boost" not in idx["base_query"]
  178 + assert idx["base_query_trans_en"]["query"] == "shoe"
  179 + assert idx["base_query_trans_en"]["boost"] == qb.translation_boost
  180 + assert idx["base_query_trans_zh"]["query"] == "鞋"
  181 + assert idx["base_query_trans_zh"]["boost"] == qb.translation_boost
  182 +
  183 +
  184 +# --- 中英混写:原文在 base_query;翻译子句独立;混写时 base 子句扩列 ---
  185 +
  186 +
  187 +def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause():
  188 + qb = _builder_multilingual_title_only(default_language="en")
  189 + q = _build(
  190 + qb,
  191 + query_text="红色 dress",
  192 + rewritten="红色 dress",
  193 + detected_language="zh",
  194 + translations={"en": "red dress"},
  195 + index_languages=["zh", "en"],
  196 + contains_chinese=True,
  197 + contains_english=True,
  198 + )
  199 + idx = _clauses_index(q)
  200 + assert set(idx) == {"base_query", "base_query_trans_en"}
  201 + assert idx["base_query"]["query"] == "红色 dress"
  202 + assert _has_title_lang(idx["base_query"], "zh") and _has_title_lang(idx["base_query"], "en")
  203 + assert idx["base_query_trans_en"]["query"] == "red dress"
  204 + assert _has_title_lang(idx["base_query_trans_en"], "en")
  205 +
  206 +
  207 +def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause():
  208 + qb = _builder_multilingual_title_only(default_language="en")
  209 + q = _build(
  210 + qb,
  211 + query_text="nike 运动鞋",
  212 + rewritten="nike 运动鞋",
  213 + detected_language="en",
  214 + translations={"zh": "耐克运动鞋"},
  215 + index_languages=["zh", "en"],
  216 + contains_chinese=True,
  217 + contains_english=True,
  218 + )
  219 + idx = _clauses_index(q)
  220 + assert set(idx) == {"base_query", "base_query_trans_zh"}
  221 + assert idx["base_query"]["query"] == "nike 运动鞋"
  222 + assert _has_title_lang(idx["base_query"], "en") and _has_title_lang(idx["base_query"], "zh")
  223 + assert idx["base_query_trans_zh"]["query"] == "耐克运动鞋"
  224 +
  225 +
  226 +def test_mixed_zh_query_index_zh_only_no_en_merge_in_base():
  227 + qb = _builder_multilingual_title_only(default_language="en")
  228 + q = _build(
  229 + qb,
  230 + query_text="法式 dress",
  231 + rewritten="法式 dress",
  232 + detected_language="zh",
  233 + translations={},
  234 + index_languages=["zh"],
  235 + contains_chinese=True,
  236 + contains_english=True,
  237 + )
  238 + idx = _clauses_index(q)
  239 + assert set(idx) == {"base_query"}
  240 + bases = {f.split("^", 1)[0] for f in _title_fields(idx["base_query"])}
  241 + assert bases == {"title.zh"}
  242 +
  243 +
  244 +# --- 去重:与 base 同语言同文本的翻译项跳过 ---
  245 +
  246 +
  247 +def test_skips_translation_when_same_lang_and_same_text_as_base():
  248 + qb = _builder_multilingual_title_only(default_language="en")
  249 + q = _build(
  250 + qb,
  251 + query_text="NIKE",
  252 + rewritten="NIKE",
  253 + detected_language="en",
  254 + translations={"en": "NIKE", "zh": "耐克"},
  255 + index_languages=["en", "zh"],
  256 + )
  257 + idx = _clauses_index(q)
  258 + assert set(idx) == {"base_query", "base_query_trans_zh"}
  259 +
  260 +
  261 +def test_keeps_translation_when_same_text_but_different_lang_than_base():
  262 + qb = _builder_multilingual_title_only(default_language="en")
  263 + q = _build(
  264 + qb,
  265 + query_text="NIKE",
  266 + rewritten="NIKE",
  267 + detected_language="en",
  268 + translations={"zh": "NIKE"},
  269 + index_languages=["en", "zh"],
  270 + )
  271 + idx = _clauses_index(q)
  272 + assert set(idx) == {"base_query", "base_query_trans_zh"}
  273 + assert idx["base_query_trans_zh"]["query"] == "NIKE"
  274 +
  275 +
  276 +# --- 翻译 key 规范化、空翻译跳过 ---
  277 +
  278 +
  279 +def test_translation_language_key_is_normalized_case_insensitive():
  280 + qb = _builder_multilingual_title_only(default_language="en")
  281 + q = _build(
  282 + qb,
  283 + query_text="dress",
  284 + rewritten="dress",
  285 + detected_language="en",
  286 + translations={"ZH": "连衣裙"},
  287 + index_languages=["en", "zh"],
  288 + )
  289 + idx = _clauses_index(q)
  290 + assert "base_query_trans_zh" in idx
  291 + assert idx["base_query_trans_zh"]["query"] == "连衣裙"
  292 +
  293 +
  294 +def test_empty_translation_value_is_skipped():
  295 + qb = _builder_multilingual_title_only(default_language="en")
  296 + q = _build(
  297 + qb,
  298 + query_text="dress",
  299 + rewritten="dress",
  300 + detected_language="en",
  301 + translations={"zh": " ", "fr": "robe"},
  302 + index_languages=["en", "zh", "fr"],
  303 + )
  304 + idx = _clauses_index(q)
  305 + assert "base_query_trans_zh" not in idx
  306 + assert "base_query_trans_fr" in idx
  307 +
  308 +
  309 +# --- index_languages 为空:视为「未约束」source_in_index 为 True ---
  310 +
  311 +
  312 +def test_empty_index_languages_treats_source_as_in_index_boosts():
  313 + qb = _builder_multilingual_title_only(default_language="en")
  314 + q = _build(
  315 + qb,
  316 + query_text="x",
  317 + rewritten="x",
  318 + detected_language="de",
  319 + translations={"en": "y"},
  320 + index_languages=[],
  321 + )
  322 + idx = _clauses_index(q)
  323 + assert "boost" not in idx["base_query"]
  324 + assert idx["base_query_trans_en"]["boost"] == qb.translation_boost
  325 +
  326 +
  327 +# --- 无翻译:仅 base_query ---
  328 +
  329 +
  330 +def test_no_translations_only_base_query():
  331 + qb = _builder_multilingual_title_only(default_language="en")
  332 + q = _build(
  333 + qb,
  334 + query_text="hello",
  335 + rewritten="hello",
  336 + detected_language="en",
  337 + translations={},
  338 + index_languages=["en", "zh"],
  339 + )
  340 + idx = _clauses_index(q)
  341 + assert set(idx) == {"base_query"}
  342 +
  343 +
  344 +# --- 与 KNN 同存时仍能解析文本子句(顶层 knn 不影响 query 内结构) ---
  345 +
  346 +
  347 +def test_text_clauses_present_alongside_knn():
  348 + qb = _builder_multilingual_title_only(default_language="en")
  349 + parsed = SimpleNamespace(
  350 + rewritten_query="dress",
  351 + detected_language="en",
  352 + translations={"zh": "连衣裙"},
  353 + contains_chinese=False,
  354 + contains_english=True,
  355 + )
  356 + q = qb.build_query(
  357 + query_text="dress",
  358 + query_vector=np.array([0.1, 0.2, 0.3], dtype=np.float32),
  359 + parsed_query=parsed,
  360 + enable_knn=True,
  361 + index_languages=["en", "zh"],
  362 + )
  363 + assert "knn" in q
  364 + idx = _clauses_index(q)
  365 + assert set(idx) == {"base_query", "base_query_trans_zh"}
  366 +
  367 +
  368 +def test_detected_language_unknown_falls_back_to_default_language():
  369 + """与 LanguageDetector 失败时 QueryConfig.default_language 行为对齐。"""
  370 + qb = _builder_multilingual_title_only(default_language="en")
  371 + parsed = SimpleNamespace(
  372 + rewritten_query="shirt",
  373 + detected_language="unknown",
  374 + translations={"zh": "衬衫"},
  375 + contains_chinese=False,
  376 + contains_english=True,
  377 + )
  378 + q = qb.build_query(
  379 + query_text="shirt",
  380 + parsed_query=parsed,
  381 + enable_knn=False,
  382 + index_languages=["en", "zh"],
  383 + )
  384 + idx = _clauses_index(q)
  385 + assert set(idx) == {"base_query", "base_query_trans_zh"}
  386 + assert idx["base_query"]["query"] == "shirt"
  387 + assert _has_title_lang(idx["base_query"], "en")
  388 +
  389 +
  390 +def test_ru_query_index_ru_en_includes_base_ru_and_trans_en():
  391 + qb = _builder_multilingual_title_only(default_language="en")
  392 + q = _build(
  393 + qb,
  394 + query_text="платье",
  395 + rewritten="платье",
  396 + detected_language="ru",
  397 + translations={"en": "dress"},
  398 + index_languages=["ru", "en"],
  399 + )
  400 + idx = _clauses_index(q)
  401 + assert set(idx) == {"base_query", "base_query_trans_en"}
  402 + assert idx["base_query"]["query"] == "платье"
  403 + assert _has_title_lang(idx["base_query"], "ru")
  404 + assert idx["base_query_trans_en"]["query"] == "dress"
  405 +
  406 +
  407 +def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause():
  408 + """
  409 + 当前实现:凡是 translations 里非空的条目都会生成子句;
  410 + index_languages 只约束混写扩列,不用于过滤翻译子句。
  411 + """
  412 + qb = _builder_multilingual_title_only(default_language="en")
  413 + q = _build(
  414 + qb,
  415 + query_text="dress",
  416 + rewritten="dress",
  417 + detected_language="en",
  418 + translations={"zh": "连衣裙", "de": "Kleid"},
  419 + index_languages=["en", "zh"],
  420 + )
  421 + idx = _clauses_index(q)
  422 + assert "base_query_trans_de" in idx
  423 + assert idx["base_query_trans_de"]["query"] == "Kleid"
  424 + assert _has_title_lang(idx["base_query_trans_de"], "de")
  425 +
  426 +
  427 +def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_base():
  428 + """base_query 始终用 rewritten_query,而非仅 query_text。"""
  429 + qb = _builder_multilingual_title_only(default_language="en")
  430 + q = _build(
  431 + qb,
  432 + query_text=" 红色 ",
  433 + rewritten="红色连衣裙",
  434 + detected_language="zh",
  435 + translations={"en": "red dress"},
  436 + index_languages=["zh", "en"],
  437 + contains_chinese=True,
  438 + contains_english=False,
  439 + )
  440 + idx = _clauses_index(q)
  441 + assert idx["base_query"]["query"] == "红色连衣裙"
  442 + assert idx["base_query_trans_en"]["query"] == "red dress"
  443 +
  444 +
  445 +def test_detected_language_unknown_falls_back_to_default_language():
  446 + """与 LanguageDetector 失败时 QueryConfig.default_language 行为对齐。"""
  447 + qb = _builder_multilingual_title_only(default_language="en")
  448 + parsed = SimpleNamespace(
  449 + rewritten_query="shirt",
  450 + detected_language="unknown",
  451 + translations={"zh": "衬衫"},
  452 + contains_chinese=False,
  453 + contains_english=True,
  454 + )
  455 + q = qb.build_query(
  456 + query_text="shirt",
  457 + parsed_query=parsed,
  458 + enable_knn=False,
  459 + index_languages=["en", "zh"],
  460 + )
  461 + idx = _clauses_index(q)
  462 + assert set(idx) == {"base_query", "base_query_trans_zh"}
  463 + assert idx["base_query"]["query"] == "shirt"
  464 + assert _has_title_lang(idx["base_query"], "en")
  465 +
  466 +
  467 +def test_ru_query_index_ru_en_includes_base_ru_and_trans_en():
  468 + qb = _builder_multilingual_title_only(default_language="en")
  469 + q = _build(
  470 + qb,
  471 + query_text="платье",
  472 + rewritten="платье",
  473 + detected_language="ru",
  474 + translations={"en": "dress"},
  475 + index_languages=["ru", "en"],
  476 + )
  477 + idx = _clauses_index(q)
  478 + assert set(idx) == {"base_query", "base_query_trans_en"}
  479 + assert idx["base_query"]["query"] == "платье"
  480 + assert _has_title_lang(idx["base_query"], "ru")
  481 + assert idx["base_query_trans_en"]["query"] == "dress"
  482 +
  483 +
  484 +def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause():
  485 + """
  486 + 当前实现:凡是 translations 里非空的条目都会生成子句;
  487 + index_languages 只约束混写扩列,不用于过滤翻译子句。
  488 + """
  489 + qb = _builder_multilingual_title_only(default_language="en")
  490 + q = _build(
  491 + qb,
  492 + query_text="dress",
  493 + rewritten="dress",
  494 + detected_language="en",
  495 + translations={"zh": "连衣裙", "de": "Kleid"},
  496 + index_languages=["en", "zh"],
  497 + )
  498 + idx = _clauses_index(q)
  499 + assert "base_query_trans_de" in idx
  500 + assert idx["base_query_trans_de"]["query"] == "Kleid"
  501 + assert _has_title_lang(idx["base_query_trans_de"], "de")
  502 +
  503 +
  504 +def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_base():
  505 + """base_query 始终用 rewritten_query,而非仅 query_text。"""
  506 + qb = _builder_multilingual_title_only(default_language="en")
  507 + q = _build(
  508 + qb,
  509 + query_text=" 红色 ",
  510 + rewritten="红色连衣裙",
  511 + detected_language="zh",
  512 + translations={"en": "red dress"},
  513 + index_languages=["zh", "en"],
  514 + contains_chinese=True,
  515 + contains_english=False,
  516 + )
  517 + idx = _clauses_index(q)
  518 + assert idx["base_query"]["query"] == "红色连衣裙"
  519 + assert idx["base_query_trans_en"]["query"] == "red dress"
... ...
tests/test_rerank_client.py
... ... @@ -11,7 +11,6 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim
11 11 "matched_queries": {
12 12 "base_query": 2.4,
13 13 "base_query_trans_zh": 1.8,
14   - "fallback_original_query_zh": 1.2,
15 14 "knn_query": 0.8,
16 15 },
17 16 },
... ... @@ -27,7 +26,7 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim
27 26  
28 27 debug = fuse_scores_and_resort(hits, [0.9, 0.7])
29 28  
30   - expected_text_1 = 2.4 + 0.25 * ((0.8 * 1.8) + (0.55 * 1.2))
  29 + expected_text_1 = 2.4 + 0.25 * (0.8 * 1.8)
31 30 expected_fused_1 = (0.9 + 0.00001) * ((expected_text_1 + 0.1) ** 0.35) * ((0.8 + 0.6) ** 0.2)
32 31 expected_fused_2 = (0.7 + 0.00001) * ((9.0 + 0.1) ** 0.35) * ((0.2 + 0.6) ** 0.2)
33 32  
... ... @@ -38,7 +37,6 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim
38 37 assert isclose(by_id["2"]["_fused_score"], expected_fused_2, rel_tol=1e-9)
39 38 assert debug[0]["text_source_score"] == 2.4
40 39 assert debug[0]["text_translation_score"] == 1.8
41   - assert debug[0]["text_fallback_score"] == 1.2
42 40 assert debug[0]["knn_score"] == 0.8
43 41 assert [hit["_id"] for hit in hits] == ["2", "1"]
44 42  
... ...