Commit 0536222c6d7fcf1bb9339299b67409c918bae320
1 parent
ef5baa86
query parser优化
Showing
15 changed files
with
629 additions
and
141 deletions
Show diff stats
config/config.yaml
| ... | ... | @@ -20,7 +20,7 @@ es_settings: |
| 20 | 20 | refresh_interval: "30s" |
| 21 | 21 | |
| 22 | 22 | # 字段权重配置(用于搜索时的字段boost) |
| 23 | -# 统一按“字段基名”配置;查询时按 search_langs 动态拼接 .{lang}。 | |
| 23 | +# 统一按“字段基名”配置;查询时按实际检索语言动态拼接 .{lang}。 | |
| 24 | 24 | # 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。 |
| 25 | 25 | field_boosts: |
| 26 | 26 | title: 3.0 |
| ... | ... | @@ -74,14 +74,11 @@ query_config: |
| 74 | 74 | - "vendor" |
| 75 | 75 | - "category_name_text" |
| 76 | 76 | |
| 77 | - # 统一文本召回策略(主查询 + 翻译查询 + 原始查询兜底) | |
| 77 | + # 统一文本召回策略(主查询 + 翻译查询) | |
| 78 | 78 | text_query_strategy: |
| 79 | 79 | base_minimum_should_match: "75%" |
| 80 | 80 | translation_minimum_should_match: "75%" |
| 81 | 81 | translation_boost: 0.4 |
| 82 | - translation_boost_when_source_missing: 1.0 | |
| 83 | - source_boost_when_missing: 0.6 | |
| 84 | - original_query_fallback_boost_when_translation_missing: 0.2 | |
| 85 | 82 | tie_breaker_base_query: 0.9 |
| 86 | 83 | |
| 87 | 84 | # Embedding字段名称 | ... | ... |
config/loader.py
| ... | ... | @@ -284,13 +284,6 @@ class AppConfigLoader: |
| 284 | 284 | base_minimum_should_match=str(text_strategy.get("base_minimum_should_match", "70%")), |
| 285 | 285 | translation_minimum_should_match=str(text_strategy.get("translation_minimum_should_match", "70%")), |
| 286 | 286 | translation_boost=float(text_strategy.get("translation_boost", 0.4)), |
| 287 | - translation_boost_when_source_missing=float( | |
| 288 | - text_strategy.get("translation_boost_when_source_missing", 1.0) | |
| 289 | - ), | |
| 290 | - source_boost_when_missing=float(text_strategy.get("source_boost_when_missing", 0.6)), | |
| 291 | - original_query_fallback_boost_when_translation_missing=float( | |
| 292 | - text_strategy.get("original_query_fallback_boost_when_translation_missing", 0.2) | |
| 293 | - ), | |
| 294 | 287 | tie_breaker_base_query=float(text_strategy.get("tie_breaker_base_query", 0.9)), |
| 295 | 288 | zh_to_en_model=str(query_cfg.get("zh_to_en_model") or "opus-mt-zh-en"), |
| 296 | 289 | en_to_zh_model=str(query_cfg.get("en_to_zh_model") or "opus-mt-en-zh"), | ... | ... |
config/schema.py
| ... | ... | @@ -54,9 +54,6 @@ class QueryConfig: |
| 54 | 54 | base_minimum_should_match: str = "70%" |
| 55 | 55 | translation_minimum_should_match: str = "70%" |
| 56 | 56 | translation_boost: float = 0.4 |
| 57 | - translation_boost_when_source_missing: float = 1.0 | |
| 58 | - source_boost_when_missing: float = 0.6 | |
| 59 | - original_query_fallback_boost_when_translation_missing: float = 0.2 | |
| 60 | 57 | tie_breaker_base_query: float = 0.9 |
| 61 | 58 | zh_to_en_model: str = "opus-mt-zh-en" |
| 62 | 59 | en_to_zh_model: str = "opus-mt-en-zh" | ... | ... |
docs/DEVELOPER_GUIDE.md
| ... | ... | @@ -147,7 +147,7 @@ docs/ # 文档(含本指南) |
| 147 | 147 | |
| 148 | 148 | ### 4.4 query |
| 149 | 149 | |
| 150 | -- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化;输出可供 Searcher 使用的结构化查询信息(含 search_langs 语言计划)。 | |
| 150 | +- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化;输出解析事实(如 `rewritten_query`、`detected_language`、`translations`、`query_vector`),不再承担 ES 语言计划拼装。 | |
| 151 | 151 | - **原则**:翻译/向量通过 `providers` 获取,不直接依赖具体服务 URL 或实现;支持按配置关闭翻译/向量(如短查询、typing 场景)。 |
| 152 | 152 | |
| 153 | 153 | ### 4.5 search | ... | ... |
docs/QUICKSTART.md
| ... | ... | @@ -558,6 +558,21 @@ lsof -i :6004 |
| 558 | 558 | |
| 559 | 559 | 更完整的运行排障(多环境切换、Suggestion 构建、FAQ)见 `docs/Usage-Guide.md`。 |
| 560 | 560 | |
| 561 | +### 5.4 HanLP 与 `transformers` 版本(`BertTokenizer.encode_plus`) | |
| 562 | + | |
| 563 | +若日志出现 **`AttributeError: BertTokenizer has no attribute encode_plus`**,通常是 **同一 venv 里装了 `transformers` 5.x**,与 **HanLP 2.1.x** 不兼容(HanLP 仍调用已移除的 `encode_plus`)。 | |
| 564 | + | |
| 565 | +**处理:** 将 `transformers` 固定到 **4.x**(例如 4.44+),然后重装/校验 HanLP: | |
| 566 | + | |
| 567 | +```bash | |
| 568 | +source activate.sh | |
| 569 | +pip install -r requirements_hanlp.txt | |
| 570 | +python -c "from transformers import BertTokenizer; import transformers as t; print(t.__version__, hasattr(BertTokenizer, 'encode_plus'))" | |
| 571 | +# 期望:4.x 且 True | |
| 572 | +``` | |
| 573 | + | |
| 574 | +**说明:** 重排/TEI 等若使用 **独立 venv**(如 `.venv-reranker`),可与主 venv 的 `transformers` 版本分离;主 venv 只要装了 HanLP 做查询分词,就不要把 `transformers` 升到 5。 | |
| 575 | + | |
| 561 | 576 | --- |
| 562 | 577 | |
| 563 | 578 | ## 6. 相关文档 | ... | ... |
docs/TODO.txt
| ... | ... | @@ -32,7 +32,7 @@ |
| 32 | 32 | }, |
| 33 | 33 | 去掉 image_embedding_512 |
| 34 | 34 | image_embedding改为,一个spu有多个sku向量,每个向量内部properties: |
| 35 | -除了vector url还应该包括 | |
| 35 | +除了vector url还应该包括,该图片是对应哪些sku | |
| 36 | 36 | "image_embedding": { |
| 37 | 37 | "type": "nested", |
| 38 | 38 | "properties": { |
| ... | ... | @@ -117,6 +117,11 @@ requests.exceptions.HTTPError: 502 Server Error: Bad Gateway for url: http://127 |
| 117 | 117 | |
| 118 | 118 | |
| 119 | 119 | |
| 120 | +是否需要: | |
| 121 | +当「源语言不在 index_languages」且「某些目标语言的翻译缺失」时,ES 里会额外加一层 用「原始 query 字符串」去撞缺失语种字段 | |
| 122 | + | |
| 123 | + | |
| 124 | + | |
| 120 | 125 | 先阅读文本embedding相关的代码: |
| 121 | 126 | @embeddings/README.md @embeddings/server.py @docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md @embeddings/text_encoder.py |
| 122 | 127 | 目前有TEXT_MAX_INFLIGHT / IMAGE_MAX_INFLIGHT 准入限制,超限返回过载状态码。 | ... | ... |
docs/搜索API对接指南-01-搜索接口.md
| ... | ... | @@ -553,9 +553,6 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) |
| 553 | 553 | | `rewritten_query` | string | 重写后的查询 | |
| 554 | 554 | | `detected_language` | string | 检测到的语言 | |
| 555 | 555 | | `translations` | object | 翻译结果 | |
| 556 | -| `query_text_by_lang` | object | 实际参与检索的多语言 query 文本 | | |
| 557 | -| `search_langs` | array[string] | 实际参与检索的语言列表 | | |
| 558 | -| `supplemental_search_langs` | array[string] | 因 mixed query 补入的附加语言列表 | | |
| 559 | 556 | | `has_vector` | boolean | 是否生成了向量 | |
| 560 | 557 | |
| 561 | 558 | `debug_info.per_result[]` 常见字段: |
| ... | ... | @@ -565,10 +562,9 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) |
| 565 | 562 | | `spu_id` | string | 结果 SPU ID | |
| 566 | 563 | | `es_score` | float | ES 原始 `_score` | |
| 567 | 564 | | `rerank_score` | float | 重排分数 | |
| 568 | -| `text_score` | float | 文本相关性大分(由 `base_query` / `base_query_trans_*` / `fallback_original_query_*` 聚合而来) | | |
| 565 | +| `text_score` | float | 文本相关性大分(由 `base_query` / `base_query_trans_*` 聚合而来) | | |
| 569 | 566 | | `text_source_score` | float | `base_query` 分数 | |
| 570 | 567 | | `text_translation_score` | float | `base_query_trans_*` 里的最大分数 | |
| 571 | -| `text_fallback_score` | float | `fallback_original_query_*` 里的最大分数 | | |
| 572 | 568 | | `text_primary_score` | float | 文本大分中的主证据部分 | |
| 573 | 569 | | `text_support_score` | float | 文本大分中的辅助证据部分 | |
| 574 | 570 | | `knn_score` | float | `knn_query` 分数 | | ... | ... |
docs/相关性检索优化说明.md
| ... | ... | @@ -2,11 +2,11 @@ |
| 2 | 2 | |
| 3 | 3 | ## 1. 文档目标 |
| 4 | 4 | |
| 5 | -本文描述当前线上代码的文本检索策略,重点覆盖: | |
| 5 | +本文描述当前代码中的文本检索策略,重点覆盖: | |
| 6 | 6 | |
| 7 | 7 | - 多语言检索路由(`detector` / `translator` / `indexed` 的关系) |
| 8 | 8 | - 统一文本召回表达式(无布尔 AST 分支) |
| 9 | -- 翻译缺失时的兜底策略 | |
| 9 | +- 解析层与检索表达式层的职责边界 | |
| 10 | 10 | - 重排融合打分与调试字段 |
| 11 | 11 | - 典型场景下实际生成的 ES 查询结构 |
| 12 | 12 | |
| ... | ... | @@ -17,9 +17,11 @@ |
| 17 | 17 | 查询链路(文本相关): |
| 18 | 18 | |
| 19 | 19 | 1. `QueryParser.parse()` |
| 20 | - 输出 `detected_language`、`query_text_by_lang`、`search_langs`、`index_languages`、`source_in_index_languages`;另输出 `contains_chinese` / `contains_english`(仅服务混写辅助召回,见 §4 末)。 | |
| 20 | + 负责产出解析事实:`query_normalized`、`rewritten_query`、`detected_language`、`translations`、`query_vector`、`query_tokens`、`contains_chinese`、`contains_english`。 | |
| 21 | +2. `Searcher.search()` | |
| 22 | + 负责读取租户 `index_languages`,并将其一方面传给 `QueryParser` 作为 `target_languages`,另一方面传给 `ESQueryBuilder` 作为字段展开约束。 | |
| 21 | 23 | 2. `ESQueryBuilder._build_advanced_text_query()` |
| 22 | - 按 `search_langs` 动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段,叠加 shared 字段(`tags`、`option*_values`);若命中混写辅助条件,在同一子句内并入另一语种列(§4 末)。 | |
| 24 | + 基于 `rewritten_query + detected_language + translations + index_languages` 构建 `base_query` 与 `base_query_trans_*`;并按语言动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段,叠加 shared 字段(`tags`、`option*_values`)。 | |
| 23 | 25 | 3. `build_query()` |
| 24 | 26 | 统一走文本策略,不再有布尔 AST 枝路。 |
| 25 | 27 | |
| ... | ... | @@ -37,18 +39,18 @@ |
| 37 | 39 | 源语言字段做主召回;其他语言走翻译补召回(低权重)。 |
| 38 | 40 | 2. 若 `detected_language not in index_languages`: |
| 39 | 41 | 翻译到 `index_languages` 是主路径;源语言字段仅作弱召回。 |
| 40 | -3. 若第 2 步翻译部分失败或全部失败: | |
| 41 | - 对缺失翻译的 `index_languages` 字段,追加“原文低权重兜底”子句,避免完全丢失这些语种索引面的召回机会。 | |
| 42 | +3. 若翻译部分失败或全部失败: | |
| 43 | + 当前实现不会再额外生成“原文打到其他语种字段”的兜底子句;系统保留 `base_query` 并继续执行,可观测性由 `translations` / warning / 命名子句分数提供。 | |
| 42 | 44 | |
| 43 | 45 | ### 3.2 翻译与向量:并发提交与共享超时 |
| 44 | 46 | |
| 45 | -`QueryParser.parse()` 内(Stage 4–6)对**离线调用**采用线程池提交 + **一次** `concurrent.futures.wait`: | |
| 47 | +`QueryParser.parse()` 内对翻译与向量采用线程池提交 + **一次** `concurrent.futures.wait`: | |
| 46 | 48 | |
| 47 | -- **翻译**:对 `index_languages` 中除 `detected_language` 外的每个目标语种各提交一个 `translator.translate` 任务(多目标时并发执行)。 | |
| 48 | -- **查询向量**(若开启 `enable_text_embedding` 且域为 default):再提交一个 `text_encoder.encode` 任务。 | |
| 49 | +- **翻译**:对调用方传入的 `target_languages` 中、除 `detected_language` 外的每个目标语种各提交一个 `translator.translate` 任务(多目标时并发执行)。 | |
| 50 | +- **查询向量**:若开启 `enable_text_embedding`,再提交一个 `text_encoder.encode` 任务。 | |
| 49 | 51 | - 上述任务进入**同一** future 集合;例如租户索引为 `[zh, en]` 且检测语种**不在**索引内时,常为 **2 路翻译 + 1 路向量,共 3 个任务并发**,共用超时。 |
| 50 | 52 | |
| 51 | -**等待预算(毫秒)**由 `detected_language` 是否属于租户 `index_languages` 决定(`query_config`): | |
| 53 | +**等待预算(毫秒)**由 `detected_language` 是否属于调用方传入的 `target_languages` 决定(`query_config`): | |
| 52 | 54 | |
| 53 | 55 | - **在索引内**:`translation_embedding_wait_budget_ms_source_in_index`(默认较短,如 80ms)— 主召回已能打在源语种字段,翻译/向量稍慢可容忍。 |
| 54 | 56 | - **不在索引内**:`translation_embedding_wait_budget_ms_source_not_in_index`(默认较长,如 200ms)— 翻译对可检索文本更关键,给足时间。 |
| ... | ... | @@ -62,7 +64,7 @@ |
| 62 | 64 | ```json |
| 63 | 65 | { |
| 64 | 66 | "multi_match": { |
| 65 | - "_name": "base_query|base_query_trans_xx|fallback_original_query_xx", | |
| 67 | + "_name": "base_query|base_query_trans_xx", | |
| 66 | 68 | "query": "<text>", |
| 67 | 69 | "fields": ["title.xx^3.0", "brief.xx^1.5", "...", "tags", "option1_values^0.5", "..."], |
| 68 | 70 | "minimum_should_match": "75%", |
| ... | ... | @@ -75,7 +77,7 @@ |
| 75 | 77 | 最终按 `bool.should` 组合,`minimum_should_match: 1`。 |
| 76 | 78 | |
| 77 | 79 | > **附 — 混写辅助召回** |
| 78 | -> 当中英(或多脚本)混写时,为略抬召回:`QueryParser` 用 `contains_chinese`(文中有汉字)、`contains_english`(分词中有长度 ≥3 的纯英文 token)打标;`ESQueryBuilder` 在某一语言的 `multi_match` 上,按规则把**另一语种**的同类字段并入同一 `fields`(受 `index_languages` 限制),并入列的 boost 为配置值再乘 **`mixed_script_merged_field_boost_scale`(默认 0.8,`ESQueryBuilder` 构造参数)**。`fallback_original_query_*` 同样适用。字段在内部以 `(path, boost)` 列表合并后再格式化为 ES 字符串。 | |
| 80 | +> 当中英(或多脚本)混写时,为略抬召回:`QueryParser` 用 `contains_chinese`(文中有汉字)、`contains_english`(分词中有长度 ≥3 的纯英文 token)打标;`ESQueryBuilder` 在某一语言的 `multi_match` 上,按规则把**另一语种**的同类字段并入同一 `fields`(受 `index_languages` 限制),并入列的 boost 为配置值再乘 **`mixed_script_merged_field_boost_scale`(默认 0.6,`ESQueryBuilder` 构造参数)**。字段在内部以 `(path, boost)` 列表合并后再格式化为 ES 字符串。 | |
| 79 | 81 | |
| 80 | 82 | ## 5. 关键配置项(文本策略) |
| 81 | 83 | |
| ... | ... | @@ -88,20 +90,12 @@ |
| 88 | 90 | |
| 89 | 91 | - `base_minimum_should_match` |
| 90 | 92 | - `translation_minimum_should_match` |
| 91 | -- `translation_boost` | |
| 92 | -- `translation_boost_when_source_missing` | |
| 93 | -- `source_boost_when_missing` | |
| 94 | -- `original_query_fallback_boost_when_translation_missing`(新增) | |
| 93 | +- `translation_boost`(所有 `base_query_trans_*` 共用) | |
| 95 | 94 | - `tie_breaker_base_query` |
| 96 | 95 | |
| 97 | -新增项说明: | |
| 98 | - | |
| 99 | -- `original_query_fallback_boost_when_translation_missing`: | |
| 100 | - 当源语种不在索引语言且翻译缺失时,原文打到缺失目标语字段的低权重系数,默认 `0.2`。 | |
| 101 | - | |
| 102 | 96 | 说明: |
| 103 | 97 | |
| 104 | -- `phrase_query` / `keywords_query` 已从当前实现中移除,文本相关性只由 `base_query`、`base_query_trans_*`、`fallback_original_query_*` 三类子句组成。 | |
| 98 | +- `phrase_query` / `keywords_query` 已从当前实现中移除,文本相关性只由 `base_query`、`base_query_trans_*` 两类子句组成。 | |
| 105 | 99 | |
| 106 | 100 | ## 6. 典型场景与实际 DSL |
| 107 | 101 | |
| ... | ... | @@ -111,11 +105,12 @@ |
| 111 | 105 | |
| 112 | 106 | - `detected_language=de` |
| 113 | 107 | - `index_languages=[de,en]` |
| 114 | -- `query_text_by_lang={de:"herren schuhe", en:"men shoes"}` | |
| 108 | +- `rewritten_query="herren schuhe"` | |
| 109 | +- `translations={en:"men shoes"}` | |
| 115 | 110 | |
| 116 | 111 | 策略结果: |
| 117 | 112 | |
| 118 | -- `base_query`:德语字段,正常权重 | |
| 113 | +- `base_query`:德语字段,**不写** `multi_match.boost` | |
| 119 | 114 | - `base_query_trans_en`:英语字段,`boost=translation_boost`(默认 0.4) |
| 120 | 115 | |
| 121 | 116 | ### 场景 B:源语种不在索引语言中,部分翻译缺失 |
| ... | ... | @@ -126,38 +121,44 @@ |
| 126 | 121 | |
| 127 | 122 | 策略结果: |
| 128 | 123 | |
| 129 | -- `base_query`(德语字段):`boost=source_boost_when_missing`(默认 0.6) | |
| 130 | -- `base_query_trans_en`(英文字段):`boost=translation_boost_when_source_missing`(默认 1.0) | |
| 131 | -- `fallback_original_query_zh`(中文字段):原文低权重兜底(默认 0.2) | |
| 124 | +- `base_query`(德语字段):**不写** `multi_match.boost`(默认 1.0) | |
| 125 | +- `base_query_trans_en`(英文字段):`boost=translation_boost`(如 0.4) | |
| 126 | +- 不会生成额外中文兜底子句 | |
| 132 | 127 | |
| 133 | 128 | ### 场景 C:源语种不在索引语言中,翻译全部失败 |
| 134 | 129 | |
| 135 | 130 | - `detected_language=de` |
| 136 | 131 | - `index_languages=[en,zh]` |
| 137 | -- `query_text_by_lang` 仅有 `de` | |
| 132 | +- `translations={}` | |
| 138 | 133 | |
| 139 | 134 | 策略结果: |
| 140 | 135 | |
| 141 | -- `base_query`(德语字段,低权重) | |
| 142 | -- `fallback_original_query_en`(英文字段原文兜底) | |
| 143 | -- `fallback_original_query_zh`(中文字段原文兜底) | |
| 136 | +- `base_query`(德语字段,**无** `boost` 字段) | |
| 137 | +- 不会生成 `base_query_trans_*` | |
| 144 | 138 | |
| 145 | -这能避免“只有源语种字段查询,且该语种字段在商家索引中稀疏/为空”导致的弱召回问题。 | |
| 139 | +这意味着当前实现优先保证职责清晰与可解释性,而不是继续在 Builder 内部隐式制造“跨语种原文兜底”。 | |
| 146 | 140 | |
| 147 | -## 7. QueryParser 与 ESBuilder 的职责分工 | |
| 141 | +## 7. QueryParser 与 Searcher / ESBuilder 的职责分工 | |
| 148 | 142 | |
| 149 | -- `QueryParser` 负责“语言计划”与“可用文本”: | |
| 150 | - - `search_langs` | |
| 151 | - - `query_text_by_lang` | |
| 152 | - - `source_in_index_languages` | |
| 153 | - - `index_languages` | |
| 143 | +- `QueryParser` 负责“解析事实”: | |
| 144 | + - `query_normalized` | |
| 145 | + - `rewritten_query` | |
| 146 | + - `detected_language` | |
| 147 | + - `translations` | |
| 148 | + - `query_vector` | |
| 149 | + - `query_tokens` | |
| 154 | 150 | - `contains_chinese` / `contains_english` |
| 151 | +- `Searcher` 负责“租户语境”: | |
| 152 | + - `index_languages` | |
| 153 | + - 将其传给 parser 作为 `target_languages` | |
| 154 | + - 将其传给 builder 作为字段展开约束 | |
| 155 | 155 | - `ESQueryBuilder` 负责“表达式展开”: |
| 156 | 156 | - 动态字段组装 |
| 157 | 157 | - 子句权重分配 |
| 158 | - - 翻译缺失兜底子句拼接 | |
| 158 | + - `base_query` / `base_query_trans_*` 子句拼接 | |
| 159 | + - 跳过“与 base_query 文本和语言完全相同”的重复翻译子句 | |
| 159 | 160 | |
| 160 | -这种分层让策略调优主要落在配置和 Builder,不破坏 Parser 的职责边界。 | |
| 161 | +这种分层让 parser 不再返回 ES 专用的“语言计划字段”,职责边界更清晰。 | |
| 161 | 162 | |
| 162 | 163 | ## 8. 融合打分(Rerank + Text + KNN) |
| 163 | 164 | |
| ... | ... | @@ -165,24 +166,21 @@ |
| 165 | 166 | |
| 166 | 167 | ### 8.1 文本相关性大分 |
| 167 | 168 | |
| 168 | -文本大分由三部分组成: | |
| 169 | +文本大分由两部分组成: | |
| 169 | 170 | |
| 170 | 171 | - `base_query` |
| 171 | 172 | - `base_query_trans_*` |
| 172 | -- `fallback_original_query_*` | |
| 173 | 173 | |
| 174 | 174 | 聚合方式: |
| 175 | 175 | |
| 176 | 176 | 1. `source_score = base_query` |
| 177 | 177 | 2. `translation_score = max(base_query_trans_*)` |
| 178 | -3. `fallback_score = max(fallback_original_query_*)` | |
| 179 | -4. 加权: | |
| 178 | +3. 加权: | |
| 180 | 179 | - `weighted_source = source_score` |
| 181 | 180 | - `weighted_translation = 0.8 * translation_score` |
| 182 | - - `weighted_fallback = 0.55 * fallback_score` | |
| 183 | -5. 合成: | |
| 184 | - - `primary = max(weighted_source, weighted_translation, weighted_fallback)` | |
| 185 | - - `support = weighted_source + weighted_translation + weighted_fallback - primary` | |
| 181 | +4. 合成: | |
| 182 | + - `primary = max(weighted_source, weighted_translation)` | |
| 183 | + - `support = weighted_source + weighted_translation - primary` | |
| 186 | 184 | - `text_score = primary + 0.25 * support` |
| 187 | 185 | |
| 188 | 186 | 如果以上子分都缺失,则回退到 ES `_score` 作为 `text_score`,避免纯文本召回被误打成 0。 |
| ... | ... | @@ -212,7 +210,6 @@ fused_score = ( |
| 212 | 210 | - `text_score` |
| 213 | 211 | - `text_source_score` |
| 214 | 212 | - `text_translation_score` |
| 215 | -- `text_fallback_score` | |
| 216 | 213 | - `text_primary_score` |
| 217 | 214 | - `text_support_score` |
| 218 | 215 | - `knn_score` |
| ... | ... | @@ -221,9 +218,9 @@ fused_score = ( |
| 221 | 218 | |
| 222 | 219 | `debug_info.query_analysis` 还会暴露: |
| 223 | 220 | |
| 224 | -- `query_text_by_lang` | |
| 225 | -- `search_langs` | |
| 226 | -- `supplemental_search_langs` | |
| 221 | +- `translations` | |
| 222 | +- `detected_language` | |
| 223 | +- `rewritten_query` | |
| 227 | 224 | |
| 228 | 225 | 这些字段用于检索效果评估与 bad case 归因。 |
| 229 | 226 | |
| ... | ... | @@ -231,7 +228,7 @@ fused_score = ( |
| 231 | 228 | |
| 232 | 229 | 1. 当前文本主链路已移除布尔 AST 分支。 |
| 233 | 230 | 2. 文档中的旧描述(如 `operator: AND` 固定开启)不再适用,当前实现未强制设置该参数。 |
| 234 | -3. `HanLP` 为可选依赖;不可用时退化到轻量分词,不影响主链路可用性。 | |
| 231 | +3. `HanLP` 为必需依赖;当前 parser 不再提供轻量 fallback。 | |
| 235 | 232 | 4. 若后续扩展到更多语种,请确保: |
| 236 | 233 | - mapping 中存在对应 `.<lang>` 字段 |
| 237 | 234 | - `index_languages` 配置在支持列表内 |
| ... | ... | @@ -263,10 +260,9 @@ python ./scripts/eval_search_quality.py |
| 263 | 260 | 建议在 `tests/` 增加文本策略用例: |
| 264 | 261 | |
| 265 | 262 | 1. 源语种在索引语言,翻译命中缓存 |
| 266 | -2. 源语种不在索引语言,翻译部分失败(验证 fallback 子句) | |
| 267 | -3. 源语种不在索引语言,翻译全部失败(验证多目标 fallback) | |
| 268 | -4. 自定义 `original_query_fallback_boost_when_translation_missing` 生效 | |
| 269 | -5. 非 `zh/en` 语种字段动态拼接(如 `de/fr/es`) | |
| 263 | +2. 源语种不在索引语言,翻译部分失败(验证仅保留 `base_query` + 成功翻译子句) | |
| 264 | +3. 源语种不在索引语言,翻译全部失败(验证无 `base_query_trans_*` 时仍可正常执行) | |
| 265 | +4. 非 `zh/en` 语种字段动态拼接(如 `de/fr/es`) | |
| 270 | 266 | |
| 271 | 267 | |
| 272 | 268 | ... | ... |
| ... | ... | @@ -0,0 +1,13 @@ |
| 1 | +# Optional: HanLP query tokenization for the main backend venv (QueryParser). | |
| 2 | +# | |
| 3 | +# Install: | |
| 4 | +# source activate.sh | |
| 5 | +# pip install -r requirements_hanlp.txt | |
| 6 | +# | |
| 7 | +# Why pin transformers<5: | |
| 8 | +# transformers 5.x no longer exposes `encode_plus` on `BertTokenizer`, but HanLP 2.1.x | |
| 9 | +# still calls it → AttributeError during `hanlp.load(...)`. | |
| 10 | +# Use transformers 4.44+ (4.x) which remains API-compatible with HanLP. | |
| 11 | + | |
| 12 | +hanlp>=2.1.0 | |
| 13 | +transformers>=4.44,<5 | ... | ... |
scripts/eval_search_quality.py
| ... | ... | @@ -83,7 +83,6 @@ class RankedItem: |
| 83 | 83 | text_score: float | None |
| 84 | 84 | text_source_score: float | None |
| 85 | 85 | text_translation_score: float | None |
| 86 | - text_fallback_score: float | None | |
| 87 | 86 | text_primary_score: float | None |
| 88 | 87 | text_support_score: float | None |
| 89 | 88 | knn_score: float | None |
| ... | ... | @@ -146,7 +145,6 @@ def _evaluate_query(searcher, tenant_id: str, query: str) -> Dict[str, Any]: |
| 146 | 145 | text_score=_to_float(debug_item.get("text_score")), |
| 147 | 146 | text_source_score=_to_float(debug_item.get("text_source_score")), |
| 148 | 147 | text_translation_score=_to_float(debug_item.get("text_translation_score")), |
| 149 | - text_fallback_score=_to_float(debug_item.get("text_fallback_score")), | |
| 150 | 148 | text_primary_score=_to_float(debug_item.get("text_primary_score")), |
| 151 | 149 | text_support_score=_to_float(debug_item.get("text_support_score")), |
| 152 | 150 | knn_score=_to_float(debug_item.get("knn_score")), |
| ... | ... | @@ -185,12 +183,11 @@ def _render_markdown(report: Dict[str, Any]) -> str: |
| 185 | 183 | f"- total={entry['total']} max_score={entry['max_score']:.6f} took_ms={entry['took_ms']}" |
| 186 | 184 | ) |
| 187 | 185 | lines.append( |
| 188 | - f"- detected_language={qa.get('detected_language')} search_langs={qa.get('search_langs')} supplemental_search_langs={qa.get('supplemental_search_langs')}" | |
| 186 | + f"- detected_language={qa.get('detected_language')} translations={qa.get('translations')}" | |
| 189 | 187 | ) |
| 190 | - lines.append(f"- query_text_by_lang={qa.get('query_text_by_lang')}") | |
| 191 | 188 | lines.append("") |
| 192 | - lines.append("| rank | spu_id | title | fused | rerank | text | text_src | text_trans | text_fb | knn | es | matched_queries |") | |
| 193 | - lines.append("| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |") | |
| 189 | + lines.append("| rank | spu_id | title | fused | rerank | text | text_src | text_trans | knn | es | matched_queries |") | |
| 190 | + lines.append("| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |") | |
| 194 | 191 | for item in entry.get("top20", []): |
| 195 | 192 | title = str(item.get("title", "")).replace("|", "/") |
| 196 | 193 | matched = json.dumps(item.get("matched_queries"), ensure_ascii=False) |
| ... | ... | @@ -199,7 +196,7 @@ def _render_markdown(report: Dict[str, Any]) -> str: |
| 199 | 196 | f"| {item.get('rank')} | {item.get('spu_id')} | {title} | " |
| 200 | 197 | f"{item.get('fused_score')} | {item.get('rerank_score')} | {item.get('text_score')} | " |
| 201 | 198 | f"{item.get('text_source_score')} | {item.get('text_translation_score')} | " |
| 202 | - f"{item.get('text_fallback_score')} | {item.get('knn_score')} | {item.get('es_score')} | {matched} |" | |
| 199 | + f"{item.get('knn_score')} | {item.get('es_score')} | {matched} |" | |
| 203 | 200 | ) |
| 204 | 201 | lines.append("") |
| 205 | 202 | return "\n".join(lines) | ... | ... |
search/es_query_builder.py
| ... | ... | @@ -36,9 +36,6 @@ class ESQueryBuilder: |
| 36 | 36 | base_minimum_should_match: str = "70%", |
| 37 | 37 | translation_minimum_should_match: str = "70%", |
| 38 | 38 | translation_boost: float = 0.4, |
| 39 | - translation_boost_when_source_missing: float = 1.0, | |
| 40 | - source_boost_when_missing: float = 0.6, | |
| 41 | - original_query_fallback_boost_when_translation_missing: float = 0.2, | |
| 42 | 39 | tie_breaker_base_query: float = 0.9, |
| 43 | 40 | mixed_script_merged_field_boost_scale: float = 0.6, |
| 44 | 41 | ): |
| ... | ... | @@ -74,11 +71,6 @@ class ESQueryBuilder: |
| 74 | 71 | self.base_minimum_should_match = base_minimum_should_match |
| 75 | 72 | self.translation_minimum_should_match = translation_minimum_should_match |
| 76 | 73 | self.translation_boost = float(translation_boost) |
| 77 | - self.translation_boost_when_source_missing = float(translation_boost_when_source_missing) | |
| 78 | - self.source_boost_when_missing = float(source_boost_when_missing) | |
| 79 | - self.original_query_fallback_boost_when_translation_missing = float( | |
| 80 | - original_query_fallback_boost_when_translation_missing | |
| 81 | - ) | |
| 82 | 74 | self.tie_breaker_base_query = float(tie_breaker_base_query) |
| 83 | 75 | self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale) |
| 84 | 76 | |
| ... | ... | @@ -168,7 +160,7 @@ class ESQueryBuilder: |
| 168 | 160 | 结构:filters and (text_recall or embedding_recall) + post_filter |
| 169 | 161 | - conjunctive_filters: 应用在 query.bool.filter(影响结果和聚合) |
| 170 | 162 | - disjunctive_filters: 应用在 post_filter(只影响结果,不影响聚合) |
| 171 | - - text_recall: 文本相关性召回(按 search_langs 动态语言字段) | |
| 163 | + - text_recall: 文本相关性召回(按实际 clause 语言动态字段) | |
| 172 | 164 | - embedding_recall: 向量召回(KNN) |
| 173 | 165 | - function_score: 包装召回部分,支持提权字段 |
| 174 | 166 | |
| ... | ... | @@ -484,6 +476,7 @@ class ESQueryBuilder: |
| 484 | 476 | contains_chinese: bool, |
| 485 | 477 | contains_english: bool, |
| 486 | 478 | index_languages: List[str], |
| 479 | + is_source: bool = False | |
| 487 | 480 | ) -> List[MatchFieldSpec]: |
| 488 | 481 | """ |
| 489 | 482 | When the query mixes scripts, widen each clause to indexed fields for the other script |
| ... | ... | @@ -497,10 +490,11 @@ class ESQueryBuilder: |
| 497 | 490 | |
| 498 | 491 | out = list(specs) |
| 499 | 492 | lnorm = (lang or "").strip().lower() |
| 500 | - if contains_english and lnorm != "en" and can_use("en"): | |
| 501 | - out = self._merge_supplemental_lang_field_specs(out, "en") | |
| 502 | - if contains_chinese and lnorm != "zh" and can_use("zh"): | |
| 503 | - out = self._merge_supplemental_lang_field_specs(out, "zh") | |
| 493 | + if is_source: | |
| 494 | + if contains_english and lnorm != "en" and can_use("en"): | |
| 495 | + out = self._merge_supplemental_lang_field_specs(out, "en") | |
| 496 | + if contains_chinese and lnorm != "zh" and can_use("zh"): | |
| 497 | + out = self._merge_supplemental_lang_field_specs(out, "zh") | |
| 504 | 498 | return out |
| 505 | 499 | |
| 506 | 500 | def _get_embedding_field(self, language: str) -> str: |
| ... | ... | @@ -557,10 +551,6 @@ class ESQueryBuilder: |
| 557 | 551 | contains_english = bool(getattr(parsed_query, "contains_english", False)) |
| 558 | 552 | |
| 559 | 553 | source_lang = str(source_lang or self.default_language).strip().lower() or self.default_language |
| 560 | - source_in_index_languages = ( | |
| 561 | - True if not normalized_index_languages else source_lang in normalized_index_languages | |
| 562 | - ) | |
| 563 | - | |
| 564 | 554 | base_query_text = ( |
| 565 | 555 | getattr(parsed_query, "rewritten_query", None) if parsed_query else None |
| 566 | 556 | ) or query_text |
| ... | ... | @@ -574,22 +564,14 @@ class ESQueryBuilder: |
| 574 | 564 | contains_chinese, |
| 575 | 565 | contains_english, |
| 576 | 566 | normalized_index_languages, |
| 567 | + is_source, | |
| 577 | 568 | ) |
| 578 | 569 | match_fields = self._format_match_field_specs(expanded_specs) |
| 579 | 570 | if not match_fields: |
| 580 | 571 | return |
| 581 | - clause_boost = 1.0 | |
| 582 | 572 | minimum_should_match = ( |
| 583 | 573 | self.base_minimum_should_match if is_source else self.translation_minimum_should_match |
| 584 | 574 | ) |
| 585 | - if is_source and not source_in_index_languages: | |
| 586 | - clause_boost = self.source_boost_when_missing | |
| 587 | - elif not is_source: | |
| 588 | - clause_boost = ( | |
| 589 | - self.translation_boost | |
| 590 | - if source_in_index_languages | |
| 591 | - else self.translation_boost_when_source_missing | |
| 592 | - ) | |
| 593 | 575 | |
| 594 | 576 | clause = { |
| 595 | 577 | "multi_match": { |
| ... | ... | @@ -600,8 +582,11 @@ class ESQueryBuilder: |
| 600 | 582 | "tie_breaker": self.tie_breaker_base_query, |
| 601 | 583 | } |
| 602 | 584 | } |
| 603 | - if abs(clause_boost - 1.0) > 1e-9: | |
| 604 | - clause["multi_match"]["boost"] = clause_boost | |
| 585 | + # base_query: never set multi_match.boost (ES default 1.0). | |
| 586 | + # Translation clauses: single knob from config — translation_boost. | |
| 587 | + if not is_source: | |
| 588 | + tb = float(self.translation_boost) | |
| 589 | + clause["multi_match"]["boost"] = tb | |
| 605 | 590 | should_clauses.append({ |
| 606 | 591 | "multi_match": clause["multi_match"] |
| 607 | 592 | }) | ... | ... |
search/rerank_client.py
| ... | ... | @@ -116,7 +116,6 @@ def _extract_named_query_score(matched_queries: Any, name: str) -> float: |
| 116 | 116 | def _collect_text_score_components(matched_queries: Any, fallback_es_score: float) -> Dict[str, float]: |
| 117 | 117 | source_score = _extract_named_query_score(matched_queries, "base_query") |
| 118 | 118 | translation_score = 0.0 |
| 119 | - fallback_score = 0.0 | |
| 120 | 119 | |
| 121 | 120 | if isinstance(matched_queries, dict): |
| 122 | 121 | for query_name, score in matched_queries.items(): |
| ... | ... | @@ -125,21 +124,16 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa |
| 125 | 124 | numeric_score = _to_score(score) |
| 126 | 125 | if query_name.startswith("base_query_trans_"): |
| 127 | 126 | translation_score = max(translation_score, numeric_score) |
| 128 | - elif query_name.startswith("fallback_original_query_"): | |
| 129 | - fallback_score = max(fallback_score, numeric_score) | |
| 130 | 127 | elif isinstance(matched_queries, list): |
| 131 | 128 | for query_name in matched_queries: |
| 132 | 129 | if not isinstance(query_name, str): |
| 133 | 130 | continue |
| 134 | 131 | if query_name.startswith("base_query_trans_"): |
| 135 | 132 | translation_score = 1.0 |
| 136 | - elif query_name.startswith("fallback_original_query_"): | |
| 137 | - fallback_score = 1.0 | |
| 138 | 133 | |
| 139 | 134 | weighted_source = source_score |
| 140 | 135 | weighted_translation = 0.8 * translation_score |
| 141 | - weighted_fallback = 0.55 * fallback_score | |
| 142 | - weighted_components = [weighted_source, weighted_translation, weighted_fallback] | |
| 136 | + weighted_components = [weighted_source, weighted_translation] | |
| 143 | 137 | primary_text_score = max(weighted_components) |
| 144 | 138 | support_text_score = sum(weighted_components) - primary_text_score |
| 145 | 139 | text_score = primary_text_score + 0.25 * support_text_score |
| ... | ... | @@ -153,10 +147,8 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa |
| 153 | 147 | return { |
| 154 | 148 | "source_score": source_score, |
| 155 | 149 | "translation_score": translation_score, |
| 156 | - "fallback_score": fallback_score, | |
| 157 | 150 | "weighted_source_score": weighted_source, |
| 158 | 151 | "weighted_translation_score": weighted_translation, |
| 159 | - "weighted_fallback_score": weighted_fallback, | |
| 160 | 152 | "primary_text_score": primary_text_score, |
| 161 | 153 | "support_text_score": support_text_score, |
| 162 | 154 | "text_score": text_score, |
| ... | ... | @@ -219,7 +211,6 @@ def fuse_scores_and_resort( |
| 219 | 211 | hit["_knn_score"] = knn_score |
| 220 | 212 | hit["_text_source_score"] = text_components["source_score"] |
| 221 | 213 | hit["_text_translation_score"] = text_components["translation_score"] |
| 222 | - hit["_text_fallback_score"] = text_components["fallback_score"] | |
| 223 | 214 | hit["_text_primary_score"] = text_components["primary_text_score"] |
| 224 | 215 | hit["_text_support_score"] = text_components["support_text_score"] |
| 225 | 216 | hit["_fused_score"] = fused |
| ... | ... | @@ -231,7 +222,6 @@ def fuse_scores_and_resort( |
| 231 | 222 | "text_score": text_score, |
| 232 | 223 | "text_source_score": text_components["source_score"], |
| 233 | 224 | "text_translation_score": text_components["translation_score"], |
| 234 | - "text_fallback_score": text_components["fallback_score"], | |
| 235 | 225 | "text_primary_score": text_components["primary_text_score"], |
| 236 | 226 | "text_support_score": text_components["support_text_score"], |
| 237 | 227 | "knn_score": knn_score, | ... | ... |
search/searcher.py
| ... | ... | @@ -132,11 +132,6 @@ class Searcher: |
| 132 | 132 | base_minimum_should_match=self.config.query_config.base_minimum_should_match, |
| 133 | 133 | translation_minimum_should_match=self.config.query_config.translation_minimum_should_match, |
| 134 | 134 | translation_boost=self.config.query_config.translation_boost, |
| 135 | - translation_boost_when_source_missing=self.config.query_config.translation_boost_when_source_missing, | |
| 136 | - source_boost_when_missing=self.config.query_config.source_boost_when_missing, | |
| 137 | - original_query_fallback_boost_when_translation_missing=( | |
| 138 | - self.config.query_config.original_query_fallback_boost_when_translation_missing | |
| 139 | - ), | |
| 140 | 135 | tie_breaker_base_query=self.config.query_config.tie_breaker_base_query, |
| 141 | 136 | ) |
| 142 | 137 | |
| ... | ... | @@ -267,13 +262,6 @@ class Searcher: |
| 267 | 262 | if normalized: |
| 268 | 263 | candidates.append(normalized) |
| 269 | 264 | |
| 270 | - query_text_by_lang = getattr(parsed_query, "query_text_by_lang", {}) or {} | |
| 271 | - if isinstance(query_text_by_lang, dict): | |
| 272 | - for text in query_text_by_lang.values(): | |
| 273 | - normalized = self._normalize_sku_match_text(text) | |
| 274 | - if normalized: | |
| 275 | - candidates.append(normalized) | |
| 276 | - | |
| 277 | 265 | translations = getattr(parsed_query, "translations", {}) or {} |
| 278 | 266 | if isinstance(translations, dict): |
| 279 | 267 | for text in translations.values(): |
| ... | ... | @@ -943,7 +931,6 @@ class Searcher: |
| 943 | 931 | debug_entry["text_score"] = rerank_debug.get("text_score") |
| 944 | 932 | debug_entry["text_source_score"] = rerank_debug.get("text_source_score") |
| 945 | 933 | debug_entry["text_translation_score"] = rerank_debug.get("text_translation_score") |
| 946 | - debug_entry["text_fallback_score"] = rerank_debug.get("text_fallback_score") | |
| 947 | 934 | debug_entry["text_primary_score"] = rerank_debug.get("text_primary_score") |
| 948 | 935 | debug_entry["text_support_score"] = rerank_debug.get("text_support_score") |
| 949 | 936 | debug_entry["knn_score"] = rerank_debug.get("knn_score") | ... | ... |
tests/test_es_query_builder_text_recall_languages.py
0 → 100644
| ... | ... | @@ -0,0 +1,519 @@ |
| 1 | +""" | |
| 2 | +ES text recall: base_query (rewritten @ detected_language) + base_query_trans_*. | |
| 3 | + | |
| 4 | +Covers combinations of query language vs tenant index_languages, translations, | |
| 5 | +and mixed Chinese/English queries. Asserts multi_match _name, query text, and | |
| 6 | +target language fields (title.{lang}). | |
| 7 | +""" | |
| 8 | + | |
| 9 | +from types import SimpleNamespace | |
| 10 | +from typing import Any, Dict, List | |
| 11 | + | |
| 12 | +import numpy as np | |
| 13 | + | |
| 14 | +from search.es_query_builder import ESQueryBuilder | |
| 15 | + | |
| 16 | + | |
| 17 | +def _builder_multilingual_title_only( | |
| 18 | + *, | |
| 19 | + default_language: str = "en", | |
| 20 | + mixed_script_scale: float = 0.6, | |
| 21 | +) -> ESQueryBuilder: | |
| 22 | + """Minimal builder: only title.{lang} for easy field assertions.""" | |
| 23 | + return ESQueryBuilder( | |
| 24 | + match_fields=["title.en^1.0"], | |
| 25 | + multilingual_fields=["title"], | |
| 26 | + shared_fields=[], | |
| 27 | + text_embedding_field="title_embedding", | |
| 28 | + default_language=default_language, | |
| 29 | + mixed_script_merged_field_boost_scale=mixed_script_scale, | |
| 30 | + function_score_config=None, | |
| 31 | + ) | |
| 32 | + | |
| 33 | + | |
| 34 | +def _unwrap_inner_query(es_body: Dict[str, Any]) -> Dict[str, Any]: | |
| 35 | + """Navigate bool.must / function_score wrappers to the text recall root.""" | |
| 36 | + q = es_body.get("query") or {} | |
| 37 | + if "bool" in q and "must" in q["bool"] and q["bool"]["must"]: | |
| 38 | + q = q["bool"]["must"][0] | |
| 39 | + if "function_score" in q: | |
| 40 | + q = q["function_score"]["query"] | |
| 41 | + return q | |
| 42 | + | |
| 43 | + | |
| 44 | +def _extract_multi_match_clauses(es_body: Dict[str, Any]) -> List[Dict[str, Any]]: | |
| 45 | + inner = _unwrap_inner_query(es_body) | |
| 46 | + if "multi_match" in inner: | |
| 47 | + return [inner["multi_match"]] | |
| 48 | + should = (inner.get("bool") or {}).get("should") or [] | |
| 49 | + return [c["multi_match"] for c in should if "multi_match" in c] | |
| 50 | + | |
| 51 | + | |
| 52 | +def _clauses_index(es_body: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: | |
| 53 | + """Map _name -> multi_match dict.""" | |
| 54 | + out: Dict[str, Dict[str, Any]] = {} | |
| 55 | + for mm in _extract_multi_match_clauses(es_body): | |
| 56 | + name = mm.get("_name") | |
| 57 | + if name: | |
| 58 | + out[str(name)] = mm | |
| 59 | + return out | |
| 60 | + | |
| 61 | + | |
| 62 | +def _title_fields(mm: Dict[str, Any]) -> List[str]: | |
| 63 | + fields = mm.get("fields") or [] | |
| 64 | + return [f for f in fields if str(f).startswith("title.")] | |
| 65 | + | |
| 66 | + | |
| 67 | +def _has_title_lang(mm: Dict[str, Any], lang: str) -> bool: | |
| 68 | + """True if any field is title.{lang} with optional ^boost suffix.""" | |
| 69 | + prefix = f"title.{lang}" | |
| 70 | + for f in mm.get("fields") or []: | |
| 71 | + s = str(f) | |
| 72 | + if s == prefix or s.startswith(prefix + "^"): | |
| 73 | + return True | |
| 74 | + return False | |
| 75 | + | |
| 76 | + | |
| 77 | +def _build( | |
| 78 | + qb: ESQueryBuilder, | |
| 79 | + *, | |
| 80 | + query_text: str, | |
| 81 | + rewritten: str, | |
| 82 | + detected_language: str, | |
| 83 | + translations: Dict[str, str], | |
| 84 | + index_languages: List[str], | |
| 85 | + contains_chinese: bool = False, | |
| 86 | + contains_english: bool = False, | |
| 87 | +) -> Dict[str, Any]: | |
| 88 | + parsed = SimpleNamespace( | |
| 89 | + rewritten_query=rewritten, | |
| 90 | + detected_language=detected_language, | |
| 91 | + translations=dict(translations), | |
| 92 | + contains_chinese=contains_chinese, | |
| 93 | + contains_english=contains_english, | |
| 94 | + ) | |
| 95 | + return qb.build_query( | |
| 96 | + query_text=query_text, | |
| 97 | + parsed_query=parsed, | |
| 98 | + enable_knn=False, | |
| 99 | + index_languages=index_languages, | |
| 100 | + ) | |
| 101 | + | |
| 102 | + | |
| 103 | +# --- 检测语言在 index_languages 内:主召回 + 翻译补召回 --- | |
| 104 | + | |
| 105 | + | |
| 106 | +def test_zh_query_index_zh_en_includes_base_zh_and_trans_en(): | |
| 107 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 108 | + q = _build( | |
| 109 | + qb, | |
| 110 | + query_text="连衣裙", | |
| 111 | + rewritten="连衣裙", | |
| 112 | + detected_language="zh", | |
| 113 | + translations={"en": "dress"}, | |
| 114 | + index_languages=["zh", "en"], | |
| 115 | + ) | |
| 116 | + idx = _clauses_index(q) | |
| 117 | + assert set(idx) == {"base_query", "base_query_trans_en"} | |
| 118 | + assert idx["base_query"]["query"] == "连衣裙" | |
| 119 | + assert "title.zh" in _title_fields(idx["base_query"]) | |
| 120 | + assert idx["base_query_trans_en"]["query"] == "dress" | |
| 121 | + assert "title.en" in _title_fields(idx["base_query_trans_en"]) | |
| 122 | + | |
| 123 | + | |
| 124 | +def test_en_query_index_zh_en_includes_base_en_and_trans_zh(): | |
| 125 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 126 | + q = _build( | |
| 127 | + qb, | |
| 128 | + query_text="dress", | |
| 129 | + rewritten="dress", | |
| 130 | + detected_language="en", | |
| 131 | + translations={"zh": "连衣裙"}, | |
| 132 | + index_languages=["en", "zh"], | |
| 133 | + ) | |
| 134 | + idx = _clauses_index(q) | |
| 135 | + assert set(idx) == {"base_query", "base_query_trans_zh"} | |
| 136 | + assert idx["base_query"]["query"] == "dress" | |
| 137 | + assert "title.en" in _title_fields(idx["base_query"]) | |
| 138 | + assert idx["base_query_trans_zh"]["query"] == "连衣裙" | |
| 139 | + assert "title.zh" in _title_fields(idx["base_query_trans_zh"]) | |
| 140 | + | |
| 141 | + | |
| 142 | +def test_de_query_index_de_en_fr_includes_base_and_two_translations(): | |
| 143 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 144 | + q = _build( | |
| 145 | + qb, | |
| 146 | + query_text="kleid", | |
| 147 | + rewritten="kleid", | |
| 148 | + detected_language="de", | |
| 149 | + translations={"en": "dress", "fr": "robe"}, | |
| 150 | + index_languages=["de", "en", "fr"], | |
| 151 | + ) | |
| 152 | + idx = _clauses_index(q) | |
| 153 | + assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_fr"} | |
| 154 | + assert idx["base_query"]["query"] == "kleid" | |
| 155 | + assert "title.de" in _title_fields(idx["base_query"]) | |
| 156 | + assert idx["base_query_trans_en"]["query"] == "dress" | |
| 157 | + assert idx["base_query_trans_fr"]["query"] == "robe" | |
| 158 | + | |
| 159 | + | |
| 160 | +# --- 检测语言不在 index_languages:仍有 base(弱)+ 翻译(强) --- | |
| 161 | + | |
| 162 | + | |
| 163 | +def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields(): | |
| 164 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 165 | + q = _build( | |
| 166 | + qb, | |
| 167 | + query_text="schuh", | |
| 168 | + rewritten="schuh", | |
| 169 | + detected_language="de", | |
| 170 | + translations={"en": "shoe", "zh": "鞋"}, | |
| 171 | + index_languages=["en", "zh"], | |
| 172 | + ) | |
| 173 | + idx = _clauses_index(q) | |
| 174 | + assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_zh"} | |
| 175 | + assert idx["base_query"]["query"] == "schuh" | |
| 176 | + assert "title.de" in _title_fields(idx["base_query"]) | |
| 177 | + assert "boost" not in idx["base_query"] | |
| 178 | + assert idx["base_query_trans_en"]["query"] == "shoe" | |
| 179 | + assert idx["base_query_trans_en"]["boost"] == qb.translation_boost | |
| 180 | + assert idx["base_query_trans_zh"]["query"] == "鞋" | |
| 181 | + assert idx["base_query_trans_zh"]["boost"] == qb.translation_boost | |
| 182 | + | |
| 183 | + | |
| 184 | +# --- 中英混写:原文在 base_query;翻译子句独立;混写时 base 子句扩列 --- | |
| 185 | + | |
| 186 | + | |
| 187 | +def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause(): | |
| 188 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 189 | + q = _build( | |
| 190 | + qb, | |
| 191 | + query_text="红色 dress", | |
| 192 | + rewritten="红色 dress", | |
| 193 | + detected_language="zh", | |
| 194 | + translations={"en": "red dress"}, | |
| 195 | + index_languages=["zh", "en"], | |
| 196 | + contains_chinese=True, | |
| 197 | + contains_english=True, | |
| 198 | + ) | |
| 199 | + idx = _clauses_index(q) | |
| 200 | + assert set(idx) == {"base_query", "base_query_trans_en"} | |
| 201 | + assert idx["base_query"]["query"] == "红色 dress" | |
| 202 | + assert _has_title_lang(idx["base_query"], "zh") and _has_title_lang(idx["base_query"], "en") | |
| 203 | + assert idx["base_query_trans_en"]["query"] == "red dress" | |
| 204 | + assert _has_title_lang(idx["base_query_trans_en"], "en") | |
| 205 | + | |
| 206 | + | |
| 207 | +def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause(): | |
| 208 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 209 | + q = _build( | |
| 210 | + qb, | |
| 211 | + query_text="nike 运动鞋", | |
| 212 | + rewritten="nike 运动鞋", | |
| 213 | + detected_language="en", | |
| 214 | + translations={"zh": "耐克运动鞋"}, | |
| 215 | + index_languages=["zh", "en"], | |
| 216 | + contains_chinese=True, | |
| 217 | + contains_english=True, | |
| 218 | + ) | |
| 219 | + idx = _clauses_index(q) | |
| 220 | + assert set(idx) == {"base_query", "base_query_trans_zh"} | |
| 221 | + assert idx["base_query"]["query"] == "nike 运动鞋" | |
| 222 | + assert _has_title_lang(idx["base_query"], "en") and _has_title_lang(idx["base_query"], "zh") | |
| 223 | + assert idx["base_query_trans_zh"]["query"] == "耐克运动鞋" | |
| 224 | + | |
| 225 | + | |
| 226 | +def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): | |
| 227 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 228 | + q = _build( | |
| 229 | + qb, | |
| 230 | + query_text="法式 dress", | |
| 231 | + rewritten="法式 dress", | |
| 232 | + detected_language="zh", | |
| 233 | + translations={}, | |
| 234 | + index_languages=["zh"], | |
| 235 | + contains_chinese=True, | |
| 236 | + contains_english=True, | |
| 237 | + ) | |
| 238 | + idx = _clauses_index(q) | |
| 239 | + assert set(idx) == {"base_query"} | |
| 240 | + bases = {f.split("^", 1)[0] for f in _title_fields(idx["base_query"])} | |
| 241 | + assert bases == {"title.zh"} | |
| 242 | + | |
| 243 | + | |
| 244 | +# --- 去重:与 base 同语言同文本的翻译项跳过 --- | |
| 245 | + | |
| 246 | + | |
| 247 | +def test_skips_translation_when_same_lang_and_same_text_as_base(): | |
| 248 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 249 | + q = _build( | |
| 250 | + qb, | |
| 251 | + query_text="NIKE", | |
| 252 | + rewritten="NIKE", | |
| 253 | + detected_language="en", | |
| 254 | + translations={"en": "NIKE", "zh": "耐克"}, | |
| 255 | + index_languages=["en", "zh"], | |
| 256 | + ) | |
| 257 | + idx = _clauses_index(q) | |
| 258 | + assert set(idx) == {"base_query", "base_query_trans_zh"} | |
| 259 | + | |
| 260 | + | |
| 261 | +def test_keeps_translation_when_same_text_but_different_lang_than_base(): | |
| 262 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 263 | + q = _build( | |
| 264 | + qb, | |
| 265 | + query_text="NIKE", | |
| 266 | + rewritten="NIKE", | |
| 267 | + detected_language="en", | |
| 268 | + translations={"zh": "NIKE"}, | |
| 269 | + index_languages=["en", "zh"], | |
| 270 | + ) | |
| 271 | + idx = _clauses_index(q) | |
| 272 | + assert set(idx) == {"base_query", "base_query_trans_zh"} | |
| 273 | + assert idx["base_query_trans_zh"]["query"] == "NIKE" | |
| 274 | + | |
| 275 | + | |
| 276 | +# --- 翻译 key 规范化、空翻译跳过 --- | |
| 277 | + | |
| 278 | + | |
| 279 | +def test_translation_language_key_is_normalized_case_insensitive(): | |
| 280 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 281 | + q = _build( | |
| 282 | + qb, | |
| 283 | + query_text="dress", | |
| 284 | + rewritten="dress", | |
| 285 | + detected_language="en", | |
| 286 | + translations={"ZH": "连衣裙"}, | |
| 287 | + index_languages=["en", "zh"], | |
| 288 | + ) | |
| 289 | + idx = _clauses_index(q) | |
| 290 | + assert "base_query_trans_zh" in idx | |
| 291 | + assert idx["base_query_trans_zh"]["query"] == "连衣裙" | |
| 292 | + | |
| 293 | + | |
| 294 | +def test_empty_translation_value_is_skipped(): | |
| 295 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 296 | + q = _build( | |
| 297 | + qb, | |
| 298 | + query_text="dress", | |
| 299 | + rewritten="dress", | |
| 300 | + detected_language="en", | |
| 301 | + translations={"zh": " ", "fr": "robe"}, | |
| 302 | + index_languages=["en", "zh", "fr"], | |
| 303 | + ) | |
| 304 | + idx = _clauses_index(q) | |
| 305 | + assert "base_query_trans_zh" not in idx | |
| 306 | + assert "base_query_trans_fr" in idx | |
| 307 | + | |
| 308 | + | |
| 309 | +# --- index_languages 为空:视为「未约束」source_in_index 为 True --- | |
| 310 | + | |
| 311 | + | |
| 312 | +def test_empty_index_languages_treats_source_as_in_index_boosts(): | |
| 313 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 314 | + q = _build( | |
| 315 | + qb, | |
| 316 | + query_text="x", | |
| 317 | + rewritten="x", | |
| 318 | + detected_language="de", | |
| 319 | + translations={"en": "y"}, | |
| 320 | + index_languages=[], | |
| 321 | + ) | |
| 322 | + idx = _clauses_index(q) | |
| 323 | + assert "boost" not in idx["base_query"] | |
| 324 | + assert idx["base_query_trans_en"]["boost"] == qb.translation_boost | |
| 325 | + | |
| 326 | + | |
| 327 | +# --- 无翻译:仅 base_query --- | |
| 328 | + | |
| 329 | + | |
| 330 | +def test_no_translations_only_base_query(): | |
| 331 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 332 | + q = _build( | |
| 333 | + qb, | |
| 334 | + query_text="hello", | |
| 335 | + rewritten="hello", | |
| 336 | + detected_language="en", | |
| 337 | + translations={}, | |
| 338 | + index_languages=["en", "zh"], | |
| 339 | + ) | |
| 340 | + idx = _clauses_index(q) | |
| 341 | + assert set(idx) == {"base_query"} | |
| 342 | + | |
| 343 | + | |
| 344 | +# --- 与 KNN 同存时仍能解析文本子句(顶层 knn 不影响 query 内结构) --- | |
| 345 | + | |
| 346 | + | |
| 347 | +def test_text_clauses_present_alongside_knn(): | |
| 348 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 349 | + parsed = SimpleNamespace( | |
| 350 | + rewritten_query="dress", | |
| 351 | + detected_language="en", | |
| 352 | + translations={"zh": "连衣裙"}, | |
| 353 | + contains_chinese=False, | |
| 354 | + contains_english=True, | |
| 355 | + ) | |
| 356 | + q = qb.build_query( | |
| 357 | + query_text="dress", | |
| 358 | + query_vector=np.array([0.1, 0.2, 0.3], dtype=np.float32), | |
| 359 | + parsed_query=parsed, | |
| 360 | + enable_knn=True, | |
| 361 | + index_languages=["en", "zh"], | |
| 362 | + ) | |
| 363 | + assert "knn" in q | |
| 364 | + idx = _clauses_index(q) | |
| 365 | + assert set(idx) == {"base_query", "base_query_trans_zh"} | |
| 366 | + | |
| 367 | + | |
| 368 | +def test_detected_language_unknown_falls_back_to_default_language(): | |
| 369 | + """与 LanguageDetector 失败时 QueryConfig.default_language 行为对齐。""" | |
| 370 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 371 | + parsed = SimpleNamespace( | |
| 372 | + rewritten_query="shirt", | |
| 373 | + detected_language="unknown", | |
| 374 | + translations={"zh": "衬衫"}, | |
| 375 | + contains_chinese=False, | |
| 376 | + contains_english=True, | |
| 377 | + ) | |
| 378 | + q = qb.build_query( | |
| 379 | + query_text="shirt", | |
| 380 | + parsed_query=parsed, | |
| 381 | + enable_knn=False, | |
| 382 | + index_languages=["en", "zh"], | |
| 383 | + ) | |
| 384 | + idx = _clauses_index(q) | |
| 385 | + assert set(idx) == {"base_query", "base_query_trans_zh"} | |
| 386 | + assert idx["base_query"]["query"] == "shirt" | |
| 387 | + assert _has_title_lang(idx["base_query"], "en") | |
| 388 | + | |
| 389 | + | |
| 390 | +def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): | |
| 391 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 392 | + q = _build( | |
| 393 | + qb, | |
| 394 | + query_text="платье", | |
| 395 | + rewritten="платье", | |
| 396 | + detected_language="ru", | |
| 397 | + translations={"en": "dress"}, | |
| 398 | + index_languages=["ru", "en"], | |
| 399 | + ) | |
| 400 | + idx = _clauses_index(q) | |
| 401 | + assert set(idx) == {"base_query", "base_query_trans_en"} | |
| 402 | + assert idx["base_query"]["query"] == "платье" | |
| 403 | + assert _has_title_lang(idx["base_query"], "ru") | |
| 404 | + assert idx["base_query_trans_en"]["query"] == "dress" | |
| 405 | + | |
| 406 | + | |
| 407 | +def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause(): | |
| 408 | + """ | |
| 409 | + 当前实现:凡是 translations 里非空的条目都会生成子句; | |
| 410 | + index_languages 只约束混写扩列,不用于过滤翻译子句。 | |
| 411 | + """ | |
| 412 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 413 | + q = _build( | |
| 414 | + qb, | |
| 415 | + query_text="dress", | |
| 416 | + rewritten="dress", | |
| 417 | + detected_language="en", | |
| 418 | + translations={"zh": "连衣裙", "de": "Kleid"}, | |
| 419 | + index_languages=["en", "zh"], | |
| 420 | + ) | |
| 421 | + idx = _clauses_index(q) | |
| 422 | + assert "base_query_trans_de" in idx | |
| 423 | + assert idx["base_query_trans_de"]["query"] == "Kleid" | |
| 424 | + assert _has_title_lang(idx["base_query_trans_de"], "de") | |
| 425 | + | |
| 426 | + | |
| 427 | +def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_base(): | |
| 428 | + """base_query 始终用 rewritten_query,而非仅 query_text。""" | |
| 429 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 430 | + q = _build( | |
| 431 | + qb, | |
| 432 | + query_text=" 红色 ", | |
| 433 | + rewritten="红色连衣裙", | |
| 434 | + detected_language="zh", | |
| 435 | + translations={"en": "red dress"}, | |
| 436 | + index_languages=["zh", "en"], | |
| 437 | + contains_chinese=True, | |
| 438 | + contains_english=False, | |
| 439 | + ) | |
| 440 | + idx = _clauses_index(q) | |
| 441 | + assert idx["base_query"]["query"] == "红色连衣裙" | |
| 442 | + assert idx["base_query_trans_en"]["query"] == "red dress" | |
| 443 | + | |
| 444 | + | |
| 445 | +def test_detected_language_unknown_falls_back_to_default_language(): | |
| 446 | + """与 LanguageDetector 失败时 QueryConfig.default_language 行为对齐。""" | |
| 447 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 448 | + parsed = SimpleNamespace( | |
| 449 | + rewritten_query="shirt", | |
| 450 | + detected_language="unknown", | |
| 451 | + translations={"zh": "衬衫"}, | |
| 452 | + contains_chinese=False, | |
| 453 | + contains_english=True, | |
| 454 | + ) | |
| 455 | + q = qb.build_query( | |
| 456 | + query_text="shirt", | |
| 457 | + parsed_query=parsed, | |
| 458 | + enable_knn=False, | |
| 459 | + index_languages=["en", "zh"], | |
| 460 | + ) | |
| 461 | + idx = _clauses_index(q) | |
| 462 | + assert set(idx) == {"base_query", "base_query_trans_zh"} | |
| 463 | + assert idx["base_query"]["query"] == "shirt" | |
| 464 | + assert _has_title_lang(idx["base_query"], "en") | |
| 465 | + | |
| 466 | + | |
| 467 | +def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): | |
| 468 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 469 | + q = _build( | |
| 470 | + qb, | |
| 471 | + query_text="платье", | |
| 472 | + rewritten="платье", | |
| 473 | + detected_language="ru", | |
| 474 | + translations={"en": "dress"}, | |
| 475 | + index_languages=["ru", "en"], | |
| 476 | + ) | |
| 477 | + idx = _clauses_index(q) | |
| 478 | + assert set(idx) == {"base_query", "base_query_trans_en"} | |
| 479 | + assert idx["base_query"]["query"] == "платье" | |
| 480 | + assert _has_title_lang(idx["base_query"], "ru") | |
| 481 | + assert idx["base_query_trans_en"]["query"] == "dress" | |
| 482 | + | |
| 483 | + | |
| 484 | +def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause(): | |
| 485 | + """ | |
| 486 | + 当前实现:凡是 translations 里非空的条目都会生成子句; | |
| 487 | + index_languages 只约束混写扩列,不用于过滤翻译子句。 | |
| 488 | + """ | |
| 489 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 490 | + q = _build( | |
| 491 | + qb, | |
| 492 | + query_text="dress", | |
| 493 | + rewritten="dress", | |
| 494 | + detected_language="en", | |
| 495 | + translations={"zh": "连衣裙", "de": "Kleid"}, | |
| 496 | + index_languages=["en", "zh"], | |
| 497 | + ) | |
| 498 | + idx = _clauses_index(q) | |
| 499 | + assert "base_query_trans_de" in idx | |
| 500 | + assert idx["base_query_trans_de"]["query"] == "Kleid" | |
| 501 | + assert _has_title_lang(idx["base_query_trans_de"], "de") | |
| 502 | + | |
| 503 | + | |
| 504 | +def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_base(): | |
| 505 | + """base_query 始终用 rewritten_query,而非仅 query_text。""" | |
| 506 | + qb = _builder_multilingual_title_only(default_language="en") | |
| 507 | + q = _build( | |
| 508 | + qb, | |
| 509 | + query_text=" 红色 ", | |
| 510 | + rewritten="红色连衣裙", | |
| 511 | + detected_language="zh", | |
| 512 | + translations={"en": "red dress"}, | |
| 513 | + index_languages=["zh", "en"], | |
| 514 | + contains_chinese=True, | |
| 515 | + contains_english=False, | |
| 516 | + ) | |
| 517 | + idx = _clauses_index(q) | |
| 518 | + assert idx["base_query"]["query"] == "红色连衣裙" | |
| 519 | + assert idx["base_query_trans_en"]["query"] == "red dress" | ... | ... |
tests/test_rerank_client.py
| ... | ... | @@ -11,7 +11,6 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim |
| 11 | 11 | "matched_queries": { |
| 12 | 12 | "base_query": 2.4, |
| 13 | 13 | "base_query_trans_zh": 1.8, |
| 14 | - "fallback_original_query_zh": 1.2, | |
| 15 | 14 | "knn_query": 0.8, |
| 16 | 15 | }, |
| 17 | 16 | }, |
| ... | ... | @@ -27,7 +26,7 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim |
| 27 | 26 | |
| 28 | 27 | debug = fuse_scores_and_resort(hits, [0.9, 0.7]) |
| 29 | 28 | |
| 30 | - expected_text_1 = 2.4 + 0.25 * ((0.8 * 1.8) + (0.55 * 1.2)) | |
| 29 | + expected_text_1 = 2.4 + 0.25 * (0.8 * 1.8) | |
| 31 | 30 | expected_fused_1 = (0.9 + 0.00001) * ((expected_text_1 + 0.1) ** 0.35) * ((0.8 + 0.6) ** 0.2) |
| 32 | 31 | expected_fused_2 = (0.7 + 0.00001) * ((9.0 + 0.1) ** 0.35) * ((0.2 + 0.6) ** 0.2) |
| 33 | 32 | |
| ... | ... | @@ -38,7 +37,6 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim |
| 38 | 37 | assert isclose(by_id["2"]["_fused_score"], expected_fused_2, rel_tol=1e-9) |
| 39 | 38 | assert debug[0]["text_source_score"] == 2.4 |
| 40 | 39 | assert debug[0]["text_translation_score"] == 1.8 |
| 41 | - assert debug[0]["text_fallback_score"] == 1.2 | |
| 42 | 40 | assert debug[0]["knn_score"] == 0.8 |
| 43 | 41 | assert [hit["_id"] for hit in hits] == ["2", "1"] |
| 44 | 42 | ... | ... |