diff --git a/config/config.yaml b/config/config.yaml index debc773..6f8233f 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -20,7 +20,7 @@ es_settings: refresh_interval: "30s" # 字段权重配置(用于搜索时的字段boost) -# 统一按“字段基名”配置;查询时按 search_langs 动态拼接 .{lang}。 +# 统一按“字段基名”配置;查询时按实际检索语言动态拼接 .{lang}。 # 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。 field_boosts: title: 3.0 @@ -74,14 +74,11 @@ query_config: - "vendor" - "category_name_text" - # 统一文本召回策略(主查询 + 翻译查询 + 原始查询兜底) + # 统一文本召回策略(主查询 + 翻译查询) text_query_strategy: base_minimum_should_match: "75%" translation_minimum_should_match: "75%" translation_boost: 0.4 - translation_boost_when_source_missing: 1.0 - source_boost_when_missing: 0.6 - original_query_fallback_boost_when_translation_missing: 0.2 tie_breaker_base_query: 0.9 # Embedding字段名称 diff --git a/config/loader.py b/config/loader.py index 282d471..8be9087 100644 --- a/config/loader.py +++ b/config/loader.py @@ -284,13 +284,6 @@ class AppConfigLoader: base_minimum_should_match=str(text_strategy.get("base_minimum_should_match", "70%")), translation_minimum_should_match=str(text_strategy.get("translation_minimum_should_match", "70%")), translation_boost=float(text_strategy.get("translation_boost", 0.4)), - translation_boost_when_source_missing=float( - text_strategy.get("translation_boost_when_source_missing", 1.0) - ), - source_boost_when_missing=float(text_strategy.get("source_boost_when_missing", 0.6)), - original_query_fallback_boost_when_translation_missing=float( - text_strategy.get("original_query_fallback_boost_when_translation_missing", 0.2) - ), tie_breaker_base_query=float(text_strategy.get("tie_breaker_base_query", 0.9)), zh_to_en_model=str(query_cfg.get("zh_to_en_model") or "opus-mt-zh-en"), en_to_zh_model=str(query_cfg.get("en_to_zh_model") or "opus-mt-en-zh"), diff --git a/config/schema.py b/config/schema.py index f570f58..713d741 100644 --- a/config/schema.py +++ b/config/schema.py @@ -54,9 +54,6 @@ class QueryConfig: base_minimum_should_match: str = "70%" translation_minimum_should_match: str = "70%" translation_boost: float = 0.4 - translation_boost_when_source_missing: float = 1.0 - source_boost_when_missing: float = 0.6 - original_query_fallback_boost_when_translation_missing: float = 0.2 tie_breaker_base_query: float = 0.9 zh_to_en_model: str = "opus-mt-zh-en" en_to_zh_model: str = "opus-mt-en-zh" diff --git a/docs/DEVELOPER_GUIDE.md b/docs/DEVELOPER_GUIDE.md index 4b9c5ca..b804033 100644 --- a/docs/DEVELOPER_GUIDE.md +++ b/docs/DEVELOPER_GUIDE.md @@ -147,7 +147,7 @@ docs/ # 文档(含本指南) ### 4.4 query -- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化;输出可供 Searcher 使用的结构化查询信息(含 search_langs 语言计划)。 +- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化;输出解析事实(如 `rewritten_query`、`detected_language`、`translations`、`query_vector`),不再承担 ES 语言计划拼装。 - **原则**:翻译/向量通过 `providers` 获取,不直接依赖具体服务 URL 或实现;支持按配置关闭翻译/向量(如短查询、typing 场景)。 ### 4.5 search diff --git a/docs/QUICKSTART.md b/docs/QUICKSTART.md index 97f4e39..bb6e80b 100644 --- a/docs/QUICKSTART.md +++ b/docs/QUICKSTART.md @@ -558,6 +558,21 @@ lsof -i :6004 更完整的运行排障(多环境切换、Suggestion 构建、FAQ)见 `docs/Usage-Guide.md`。 +### 5.4 HanLP 与 `transformers` 版本(`BertTokenizer.encode_plus`) + +若日志出现 **`AttributeError: BertTokenizer has no attribute encode_plus`**,通常是 **同一 venv 里装了 `transformers` 5.x**,与 **HanLP 2.1.x** 不兼容(HanLP 仍调用已移除的 `encode_plus`)。 + +**处理:** 将 `transformers` 固定到 **4.x**(例如 4.44+),然后重装/校验 HanLP: + +```bash +source activate.sh +pip install -r requirements_hanlp.txt +python -c "from transformers import BertTokenizer; import transformers as t; print(t.__version__, hasattr(BertTokenizer, 'encode_plus'))" +# 期望:4.x 且 True +``` + +**说明:** 重排/TEI 等若使用 **独立 venv**(如 `.venv-reranker`),可与主 venv 的 `transformers` 版本分离;主 venv 只要装了 HanLP 做查询分词,就不要把 `transformers` 升到 5。 + --- ## 6. 相关文档 diff --git a/docs/TODO.txt b/docs/TODO.txt index ae809a1..bf977ed 100644 --- a/docs/TODO.txt +++ b/docs/TODO.txt @@ -32,7 +32,7 @@ }, 去掉 image_embedding_512 image_embedding改为,一个spu有多个sku向量,每个向量内部properties: -除了vector url还应该包括 +除了vector url还应该包括,该图片是对应哪些sku "image_embedding": { "type": "nested", "properties": { @@ -117,6 +117,11 @@ requests.exceptions.HTTPError: 502 Server Error: Bad Gateway for url: http://127 +是否需要: +当「源语言不在 index_languages」且「某些目标语言的翻译缺失」时,ES 里会额外加一层 用「原始 query 字符串」去撞缺失语种字段 + + + 先阅读文本embedding相关的代码: @embeddings/README.md @embeddings/server.py @docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md @embeddings/text_encoder.py 目前有TEXT_MAX_INFLIGHT / IMAGE_MAX_INFLIGHT 准入限制,超限返回过载状态码。 diff --git a/docs/搜索API对接指南-01-搜索接口.md b/docs/搜索API对接指南-01-搜索接口.md index 34fc597..e79c32f 100644 --- a/docs/搜索API对接指南-01-搜索接口.md +++ b/docs/搜索API对接指南-01-搜索接口.md @@ -553,9 +553,6 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) | `rewritten_query` | string | 重写后的查询 | | `detected_language` | string | 检测到的语言 | | `translations` | object | 翻译结果 | -| `query_text_by_lang` | object | 实际参与检索的多语言 query 文本 | -| `search_langs` | array[string] | 实际参与检索的语言列表 | -| `supplemental_search_langs` | array[string] | 因 mixed query 补入的附加语言列表 | | `has_vector` | boolean | 是否生成了向量 | `debug_info.per_result[]` 常见字段: @@ -565,10 +562,9 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) | `spu_id` | string | 结果 SPU ID | | `es_score` | float | ES 原始 `_score` | | `rerank_score` | float | 重排分数 | -| `text_score` | float | 文本相关性大分(由 `base_query` / `base_query_trans_*` / `fallback_original_query_*` 聚合而来) | +| `text_score` | float | 文本相关性大分(由 `base_query` / `base_query_trans_*` 聚合而来) | | `text_source_score` | float | `base_query` 分数 | | `text_translation_score` | float | `base_query_trans_*` 里的最大分数 | -| `text_fallback_score` | float | `fallback_original_query_*` 里的最大分数 | | `text_primary_score` | float | 文本大分中的主证据部分 | | `text_support_score` | float | 文本大分中的辅助证据部分 | | `knn_score` | float | `knn_query` 分数 | diff --git a/docs/相关性检索优化说明.md b/docs/相关性检索优化说明.md index 6cf890f..1822573 100644 --- a/docs/相关性检索优化说明.md +++ b/docs/相关性检索优化说明.md @@ -2,11 +2,11 @@ ## 1. 文档目标 -本文描述当前线上代码的文本检索策略,重点覆盖: +本文描述当前代码中的文本检索策略,重点覆盖: - 多语言检索路由(`detector` / `translator` / `indexed` 的关系) - 统一文本召回表达式(无布尔 AST 分支) -- 翻译缺失时的兜底策略 +- 解析层与检索表达式层的职责边界 - 重排融合打分与调试字段 - 典型场景下实际生成的 ES 查询结构 @@ -17,9 +17,11 @@ 查询链路(文本相关): 1. `QueryParser.parse()` - 输出 `detected_language`、`query_text_by_lang`、`search_langs`、`index_languages`、`source_in_index_languages`;另输出 `contains_chinese` / `contains_english`(仅服务混写辅助召回,见 §4 末)。 + 负责产出解析事实:`query_normalized`、`rewritten_query`、`detected_language`、`translations`、`query_vector`、`query_tokens`、`contains_chinese`、`contains_english`。 +2. `Searcher.search()` + 负责读取租户 `index_languages`,并将其一方面传给 `QueryParser` 作为 `target_languages`,另一方面传给 `ESQueryBuilder` 作为字段展开约束。 2. `ESQueryBuilder._build_advanced_text_query()` - 按 `search_langs` 动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段,叠加 shared 字段(`tags`、`option*_values`);若命中混写辅助条件,在同一子句内并入另一语种列(§4 末)。 + 基于 `rewritten_query + detected_language + translations + index_languages` 构建 `base_query` 与 `base_query_trans_*`;并按语言动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段,叠加 shared 字段(`tags`、`option*_values`)。 3. `build_query()` 统一走文本策略,不再有布尔 AST 枝路。 @@ -37,18 +39,18 @@ 源语言字段做主召回;其他语言走翻译补召回(低权重)。 2. 若 `detected_language not in index_languages`: 翻译到 `index_languages` 是主路径;源语言字段仅作弱召回。 -3. 若第 2 步翻译部分失败或全部失败: - 对缺失翻译的 `index_languages` 字段,追加“原文低权重兜底”子句,避免完全丢失这些语种索引面的召回机会。 +3. 若翻译部分失败或全部失败: + 当前实现不会再额外生成“原文打到其他语种字段”的兜底子句;系统保留 `base_query` 并继续执行,可观测性由 `translations` / warning / 命名子句分数提供。 ### 3.2 翻译与向量:并发提交与共享超时 -`QueryParser.parse()` 内(Stage 4–6)对**离线调用**采用线程池提交 + **一次** `concurrent.futures.wait`: +`QueryParser.parse()` 内对翻译与向量采用线程池提交 + **一次** `concurrent.futures.wait`: -- **翻译**:对 `index_languages` 中除 `detected_language` 外的每个目标语种各提交一个 `translator.translate` 任务(多目标时并发执行)。 -- **查询向量**(若开启 `enable_text_embedding` 且域为 default):再提交一个 `text_encoder.encode` 任务。 +- **翻译**:对调用方传入的 `target_languages` 中、除 `detected_language` 外的每个目标语种各提交一个 `translator.translate` 任务(多目标时并发执行)。 +- **查询向量**:若开启 `enable_text_embedding`,再提交一个 `text_encoder.encode` 任务。 - 上述任务进入**同一** future 集合;例如租户索引为 `[zh, en]` 且检测语种**不在**索引内时,常为 **2 路翻译 + 1 路向量,共 3 个任务并发**,共用超时。 -**等待预算(毫秒)**由 `detected_language` 是否属于租户 `index_languages` 决定(`query_config`): +**等待预算(毫秒)**由 `detected_language` 是否属于调用方传入的 `target_languages` 决定(`query_config`): - **在索引内**:`translation_embedding_wait_budget_ms_source_in_index`(默认较短,如 80ms)— 主召回已能打在源语种字段,翻译/向量稍慢可容忍。 - **不在索引内**:`translation_embedding_wait_budget_ms_source_not_in_index`(默认较长,如 200ms)— 翻译对可检索文本更关键,给足时间。 @@ -62,7 +64,7 @@ ```json { "multi_match": { - "_name": "base_query|base_query_trans_xx|fallback_original_query_xx", + "_name": "base_query|base_query_trans_xx", "query": "", "fields": ["title.xx^3.0", "brief.xx^1.5", "...", "tags", "option1_values^0.5", "..."], "minimum_should_match": "75%", @@ -75,7 +77,7 @@ 最终按 `bool.should` 组合,`minimum_should_match: 1`。 > **附 — 混写辅助召回** -> 当中英(或多脚本)混写时,为略抬召回:`QueryParser` 用 `contains_chinese`(文中有汉字)、`contains_english`(分词中有长度 ≥3 的纯英文 token)打标;`ESQueryBuilder` 在某一语言的 `multi_match` 上,按规则把**另一语种**的同类字段并入同一 `fields`(受 `index_languages` 限制),并入列的 boost 为配置值再乘 **`mixed_script_merged_field_boost_scale`(默认 0.8,`ESQueryBuilder` 构造参数)**。`fallback_original_query_*` 同样适用。字段在内部以 `(path, boost)` 列表合并后再格式化为 ES 字符串。 +> 当中英(或多脚本)混写时,为略抬召回:`QueryParser` 用 `contains_chinese`(文中有汉字)、`contains_english`(分词中有长度 ≥3 的纯英文 token)打标;`ESQueryBuilder` 在某一语言的 `multi_match` 上,按规则把**另一语种**的同类字段并入同一 `fields`(受 `index_languages` 限制),并入列的 boost 为配置值再乘 **`mixed_script_merged_field_boost_scale`(默认 0.6,`ESQueryBuilder` 构造参数)**。字段在内部以 `(path, boost)` 列表合并后再格式化为 ES 字符串。 ## 5. 关键配置项(文本策略) @@ -88,20 +90,12 @@ - `base_minimum_should_match` - `translation_minimum_should_match` -- `translation_boost` -- `translation_boost_when_source_missing` -- `source_boost_when_missing` -- `original_query_fallback_boost_when_translation_missing`(新增) +- `translation_boost`(所有 `base_query_trans_*` 共用) - `tie_breaker_base_query` -新增项说明: - -- `original_query_fallback_boost_when_translation_missing`: - 当源语种不在索引语言且翻译缺失时,原文打到缺失目标语字段的低权重系数,默认 `0.2`。 - 说明: -- `phrase_query` / `keywords_query` 已从当前实现中移除,文本相关性只由 `base_query`、`base_query_trans_*`、`fallback_original_query_*` 三类子句组成。 +- `phrase_query` / `keywords_query` 已从当前实现中移除,文本相关性只由 `base_query`、`base_query_trans_*` 两类子句组成。 ## 6. 典型场景与实际 DSL @@ -111,11 +105,12 @@ - `detected_language=de` - `index_languages=[de,en]` -- `query_text_by_lang={de:"herren schuhe", en:"men shoes"}` +- `rewritten_query="herren schuhe"` +- `translations={en:"men shoes"}` 策略结果: -- `base_query`:德语字段,正常权重 +- `base_query`:德语字段,**不写** `multi_match.boost` - `base_query_trans_en`:英语字段,`boost=translation_boost`(默认 0.4) ### 场景 B:源语种不在索引语言中,部分翻译缺失 @@ -126,38 +121,44 @@ 策略结果: -- `base_query`(德语字段):`boost=source_boost_when_missing`(默认 0.6) -- `base_query_trans_en`(英文字段):`boost=translation_boost_when_source_missing`(默认 1.0) -- `fallback_original_query_zh`(中文字段):原文低权重兜底(默认 0.2) +- `base_query`(德语字段):**不写** `multi_match.boost`(默认 1.0) +- `base_query_trans_en`(英文字段):`boost=translation_boost`(如 0.4) +- 不会生成额外中文兜底子句 ### 场景 C:源语种不在索引语言中,翻译全部失败 - `detected_language=de` - `index_languages=[en,zh]` -- `query_text_by_lang` 仅有 `de` +- `translations={}` 策略结果: -- `base_query`(德语字段,低权重) -- `fallback_original_query_en`(英文字段原文兜底) -- `fallback_original_query_zh`(中文字段原文兜底) +- `base_query`(德语字段,**无** `boost` 字段) +- 不会生成 `base_query_trans_*` -这能避免“只有源语种字段查询,且该语种字段在商家索引中稀疏/为空”导致的弱召回问题。 +这意味着当前实现优先保证职责清晰与可解释性,而不是继续在 Builder 内部隐式制造“跨语种原文兜底”。 -## 7. QueryParser 与 ESBuilder 的职责分工 +## 7. QueryParser 与 Searcher / ESBuilder 的职责分工 -- `QueryParser` 负责“语言计划”与“可用文本”: - - `search_langs` - - `query_text_by_lang` - - `source_in_index_languages` - - `index_languages` +- `QueryParser` 负责“解析事实”: + - `query_normalized` + - `rewritten_query` + - `detected_language` + - `translations` + - `query_vector` + - `query_tokens` - `contains_chinese` / `contains_english` +- `Searcher` 负责“租户语境”: + - `index_languages` + - 将其传给 parser 作为 `target_languages` + - 将其传给 builder 作为字段展开约束 - `ESQueryBuilder` 负责“表达式展开”: - 动态字段组装 - 子句权重分配 - - 翻译缺失兜底子句拼接 + - `base_query` / `base_query_trans_*` 子句拼接 + - 跳过“与 base_query 文本和语言完全相同”的重复翻译子句 -这种分层让策略调优主要落在配置和 Builder,不破坏 Parser 的职责边界。 +这种分层让 parser 不再返回 ES 专用的“语言计划字段”,职责边界更清晰。 ## 8. 融合打分(Rerank + Text + KNN) @@ -165,24 +166,21 @@ ### 8.1 文本相关性大分 -文本大分由三部分组成: +文本大分由两部分组成: - `base_query` - `base_query_trans_*` -- `fallback_original_query_*` 聚合方式: 1. `source_score = base_query` 2. `translation_score = max(base_query_trans_*)` -3. `fallback_score = max(fallback_original_query_*)` -4. 加权: +3. 加权: - `weighted_source = source_score` - `weighted_translation = 0.8 * translation_score` - - `weighted_fallback = 0.55 * fallback_score` -5. 合成: - - `primary = max(weighted_source, weighted_translation, weighted_fallback)` - - `support = weighted_source + weighted_translation + weighted_fallback - primary` +4. 合成: + - `primary = max(weighted_source, weighted_translation)` + - `support = weighted_source + weighted_translation - primary` - `text_score = primary + 0.25 * support` 如果以上子分都缺失,则回退到 ES `_score` 作为 `text_score`,避免纯文本召回被误打成 0。 @@ -212,7 +210,6 @@ fused_score = ( - `text_score` - `text_source_score` - `text_translation_score` -- `text_fallback_score` - `text_primary_score` - `text_support_score` - `knn_score` @@ -221,9 +218,9 @@ fused_score = ( `debug_info.query_analysis` 还会暴露: -- `query_text_by_lang` -- `search_langs` -- `supplemental_search_langs` +- `translations` +- `detected_language` +- `rewritten_query` 这些字段用于检索效果评估与 bad case 归因。 @@ -231,7 +228,7 @@ fused_score = ( 1. 当前文本主链路已移除布尔 AST 分支。 2. 文档中的旧描述(如 `operator: AND` 固定开启)不再适用,当前实现未强制设置该参数。 -3. `HanLP` 为可选依赖;不可用时退化到轻量分词,不影响主链路可用性。 +3. `HanLP` 为必需依赖;当前 parser 不再提供轻量 fallback。 4. 若后续扩展到更多语种,请确保: - mapping 中存在对应 `.` 字段 - `index_languages` 配置在支持列表内 @@ -263,10 +260,9 @@ python ./scripts/eval_search_quality.py 建议在 `tests/` 增加文本策略用例: 1. 源语种在索引语言,翻译命中缓存 -2. 源语种不在索引语言,翻译部分失败(验证 fallback 子句) -3. 源语种不在索引语言,翻译全部失败(验证多目标 fallback) -4. 自定义 `original_query_fallback_boost_when_translation_missing` 生效 -5. 非 `zh/en` 语种字段动态拼接(如 `de/fr/es`) +2. 源语种不在索引语言,翻译部分失败(验证仅保留 `base_query` + 成功翻译子句) +3. 源语种不在索引语言,翻译全部失败(验证无 `base_query_trans_*` 时仍可正常执行) +4. 非 `zh/en` 语种字段动态拼接(如 `de/fr/es`) diff --git a/requirements_hanlp.txt b/requirements_hanlp.txt new file mode 100644 index 0000000..5657cec --- /dev/null +++ b/requirements_hanlp.txt @@ -0,0 +1,13 @@ +# Optional: HanLP query tokenization for the main backend venv (QueryParser). +# +# Install: +# source activate.sh +# pip install -r requirements_hanlp.txt +# +# Why pin transformers<5: +# transformers 5.x no longer exposes `encode_plus` on `BertTokenizer`, but HanLP 2.1.x +# still calls it → AttributeError during `hanlp.load(...)`. +# Use transformers 4.44+ (4.x) which remains API-compatible with HanLP. + +hanlp>=2.1.0 +transformers>=4.44,<5 diff --git a/scripts/eval_search_quality.py b/scripts/eval_search_quality.py index f14336f..217776d 100644 --- a/scripts/eval_search_quality.py +++ b/scripts/eval_search_quality.py @@ -83,7 +83,6 @@ class RankedItem: text_score: float | None text_source_score: float | None text_translation_score: float | None - text_fallback_score: float | None text_primary_score: float | None text_support_score: float | None knn_score: float | None @@ -146,7 +145,6 @@ def _evaluate_query(searcher, tenant_id: str, query: str) -> Dict[str, Any]: text_score=_to_float(debug_item.get("text_score")), text_source_score=_to_float(debug_item.get("text_source_score")), text_translation_score=_to_float(debug_item.get("text_translation_score")), - text_fallback_score=_to_float(debug_item.get("text_fallback_score")), text_primary_score=_to_float(debug_item.get("text_primary_score")), text_support_score=_to_float(debug_item.get("text_support_score")), knn_score=_to_float(debug_item.get("knn_score")), @@ -185,12 +183,11 @@ def _render_markdown(report: Dict[str, Any]) -> str: f"- total={entry['total']} max_score={entry['max_score']:.6f} took_ms={entry['took_ms']}" ) lines.append( - f"- detected_language={qa.get('detected_language')} search_langs={qa.get('search_langs')} supplemental_search_langs={qa.get('supplemental_search_langs')}" + f"- detected_language={qa.get('detected_language')} translations={qa.get('translations')}" ) - lines.append(f"- query_text_by_lang={qa.get('query_text_by_lang')}") lines.append("") - lines.append("| rank | spu_id | title | fused | rerank | text | text_src | text_trans | text_fb | knn | es | matched_queries |") - lines.append("| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |") + lines.append("| rank | spu_id | title | fused | rerank | text | text_src | text_trans | knn | es | matched_queries |") + lines.append("| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |") for item in entry.get("top20", []): title = str(item.get("title", "")).replace("|", "/") matched = json.dumps(item.get("matched_queries"), ensure_ascii=False) @@ -199,7 +196,7 @@ def _render_markdown(report: Dict[str, Any]) -> str: f"| {item.get('rank')} | {item.get('spu_id')} | {title} | " f"{item.get('fused_score')} | {item.get('rerank_score')} | {item.get('text_score')} | " f"{item.get('text_source_score')} | {item.get('text_translation_score')} | " - f"{item.get('text_fallback_score')} | {item.get('knn_score')} | {item.get('es_score')} | {matched} |" + f"{item.get('knn_score')} | {item.get('es_score')} | {matched} |" ) lines.append("") return "\n".join(lines) diff --git a/search/es_query_builder.py b/search/es_query_builder.py index 32ad3b9..15dd2a0 100644 --- a/search/es_query_builder.py +++ b/search/es_query_builder.py @@ -36,9 +36,6 @@ class ESQueryBuilder: base_minimum_should_match: str = "70%", translation_minimum_should_match: str = "70%", translation_boost: float = 0.4, - translation_boost_when_source_missing: float = 1.0, - source_boost_when_missing: float = 0.6, - original_query_fallback_boost_when_translation_missing: float = 0.2, tie_breaker_base_query: float = 0.9, mixed_script_merged_field_boost_scale: float = 0.6, ): @@ -74,11 +71,6 @@ class ESQueryBuilder: self.base_minimum_should_match = base_minimum_should_match self.translation_minimum_should_match = translation_minimum_should_match self.translation_boost = float(translation_boost) - self.translation_boost_when_source_missing = float(translation_boost_when_source_missing) - self.source_boost_when_missing = float(source_boost_when_missing) - self.original_query_fallback_boost_when_translation_missing = float( - original_query_fallback_boost_when_translation_missing - ) self.tie_breaker_base_query = float(tie_breaker_base_query) self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale) @@ -168,7 +160,7 @@ class ESQueryBuilder: 结构:filters and (text_recall or embedding_recall) + post_filter - conjunctive_filters: 应用在 query.bool.filter(影响结果和聚合) - disjunctive_filters: 应用在 post_filter(只影响结果,不影响聚合) - - text_recall: 文本相关性召回(按 search_langs 动态语言字段) + - text_recall: 文本相关性召回(按实际 clause 语言动态字段) - embedding_recall: 向量召回(KNN) - function_score: 包装召回部分,支持提权字段 @@ -484,6 +476,7 @@ class ESQueryBuilder: contains_chinese: bool, contains_english: bool, index_languages: List[str], + is_source: bool = False ) -> List[MatchFieldSpec]: """ When the query mixes scripts, widen each clause to indexed fields for the other script @@ -497,10 +490,11 @@ class ESQueryBuilder: out = list(specs) lnorm = (lang or "").strip().lower() - if contains_english and lnorm != "en" and can_use("en"): - out = self._merge_supplemental_lang_field_specs(out, "en") - if contains_chinese and lnorm != "zh" and can_use("zh"): - out = self._merge_supplemental_lang_field_specs(out, "zh") + if is_source: + if contains_english and lnorm != "en" and can_use("en"): + out = self._merge_supplemental_lang_field_specs(out, "en") + if contains_chinese and lnorm != "zh" and can_use("zh"): + out = self._merge_supplemental_lang_field_specs(out, "zh") return out def _get_embedding_field(self, language: str) -> str: @@ -557,10 +551,6 @@ class ESQueryBuilder: contains_english = bool(getattr(parsed_query, "contains_english", False)) source_lang = str(source_lang or self.default_language).strip().lower() or self.default_language - source_in_index_languages = ( - True if not normalized_index_languages else source_lang in normalized_index_languages - ) - base_query_text = ( getattr(parsed_query, "rewritten_query", None) if parsed_query else None ) or query_text @@ -574,22 +564,14 @@ class ESQueryBuilder: contains_chinese, contains_english, normalized_index_languages, + is_source, ) match_fields = self._format_match_field_specs(expanded_specs) if not match_fields: return - clause_boost = 1.0 minimum_should_match = ( self.base_minimum_should_match if is_source else self.translation_minimum_should_match ) - if is_source and not source_in_index_languages: - clause_boost = self.source_boost_when_missing - elif not is_source: - clause_boost = ( - self.translation_boost - if source_in_index_languages - else self.translation_boost_when_source_missing - ) clause = { "multi_match": { @@ -600,8 +582,11 @@ class ESQueryBuilder: "tie_breaker": self.tie_breaker_base_query, } } - if abs(clause_boost - 1.0) > 1e-9: - clause["multi_match"]["boost"] = clause_boost + # base_query: never set multi_match.boost (ES default 1.0). + # Translation clauses: single knob from config — translation_boost. + if not is_source: + tb = float(self.translation_boost) + clause["multi_match"]["boost"] = tb should_clauses.append({ "multi_match": clause["multi_match"] }) diff --git a/search/rerank_client.py b/search/rerank_client.py index fcd7607..7108b5f 100644 --- a/search/rerank_client.py +++ b/search/rerank_client.py @@ -116,7 +116,6 @@ def _extract_named_query_score(matched_queries: Any, name: str) -> float: def _collect_text_score_components(matched_queries: Any, fallback_es_score: float) -> Dict[str, float]: source_score = _extract_named_query_score(matched_queries, "base_query") translation_score = 0.0 - fallback_score = 0.0 if isinstance(matched_queries, dict): for query_name, score in matched_queries.items(): @@ -125,21 +124,16 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa numeric_score = _to_score(score) if query_name.startswith("base_query_trans_"): translation_score = max(translation_score, numeric_score) - elif query_name.startswith("fallback_original_query_"): - fallback_score = max(fallback_score, numeric_score) elif isinstance(matched_queries, list): for query_name in matched_queries: if not isinstance(query_name, str): continue if query_name.startswith("base_query_trans_"): translation_score = 1.0 - elif query_name.startswith("fallback_original_query_"): - fallback_score = 1.0 weighted_source = source_score weighted_translation = 0.8 * translation_score - weighted_fallback = 0.55 * fallback_score - weighted_components = [weighted_source, weighted_translation, weighted_fallback] + weighted_components = [weighted_source, weighted_translation] primary_text_score = max(weighted_components) support_text_score = sum(weighted_components) - primary_text_score text_score = primary_text_score + 0.25 * support_text_score @@ -153,10 +147,8 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa return { "source_score": source_score, "translation_score": translation_score, - "fallback_score": fallback_score, "weighted_source_score": weighted_source, "weighted_translation_score": weighted_translation, - "weighted_fallback_score": weighted_fallback, "primary_text_score": primary_text_score, "support_text_score": support_text_score, "text_score": text_score, @@ -219,7 +211,6 @@ def fuse_scores_and_resort( hit["_knn_score"] = knn_score hit["_text_source_score"] = text_components["source_score"] hit["_text_translation_score"] = text_components["translation_score"] - hit["_text_fallback_score"] = text_components["fallback_score"] hit["_text_primary_score"] = text_components["primary_text_score"] hit["_text_support_score"] = text_components["support_text_score"] hit["_fused_score"] = fused @@ -231,7 +222,6 @@ def fuse_scores_and_resort( "text_score": text_score, "text_source_score": text_components["source_score"], "text_translation_score": text_components["translation_score"], - "text_fallback_score": text_components["fallback_score"], "text_primary_score": text_components["primary_text_score"], "text_support_score": text_components["support_text_score"], "knn_score": knn_score, diff --git a/search/searcher.py b/search/searcher.py index 29e22bb..156f996 100644 --- a/search/searcher.py +++ b/search/searcher.py @@ -132,11 +132,6 @@ class Searcher: base_minimum_should_match=self.config.query_config.base_minimum_should_match, translation_minimum_should_match=self.config.query_config.translation_minimum_should_match, translation_boost=self.config.query_config.translation_boost, - translation_boost_when_source_missing=self.config.query_config.translation_boost_when_source_missing, - source_boost_when_missing=self.config.query_config.source_boost_when_missing, - original_query_fallback_boost_when_translation_missing=( - self.config.query_config.original_query_fallback_boost_when_translation_missing - ), tie_breaker_base_query=self.config.query_config.tie_breaker_base_query, ) @@ -267,13 +262,6 @@ class Searcher: if normalized: candidates.append(normalized) - query_text_by_lang = getattr(parsed_query, "query_text_by_lang", {}) or {} - if isinstance(query_text_by_lang, dict): - for text in query_text_by_lang.values(): - normalized = self._normalize_sku_match_text(text) - if normalized: - candidates.append(normalized) - translations = getattr(parsed_query, "translations", {}) or {} if isinstance(translations, dict): for text in translations.values(): @@ -943,7 +931,6 @@ class Searcher: debug_entry["text_score"] = rerank_debug.get("text_score") debug_entry["text_source_score"] = rerank_debug.get("text_source_score") debug_entry["text_translation_score"] = rerank_debug.get("text_translation_score") - debug_entry["text_fallback_score"] = rerank_debug.get("text_fallback_score") debug_entry["text_primary_score"] = rerank_debug.get("text_primary_score") debug_entry["text_support_score"] = rerank_debug.get("text_support_score") debug_entry["knn_score"] = rerank_debug.get("knn_score") diff --git a/tests/test_es_query_builder_text_recall_languages.py b/tests/test_es_query_builder_text_recall_languages.py new file mode 100644 index 0000000..8799256 --- /dev/null +++ b/tests/test_es_query_builder_text_recall_languages.py @@ -0,0 +1,519 @@ +""" +ES text recall: base_query (rewritten @ detected_language) + base_query_trans_*. + +Covers combinations of query language vs tenant index_languages, translations, +and mixed Chinese/English queries. Asserts multi_match _name, query text, and +target language fields (title.{lang}). +""" + +from types import SimpleNamespace +from typing import Any, Dict, List + +import numpy as np + +from search.es_query_builder import ESQueryBuilder + + +def _builder_multilingual_title_only( + *, + default_language: str = "en", + mixed_script_scale: float = 0.6, +) -> ESQueryBuilder: + """Minimal builder: only title.{lang} for easy field assertions.""" + return ESQueryBuilder( + match_fields=["title.en^1.0"], + multilingual_fields=["title"], + shared_fields=[], + text_embedding_field="title_embedding", + default_language=default_language, + mixed_script_merged_field_boost_scale=mixed_script_scale, + function_score_config=None, + ) + + +def _unwrap_inner_query(es_body: Dict[str, Any]) -> Dict[str, Any]: + """Navigate bool.must / function_score wrappers to the text recall root.""" + q = es_body.get("query") or {} + if "bool" in q and "must" in q["bool"] and q["bool"]["must"]: + q = q["bool"]["must"][0] + if "function_score" in q: + q = q["function_score"]["query"] + return q + + +def _extract_multi_match_clauses(es_body: Dict[str, Any]) -> List[Dict[str, Any]]: + inner = _unwrap_inner_query(es_body) + if "multi_match" in inner: + return [inner["multi_match"]] + should = (inner.get("bool") or {}).get("should") or [] + return [c["multi_match"] for c in should if "multi_match" in c] + + +def _clauses_index(es_body: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: + """Map _name -> multi_match dict.""" + out: Dict[str, Dict[str, Any]] = {} + for mm in _extract_multi_match_clauses(es_body): + name = mm.get("_name") + if name: + out[str(name)] = mm + return out + + +def _title_fields(mm: Dict[str, Any]) -> List[str]: + fields = mm.get("fields") or [] + return [f for f in fields if str(f).startswith("title.")] + + +def _has_title_lang(mm: Dict[str, Any], lang: str) -> bool: + """True if any field is title.{lang} with optional ^boost suffix.""" + prefix = f"title.{lang}" + for f in mm.get("fields") or []: + s = str(f) + if s == prefix or s.startswith(prefix + "^"): + return True + return False + + +def _build( + qb: ESQueryBuilder, + *, + query_text: str, + rewritten: str, + detected_language: str, + translations: Dict[str, str], + index_languages: List[str], + contains_chinese: bool = False, + contains_english: bool = False, +) -> Dict[str, Any]: + parsed = SimpleNamespace( + rewritten_query=rewritten, + detected_language=detected_language, + translations=dict(translations), + contains_chinese=contains_chinese, + contains_english=contains_english, + ) + return qb.build_query( + query_text=query_text, + parsed_query=parsed, + enable_knn=False, + index_languages=index_languages, + ) + + +# --- 检测语言在 index_languages 内:主召回 + 翻译补召回 --- + + +def test_zh_query_index_zh_en_includes_base_zh_and_trans_en(): + qb = _builder_multilingual_title_only(default_language="en") + q = _build( + qb, + query_text="连衣裙", + rewritten="连衣裙", + detected_language="zh", + translations={"en": "dress"}, + index_languages=["zh", "en"], + ) + idx = _clauses_index(q) + assert set(idx) == {"base_query", "base_query_trans_en"} + assert idx["base_query"]["query"] == "连衣裙" + assert "title.zh" in _title_fields(idx["base_query"]) + assert idx["base_query_trans_en"]["query"] == "dress" + assert "title.en" in _title_fields(idx["base_query_trans_en"]) + + +def test_en_query_index_zh_en_includes_base_en_and_trans_zh(): + qb = _builder_multilingual_title_only(default_language="en") + q = _build( + qb, + query_text="dress", + rewritten="dress", + detected_language="en", + translations={"zh": "连衣裙"}, + index_languages=["en", "zh"], + ) + idx = _clauses_index(q) + assert set(idx) == {"base_query", "base_query_trans_zh"} + assert idx["base_query"]["query"] == "dress" + assert "title.en" in _title_fields(idx["base_query"]) + assert idx["base_query_trans_zh"]["query"] == "连衣裙" + assert "title.zh" in _title_fields(idx["base_query_trans_zh"]) + + +def test_de_query_index_de_en_fr_includes_base_and_two_translations(): + qb = _builder_multilingual_title_only(default_language="en") + q = _build( + qb, + query_text="kleid", + rewritten="kleid", + detected_language="de", + translations={"en": "dress", "fr": "robe"}, + index_languages=["de", "en", "fr"], + ) + idx = _clauses_index(q) + assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_fr"} + assert idx["base_query"]["query"] == "kleid" + assert "title.de" in _title_fields(idx["base_query"]) + assert idx["base_query_trans_en"]["query"] == "dress" + assert idx["base_query_trans_fr"]["query"] == "robe" + + +# --- 检测语言不在 index_languages:仍有 base(弱)+ 翻译(强) --- + + +def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields(): + qb = _builder_multilingual_title_only(default_language="en") + q = _build( + qb, + query_text="schuh", + rewritten="schuh", + detected_language="de", + translations={"en": "shoe", "zh": "鞋"}, + index_languages=["en", "zh"], + ) + idx = _clauses_index(q) + assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_zh"} + assert idx["base_query"]["query"] == "schuh" + assert "title.de" in _title_fields(idx["base_query"]) + assert "boost" not in idx["base_query"] + assert idx["base_query_trans_en"]["query"] == "shoe" + assert idx["base_query_trans_en"]["boost"] == qb.translation_boost + assert idx["base_query_trans_zh"]["query"] == "鞋" + assert idx["base_query_trans_zh"]["boost"] == qb.translation_boost + + +# --- 中英混写:原文在 base_query;翻译子句独立;混写时 base 子句扩列 --- + + +def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause(): + qb = _builder_multilingual_title_only(default_language="en") + q = _build( + qb, + query_text="红色 dress", + rewritten="红色 dress", + detected_language="zh", + translations={"en": "red dress"}, + index_languages=["zh", "en"], + contains_chinese=True, + contains_english=True, + ) + idx = _clauses_index(q) + assert set(idx) == {"base_query", "base_query_trans_en"} + assert idx["base_query"]["query"] == "红色 dress" + assert _has_title_lang(idx["base_query"], "zh") and _has_title_lang(idx["base_query"], "en") + assert idx["base_query_trans_en"]["query"] == "red dress" + assert _has_title_lang(idx["base_query_trans_en"], "en") + + +def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause(): + qb = _builder_multilingual_title_only(default_language="en") + q = _build( + qb, + query_text="nike 运动鞋", + rewritten="nike 运动鞋", + detected_language="en", + translations={"zh": "耐克运动鞋"}, + index_languages=["zh", "en"], + contains_chinese=True, + contains_english=True, + ) + idx = _clauses_index(q) + assert set(idx) == {"base_query", "base_query_trans_zh"} + assert idx["base_query"]["query"] == "nike 运动鞋" + assert _has_title_lang(idx["base_query"], "en") and _has_title_lang(idx["base_query"], "zh") + assert idx["base_query_trans_zh"]["query"] == "耐克运动鞋" + + +def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): + qb = _builder_multilingual_title_only(default_language="en") + q = _build( + qb, + query_text="法式 dress", + rewritten="法式 dress", + detected_language="zh", + translations={}, + index_languages=["zh"], + contains_chinese=True, + contains_english=True, + ) + idx = _clauses_index(q) + assert set(idx) == {"base_query"} + bases = {f.split("^", 1)[0] for f in _title_fields(idx["base_query"])} + assert bases == {"title.zh"} + + +# --- 去重:与 base 同语言同文本的翻译项跳过 --- + + +def test_skips_translation_when_same_lang_and_same_text_as_base(): + qb = _builder_multilingual_title_only(default_language="en") + q = _build( + qb, + query_text="NIKE", + rewritten="NIKE", + detected_language="en", + translations={"en": "NIKE", "zh": "耐克"}, + index_languages=["en", "zh"], + ) + idx = _clauses_index(q) + assert set(idx) == {"base_query", "base_query_trans_zh"} + + +def test_keeps_translation_when_same_text_but_different_lang_than_base(): + qb = _builder_multilingual_title_only(default_language="en") + q = _build( + qb, + query_text="NIKE", + rewritten="NIKE", + detected_language="en", + translations={"zh": "NIKE"}, + index_languages=["en", "zh"], + ) + idx = _clauses_index(q) + assert set(idx) == {"base_query", "base_query_trans_zh"} + assert idx["base_query_trans_zh"]["query"] == "NIKE" + + +# --- 翻译 key 规范化、空翻译跳过 --- + + +def test_translation_language_key_is_normalized_case_insensitive(): + qb = _builder_multilingual_title_only(default_language="en") + q = _build( + qb, + query_text="dress", + rewritten="dress", + detected_language="en", + translations={"ZH": "连衣裙"}, + index_languages=["en", "zh"], + ) + idx = _clauses_index(q) + assert "base_query_trans_zh" in idx + assert idx["base_query_trans_zh"]["query"] == "连衣裙" + + +def test_empty_translation_value_is_skipped(): + qb = _builder_multilingual_title_only(default_language="en") + q = _build( + qb, + query_text="dress", + rewritten="dress", + detected_language="en", + translations={"zh": " ", "fr": "robe"}, + index_languages=["en", "zh", "fr"], + ) + idx = _clauses_index(q) + assert "base_query_trans_zh" not in idx + assert "base_query_trans_fr" in idx + + +# --- index_languages 为空:视为「未约束」source_in_index 为 True --- + + +def test_empty_index_languages_treats_source_as_in_index_boosts(): + qb = _builder_multilingual_title_only(default_language="en") + q = _build( + qb, + query_text="x", + rewritten="x", + detected_language="de", + translations={"en": "y"}, + index_languages=[], + ) + idx = _clauses_index(q) + assert "boost" not in idx["base_query"] + assert idx["base_query_trans_en"]["boost"] == qb.translation_boost + + +# --- 无翻译:仅 base_query --- + + +def test_no_translations_only_base_query(): + qb = _builder_multilingual_title_only(default_language="en") + q = _build( + qb, + query_text="hello", + rewritten="hello", + detected_language="en", + translations={}, + index_languages=["en", "zh"], + ) + idx = _clauses_index(q) + assert set(idx) == {"base_query"} + + +# --- 与 KNN 同存时仍能解析文本子句(顶层 knn 不影响 query 内结构) --- + + +def test_text_clauses_present_alongside_knn(): + qb = _builder_multilingual_title_only(default_language="en") + parsed = SimpleNamespace( + rewritten_query="dress", + detected_language="en", + translations={"zh": "连衣裙"}, + contains_chinese=False, + contains_english=True, + ) + q = qb.build_query( + query_text="dress", + query_vector=np.array([0.1, 0.2, 0.3], dtype=np.float32), + parsed_query=parsed, + enable_knn=True, + index_languages=["en", "zh"], + ) + assert "knn" in q + idx = _clauses_index(q) + assert set(idx) == {"base_query", "base_query_trans_zh"} + + +def test_detected_language_unknown_falls_back_to_default_language(): + """与 LanguageDetector 失败时 QueryConfig.default_language 行为对齐。""" + qb = _builder_multilingual_title_only(default_language="en") + parsed = SimpleNamespace( + rewritten_query="shirt", + detected_language="unknown", + translations={"zh": "衬衫"}, + contains_chinese=False, + contains_english=True, + ) + q = qb.build_query( + query_text="shirt", + parsed_query=parsed, + enable_knn=False, + index_languages=["en", "zh"], + ) + idx = _clauses_index(q) + assert set(idx) == {"base_query", "base_query_trans_zh"} + assert idx["base_query"]["query"] == "shirt" + assert _has_title_lang(idx["base_query"], "en") + + +def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): + qb = _builder_multilingual_title_only(default_language="en") + q = _build( + qb, + query_text="платье", + rewritten="платье", + detected_language="ru", + translations={"en": "dress"}, + index_languages=["ru", "en"], + ) + idx = _clauses_index(q) + assert set(idx) == {"base_query", "base_query_trans_en"} + assert idx["base_query"]["query"] == "платье" + assert _has_title_lang(idx["base_query"], "ru") + assert idx["base_query_trans_en"]["query"] == "dress" + + +def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause(): + """ + 当前实现:凡是 translations 里非空的条目都会生成子句; + index_languages 只约束混写扩列,不用于过滤翻译子句。 + """ + qb = _builder_multilingual_title_only(default_language="en") + q = _build( + qb, + query_text="dress", + rewritten="dress", + detected_language="en", + translations={"zh": "连衣裙", "de": "Kleid"}, + index_languages=["en", "zh"], + ) + idx = _clauses_index(q) + assert "base_query_trans_de" in idx + assert idx["base_query_trans_de"]["query"] == "Kleid" + assert _has_title_lang(idx["base_query_trans_de"], "de") + + +def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_base(): + """base_query 始终用 rewritten_query,而非仅 query_text。""" + qb = _builder_multilingual_title_only(default_language="en") + q = _build( + qb, + query_text=" 红色 ", + rewritten="红色连衣裙", + detected_language="zh", + translations={"en": "red dress"}, + index_languages=["zh", "en"], + contains_chinese=True, + contains_english=False, + ) + idx = _clauses_index(q) + assert idx["base_query"]["query"] == "红色连衣裙" + assert idx["base_query_trans_en"]["query"] == "red dress" + + +def test_detected_language_unknown_falls_back_to_default_language(): + """与 LanguageDetector 失败时 QueryConfig.default_language 行为对齐。""" + qb = _builder_multilingual_title_only(default_language="en") + parsed = SimpleNamespace( + rewritten_query="shirt", + detected_language="unknown", + translations={"zh": "衬衫"}, + contains_chinese=False, + contains_english=True, + ) + q = qb.build_query( + query_text="shirt", + parsed_query=parsed, + enable_knn=False, + index_languages=["en", "zh"], + ) + idx = _clauses_index(q) + assert set(idx) == {"base_query", "base_query_trans_zh"} + assert idx["base_query"]["query"] == "shirt" + assert _has_title_lang(idx["base_query"], "en") + + +def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): + qb = _builder_multilingual_title_only(default_language="en") + q = _build( + qb, + query_text="платье", + rewritten="платье", + detected_language="ru", + translations={"en": "dress"}, + index_languages=["ru", "en"], + ) + idx = _clauses_index(q) + assert set(idx) == {"base_query", "base_query_trans_en"} + assert idx["base_query"]["query"] == "платье" + assert _has_title_lang(idx["base_query"], "ru") + assert idx["base_query_trans_en"]["query"] == "dress" + + +def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause(): + """ + 当前实现:凡是 translations 里非空的条目都会生成子句; + index_languages 只约束混写扩列,不用于过滤翻译子句。 + """ + qb = _builder_multilingual_title_only(default_language="en") + q = _build( + qb, + query_text="dress", + rewritten="dress", + detected_language="en", + translations={"zh": "连衣裙", "de": "Kleid"}, + index_languages=["en", "zh"], + ) + idx = _clauses_index(q) + assert "base_query_trans_de" in idx + assert idx["base_query_trans_de"]["query"] == "Kleid" + assert _has_title_lang(idx["base_query_trans_de"], "de") + + +def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_base(): + """base_query 始终用 rewritten_query,而非仅 query_text。""" + qb = _builder_multilingual_title_only(default_language="en") + q = _build( + qb, + query_text=" 红色 ", + rewritten="红色连衣裙", + detected_language="zh", + translations={"en": "red dress"}, + index_languages=["zh", "en"], + contains_chinese=True, + contains_english=False, + ) + idx = _clauses_index(q) + assert idx["base_query"]["query"] == "红色连衣裙" + assert idx["base_query_trans_en"]["query"] == "red dress" diff --git a/tests/test_rerank_client.py b/tests/test_rerank_client.py index 3ffa4f9..950e945 100644 --- a/tests/test_rerank_client.py +++ b/tests/test_rerank_client.py @@ -11,7 +11,6 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim "matched_queries": { "base_query": 2.4, "base_query_trans_zh": 1.8, - "fallback_original_query_zh": 1.2, "knn_query": 0.8, }, }, @@ -27,7 +26,7 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim debug = fuse_scores_and_resort(hits, [0.9, 0.7]) - expected_text_1 = 2.4 + 0.25 * ((0.8 * 1.8) + (0.55 * 1.2)) + expected_text_1 = 2.4 + 0.25 * (0.8 * 1.8) expected_fused_1 = (0.9 + 0.00001) * ((expected_text_1 + 0.1) ** 0.35) * ((0.8 + 0.6) ** 0.2) expected_fused_2 = (0.7 + 0.00001) * ((9.0 + 0.1) ** 0.35) * ((0.2 + 0.6) ** 0.2) @@ -38,7 +37,6 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim assert isclose(by_id["2"]["_fused_score"], expected_fused_2, rel_tol=1e-9) assert debug[0]["text_source_score"] == 2.4 assert debug[0]["text_translation_score"] == 1.8 - assert debug[0]["text_fallback_score"] == 1.2 assert debug[0]["knn_score"] == 0.8 assert [hit["_id"] for hit in hits] == ["2", "1"] -- libgit2 0.21.2