Commit 0536222c6d7fcf1bb9339299b67409c918bae320
1 parent
ef5baa86
query parser优化
Showing
15 changed files
with
629 additions
and
141 deletions
Show diff stats
config/config.yaml
| @@ -20,7 +20,7 @@ es_settings: | @@ -20,7 +20,7 @@ es_settings: | ||
| 20 | refresh_interval: "30s" | 20 | refresh_interval: "30s" |
| 21 | 21 | ||
| 22 | # 字段权重配置(用于搜索时的字段boost) | 22 | # 字段权重配置(用于搜索时的字段boost) |
| 23 | -# 统一按“字段基名”配置;查询时按 search_langs 动态拼接 .{lang}。 | 23 | +# 统一按“字段基名”配置;查询时按实际检索语言动态拼接 .{lang}。 |
| 24 | # 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。 | 24 | # 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。 |
| 25 | field_boosts: | 25 | field_boosts: |
| 26 | title: 3.0 | 26 | title: 3.0 |
| @@ -74,14 +74,11 @@ query_config: | @@ -74,14 +74,11 @@ query_config: | ||
| 74 | - "vendor" | 74 | - "vendor" |
| 75 | - "category_name_text" | 75 | - "category_name_text" |
| 76 | 76 | ||
| 77 | - # 统一文本召回策略(主查询 + 翻译查询 + 原始查询兜底) | 77 | + # 统一文本召回策略(主查询 + 翻译查询) |
| 78 | text_query_strategy: | 78 | text_query_strategy: |
| 79 | base_minimum_should_match: "75%" | 79 | base_minimum_should_match: "75%" |
| 80 | translation_minimum_should_match: "75%" | 80 | translation_minimum_should_match: "75%" |
| 81 | translation_boost: 0.4 | 81 | translation_boost: 0.4 |
| 82 | - translation_boost_when_source_missing: 1.0 | ||
| 83 | - source_boost_when_missing: 0.6 | ||
| 84 | - original_query_fallback_boost_when_translation_missing: 0.2 | ||
| 85 | tie_breaker_base_query: 0.9 | 82 | tie_breaker_base_query: 0.9 |
| 86 | 83 | ||
| 87 | # Embedding字段名称 | 84 | # Embedding字段名称 |
config/loader.py
| @@ -284,13 +284,6 @@ class AppConfigLoader: | @@ -284,13 +284,6 @@ class AppConfigLoader: | ||
| 284 | base_minimum_should_match=str(text_strategy.get("base_minimum_should_match", "70%")), | 284 | base_minimum_should_match=str(text_strategy.get("base_minimum_should_match", "70%")), |
| 285 | translation_minimum_should_match=str(text_strategy.get("translation_minimum_should_match", "70%")), | 285 | translation_minimum_should_match=str(text_strategy.get("translation_minimum_should_match", "70%")), |
| 286 | translation_boost=float(text_strategy.get("translation_boost", 0.4)), | 286 | translation_boost=float(text_strategy.get("translation_boost", 0.4)), |
| 287 | - translation_boost_when_source_missing=float( | ||
| 288 | - text_strategy.get("translation_boost_when_source_missing", 1.0) | ||
| 289 | - ), | ||
| 290 | - source_boost_when_missing=float(text_strategy.get("source_boost_when_missing", 0.6)), | ||
| 291 | - original_query_fallback_boost_when_translation_missing=float( | ||
| 292 | - text_strategy.get("original_query_fallback_boost_when_translation_missing", 0.2) | ||
| 293 | - ), | ||
| 294 | tie_breaker_base_query=float(text_strategy.get("tie_breaker_base_query", 0.9)), | 287 | tie_breaker_base_query=float(text_strategy.get("tie_breaker_base_query", 0.9)), |
| 295 | zh_to_en_model=str(query_cfg.get("zh_to_en_model") or "opus-mt-zh-en"), | 288 | zh_to_en_model=str(query_cfg.get("zh_to_en_model") or "opus-mt-zh-en"), |
| 296 | en_to_zh_model=str(query_cfg.get("en_to_zh_model") or "opus-mt-en-zh"), | 289 | en_to_zh_model=str(query_cfg.get("en_to_zh_model") or "opus-mt-en-zh"), |
config/schema.py
| @@ -54,9 +54,6 @@ class QueryConfig: | @@ -54,9 +54,6 @@ class QueryConfig: | ||
| 54 | base_minimum_should_match: str = "70%" | 54 | base_minimum_should_match: str = "70%" |
| 55 | translation_minimum_should_match: str = "70%" | 55 | translation_minimum_should_match: str = "70%" |
| 56 | translation_boost: float = 0.4 | 56 | translation_boost: float = 0.4 |
| 57 | - translation_boost_when_source_missing: float = 1.0 | ||
| 58 | - source_boost_when_missing: float = 0.6 | ||
| 59 | - original_query_fallback_boost_when_translation_missing: float = 0.2 | ||
| 60 | tie_breaker_base_query: float = 0.9 | 57 | tie_breaker_base_query: float = 0.9 |
| 61 | zh_to_en_model: str = "opus-mt-zh-en" | 58 | zh_to_en_model: str = "opus-mt-zh-en" |
| 62 | en_to_zh_model: str = "opus-mt-en-zh" | 59 | en_to_zh_model: str = "opus-mt-en-zh" |
docs/DEVELOPER_GUIDE.md
| @@ -147,7 +147,7 @@ docs/ # 文档(含本指南) | @@ -147,7 +147,7 @@ docs/ # 文档(含本指南) | ||
| 147 | 147 | ||
| 148 | ### 4.4 query | 148 | ### 4.4 query |
| 149 | 149 | ||
| 150 | -- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化;输出可供 Searcher 使用的结构化查询信息(含 search_langs 语言计划)。 | 150 | +- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化;输出解析事实(如 `rewritten_query`、`detected_language`、`translations`、`query_vector`),不再承担 ES 语言计划拼装。 |
| 151 | - **原则**:翻译/向量通过 `providers` 获取,不直接依赖具体服务 URL 或实现;支持按配置关闭翻译/向量(如短查询、typing 场景)。 | 151 | - **原则**:翻译/向量通过 `providers` 获取,不直接依赖具体服务 URL 或实现;支持按配置关闭翻译/向量(如短查询、typing 场景)。 |
| 152 | 152 | ||
| 153 | ### 4.5 search | 153 | ### 4.5 search |
docs/QUICKSTART.md
| @@ -558,6 +558,21 @@ lsof -i :6004 | @@ -558,6 +558,21 @@ lsof -i :6004 | ||
| 558 | 558 | ||
| 559 | 更完整的运行排障(多环境切换、Suggestion 构建、FAQ)见 `docs/Usage-Guide.md`。 | 559 | 更完整的运行排障(多环境切换、Suggestion 构建、FAQ)见 `docs/Usage-Guide.md`。 |
| 560 | 560 | ||
| 561 | +### 5.4 HanLP 与 `transformers` 版本(`BertTokenizer.encode_plus`) | ||
| 562 | + | ||
| 563 | +若日志出现 **`AttributeError: BertTokenizer has no attribute encode_plus`**,通常是 **同一 venv 里装了 `transformers` 5.x**,与 **HanLP 2.1.x** 不兼容(HanLP 仍调用已移除的 `encode_plus`)。 | ||
| 564 | + | ||
| 565 | +**处理:** 将 `transformers` 固定到 **4.x**(例如 4.44+),然后重装/校验 HanLP: | ||
| 566 | + | ||
| 567 | +```bash | ||
| 568 | +source activate.sh | ||
| 569 | +pip install -r requirements_hanlp.txt | ||
| 570 | +python -c "from transformers import BertTokenizer; import transformers as t; print(t.__version__, hasattr(BertTokenizer, 'encode_plus'))" | ||
| 571 | +# 期望:4.x 且 True | ||
| 572 | +``` | ||
| 573 | + | ||
| 574 | +**说明:** 重排/TEI 等若使用 **独立 venv**(如 `.venv-reranker`),可与主 venv 的 `transformers` 版本分离;主 venv 只要装了 HanLP 做查询分词,就不要把 `transformers` 升到 5。 | ||
| 575 | + | ||
| 561 | --- | 576 | --- |
| 562 | 577 | ||
| 563 | ## 6. 相关文档 | 578 | ## 6. 相关文档 |
docs/TODO.txt
| @@ -32,7 +32,7 @@ | @@ -32,7 +32,7 @@ | ||
| 32 | }, | 32 | }, |
| 33 | 去掉 image_embedding_512 | 33 | 去掉 image_embedding_512 |
| 34 | image_embedding改为,一个spu有多个sku向量,每个向量内部properties: | 34 | image_embedding改为,一个spu有多个sku向量,每个向量内部properties: |
| 35 | -除了vector url还应该包括 | 35 | +除了vector url还应该包括,该图片是对应哪些sku |
| 36 | "image_embedding": { | 36 | "image_embedding": { |
| 37 | "type": "nested", | 37 | "type": "nested", |
| 38 | "properties": { | 38 | "properties": { |
| @@ -117,6 +117,11 @@ requests.exceptions.HTTPError: 502 Server Error: Bad Gateway for url: http://127 | @@ -117,6 +117,11 @@ requests.exceptions.HTTPError: 502 Server Error: Bad Gateway for url: http://127 | ||
| 117 | 117 | ||
| 118 | 118 | ||
| 119 | 119 | ||
| 120 | +是否需要: | ||
| 121 | +当「源语言不在 index_languages」且「某些目标语言的翻译缺失」时,ES 里会额外加一层 用「原始 query 字符串」去撞缺失语种字段 | ||
| 122 | + | ||
| 123 | + | ||
| 124 | + | ||
| 120 | 先阅读文本embedding相关的代码: | 125 | 先阅读文本embedding相关的代码: |
| 121 | @embeddings/README.md @embeddings/server.py @docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md @embeddings/text_encoder.py | 126 | @embeddings/README.md @embeddings/server.py @docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md @embeddings/text_encoder.py |
| 122 | 目前有TEXT_MAX_INFLIGHT / IMAGE_MAX_INFLIGHT 准入限制,超限返回过载状态码。 | 127 | 目前有TEXT_MAX_INFLIGHT / IMAGE_MAX_INFLIGHT 准入限制,超限返回过载状态码。 |
docs/搜索API对接指南-01-搜索接口.md
| @@ -553,9 +553,6 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) | @@ -553,9 +553,6 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) | ||
| 553 | | `rewritten_query` | string | 重写后的查询 | | 553 | | `rewritten_query` | string | 重写后的查询 | |
| 554 | | `detected_language` | string | 检测到的语言 | | 554 | | `detected_language` | string | 检测到的语言 | |
| 555 | | `translations` | object | 翻译结果 | | 555 | | `translations` | object | 翻译结果 | |
| 556 | -| `query_text_by_lang` | object | 实际参与检索的多语言 query 文本 | | ||
| 557 | -| `search_langs` | array[string] | 实际参与检索的语言列表 | | ||
| 558 | -| `supplemental_search_langs` | array[string] | 因 mixed query 补入的附加语言列表 | | ||
| 559 | | `has_vector` | boolean | 是否生成了向量 | | 556 | | `has_vector` | boolean | 是否生成了向量 | |
| 560 | 557 | ||
| 561 | `debug_info.per_result[]` 常见字段: | 558 | `debug_info.per_result[]` 常见字段: |
| @@ -565,10 +562,9 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) | @@ -565,10 +562,9 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) | ||
| 565 | | `spu_id` | string | 结果 SPU ID | | 562 | | `spu_id` | string | 结果 SPU ID | |
| 566 | | `es_score` | float | ES 原始 `_score` | | 563 | | `es_score` | float | ES 原始 `_score` | |
| 567 | | `rerank_score` | float | 重排分数 | | 564 | | `rerank_score` | float | 重排分数 | |
| 568 | -| `text_score` | float | 文本相关性大分(由 `base_query` / `base_query_trans_*` / `fallback_original_query_*` 聚合而来) | | 565 | +| `text_score` | float | 文本相关性大分(由 `base_query` / `base_query_trans_*` 聚合而来) | |
| 569 | | `text_source_score` | float | `base_query` 分数 | | 566 | | `text_source_score` | float | `base_query` 分数 | |
| 570 | | `text_translation_score` | float | `base_query_trans_*` 里的最大分数 | | 567 | | `text_translation_score` | float | `base_query_trans_*` 里的最大分数 | |
| 571 | -| `text_fallback_score` | float | `fallback_original_query_*` 里的最大分数 | | ||
| 572 | | `text_primary_score` | float | 文本大分中的主证据部分 | | 568 | | `text_primary_score` | float | 文本大分中的主证据部分 | |
| 573 | | `text_support_score` | float | 文本大分中的辅助证据部分 | | 569 | | `text_support_score` | float | 文本大分中的辅助证据部分 | |
| 574 | | `knn_score` | float | `knn_query` 分数 | | 570 | | `knn_score` | float | `knn_query` 分数 | |
docs/相关性检索优化说明.md
| @@ -2,11 +2,11 @@ | @@ -2,11 +2,11 @@ | ||
| 2 | 2 | ||
| 3 | ## 1. 文档目标 | 3 | ## 1. 文档目标 |
| 4 | 4 | ||
| 5 | -本文描述当前线上代码的文本检索策略,重点覆盖: | 5 | +本文描述当前代码中的文本检索策略,重点覆盖: |
| 6 | 6 | ||
| 7 | - 多语言检索路由(`detector` / `translator` / `indexed` 的关系) | 7 | - 多语言检索路由(`detector` / `translator` / `indexed` 的关系) |
| 8 | - 统一文本召回表达式(无布尔 AST 分支) | 8 | - 统一文本召回表达式(无布尔 AST 分支) |
| 9 | -- 翻译缺失时的兜底策略 | 9 | +- 解析层与检索表达式层的职责边界 |
| 10 | - 重排融合打分与调试字段 | 10 | - 重排融合打分与调试字段 |
| 11 | - 典型场景下实际生成的 ES 查询结构 | 11 | - 典型场景下实际生成的 ES 查询结构 |
| 12 | 12 | ||
| @@ -17,9 +17,11 @@ | @@ -17,9 +17,11 @@ | ||
| 17 | 查询链路(文本相关): | 17 | 查询链路(文本相关): |
| 18 | 18 | ||
| 19 | 1. `QueryParser.parse()` | 19 | 1. `QueryParser.parse()` |
| 20 | - 输出 `detected_language`、`query_text_by_lang`、`search_langs`、`index_languages`、`source_in_index_languages`;另输出 `contains_chinese` / `contains_english`(仅服务混写辅助召回,见 §4 末)。 | 20 | + 负责产出解析事实:`query_normalized`、`rewritten_query`、`detected_language`、`translations`、`query_vector`、`query_tokens`、`contains_chinese`、`contains_english`。 |
| 21 | +2. `Searcher.search()` | ||
| 22 | + 负责读取租户 `index_languages`,并将其一方面传给 `QueryParser` 作为 `target_languages`,另一方面传给 `ESQueryBuilder` 作为字段展开约束。 | ||
| 21 | 2. `ESQueryBuilder._build_advanced_text_query()` | 23 | 2. `ESQueryBuilder._build_advanced_text_query()` |
| 22 | - 按 `search_langs` 动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段,叠加 shared 字段(`tags`、`option*_values`);若命中混写辅助条件,在同一子句内并入另一语种列(§4 末)。 | 24 | + 基于 `rewritten_query + detected_language + translations + index_languages` 构建 `base_query` 与 `base_query_trans_*`;并按语言动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段,叠加 shared 字段(`tags`、`option*_values`)。 |
| 23 | 3. `build_query()` | 25 | 3. `build_query()` |
| 24 | 统一走文本策略,不再有布尔 AST 枝路。 | 26 | 统一走文本策略,不再有布尔 AST 枝路。 |
| 25 | 27 | ||
| @@ -37,18 +39,18 @@ | @@ -37,18 +39,18 @@ | ||
| 37 | 源语言字段做主召回;其他语言走翻译补召回(低权重)。 | 39 | 源语言字段做主召回;其他语言走翻译补召回(低权重)。 |
| 38 | 2. 若 `detected_language not in index_languages`: | 40 | 2. 若 `detected_language not in index_languages`: |
| 39 | 翻译到 `index_languages` 是主路径;源语言字段仅作弱召回。 | 41 | 翻译到 `index_languages` 是主路径;源语言字段仅作弱召回。 |
| 40 | -3. 若第 2 步翻译部分失败或全部失败: | ||
| 41 | - 对缺失翻译的 `index_languages` 字段,追加“原文低权重兜底”子句,避免完全丢失这些语种索引面的召回机会。 | 42 | +3. 若翻译部分失败或全部失败: |
| 43 | + 当前实现不会再额外生成“原文打到其他语种字段”的兜底子句;系统保留 `base_query` 并继续执行,可观测性由 `translations` / warning / 命名子句分数提供。 | ||
| 42 | 44 | ||
| 43 | ### 3.2 翻译与向量:并发提交与共享超时 | 45 | ### 3.2 翻译与向量:并发提交与共享超时 |
| 44 | 46 | ||
| 45 | -`QueryParser.parse()` 内(Stage 4–6)对**离线调用**采用线程池提交 + **一次** `concurrent.futures.wait`: | 47 | +`QueryParser.parse()` 内对翻译与向量采用线程池提交 + **一次** `concurrent.futures.wait`: |
| 46 | 48 | ||
| 47 | -- **翻译**:对 `index_languages` 中除 `detected_language` 外的每个目标语种各提交一个 `translator.translate` 任务(多目标时并发执行)。 | ||
| 48 | -- **查询向量**(若开启 `enable_text_embedding` 且域为 default):再提交一个 `text_encoder.encode` 任务。 | 49 | +- **翻译**:对调用方传入的 `target_languages` 中、除 `detected_language` 外的每个目标语种各提交一个 `translator.translate` 任务(多目标时并发执行)。 |
| 50 | +- **查询向量**:若开启 `enable_text_embedding`,再提交一个 `text_encoder.encode` 任务。 | ||
| 49 | - 上述任务进入**同一** future 集合;例如租户索引为 `[zh, en]` 且检测语种**不在**索引内时,常为 **2 路翻译 + 1 路向量,共 3 个任务并发**,共用超时。 | 51 | - 上述任务进入**同一** future 集合;例如租户索引为 `[zh, en]` 且检测语种**不在**索引内时,常为 **2 路翻译 + 1 路向量,共 3 个任务并发**,共用超时。 |
| 50 | 52 | ||
| 51 | -**等待预算(毫秒)**由 `detected_language` 是否属于租户 `index_languages` 决定(`query_config`): | 53 | +**等待预算(毫秒)**由 `detected_language` 是否属于调用方传入的 `target_languages` 决定(`query_config`): |
| 52 | 54 | ||
| 53 | - **在索引内**:`translation_embedding_wait_budget_ms_source_in_index`(默认较短,如 80ms)— 主召回已能打在源语种字段,翻译/向量稍慢可容忍。 | 55 | - **在索引内**:`translation_embedding_wait_budget_ms_source_in_index`(默认较短,如 80ms)— 主召回已能打在源语种字段,翻译/向量稍慢可容忍。 |
| 54 | - **不在索引内**:`translation_embedding_wait_budget_ms_source_not_in_index`(默认较长,如 200ms)— 翻译对可检索文本更关键,给足时间。 | 56 | - **不在索引内**:`translation_embedding_wait_budget_ms_source_not_in_index`(默认较长,如 200ms)— 翻译对可检索文本更关键,给足时间。 |
| @@ -62,7 +64,7 @@ | @@ -62,7 +64,7 @@ | ||
| 62 | ```json | 64 | ```json |
| 63 | { | 65 | { |
| 64 | "multi_match": { | 66 | "multi_match": { |
| 65 | - "_name": "base_query|base_query_trans_xx|fallback_original_query_xx", | 67 | + "_name": "base_query|base_query_trans_xx", |
| 66 | "query": "<text>", | 68 | "query": "<text>", |
| 67 | "fields": ["title.xx^3.0", "brief.xx^1.5", "...", "tags", "option1_values^0.5", "..."], | 69 | "fields": ["title.xx^3.0", "brief.xx^1.5", "...", "tags", "option1_values^0.5", "..."], |
| 68 | "minimum_should_match": "75%", | 70 | "minimum_should_match": "75%", |
| @@ -75,7 +77,7 @@ | @@ -75,7 +77,7 @@ | ||
| 75 | 最终按 `bool.should` 组合,`minimum_should_match: 1`。 | 77 | 最终按 `bool.should` 组合,`minimum_should_match: 1`。 |
| 76 | 78 | ||
| 77 | > **附 — 混写辅助召回** | 79 | > **附 — 混写辅助召回** |
| 78 | -> 当中英(或多脚本)混写时,为略抬召回:`QueryParser` 用 `contains_chinese`(文中有汉字)、`contains_english`(分词中有长度 ≥3 的纯英文 token)打标;`ESQueryBuilder` 在某一语言的 `multi_match` 上,按规则把**另一语种**的同类字段并入同一 `fields`(受 `index_languages` 限制),并入列的 boost 为配置值再乘 **`mixed_script_merged_field_boost_scale`(默认 0.8,`ESQueryBuilder` 构造参数)**。`fallback_original_query_*` 同样适用。字段在内部以 `(path, boost)` 列表合并后再格式化为 ES 字符串。 | 80 | +> 当中英(或多脚本)混写时,为略抬召回:`QueryParser` 用 `contains_chinese`(文中有汉字)、`contains_english`(分词中有长度 ≥3 的纯英文 token)打标;`ESQueryBuilder` 在某一语言的 `multi_match` 上,按规则把**另一语种**的同类字段并入同一 `fields`(受 `index_languages` 限制),并入列的 boost 为配置值再乘 **`mixed_script_merged_field_boost_scale`(默认 0.6,`ESQueryBuilder` 构造参数)**。字段在内部以 `(path, boost)` 列表合并后再格式化为 ES 字符串。 |
| 79 | 81 | ||
| 80 | ## 5. 关键配置项(文本策略) | 82 | ## 5. 关键配置项(文本策略) |
| 81 | 83 | ||
| @@ -88,20 +90,12 @@ | @@ -88,20 +90,12 @@ | ||
| 88 | 90 | ||
| 89 | - `base_minimum_should_match` | 91 | - `base_minimum_should_match` |
| 90 | - `translation_minimum_should_match` | 92 | - `translation_minimum_should_match` |
| 91 | -- `translation_boost` | ||
| 92 | -- `translation_boost_when_source_missing` | ||
| 93 | -- `source_boost_when_missing` | ||
| 94 | -- `original_query_fallback_boost_when_translation_missing`(新增) | 93 | +- `translation_boost`(所有 `base_query_trans_*` 共用) |
| 95 | - `tie_breaker_base_query` | 94 | - `tie_breaker_base_query` |
| 96 | 95 | ||
| 97 | -新增项说明: | ||
| 98 | - | ||
| 99 | -- `original_query_fallback_boost_when_translation_missing`: | ||
| 100 | - 当源语种不在索引语言且翻译缺失时,原文打到缺失目标语字段的低权重系数,默认 `0.2`。 | ||
| 101 | - | ||
| 102 | 说明: | 96 | 说明: |
| 103 | 97 | ||
| 104 | -- `phrase_query` / `keywords_query` 已从当前实现中移除,文本相关性只由 `base_query`、`base_query_trans_*`、`fallback_original_query_*` 三类子句组成。 | 98 | +- `phrase_query` / `keywords_query` 已从当前实现中移除,文本相关性只由 `base_query`、`base_query_trans_*` 两类子句组成。 |
| 105 | 99 | ||
| 106 | ## 6. 典型场景与实际 DSL | 100 | ## 6. 典型场景与实际 DSL |
| 107 | 101 | ||
| @@ -111,11 +105,12 @@ | @@ -111,11 +105,12 @@ | ||
| 111 | 105 | ||
| 112 | - `detected_language=de` | 106 | - `detected_language=de` |
| 113 | - `index_languages=[de,en]` | 107 | - `index_languages=[de,en]` |
| 114 | -- `query_text_by_lang={de:"herren schuhe", en:"men shoes"}` | 108 | +- `rewritten_query="herren schuhe"` |
| 109 | +- `translations={en:"men shoes"}` | ||
| 115 | 110 | ||
| 116 | 策略结果: | 111 | 策略结果: |
| 117 | 112 | ||
| 118 | -- `base_query`:德语字段,正常权重 | 113 | +- `base_query`:德语字段,**不写** `multi_match.boost` |
| 119 | - `base_query_trans_en`:英语字段,`boost=translation_boost`(默认 0.4) | 114 | - `base_query_trans_en`:英语字段,`boost=translation_boost`(默认 0.4) |
| 120 | 115 | ||
| 121 | ### 场景 B:源语种不在索引语言中,部分翻译缺失 | 116 | ### 场景 B:源语种不在索引语言中,部分翻译缺失 |
| @@ -126,38 +121,44 @@ | @@ -126,38 +121,44 @@ | ||
| 126 | 121 | ||
| 127 | 策略结果: | 122 | 策略结果: |
| 128 | 123 | ||
| 129 | -- `base_query`(德语字段):`boost=source_boost_when_missing`(默认 0.6) | ||
| 130 | -- `base_query_trans_en`(英文字段):`boost=translation_boost_when_source_missing`(默认 1.0) | ||
| 131 | -- `fallback_original_query_zh`(中文字段):原文低权重兜底(默认 0.2) | 124 | +- `base_query`(德语字段):**不写** `multi_match.boost`(默认 1.0) |
| 125 | +- `base_query_trans_en`(英文字段):`boost=translation_boost`(如 0.4) | ||
| 126 | +- 不会生成额外中文兜底子句 | ||
| 132 | 127 | ||
| 133 | ### 场景 C:源语种不在索引语言中,翻译全部失败 | 128 | ### 场景 C:源语种不在索引语言中,翻译全部失败 |
| 134 | 129 | ||
| 135 | - `detected_language=de` | 130 | - `detected_language=de` |
| 136 | - `index_languages=[en,zh]` | 131 | - `index_languages=[en,zh]` |
| 137 | -- `query_text_by_lang` 仅有 `de` | 132 | +- `translations={}` |
| 138 | 133 | ||
| 139 | 策略结果: | 134 | 策略结果: |
| 140 | 135 | ||
| 141 | -- `base_query`(德语字段,低权重) | ||
| 142 | -- `fallback_original_query_en`(英文字段原文兜底) | ||
| 143 | -- `fallback_original_query_zh`(中文字段原文兜底) | 136 | +- `base_query`(德语字段,**无** `boost` 字段) |
| 137 | +- 不会生成 `base_query_trans_*` | ||
| 144 | 138 | ||
| 145 | -这能避免“只有源语种字段查询,且该语种字段在商家索引中稀疏/为空”导致的弱召回问题。 | 139 | +这意味着当前实现优先保证职责清晰与可解释性,而不是继续在 Builder 内部隐式制造“跨语种原文兜底”。 |
| 146 | 140 | ||
| 147 | -## 7. QueryParser 与 ESBuilder 的职责分工 | 141 | +## 7. QueryParser 与 Searcher / ESBuilder 的职责分工 |
| 148 | 142 | ||
| 149 | -- `QueryParser` 负责“语言计划”与“可用文本”: | ||
| 150 | - - `search_langs` | ||
| 151 | - - `query_text_by_lang` | ||
| 152 | - - `source_in_index_languages` | ||
| 153 | - - `index_languages` | 143 | +- `QueryParser` 负责“解析事实”: |
| 144 | + - `query_normalized` | ||
| 145 | + - `rewritten_query` | ||
| 146 | + - `detected_language` | ||
| 147 | + - `translations` | ||
| 148 | + - `query_vector` | ||
| 149 | + - `query_tokens` | ||
| 154 | - `contains_chinese` / `contains_english` | 150 | - `contains_chinese` / `contains_english` |
| 151 | +- `Searcher` 负责“租户语境”: | ||
| 152 | + - `index_languages` | ||
| 153 | + - 将其传给 parser 作为 `target_languages` | ||
| 154 | + - 将其传给 builder 作为字段展开约束 | ||
| 155 | - `ESQueryBuilder` 负责“表达式展开”: | 155 | - `ESQueryBuilder` 负责“表达式展开”: |
| 156 | - 动态字段组装 | 156 | - 动态字段组装 |
| 157 | - 子句权重分配 | 157 | - 子句权重分配 |
| 158 | - - 翻译缺失兜底子句拼接 | 158 | + - `base_query` / `base_query_trans_*` 子句拼接 |
| 159 | + - 跳过“与 base_query 文本和语言完全相同”的重复翻译子句 | ||
| 159 | 160 | ||
| 160 | -这种分层让策略调优主要落在配置和 Builder,不破坏 Parser 的职责边界。 | 161 | +这种分层让 parser 不再返回 ES 专用的“语言计划字段”,职责边界更清晰。 |
| 161 | 162 | ||
| 162 | ## 8. 融合打分(Rerank + Text + KNN) | 163 | ## 8. 融合打分(Rerank + Text + KNN) |
| 163 | 164 | ||
| @@ -165,24 +166,21 @@ | @@ -165,24 +166,21 @@ | ||
| 165 | 166 | ||
| 166 | ### 8.1 文本相关性大分 | 167 | ### 8.1 文本相关性大分 |
| 167 | 168 | ||
| 168 | -文本大分由三部分组成: | 169 | +文本大分由两部分组成: |
| 169 | 170 | ||
| 170 | - `base_query` | 171 | - `base_query` |
| 171 | - `base_query_trans_*` | 172 | - `base_query_trans_*` |
| 172 | -- `fallback_original_query_*` | ||
| 173 | 173 | ||
| 174 | 聚合方式: | 174 | 聚合方式: |
| 175 | 175 | ||
| 176 | 1. `source_score = base_query` | 176 | 1. `source_score = base_query` |
| 177 | 2. `translation_score = max(base_query_trans_*)` | 177 | 2. `translation_score = max(base_query_trans_*)` |
| 178 | -3. `fallback_score = max(fallback_original_query_*)` | ||
| 179 | -4. 加权: | 178 | +3. 加权: |
| 180 | - `weighted_source = source_score` | 179 | - `weighted_source = source_score` |
| 181 | - `weighted_translation = 0.8 * translation_score` | 180 | - `weighted_translation = 0.8 * translation_score` |
| 182 | - - `weighted_fallback = 0.55 * fallback_score` | ||
| 183 | -5. 合成: | ||
| 184 | - - `primary = max(weighted_source, weighted_translation, weighted_fallback)` | ||
| 185 | - - `support = weighted_source + weighted_translation + weighted_fallback - primary` | 181 | +4. 合成: |
| 182 | + - `primary = max(weighted_source, weighted_translation)` | ||
| 183 | + - `support = weighted_source + weighted_translation - primary` | ||
| 186 | - `text_score = primary + 0.25 * support` | 184 | - `text_score = primary + 0.25 * support` |
| 187 | 185 | ||
| 188 | 如果以上子分都缺失,则回退到 ES `_score` 作为 `text_score`,避免纯文本召回被误打成 0。 | 186 | 如果以上子分都缺失,则回退到 ES `_score` 作为 `text_score`,避免纯文本召回被误打成 0。 |
| @@ -212,7 +210,6 @@ fused_score = ( | @@ -212,7 +210,6 @@ fused_score = ( | ||
| 212 | - `text_score` | 210 | - `text_score` |
| 213 | - `text_source_score` | 211 | - `text_source_score` |
| 214 | - `text_translation_score` | 212 | - `text_translation_score` |
| 215 | -- `text_fallback_score` | ||
| 216 | - `text_primary_score` | 213 | - `text_primary_score` |
| 217 | - `text_support_score` | 214 | - `text_support_score` |
| 218 | - `knn_score` | 215 | - `knn_score` |
| @@ -221,9 +218,9 @@ fused_score = ( | @@ -221,9 +218,9 @@ fused_score = ( | ||
| 221 | 218 | ||
| 222 | `debug_info.query_analysis` 还会暴露: | 219 | `debug_info.query_analysis` 还会暴露: |
| 223 | 220 | ||
| 224 | -- `query_text_by_lang` | ||
| 225 | -- `search_langs` | ||
| 226 | -- `supplemental_search_langs` | 221 | +- `translations` |
| 222 | +- `detected_language` | ||
| 223 | +- `rewritten_query` | ||
| 227 | 224 | ||
| 228 | 这些字段用于检索效果评估与 bad case 归因。 | 225 | 这些字段用于检索效果评估与 bad case 归因。 |
| 229 | 226 | ||
| @@ -231,7 +228,7 @@ fused_score = ( | @@ -231,7 +228,7 @@ fused_score = ( | ||
| 231 | 228 | ||
| 232 | 1. 当前文本主链路已移除布尔 AST 分支。 | 229 | 1. 当前文本主链路已移除布尔 AST 分支。 |
| 233 | 2. 文档中的旧描述(如 `operator: AND` 固定开启)不再适用,当前实现未强制设置该参数。 | 230 | 2. 文档中的旧描述(如 `operator: AND` 固定开启)不再适用,当前实现未强制设置该参数。 |
| 234 | -3. `HanLP` 为可选依赖;不可用时退化到轻量分词,不影响主链路可用性。 | 231 | +3. `HanLP` 为必需依赖;当前 parser 不再提供轻量 fallback。 |
| 235 | 4. 若后续扩展到更多语种,请确保: | 232 | 4. 若后续扩展到更多语种,请确保: |
| 236 | - mapping 中存在对应 `.<lang>` 字段 | 233 | - mapping 中存在对应 `.<lang>` 字段 |
| 237 | - `index_languages` 配置在支持列表内 | 234 | - `index_languages` 配置在支持列表内 |
| @@ -263,10 +260,9 @@ python ./scripts/eval_search_quality.py | @@ -263,10 +260,9 @@ python ./scripts/eval_search_quality.py | ||
| 263 | 建议在 `tests/` 增加文本策略用例: | 260 | 建议在 `tests/` 增加文本策略用例: |
| 264 | 261 | ||
| 265 | 1. 源语种在索引语言,翻译命中缓存 | 262 | 1. 源语种在索引语言,翻译命中缓存 |
| 266 | -2. 源语种不在索引语言,翻译部分失败(验证 fallback 子句) | ||
| 267 | -3. 源语种不在索引语言,翻译全部失败(验证多目标 fallback) | ||
| 268 | -4. 自定义 `original_query_fallback_boost_when_translation_missing` 生效 | ||
| 269 | -5. 非 `zh/en` 语种字段动态拼接(如 `de/fr/es`) | 263 | +2. 源语种不在索引语言,翻译部分失败(验证仅保留 `base_query` + 成功翻译子句) |
| 264 | +3. 源语种不在索引语言,翻译全部失败(验证无 `base_query_trans_*` 时仍可正常执行) | ||
| 265 | +4. 非 `zh/en` 语种字段动态拼接(如 `de/fr/es`) | ||
| 270 | 266 | ||
| 271 | 267 | ||
| 272 | 268 |
| @@ -0,0 +1,13 @@ | @@ -0,0 +1,13 @@ | ||
| 1 | +# Optional: HanLP query tokenization for the main backend venv (QueryParser). | ||
| 2 | +# | ||
| 3 | +# Install: | ||
| 4 | +# source activate.sh | ||
| 5 | +# pip install -r requirements_hanlp.txt | ||
| 6 | +# | ||
| 7 | +# Why pin transformers<5: | ||
| 8 | +# transformers 5.x no longer exposes `encode_plus` on `BertTokenizer`, but HanLP 2.1.x | ||
| 9 | +# still calls it → AttributeError during `hanlp.load(...)`. | ||
| 10 | +# Use transformers 4.44+ (4.x) which remains API-compatible with HanLP. | ||
| 11 | + | ||
| 12 | +hanlp>=2.1.0 | ||
| 13 | +transformers>=4.44,<5 |
scripts/eval_search_quality.py
| @@ -83,7 +83,6 @@ class RankedItem: | @@ -83,7 +83,6 @@ class RankedItem: | ||
| 83 | text_score: float | None | 83 | text_score: float | None |
| 84 | text_source_score: float | None | 84 | text_source_score: float | None |
| 85 | text_translation_score: float | None | 85 | text_translation_score: float | None |
| 86 | - text_fallback_score: float | None | ||
| 87 | text_primary_score: float | None | 86 | text_primary_score: float | None |
| 88 | text_support_score: float | None | 87 | text_support_score: float | None |
| 89 | knn_score: float | None | 88 | knn_score: float | None |
| @@ -146,7 +145,6 @@ def _evaluate_query(searcher, tenant_id: str, query: str) -> Dict[str, Any]: | @@ -146,7 +145,6 @@ def _evaluate_query(searcher, tenant_id: str, query: str) -> Dict[str, Any]: | ||
| 146 | text_score=_to_float(debug_item.get("text_score")), | 145 | text_score=_to_float(debug_item.get("text_score")), |
| 147 | text_source_score=_to_float(debug_item.get("text_source_score")), | 146 | text_source_score=_to_float(debug_item.get("text_source_score")), |
| 148 | text_translation_score=_to_float(debug_item.get("text_translation_score")), | 147 | text_translation_score=_to_float(debug_item.get("text_translation_score")), |
| 149 | - text_fallback_score=_to_float(debug_item.get("text_fallback_score")), | ||
| 150 | text_primary_score=_to_float(debug_item.get("text_primary_score")), | 148 | text_primary_score=_to_float(debug_item.get("text_primary_score")), |
| 151 | text_support_score=_to_float(debug_item.get("text_support_score")), | 149 | text_support_score=_to_float(debug_item.get("text_support_score")), |
| 152 | knn_score=_to_float(debug_item.get("knn_score")), | 150 | knn_score=_to_float(debug_item.get("knn_score")), |
| @@ -185,12 +183,11 @@ def _render_markdown(report: Dict[str, Any]) -> str: | @@ -185,12 +183,11 @@ def _render_markdown(report: Dict[str, Any]) -> str: | ||
| 185 | f"- total={entry['total']} max_score={entry['max_score']:.6f} took_ms={entry['took_ms']}" | 183 | f"- total={entry['total']} max_score={entry['max_score']:.6f} took_ms={entry['took_ms']}" |
| 186 | ) | 184 | ) |
| 187 | lines.append( | 185 | lines.append( |
| 188 | - f"- detected_language={qa.get('detected_language')} search_langs={qa.get('search_langs')} supplemental_search_langs={qa.get('supplemental_search_langs')}" | 186 | + f"- detected_language={qa.get('detected_language')} translations={qa.get('translations')}" |
| 189 | ) | 187 | ) |
| 190 | - lines.append(f"- query_text_by_lang={qa.get('query_text_by_lang')}") | ||
| 191 | lines.append("") | 188 | lines.append("") |
| 192 | - lines.append("| rank | spu_id | title | fused | rerank | text | text_src | text_trans | text_fb | knn | es | matched_queries |") | ||
| 193 | - lines.append("| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |") | 189 | + lines.append("| rank | spu_id | title | fused | rerank | text | text_src | text_trans | knn | es | matched_queries |") |
| 190 | + lines.append("| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |") | ||
| 194 | for item in entry.get("top20", []): | 191 | for item in entry.get("top20", []): |
| 195 | title = str(item.get("title", "")).replace("|", "/") | 192 | title = str(item.get("title", "")).replace("|", "/") |
| 196 | matched = json.dumps(item.get("matched_queries"), ensure_ascii=False) | 193 | matched = json.dumps(item.get("matched_queries"), ensure_ascii=False) |
| @@ -199,7 +196,7 @@ def _render_markdown(report: Dict[str, Any]) -> str: | @@ -199,7 +196,7 @@ def _render_markdown(report: Dict[str, Any]) -> str: | ||
| 199 | f"| {item.get('rank')} | {item.get('spu_id')} | {title} | " | 196 | f"| {item.get('rank')} | {item.get('spu_id')} | {title} | " |
| 200 | f"{item.get('fused_score')} | {item.get('rerank_score')} | {item.get('text_score')} | " | 197 | f"{item.get('fused_score')} | {item.get('rerank_score')} | {item.get('text_score')} | " |
| 201 | f"{item.get('text_source_score')} | {item.get('text_translation_score')} | " | 198 | f"{item.get('text_source_score')} | {item.get('text_translation_score')} | " |
| 202 | - f"{item.get('text_fallback_score')} | {item.get('knn_score')} | {item.get('es_score')} | {matched} |" | 199 | + f"{item.get('knn_score')} | {item.get('es_score')} | {matched} |" |
| 203 | ) | 200 | ) |
| 204 | lines.append("") | 201 | lines.append("") |
| 205 | return "\n".join(lines) | 202 | return "\n".join(lines) |
search/es_query_builder.py
| @@ -36,9 +36,6 @@ class ESQueryBuilder: | @@ -36,9 +36,6 @@ class ESQueryBuilder: | ||
| 36 | base_minimum_should_match: str = "70%", | 36 | base_minimum_should_match: str = "70%", |
| 37 | translation_minimum_should_match: str = "70%", | 37 | translation_minimum_should_match: str = "70%", |
| 38 | translation_boost: float = 0.4, | 38 | translation_boost: float = 0.4, |
| 39 | - translation_boost_when_source_missing: float = 1.0, | ||
| 40 | - source_boost_when_missing: float = 0.6, | ||
| 41 | - original_query_fallback_boost_when_translation_missing: float = 0.2, | ||
| 42 | tie_breaker_base_query: float = 0.9, | 39 | tie_breaker_base_query: float = 0.9, |
| 43 | mixed_script_merged_field_boost_scale: float = 0.6, | 40 | mixed_script_merged_field_boost_scale: float = 0.6, |
| 44 | ): | 41 | ): |
| @@ -74,11 +71,6 @@ class ESQueryBuilder: | @@ -74,11 +71,6 @@ class ESQueryBuilder: | ||
| 74 | self.base_minimum_should_match = base_minimum_should_match | 71 | self.base_minimum_should_match = base_minimum_should_match |
| 75 | self.translation_minimum_should_match = translation_minimum_should_match | 72 | self.translation_minimum_should_match = translation_minimum_should_match |
| 76 | self.translation_boost = float(translation_boost) | 73 | self.translation_boost = float(translation_boost) |
| 77 | - self.translation_boost_when_source_missing = float(translation_boost_when_source_missing) | ||
| 78 | - self.source_boost_when_missing = float(source_boost_when_missing) | ||
| 79 | - self.original_query_fallback_boost_when_translation_missing = float( | ||
| 80 | - original_query_fallback_boost_when_translation_missing | ||
| 81 | - ) | ||
| 82 | self.tie_breaker_base_query = float(tie_breaker_base_query) | 74 | self.tie_breaker_base_query = float(tie_breaker_base_query) |
| 83 | self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale) | 75 | self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale) |
| 84 | 76 | ||
| @@ -168,7 +160,7 @@ class ESQueryBuilder: | @@ -168,7 +160,7 @@ class ESQueryBuilder: | ||
| 168 | 结构:filters and (text_recall or embedding_recall) + post_filter | 160 | 结构:filters and (text_recall or embedding_recall) + post_filter |
| 169 | - conjunctive_filters: 应用在 query.bool.filter(影响结果和聚合) | 161 | - conjunctive_filters: 应用在 query.bool.filter(影响结果和聚合) |
| 170 | - disjunctive_filters: 应用在 post_filter(只影响结果,不影响聚合) | 162 | - disjunctive_filters: 应用在 post_filter(只影响结果,不影响聚合) |
| 171 | - - text_recall: 文本相关性召回(按 search_langs 动态语言字段) | 163 | + - text_recall: 文本相关性召回(按实际 clause 语言动态字段) |
| 172 | - embedding_recall: 向量召回(KNN) | 164 | - embedding_recall: 向量召回(KNN) |
| 173 | - function_score: 包装召回部分,支持提权字段 | 165 | - function_score: 包装召回部分,支持提权字段 |
| 174 | 166 | ||
| @@ -484,6 +476,7 @@ class ESQueryBuilder: | @@ -484,6 +476,7 @@ class ESQueryBuilder: | ||
| 484 | contains_chinese: bool, | 476 | contains_chinese: bool, |
| 485 | contains_english: bool, | 477 | contains_english: bool, |
| 486 | index_languages: List[str], | 478 | index_languages: List[str], |
| 479 | + is_source: bool = False | ||
| 487 | ) -> List[MatchFieldSpec]: | 480 | ) -> List[MatchFieldSpec]: |
| 488 | """ | 481 | """ |
| 489 | When the query mixes scripts, widen each clause to indexed fields for the other script | 482 | When the query mixes scripts, widen each clause to indexed fields for the other script |
| @@ -497,10 +490,11 @@ class ESQueryBuilder: | @@ -497,10 +490,11 @@ class ESQueryBuilder: | ||
| 497 | 490 | ||
| 498 | out = list(specs) | 491 | out = list(specs) |
| 499 | lnorm = (lang or "").strip().lower() | 492 | lnorm = (lang or "").strip().lower() |
| 500 | - if contains_english and lnorm != "en" and can_use("en"): | ||
| 501 | - out = self._merge_supplemental_lang_field_specs(out, "en") | ||
| 502 | - if contains_chinese and lnorm != "zh" and can_use("zh"): | ||
| 503 | - out = self._merge_supplemental_lang_field_specs(out, "zh") | 493 | + if is_source: |
| 494 | + if contains_english and lnorm != "en" and can_use("en"): | ||
| 495 | + out = self._merge_supplemental_lang_field_specs(out, "en") | ||
| 496 | + if contains_chinese and lnorm != "zh" and can_use("zh"): | ||
| 497 | + out = self._merge_supplemental_lang_field_specs(out, "zh") | ||
| 504 | return out | 498 | return out |
| 505 | 499 | ||
| 506 | def _get_embedding_field(self, language: str) -> str: | 500 | def _get_embedding_field(self, language: str) -> str: |
| @@ -557,10 +551,6 @@ class ESQueryBuilder: | @@ -557,10 +551,6 @@ class ESQueryBuilder: | ||
| 557 | contains_english = bool(getattr(parsed_query, "contains_english", False)) | 551 | contains_english = bool(getattr(parsed_query, "contains_english", False)) |
| 558 | 552 | ||
| 559 | source_lang = str(source_lang or self.default_language).strip().lower() or self.default_language | 553 | source_lang = str(source_lang or self.default_language).strip().lower() or self.default_language |
| 560 | - source_in_index_languages = ( | ||
| 561 | - True if not normalized_index_languages else source_lang in normalized_index_languages | ||
| 562 | - ) | ||
| 563 | - | ||
| 564 | base_query_text = ( | 554 | base_query_text = ( |
| 565 | getattr(parsed_query, "rewritten_query", None) if parsed_query else None | 555 | getattr(parsed_query, "rewritten_query", None) if parsed_query else None |
| 566 | ) or query_text | 556 | ) or query_text |
| @@ -574,22 +564,14 @@ class ESQueryBuilder: | @@ -574,22 +564,14 @@ class ESQueryBuilder: | ||
| 574 | contains_chinese, | 564 | contains_chinese, |
| 575 | contains_english, | 565 | contains_english, |
| 576 | normalized_index_languages, | 566 | normalized_index_languages, |
| 567 | + is_source, | ||
| 577 | ) | 568 | ) |
| 578 | match_fields = self._format_match_field_specs(expanded_specs) | 569 | match_fields = self._format_match_field_specs(expanded_specs) |
| 579 | if not match_fields: | 570 | if not match_fields: |
| 580 | return | 571 | return |
| 581 | - clause_boost = 1.0 | ||
| 582 | minimum_should_match = ( | 572 | minimum_should_match = ( |
| 583 | self.base_minimum_should_match if is_source else self.translation_minimum_should_match | 573 | self.base_minimum_should_match if is_source else self.translation_minimum_should_match |
| 584 | ) | 574 | ) |
| 585 | - if is_source and not source_in_index_languages: | ||
| 586 | - clause_boost = self.source_boost_when_missing | ||
| 587 | - elif not is_source: | ||
| 588 | - clause_boost = ( | ||
| 589 | - self.translation_boost | ||
| 590 | - if source_in_index_languages | ||
| 591 | - else self.translation_boost_when_source_missing | ||
| 592 | - ) | ||
| 593 | 575 | ||
| 594 | clause = { | 576 | clause = { |
| 595 | "multi_match": { | 577 | "multi_match": { |
| @@ -600,8 +582,11 @@ class ESQueryBuilder: | @@ -600,8 +582,11 @@ class ESQueryBuilder: | ||
| 600 | "tie_breaker": self.tie_breaker_base_query, | 582 | "tie_breaker": self.tie_breaker_base_query, |
| 601 | } | 583 | } |
| 602 | } | 584 | } |
| 603 | - if abs(clause_boost - 1.0) > 1e-9: | ||
| 604 | - clause["multi_match"]["boost"] = clause_boost | 585 | + # base_query: never set multi_match.boost (ES default 1.0). |
| 586 | + # Translation clauses: single knob from config — translation_boost. | ||
| 587 | + if not is_source: | ||
| 588 | + tb = float(self.translation_boost) | ||
| 589 | + clause["multi_match"]["boost"] = tb | ||
| 605 | should_clauses.append({ | 590 | should_clauses.append({ |
| 606 | "multi_match": clause["multi_match"] | 591 | "multi_match": clause["multi_match"] |
| 607 | }) | 592 | }) |
search/rerank_client.py
| @@ -116,7 +116,6 @@ def _extract_named_query_score(matched_queries: Any, name: str) -> float: | @@ -116,7 +116,6 @@ def _extract_named_query_score(matched_queries: Any, name: str) -> float: | ||
| 116 | def _collect_text_score_components(matched_queries: Any, fallback_es_score: float) -> Dict[str, float]: | 116 | def _collect_text_score_components(matched_queries: Any, fallback_es_score: float) -> Dict[str, float]: |
| 117 | source_score = _extract_named_query_score(matched_queries, "base_query") | 117 | source_score = _extract_named_query_score(matched_queries, "base_query") |
| 118 | translation_score = 0.0 | 118 | translation_score = 0.0 |
| 119 | - fallback_score = 0.0 | ||
| 120 | 119 | ||
| 121 | if isinstance(matched_queries, dict): | 120 | if isinstance(matched_queries, dict): |
| 122 | for query_name, score in matched_queries.items(): | 121 | for query_name, score in matched_queries.items(): |
| @@ -125,21 +124,16 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa | @@ -125,21 +124,16 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa | ||
| 125 | numeric_score = _to_score(score) | 124 | numeric_score = _to_score(score) |
| 126 | if query_name.startswith("base_query_trans_"): | 125 | if query_name.startswith("base_query_trans_"): |
| 127 | translation_score = max(translation_score, numeric_score) | 126 | translation_score = max(translation_score, numeric_score) |
| 128 | - elif query_name.startswith("fallback_original_query_"): | ||
| 129 | - fallback_score = max(fallback_score, numeric_score) | ||
| 130 | elif isinstance(matched_queries, list): | 127 | elif isinstance(matched_queries, list): |
| 131 | for query_name in matched_queries: | 128 | for query_name in matched_queries: |
| 132 | if not isinstance(query_name, str): | 129 | if not isinstance(query_name, str): |
| 133 | continue | 130 | continue |
| 134 | if query_name.startswith("base_query_trans_"): | 131 | if query_name.startswith("base_query_trans_"): |
| 135 | translation_score = 1.0 | 132 | translation_score = 1.0 |
| 136 | - elif query_name.startswith("fallback_original_query_"): | ||
| 137 | - fallback_score = 1.0 | ||
| 138 | 133 | ||
| 139 | weighted_source = source_score | 134 | weighted_source = source_score |
| 140 | weighted_translation = 0.8 * translation_score | 135 | weighted_translation = 0.8 * translation_score |
| 141 | - weighted_fallback = 0.55 * fallback_score | ||
| 142 | - weighted_components = [weighted_source, weighted_translation, weighted_fallback] | 136 | + weighted_components = [weighted_source, weighted_translation] |
| 143 | primary_text_score = max(weighted_components) | 137 | primary_text_score = max(weighted_components) |
| 144 | support_text_score = sum(weighted_components) - primary_text_score | 138 | support_text_score = sum(weighted_components) - primary_text_score |
| 145 | text_score = primary_text_score + 0.25 * support_text_score | 139 | text_score = primary_text_score + 0.25 * support_text_score |
| @@ -153,10 +147,8 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa | @@ -153,10 +147,8 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa | ||
| 153 | return { | 147 | return { |
| 154 | "source_score": source_score, | 148 | "source_score": source_score, |
| 155 | "translation_score": translation_score, | 149 | "translation_score": translation_score, |
| 156 | - "fallback_score": fallback_score, | ||
| 157 | "weighted_source_score": weighted_source, | 150 | "weighted_source_score": weighted_source, |
| 158 | "weighted_translation_score": weighted_translation, | 151 | "weighted_translation_score": weighted_translation, |
| 159 | - "weighted_fallback_score": weighted_fallback, | ||
| 160 | "primary_text_score": primary_text_score, | 152 | "primary_text_score": primary_text_score, |
| 161 | "support_text_score": support_text_score, | 153 | "support_text_score": support_text_score, |
| 162 | "text_score": text_score, | 154 | "text_score": text_score, |
| @@ -219,7 +211,6 @@ def fuse_scores_and_resort( | @@ -219,7 +211,6 @@ def fuse_scores_and_resort( | ||
| 219 | hit["_knn_score"] = knn_score | 211 | hit["_knn_score"] = knn_score |
| 220 | hit["_text_source_score"] = text_components["source_score"] | 212 | hit["_text_source_score"] = text_components["source_score"] |
| 221 | hit["_text_translation_score"] = text_components["translation_score"] | 213 | hit["_text_translation_score"] = text_components["translation_score"] |
| 222 | - hit["_text_fallback_score"] = text_components["fallback_score"] | ||
| 223 | hit["_text_primary_score"] = text_components["primary_text_score"] | 214 | hit["_text_primary_score"] = text_components["primary_text_score"] |
| 224 | hit["_text_support_score"] = text_components["support_text_score"] | 215 | hit["_text_support_score"] = text_components["support_text_score"] |
| 225 | hit["_fused_score"] = fused | 216 | hit["_fused_score"] = fused |
| @@ -231,7 +222,6 @@ def fuse_scores_and_resort( | @@ -231,7 +222,6 @@ def fuse_scores_and_resort( | ||
| 231 | "text_score": text_score, | 222 | "text_score": text_score, |
| 232 | "text_source_score": text_components["source_score"], | 223 | "text_source_score": text_components["source_score"], |
| 233 | "text_translation_score": text_components["translation_score"], | 224 | "text_translation_score": text_components["translation_score"], |
| 234 | - "text_fallback_score": text_components["fallback_score"], | ||
| 235 | "text_primary_score": text_components["primary_text_score"], | 225 | "text_primary_score": text_components["primary_text_score"], |
| 236 | "text_support_score": text_components["support_text_score"], | 226 | "text_support_score": text_components["support_text_score"], |
| 237 | "knn_score": knn_score, | 227 | "knn_score": knn_score, |
search/searcher.py
| @@ -132,11 +132,6 @@ class Searcher: | @@ -132,11 +132,6 @@ class Searcher: | ||
| 132 | base_minimum_should_match=self.config.query_config.base_minimum_should_match, | 132 | base_minimum_should_match=self.config.query_config.base_minimum_should_match, |
| 133 | translation_minimum_should_match=self.config.query_config.translation_minimum_should_match, | 133 | translation_minimum_should_match=self.config.query_config.translation_minimum_should_match, |
| 134 | translation_boost=self.config.query_config.translation_boost, | 134 | translation_boost=self.config.query_config.translation_boost, |
| 135 | - translation_boost_when_source_missing=self.config.query_config.translation_boost_when_source_missing, | ||
| 136 | - source_boost_when_missing=self.config.query_config.source_boost_when_missing, | ||
| 137 | - original_query_fallback_boost_when_translation_missing=( | ||
| 138 | - self.config.query_config.original_query_fallback_boost_when_translation_missing | ||
| 139 | - ), | ||
| 140 | tie_breaker_base_query=self.config.query_config.tie_breaker_base_query, | 135 | tie_breaker_base_query=self.config.query_config.tie_breaker_base_query, |
| 141 | ) | 136 | ) |
| 142 | 137 | ||
| @@ -267,13 +262,6 @@ class Searcher: | @@ -267,13 +262,6 @@ class Searcher: | ||
| 267 | if normalized: | 262 | if normalized: |
| 268 | candidates.append(normalized) | 263 | candidates.append(normalized) |
| 269 | 264 | ||
| 270 | - query_text_by_lang = getattr(parsed_query, "query_text_by_lang", {}) or {} | ||
| 271 | - if isinstance(query_text_by_lang, dict): | ||
| 272 | - for text in query_text_by_lang.values(): | ||
| 273 | - normalized = self._normalize_sku_match_text(text) | ||
| 274 | - if normalized: | ||
| 275 | - candidates.append(normalized) | ||
| 276 | - | ||
| 277 | translations = getattr(parsed_query, "translations", {}) or {} | 265 | translations = getattr(parsed_query, "translations", {}) or {} |
| 278 | if isinstance(translations, dict): | 266 | if isinstance(translations, dict): |
| 279 | for text in translations.values(): | 267 | for text in translations.values(): |
| @@ -943,7 +931,6 @@ class Searcher: | @@ -943,7 +931,6 @@ class Searcher: | ||
| 943 | debug_entry["text_score"] = rerank_debug.get("text_score") | 931 | debug_entry["text_score"] = rerank_debug.get("text_score") |
| 944 | debug_entry["text_source_score"] = rerank_debug.get("text_source_score") | 932 | debug_entry["text_source_score"] = rerank_debug.get("text_source_score") |
| 945 | debug_entry["text_translation_score"] = rerank_debug.get("text_translation_score") | 933 | debug_entry["text_translation_score"] = rerank_debug.get("text_translation_score") |
| 946 | - debug_entry["text_fallback_score"] = rerank_debug.get("text_fallback_score") | ||
| 947 | debug_entry["text_primary_score"] = rerank_debug.get("text_primary_score") | 934 | debug_entry["text_primary_score"] = rerank_debug.get("text_primary_score") |
| 948 | debug_entry["text_support_score"] = rerank_debug.get("text_support_score") | 935 | debug_entry["text_support_score"] = rerank_debug.get("text_support_score") |
| 949 | debug_entry["knn_score"] = rerank_debug.get("knn_score") | 936 | debug_entry["knn_score"] = rerank_debug.get("knn_score") |
tests/test_es_query_builder_text_recall_languages.py
0 → 100644
| @@ -0,0 +1,519 @@ | @@ -0,0 +1,519 @@ | ||
| 1 | +""" | ||
| 2 | +ES text recall: base_query (rewritten @ detected_language) + base_query_trans_*. | ||
| 3 | + | ||
| 4 | +Covers combinations of query language vs tenant index_languages, translations, | ||
| 5 | +and mixed Chinese/English queries. Asserts multi_match _name, query text, and | ||
| 6 | +target language fields (title.{lang}). | ||
| 7 | +""" | ||
| 8 | + | ||
| 9 | +from types import SimpleNamespace | ||
| 10 | +from typing import Any, Dict, List | ||
| 11 | + | ||
| 12 | +import numpy as np | ||
| 13 | + | ||
| 14 | +from search.es_query_builder import ESQueryBuilder | ||
| 15 | + | ||
| 16 | + | ||
| 17 | +def _builder_multilingual_title_only( | ||
| 18 | + *, | ||
| 19 | + default_language: str = "en", | ||
| 20 | + mixed_script_scale: float = 0.6, | ||
| 21 | +) -> ESQueryBuilder: | ||
| 22 | + """Minimal builder: only title.{lang} for easy field assertions.""" | ||
| 23 | + return ESQueryBuilder( | ||
| 24 | + match_fields=["title.en^1.0"], | ||
| 25 | + multilingual_fields=["title"], | ||
| 26 | + shared_fields=[], | ||
| 27 | + text_embedding_field="title_embedding", | ||
| 28 | + default_language=default_language, | ||
| 29 | + mixed_script_merged_field_boost_scale=mixed_script_scale, | ||
| 30 | + function_score_config=None, | ||
| 31 | + ) | ||
| 32 | + | ||
| 33 | + | ||
| 34 | +def _unwrap_inner_query(es_body: Dict[str, Any]) -> Dict[str, Any]: | ||
| 35 | + """Navigate bool.must / function_score wrappers to the text recall root.""" | ||
| 36 | + q = es_body.get("query") or {} | ||
| 37 | + if "bool" in q and "must" in q["bool"] and q["bool"]["must"]: | ||
| 38 | + q = q["bool"]["must"][0] | ||
| 39 | + if "function_score" in q: | ||
| 40 | + q = q["function_score"]["query"] | ||
| 41 | + return q | ||
| 42 | + | ||
| 43 | + | ||
| 44 | +def _extract_multi_match_clauses(es_body: Dict[str, Any]) -> List[Dict[str, Any]]: | ||
| 45 | + inner = _unwrap_inner_query(es_body) | ||
| 46 | + if "multi_match" in inner: | ||
| 47 | + return [inner["multi_match"]] | ||
| 48 | + should = (inner.get("bool") or {}).get("should") or [] | ||
| 49 | + return [c["multi_match"] for c in should if "multi_match" in c] | ||
| 50 | + | ||
| 51 | + | ||
| 52 | +def _clauses_index(es_body: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: | ||
| 53 | + """Map _name -> multi_match dict.""" | ||
| 54 | + out: Dict[str, Dict[str, Any]] = {} | ||
| 55 | + for mm in _extract_multi_match_clauses(es_body): | ||
| 56 | + name = mm.get("_name") | ||
| 57 | + if name: | ||
| 58 | + out[str(name)] = mm | ||
| 59 | + return out | ||
| 60 | + | ||
| 61 | + | ||
| 62 | +def _title_fields(mm: Dict[str, Any]) -> List[str]: | ||
| 63 | + fields = mm.get("fields") or [] | ||
| 64 | + return [f for f in fields if str(f).startswith("title.")] | ||
| 65 | + | ||
| 66 | + | ||
| 67 | +def _has_title_lang(mm: Dict[str, Any], lang: str) -> bool: | ||
| 68 | + """True if any field is title.{lang} with optional ^boost suffix.""" | ||
| 69 | + prefix = f"title.{lang}" | ||
| 70 | + for f in mm.get("fields") or []: | ||
| 71 | + s = str(f) | ||
| 72 | + if s == prefix or s.startswith(prefix + "^"): | ||
| 73 | + return True | ||
| 74 | + return False | ||
| 75 | + | ||
| 76 | + | ||
| 77 | +def _build( | ||
| 78 | + qb: ESQueryBuilder, | ||
| 79 | + *, | ||
| 80 | + query_text: str, | ||
| 81 | + rewritten: str, | ||
| 82 | + detected_language: str, | ||
| 83 | + translations: Dict[str, str], | ||
| 84 | + index_languages: List[str], | ||
| 85 | + contains_chinese: bool = False, | ||
| 86 | + contains_english: bool = False, | ||
| 87 | +) -> Dict[str, Any]: | ||
| 88 | + parsed = SimpleNamespace( | ||
| 89 | + rewritten_query=rewritten, | ||
| 90 | + detected_language=detected_language, | ||
| 91 | + translations=dict(translations), | ||
| 92 | + contains_chinese=contains_chinese, | ||
| 93 | + contains_english=contains_english, | ||
| 94 | + ) | ||
| 95 | + return qb.build_query( | ||
| 96 | + query_text=query_text, | ||
| 97 | + parsed_query=parsed, | ||
| 98 | + enable_knn=False, | ||
| 99 | + index_languages=index_languages, | ||
| 100 | + ) | ||
| 101 | + | ||
| 102 | + | ||
| 103 | +# --- 检测语言在 index_languages 内:主召回 + 翻译补召回 --- | ||
| 104 | + | ||
| 105 | + | ||
| 106 | +def test_zh_query_index_zh_en_includes_base_zh_and_trans_en(): | ||
| 107 | + qb = _builder_multilingual_title_only(default_language="en") | ||
| 108 | + q = _build( | ||
| 109 | + qb, | ||
| 110 | + query_text="连衣裙", | ||
| 111 | + rewritten="连衣裙", | ||
| 112 | + detected_language="zh", | ||
| 113 | + translations={"en": "dress"}, | ||
| 114 | + index_languages=["zh", "en"], | ||
| 115 | + ) | ||
| 116 | + idx = _clauses_index(q) | ||
| 117 | + assert set(idx) == {"base_query", "base_query_trans_en"} | ||
| 118 | + assert idx["base_query"]["query"] == "连衣裙" | ||
| 119 | + assert "title.zh" in _title_fields(idx["base_query"]) | ||
| 120 | + assert idx["base_query_trans_en"]["query"] == "dress" | ||
| 121 | + assert "title.en" in _title_fields(idx["base_query_trans_en"]) | ||
| 122 | + | ||
| 123 | + | ||
| 124 | +def test_en_query_index_zh_en_includes_base_en_and_trans_zh(): | ||
| 125 | + qb = _builder_multilingual_title_only(default_language="en") | ||
| 126 | + q = _build( | ||
| 127 | + qb, | ||
| 128 | + query_text="dress", | ||
| 129 | + rewritten="dress", | ||
| 130 | + detected_language="en", | ||
| 131 | + translations={"zh": "连衣裙"}, | ||
| 132 | + index_languages=["en", "zh"], | ||
| 133 | + ) | ||
| 134 | + idx = _clauses_index(q) | ||
| 135 | + assert set(idx) == {"base_query", "base_query_trans_zh"} | ||
| 136 | + assert idx["base_query"]["query"] == "dress" | ||
| 137 | + assert "title.en" in _title_fields(idx["base_query"]) | ||
| 138 | + assert idx["base_query_trans_zh"]["query"] == "连衣裙" | ||
| 139 | + assert "title.zh" in _title_fields(idx["base_query_trans_zh"]) | ||
| 140 | + | ||
| 141 | + | ||
| 142 | +def test_de_query_index_de_en_fr_includes_base_and_two_translations(): | ||
| 143 | + qb = _builder_multilingual_title_only(default_language="en") | ||
| 144 | + q = _build( | ||
| 145 | + qb, | ||
| 146 | + query_text="kleid", | ||
| 147 | + rewritten="kleid", | ||
| 148 | + detected_language="de", | ||
| 149 | + translations={"en": "dress", "fr": "robe"}, | ||
| 150 | + index_languages=["de", "en", "fr"], | ||
| 151 | + ) | ||
| 152 | + idx = _clauses_index(q) | ||
| 153 | + assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_fr"} | ||
| 154 | + assert idx["base_query"]["query"] == "kleid" | ||
| 155 | + assert "title.de" in _title_fields(idx["base_query"]) | ||
| 156 | + assert idx["base_query_trans_en"]["query"] == "dress" | ||
| 157 | + assert idx["base_query_trans_fr"]["query"] == "robe" | ||
| 158 | + | ||
| 159 | + | ||
| 160 | +# --- 检测语言不在 index_languages:仍有 base(弱)+ 翻译(强) --- | ||
| 161 | + | ||
| 162 | + | ||
| 163 | +def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields(): | ||
| 164 | + qb = _builder_multilingual_title_only(default_language="en") | ||
| 165 | + q = _build( | ||
| 166 | + qb, | ||
| 167 | + query_text="schuh", | ||
| 168 | + rewritten="schuh", | ||
| 169 | + detected_language="de", | ||
| 170 | + translations={"en": "shoe", "zh": "鞋"}, | ||
| 171 | + index_languages=["en", "zh"], | ||
| 172 | + ) | ||
| 173 | + idx = _clauses_index(q) | ||
| 174 | + assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_zh"} | ||
| 175 | + assert idx["base_query"]["query"] == "schuh" | ||
| 176 | + assert "title.de" in _title_fields(idx["base_query"]) | ||
| 177 | + assert "boost" not in idx["base_query"] | ||
| 178 | + assert idx["base_query_trans_en"]["query"] == "shoe" | ||
| 179 | + assert idx["base_query_trans_en"]["boost"] == qb.translation_boost | ||
| 180 | + assert idx["base_query_trans_zh"]["query"] == "鞋" | ||
| 181 | + assert idx["base_query_trans_zh"]["boost"] == qb.translation_boost | ||
| 182 | + | ||
| 183 | + | ||
| 184 | +# --- 中英混写:原文在 base_query;翻译子句独立;混写时 base 子句扩列 --- | ||
| 185 | + | ||
| 186 | + | ||
| 187 | +def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause(): | ||
| 188 | + qb = _builder_multilingual_title_only(default_language="en") | ||
| 189 | + q = _build( | ||
| 190 | + qb, | ||
| 191 | + query_text="红色 dress", | ||
| 192 | + rewritten="红色 dress", | ||
| 193 | + detected_language="zh", | ||
| 194 | + translations={"en": "red dress"}, | ||
| 195 | + index_languages=["zh", "en"], | ||
| 196 | + contains_chinese=True, | ||
| 197 | + contains_english=True, | ||
| 198 | + ) | ||
| 199 | + idx = _clauses_index(q) | ||
| 200 | + assert set(idx) == {"base_query", "base_query_trans_en"} | ||
| 201 | + assert idx["base_query"]["query"] == "红色 dress" | ||
| 202 | + assert _has_title_lang(idx["base_query"], "zh") and _has_title_lang(idx["base_query"], "en") | ||
| 203 | + assert idx["base_query_trans_en"]["query"] == "red dress" | ||
| 204 | + assert _has_title_lang(idx["base_query_trans_en"], "en") | ||
| 205 | + | ||
| 206 | + | ||
| 207 | +def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause(): | ||
| 208 | + qb = _builder_multilingual_title_only(default_language="en") | ||
| 209 | + q = _build( | ||
| 210 | + qb, | ||
| 211 | + query_text="nike 运动鞋", | ||
| 212 | + rewritten="nike 运动鞋", | ||
| 213 | + detected_language="en", | ||
| 214 | + translations={"zh": "耐克运动鞋"}, | ||
| 215 | + index_languages=["zh", "en"], | ||
| 216 | + contains_chinese=True, | ||
| 217 | + contains_english=True, | ||
| 218 | + ) | ||
| 219 | + idx = _clauses_index(q) | ||
| 220 | + assert set(idx) == {"base_query", "base_query_trans_zh"} | ||
| 221 | + assert idx["base_query"]["query"] == "nike 运动鞋" | ||
| 222 | + assert _has_title_lang(idx["base_query"], "en") and _has_title_lang(idx["base_query"], "zh") | ||
| 223 | + assert idx["base_query_trans_zh"]["query"] == "耐克运动鞋" | ||
| 224 | + | ||
| 225 | + | ||
| 226 | +def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): | ||
| 227 | + qb = _builder_multilingual_title_only(default_language="en") | ||
| 228 | + q = _build( | ||
| 229 | + qb, | ||
| 230 | + query_text="法式 dress", | ||
| 231 | + rewritten="法式 dress", | ||
| 232 | + detected_language="zh", | ||
| 233 | + translations={}, | ||
| 234 | + index_languages=["zh"], | ||
| 235 | + contains_chinese=True, | ||
| 236 | + contains_english=True, | ||
| 237 | + ) | ||
| 238 | + idx = _clauses_index(q) | ||
| 239 | + assert set(idx) == {"base_query"} | ||
| 240 | + bases = {f.split("^", 1)[0] for f in _title_fields(idx["base_query"])} | ||
| 241 | + assert bases == {"title.zh"} | ||
| 242 | + | ||
| 243 | + | ||
| 244 | +# --- 去重:与 base 同语言同文本的翻译项跳过 --- | ||
| 245 | + | ||
| 246 | + | ||
| 247 | +def test_skips_translation_when_same_lang_and_same_text_as_base(): | ||
| 248 | + qb = _builder_multilingual_title_only(default_language="en") | ||
| 249 | + q = _build( | ||
| 250 | + qb, | ||
| 251 | + query_text="NIKE", | ||
| 252 | + rewritten="NIKE", | ||
| 253 | + detected_language="en", | ||
| 254 | + translations={"en": "NIKE", "zh": "耐克"}, | ||
| 255 | + index_languages=["en", "zh"], | ||
| 256 | + ) | ||
| 257 | + idx = _clauses_index(q) | ||
| 258 | + assert set(idx) == {"base_query", "base_query_trans_zh"} | ||
| 259 | + | ||
| 260 | + | ||
| 261 | +def test_keeps_translation_when_same_text_but_different_lang_than_base(): | ||
| 262 | + qb = _builder_multilingual_title_only(default_language="en") | ||
| 263 | + q = _build( | ||
| 264 | + qb, | ||
| 265 | + query_text="NIKE", | ||
| 266 | + rewritten="NIKE", | ||
| 267 | + detected_language="en", | ||
| 268 | + translations={"zh": "NIKE"}, | ||
| 269 | + index_languages=["en", "zh"], | ||
| 270 | + ) | ||
| 271 | + idx = _clauses_index(q) | ||
| 272 | + assert set(idx) == {"base_query", "base_query_trans_zh"} | ||
| 273 | + assert idx["base_query_trans_zh"]["query"] == "NIKE" | ||
| 274 | + | ||
| 275 | + | ||
| 276 | +# --- 翻译 key 规范化、空翻译跳过 --- | ||
| 277 | + | ||
| 278 | + | ||
| 279 | +def test_translation_language_key_is_normalized_case_insensitive(): | ||
| 280 | + qb = _builder_multilingual_title_only(default_language="en") | ||
| 281 | + q = _build( | ||
| 282 | + qb, | ||
| 283 | + query_text="dress", | ||
| 284 | + rewritten="dress", | ||
| 285 | + detected_language="en", | ||
| 286 | + translations={"ZH": "连衣裙"}, | ||
| 287 | + index_languages=["en", "zh"], | ||
| 288 | + ) | ||
| 289 | + idx = _clauses_index(q) | ||
| 290 | + assert "base_query_trans_zh" in idx | ||
| 291 | + assert idx["base_query_trans_zh"]["query"] == "连衣裙" | ||
| 292 | + | ||
| 293 | + | ||
| 294 | +def test_empty_translation_value_is_skipped(): | ||
| 295 | + qb = _builder_multilingual_title_only(default_language="en") | ||
| 296 | + q = _build( | ||
| 297 | + qb, | ||
| 298 | + query_text="dress", | ||
| 299 | + rewritten="dress", | ||
| 300 | + detected_language="en", | ||
| 301 | + translations={"zh": " ", "fr": "robe"}, | ||
| 302 | + index_languages=["en", "zh", "fr"], | ||
| 303 | + ) | ||
| 304 | + idx = _clauses_index(q) | ||
| 305 | + assert "base_query_trans_zh" not in idx | ||
| 306 | + assert "base_query_trans_fr" in idx | ||
| 307 | + | ||
| 308 | + | ||
| 309 | +# --- index_languages 为空:视为「未约束」source_in_index 为 True --- | ||
| 310 | + | ||
| 311 | + | ||
| 312 | +def test_empty_index_languages_treats_source_as_in_index_boosts(): | ||
| 313 | + qb = _builder_multilingual_title_only(default_language="en") | ||
| 314 | + q = _build( | ||
| 315 | + qb, | ||
| 316 | + query_text="x", | ||
| 317 | + rewritten="x", | ||
| 318 | + detected_language="de", | ||
| 319 | + translations={"en": "y"}, | ||
| 320 | + index_languages=[], | ||
| 321 | + ) | ||
| 322 | + idx = _clauses_index(q) | ||
| 323 | + assert "boost" not in idx["base_query"] | ||
| 324 | + assert idx["base_query_trans_en"]["boost"] == qb.translation_boost | ||
| 325 | + | ||
| 326 | + | ||
| 327 | +# --- 无翻译:仅 base_query --- | ||
| 328 | + | ||
| 329 | + | ||
| 330 | +def test_no_translations_only_base_query(): | ||
| 331 | + qb = _builder_multilingual_title_only(default_language="en") | ||
| 332 | + q = _build( | ||
| 333 | + qb, | ||
| 334 | + query_text="hello", | ||
| 335 | + rewritten="hello", | ||
| 336 | + detected_language="en", | ||
| 337 | + translations={}, | ||
| 338 | + index_languages=["en", "zh"], | ||
| 339 | + ) | ||
| 340 | + idx = _clauses_index(q) | ||
| 341 | + assert set(idx) == {"base_query"} | ||
| 342 | + | ||
| 343 | + | ||
| 344 | +# --- 与 KNN 同存时仍能解析文本子句(顶层 knn 不影响 query 内结构) --- | ||
| 345 | + | ||
| 346 | + | ||
| 347 | +def test_text_clauses_present_alongside_knn(): | ||
| 348 | + qb = _builder_multilingual_title_only(default_language="en") | ||
| 349 | + parsed = SimpleNamespace( | ||
| 350 | + rewritten_query="dress", | ||
| 351 | + detected_language="en", | ||
| 352 | + translations={"zh": "连衣裙"}, | ||
| 353 | + contains_chinese=False, | ||
| 354 | + contains_english=True, | ||
| 355 | + ) | ||
| 356 | + q = qb.build_query( | ||
| 357 | + query_text="dress", | ||
| 358 | + query_vector=np.array([0.1, 0.2, 0.3], dtype=np.float32), | ||
| 359 | + parsed_query=parsed, | ||
| 360 | + enable_knn=True, | ||
| 361 | + index_languages=["en", "zh"], | ||
| 362 | + ) | ||
| 363 | + assert "knn" in q | ||
| 364 | + idx = _clauses_index(q) | ||
| 365 | + assert set(idx) == {"base_query", "base_query_trans_zh"} | ||
| 366 | + | ||
| 367 | + | ||
| 368 | +def test_detected_language_unknown_falls_back_to_default_language(): | ||
| 369 | + """与 LanguageDetector 失败时 QueryConfig.default_language 行为对齐。""" | ||
| 370 | + qb = _builder_multilingual_title_only(default_language="en") | ||
| 371 | + parsed = SimpleNamespace( | ||
| 372 | + rewritten_query="shirt", | ||
| 373 | + detected_language="unknown", | ||
| 374 | + translations={"zh": "衬衫"}, | ||
| 375 | + contains_chinese=False, | ||
| 376 | + contains_english=True, | ||
| 377 | + ) | ||
| 378 | + q = qb.build_query( | ||
| 379 | + query_text="shirt", | ||
| 380 | + parsed_query=parsed, | ||
| 381 | + enable_knn=False, | ||
| 382 | + index_languages=["en", "zh"], | ||
| 383 | + ) | ||
| 384 | + idx = _clauses_index(q) | ||
| 385 | + assert set(idx) == {"base_query", "base_query_trans_zh"} | ||
| 386 | + assert idx["base_query"]["query"] == "shirt" | ||
| 387 | + assert _has_title_lang(idx["base_query"], "en") | ||
| 388 | + | ||
| 389 | + | ||
| 390 | +def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): | ||
| 391 | + qb = _builder_multilingual_title_only(default_language="en") | ||
| 392 | + q = _build( | ||
| 393 | + qb, | ||
| 394 | + query_text="платье", | ||
| 395 | + rewritten="платье", | ||
| 396 | + detected_language="ru", | ||
| 397 | + translations={"en": "dress"}, | ||
| 398 | + index_languages=["ru", "en"], | ||
| 399 | + ) | ||
| 400 | + idx = _clauses_index(q) | ||
| 401 | + assert set(idx) == {"base_query", "base_query_trans_en"} | ||
| 402 | + assert idx["base_query"]["query"] == "платье" | ||
| 403 | + assert _has_title_lang(idx["base_query"], "ru") | ||
| 404 | + assert idx["base_query_trans_en"]["query"] == "dress" | ||
| 405 | + | ||
| 406 | + | ||
| 407 | +def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause(): | ||
| 408 | + """ | ||
| 409 | + 当前实现:凡是 translations 里非空的条目都会生成子句; | ||
| 410 | + index_languages 只约束混写扩列,不用于过滤翻译子句。 | ||
| 411 | + """ | ||
| 412 | + qb = _builder_multilingual_title_only(default_language="en") | ||
| 413 | + q = _build( | ||
| 414 | + qb, | ||
| 415 | + query_text="dress", | ||
| 416 | + rewritten="dress", | ||
| 417 | + detected_language="en", | ||
| 418 | + translations={"zh": "连衣裙", "de": "Kleid"}, | ||
| 419 | + index_languages=["en", "zh"], | ||
| 420 | + ) | ||
| 421 | + idx = _clauses_index(q) | ||
| 422 | + assert "base_query_trans_de" in idx | ||
| 423 | + assert idx["base_query_trans_de"]["query"] == "Kleid" | ||
| 424 | + assert _has_title_lang(idx["base_query_trans_de"], "de") | ||
| 425 | + | ||
| 426 | + | ||
| 427 | +def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_base(): | ||
| 428 | + """base_query 始终用 rewritten_query,而非仅 query_text。""" | ||
| 429 | + qb = _builder_multilingual_title_only(default_language="en") | ||
| 430 | + q = _build( | ||
| 431 | + qb, | ||
| 432 | + query_text=" 红色 ", | ||
| 433 | + rewritten="红色连衣裙", | ||
| 434 | + detected_language="zh", | ||
| 435 | + translations={"en": "red dress"}, | ||
| 436 | + index_languages=["zh", "en"], | ||
| 437 | + contains_chinese=True, | ||
| 438 | + contains_english=False, | ||
| 439 | + ) | ||
| 440 | + idx = _clauses_index(q) | ||
| 441 | + assert idx["base_query"]["query"] == "红色连衣裙" | ||
| 442 | + assert idx["base_query_trans_en"]["query"] == "red dress" | ||
| 443 | + | ||
| 444 | + | ||
| 445 | +def test_detected_language_unknown_falls_back_to_default_language(): | ||
| 446 | + """与 LanguageDetector 失败时 QueryConfig.default_language 行为对齐。""" | ||
| 447 | + qb = _builder_multilingual_title_only(default_language="en") | ||
| 448 | + parsed = SimpleNamespace( | ||
| 449 | + rewritten_query="shirt", | ||
| 450 | + detected_language="unknown", | ||
| 451 | + translations={"zh": "衬衫"}, | ||
| 452 | + contains_chinese=False, | ||
| 453 | + contains_english=True, | ||
| 454 | + ) | ||
| 455 | + q = qb.build_query( | ||
| 456 | + query_text="shirt", | ||
| 457 | + parsed_query=parsed, | ||
| 458 | + enable_knn=False, | ||
| 459 | + index_languages=["en", "zh"], | ||
| 460 | + ) | ||
| 461 | + idx = _clauses_index(q) | ||
| 462 | + assert set(idx) == {"base_query", "base_query_trans_zh"} | ||
| 463 | + assert idx["base_query"]["query"] == "shirt" | ||
| 464 | + assert _has_title_lang(idx["base_query"], "en") | ||
| 465 | + | ||
| 466 | + | ||
| 467 | +def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): | ||
| 468 | + qb = _builder_multilingual_title_only(default_language="en") | ||
| 469 | + q = _build( | ||
| 470 | + qb, | ||
| 471 | + query_text="платье", | ||
| 472 | + rewritten="платье", | ||
| 473 | + detected_language="ru", | ||
| 474 | + translations={"en": "dress"}, | ||
| 475 | + index_languages=["ru", "en"], | ||
| 476 | + ) | ||
| 477 | + idx = _clauses_index(q) | ||
| 478 | + assert set(idx) == {"base_query", "base_query_trans_en"} | ||
| 479 | + assert idx["base_query"]["query"] == "платье" | ||
| 480 | + assert _has_title_lang(idx["base_query"], "ru") | ||
| 481 | + assert idx["base_query_trans_en"]["query"] == "dress" | ||
| 482 | + | ||
| 483 | + | ||
| 484 | +def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause(): | ||
| 485 | + """ | ||
| 486 | + 当前实现:凡是 translations 里非空的条目都会生成子句; | ||
| 487 | + index_languages 只约束混写扩列,不用于过滤翻译子句。 | ||
| 488 | + """ | ||
| 489 | + qb = _builder_multilingual_title_only(default_language="en") | ||
| 490 | + q = _build( | ||
| 491 | + qb, | ||
| 492 | + query_text="dress", | ||
| 493 | + rewritten="dress", | ||
| 494 | + detected_language="en", | ||
| 495 | + translations={"zh": "连衣裙", "de": "Kleid"}, | ||
| 496 | + index_languages=["en", "zh"], | ||
| 497 | + ) | ||
| 498 | + idx = _clauses_index(q) | ||
| 499 | + assert "base_query_trans_de" in idx | ||
| 500 | + assert idx["base_query_trans_de"]["query"] == "Kleid" | ||
| 501 | + assert _has_title_lang(idx["base_query_trans_de"], "de") | ||
| 502 | + | ||
| 503 | + | ||
| 504 | +def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_base(): | ||
| 505 | + """base_query 始终用 rewritten_query,而非仅 query_text。""" | ||
| 506 | + qb = _builder_multilingual_title_only(default_language="en") | ||
| 507 | + q = _build( | ||
| 508 | + qb, | ||
| 509 | + query_text=" 红色 ", | ||
| 510 | + rewritten="红色连衣裙", | ||
| 511 | + detected_language="zh", | ||
| 512 | + translations={"en": "red dress"}, | ||
| 513 | + index_languages=["zh", "en"], | ||
| 514 | + contains_chinese=True, | ||
| 515 | + contains_english=False, | ||
| 516 | + ) | ||
| 517 | + idx = _clauses_index(q) | ||
| 518 | + assert idx["base_query"]["query"] == "红色连衣裙" | ||
| 519 | + assert idx["base_query_trans_en"]["query"] == "red dress" |
tests/test_rerank_client.py
| @@ -11,7 +11,6 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim | @@ -11,7 +11,6 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim | ||
| 11 | "matched_queries": { | 11 | "matched_queries": { |
| 12 | "base_query": 2.4, | 12 | "base_query": 2.4, |
| 13 | "base_query_trans_zh": 1.8, | 13 | "base_query_trans_zh": 1.8, |
| 14 | - "fallback_original_query_zh": 1.2, | ||
| 15 | "knn_query": 0.8, | 14 | "knn_query": 0.8, |
| 16 | }, | 15 | }, |
| 17 | }, | 16 | }, |
| @@ -27,7 +26,7 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim | @@ -27,7 +26,7 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim | ||
| 27 | 26 | ||
| 28 | debug = fuse_scores_and_resort(hits, [0.9, 0.7]) | 27 | debug = fuse_scores_and_resort(hits, [0.9, 0.7]) |
| 29 | 28 | ||
| 30 | - expected_text_1 = 2.4 + 0.25 * ((0.8 * 1.8) + (0.55 * 1.2)) | 29 | + expected_text_1 = 2.4 + 0.25 * (0.8 * 1.8) |
| 31 | expected_fused_1 = (0.9 + 0.00001) * ((expected_text_1 + 0.1) ** 0.35) * ((0.8 + 0.6) ** 0.2) | 30 | expected_fused_1 = (0.9 + 0.00001) * ((expected_text_1 + 0.1) ** 0.35) * ((0.8 + 0.6) ** 0.2) |
| 32 | expected_fused_2 = (0.7 + 0.00001) * ((9.0 + 0.1) ** 0.35) * ((0.2 + 0.6) ** 0.2) | 31 | expected_fused_2 = (0.7 + 0.00001) * ((9.0 + 0.1) ** 0.35) * ((0.2 + 0.6) ** 0.2) |
| 33 | 32 | ||
| @@ -38,7 +37,6 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim | @@ -38,7 +37,6 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim | ||
| 38 | assert isclose(by_id["2"]["_fused_score"], expected_fused_2, rel_tol=1e-9) | 37 | assert isclose(by_id["2"]["_fused_score"], expected_fused_2, rel_tol=1e-9) |
| 39 | assert debug[0]["text_source_score"] == 2.4 | 38 | assert debug[0]["text_source_score"] == 2.4 |
| 40 | assert debug[0]["text_translation_score"] == 1.8 | 39 | assert debug[0]["text_translation_score"] == 1.8 |
| 41 | - assert debug[0]["text_fallback_score"] == 1.2 | ||
| 42 | assert debug[0]["knn_score"] == 0.8 | 40 | assert debug[0]["knn_score"] == 0.8 |
| 43 | assert [hit["_id"] for hit in hits] == ["2", "1"] | 41 | assert [hit["_id"] for hit in hits] == ["2", "1"] |
| 44 | 42 |