Commit e756b18eebb8bf04453e1383d003f941ac88db16
1 parent
a3d3fb11
重构了文本召回构建器,现在每个 base_query / base_query_trans_*
子句都变成了一个带有以下结构的命名布尔查询: must:combined_fields should:加权后的 best_fields 和 phrase 子句 主要改动位于 search/es_query_builder.py,但此次调整沿用了现有语言路由设计,并未引入一次性分支。额外的 should 子句权重现在通过 config/schema.py、config/loader.py、search/searcher.py 以及 config/config.yaml 进行配置驱动,从而保持结构的集中管理。
Showing
9 changed files
with
883 additions
and
222 deletions
Show diff stats
config/config.yaml
| ... | ... | @@ -81,6 +81,7 @@ field_boosts: |
| 81 | 81 | category_path: 2.0 |
| 82 | 82 | brief: 1.5 |
| 83 | 83 | description: 1.5 |
| 84 | + vendor: 1.5 | |
| 84 | 85 | option1_values: 1.5 |
| 85 | 86 | option2_values: 1.5 |
| 86 | 87 | option3_values: 1.5 |
| ... | ... | @@ -126,6 +127,7 @@ query_config: |
| 126 | 127 | - "category_name_text" |
| 127 | 128 | - "brief" |
| 128 | 129 | - "description" |
| 130 | + - "vendor" | |
| 129 | 131 | shared_fields: |
| 130 | 132 | - "tags" |
| 131 | 133 | - "option1_values" |
| ... | ... | @@ -133,7 +135,7 @@ query_config: |
| 133 | 135 | - "option3_values" |
| 134 | 136 | core_multilingual_fields: |
| 135 | 137 | - "title" |
| 136 | - - "brief" | |
| 138 | + - "qanchors" | |
| 137 | 139 | - "category_name_text" |
| 138 | 140 | |
| 139 | 141 | # 统一文本召回策略(主查询 + 翻译查询) |
| ... | ... | @@ -142,6 +144,15 @@ query_config: |
| 142 | 144 | translation_minimum_should_match: "75%" |
| 143 | 145 | translation_boost: 0.75 |
| 144 | 146 | tie_breaker_base_query: 0.5 |
| 147 | + best_fields_boost: 2.0 | |
| 148 | + best_fields: | |
| 149 | + title: 4.0 | |
| 150 | + qanchors: 3.0 | |
| 151 | + category_name_text: 2.0 | |
| 152 | + phrase_fields: | |
| 153 | + title: 5.0 | |
| 154 | + qanchors: 4.0 | |
| 155 | + phrase_match_boost: 3.0 | |
| 145 | 156 | |
| 146 | 157 | # Embedding字段名称 |
| 147 | 158 | text_embedding_field: "title_embedding" | ... | ... |
config/loader.py
| ... | ... | @@ -285,6 +285,16 @@ class AppConfigLoader: |
| 285 | 285 | translation_minimum_should_match=str(text_strategy.get("translation_minimum_should_match", "70%")), |
| 286 | 286 | translation_boost=float(text_strategy.get("translation_boost", 0.4)), |
| 287 | 287 | tie_breaker_base_query=float(text_strategy.get("tie_breaker_base_query", 0.9)), |
| 288 | + best_fields={ | |
| 289 | + str(field): float(boost) | |
| 290 | + for field, boost in dict(text_strategy.get("best_fields") or {}).items() | |
| 291 | + }, | |
| 292 | + best_fields_boost=float(text_strategy.get("best_fields_boost", 2.0)), | |
| 293 | + phrase_fields={ | |
| 294 | + str(field): float(boost) | |
| 295 | + for field, boost in dict(text_strategy.get("phrase_fields") or {}).items() | |
| 296 | + }, | |
| 297 | + phrase_match_boost=float(text_strategy.get("phrase_match_boost", 3.0)), | |
| 288 | 298 | zh_to_en_model=str(query_cfg.get("zh_to_en_model") or "opus-mt-zh-en"), |
| 289 | 299 | en_to_zh_model=str(query_cfg.get("en_to_zh_model") or "opus-mt-en-zh"), |
| 290 | 300 | default_translation_model=str( | ... | ... |
config/schema.py
| ... | ... | @@ -55,6 +55,10 @@ class QueryConfig: |
| 55 | 55 | translation_minimum_should_match: str = "70%" |
| 56 | 56 | translation_boost: float = 0.4 |
| 57 | 57 | tie_breaker_base_query: float = 0.9 |
| 58 | + best_fields: Dict[str, float] = field(default_factory=dict) | |
| 59 | + best_fields_boost: float = 2.0 | |
| 60 | + phrase_fields: Dict[str, float] = field(default_factory=dict) | |
| 61 | + phrase_match_boost: float = 3.0 | |
| 58 | 62 | zh_to_en_model: str = "opus-mt-zh-en" |
| 59 | 63 | en_to_zh_model: str = "opus-mt-en-zh" |
| 60 | 64 | default_translation_model: str = "nllb-200-distilled-600m" | ... | ... |
| ... | ... | @@ -0,0 +1,596 @@ |
| 1 | +项目 TODO 清单 | |
| 2 | + | |
| 3 | +2. 核心搜索功能优化 | |
| 4 | + | |
| 5 | +2.1 意图识别模块 | |
| 6 | + | |
| 7 | +- 增加款式意图识别模块 | |
| 8 | + | |
| 9 | +- 意图类型: 颜色,尺码(目前只需要支持这两种) | |
| 10 | + | |
| 11 | +- 意图召回层: | |
| 12 | +每种意图,有一个召回词集合 | |
| 13 | +对query(包括原始query、各种翻译query 都做匹配) | |
| 14 | + | |
| 15 | +- 以颜色意图为例: | |
| 16 | +有一个词表,每一行 都逗号分割,互为同义词,行内第一个为标准化词 | |
| 17 | +query匹配了其中任何一个词,都认为,具有颜色意图 | |
| 18 | +匹配规则: 用细粒度、粗粒度分词,看是否有在词表中的。原始query分词、和每种翻译的分词,都要用。 | |
| 19 | + | |
| 20 | +- 意图判断: | |
| 21 | + 暂时留空,直接返回true。目前没有模型,即只要召回了(词表匹配了),即认为有该维度款式需求。 | |
| 22 | + (以后考虑建设fasttext/bert系列多分类模型) | |
| 23 | + | |
| 24 | +- 意图使用: | |
| 25 | +我们第一阶段,使用 参与ES提权。 | |
| 26 | + | |
| 27 | +- 一、参与ES提权 | |
| 28 | + | |
| 29 | +- 二、参与reranker | |
| 30 | + | |
| 31 | +- 如果有: 先做sku筛选,然后把最优的拼接到名称中,参与reranker。 | |
| 32 | + | |
| 33 | + | |
| 34 | + | |
| 35 | +- 现在在reranker、分页之后、做填充的时候,已经有做sku的筛选。 | |
| 36 | +需要优化: | |
| 37 | +现在是,先做包含的判断,找到第一个 option_value被query包含的,则直接认为匹配。改为 | |
| 38 | + 1. 第一轮:遍历完,如果有且仅有一个被query包含,那么认为匹配。 | |
| 39 | + 2. 第二轮:如果有多个符合(被query包含),跳到3。如果没有,对每个词都走泛化词表进行匹配。 | |
| 40 | + 3. 第三轮:如果有多个,那么对这多个,走embedding相关性取最高的。如果一个也没有,则对所有的走embedding相关性取最高的 | |
| 41 | + 这个sku筛选也需要提取为一个独立的模块。 | |
| 42 | + | |
| 43 | +- 另外:现在是reranker、分页之后做sku筛选,要改为: | |
| 44 | + 1. 有款式意图的时候,才做sku筛选 | |
| 45 | + 2. sku筛选的时机,改为在reranker之前,对所有内容做sku筛选,然后 | |
| 46 | + 3. 从仅 option1 扩展到多个维度,识别的意图,包含意图的维度名(color)和维度名的泛化词list(color、颜色、colour、olors、、、、),遍历option1_name,option2_name,option3_name,看哪个能匹配上意图的维度名list,哪个匹配上了,则在这个维度筛选。 | |
| 47 | + 4. Rerank doc (有款式意图的时候)要带上属性后缀,拼接到title后面。在调用 run_rerank 前,对每条 hit 生成「用于重排的 doc 文本」(标题 + 可选后缀) | |
| 48 | + 5. TODO : 还有一个问题。 目前,sku只返回一个维度(店铺主维度。默认应该是option1,不是所有维度的sku信息都返回的。所以,如果有款式意图,但是主维度是颜色,那么拿不到全的款式sku) | |
| 49 | + | |
| 50 | +- 筛选SKU: 先只筛选第一个维度,但考虑到用户搜索词可能带了尺码,所以第二、三个维度也要考虑 | |
| 51 | + | |
| 52 | + 当前项目功能已经较多,但是有清晰的框架,请务必基于现有框架进行改造,不要进行补丁式的修改,避免代码逻辑分叉。 | |
| 53 | + 请一步一步来,先设计意图识别模块,仔细思考需求,意图识别模块需要提供哪些内容,用于返回数据接口的定义,深度思考,定义一个合理的接口后,再给出合理的模块设计。 | |
| 54 | + | |
| 55 | +2.3 向量检索与融合 | |
| 56 | + | |
| 57 | +- 把knn跟文本相关性的融合方式修改为 "rank": {"rrf": {} }需要licence,可以帮我修改源码支持吗? | |
| 58 | + | |
| 59 | + knn_boost: 2.0 | |
| 60 | + | |
| 61 | + { | |
| 62 | + "query": { ...全文检索... }, | |
| 63 | + "knn": { ...向量检索... }, | |
| 64 | + "rank": { | |
| 65 | + "rrf": {} | |
| 66 | + } | |
| 67 | + } | |
| 68 | + | |
| 69 | +- 融合打分(已完成,2026-03) | |
| 70 | + | |
| 71 | + 以下已经完成: | |
| 72 | + 1. fuse_scores_and_resort 已改为乘法融合,并通过 matched_queries 提取: | |
| 73 | + - base_query | |
| 74 | + - base_query_trans_* | |
| 75 | + - fallback_original_query_* | |
| 76 | + - knn_query | |
| 77 | + 2. 文本相关性大分不再依赖 phrase_query / keywords_query,这两类查询已清理。 | |
| 78 | + 3. 当前融合策略: | |
| 79 | + - text_score = primary(weighted_source, weighted_translation, weighted_fallback) + 0.25 * support | |
| 80 | + - fused_score = (rerank_score + 0.00001) * (text_score + 0.1) ** 0.35 * (knn_score + 0.6) ** 0.2 | |
| 81 | + 4. track_scores 与 include_named_queries_score 已接入,调试字段与评估方法已同步到: | |
| 82 | + - docs/相关性检索优化说明.md | |
| 83 | + - docs/搜索API对接指南.md | |
| 84 | + - docs/Usage-Guide.md | |
| 85 | + | |
| 86 | + 未完成的: | |
| 87 | + (归一化、次序融合?还乘法公式?) | |
| 88 | + RRF:先把多路召回稳妥融合 | |
| 89 | + linear + minmax:让你能精调 knn 和文本的权重 | |
| 90 | + reranker:对前面召回出来的 top-k 再做“最后一刀” | |
| 91 | + | |
| 92 | +2.4 文本相关性优化 | |
| 93 | + | |
| 94 | +- 调研: | |
| 95 | +Princeton WordNet — 英文同义词底库 | |
| 96 | +Shopify Product Taxonomy — 电商品类标准 | |
| 97 | +Querqy — 电商搜索规则框架 | |
| 98 | +gensimpson/elasticsearch-synonyms — ES 同义词规则落地 | |
| 99 | + | |
| 100 | +- tags字段使用的优化: | |
| 101 | +现在是keyword,在搜索中,不太好使用(目前主要用于suggest)。 | |
| 102 | +可以考虑也拆分多语言,配合analyzer使用(和qanchors一样) | |
| 103 | + | |
| 104 | +- 是否需要: | |
| 105 | +当「源语言不在 index_languages」且「某些目标语言的翻译缺失」时,ES 里会额外加一层 用「原始 query 字符串」去撞缺失语种字段 | |
| 106 | + | |
| 107 | +- 检索相关性优化: | |
| 108 | +原始搜索词和翻译的词,都需要有对应的主干分析 | |
| 109 | +这个主干可以根据词性简单提取名词即可 | |
| 110 | +在搜索时,原始词和主干都成对地出现,原始词和trunk_keywords一起组成一个或查询。 | |
| 111 | +有一种方案是把原始词和主干词拼接起来。但是bm25要调tf系数。 | |
| 112 | + | |
| 113 | +2.5 图片相关性与向量字段调整 | |
| 114 | + | |
| 115 | +- "image_embedding": { | |
| 116 | + "type": "nested", | |
| 117 | + "properties": { | |
| 118 | + "vector": { | |
| 119 | + "type": "dense_vector", | |
| 120 | + "dims": 1024, | |
| 121 | + "index": true, | |
| 122 | + "similarity": "dot_product", | |
| 123 | + "element_type": "bfloat16" | |
| 124 | + }, | |
| 125 | + "url": { | |
| 126 | + "type": "text" | |
| 127 | + } | |
| 128 | + } | |
| 129 | +}, | |
| 130 | +去掉 image_embedding_512 | |
| 131 | +image_embedding改为,一个spu有多个sku向量,每个向量内部properties: | |
| 132 | +除了vector url还应该包括,该图片是对应哪些sku | |
| 133 | +"image_embedding": { | |
| 134 | + "type": "nested", | |
| 135 | + "properties": { | |
| 136 | + "vector": { | |
| 137 | + "type": "dense_vector", | |
| 138 | + "dims": 1024, | |
| 139 | + "index": true, | |
| 140 | + "similarity": "dot_product", | |
| 141 | + "element_type": "bfloat16" | |
| 142 | + }, | |
| 143 | + "url": { | |
| 144 | + "type": "text" | |
| 145 | + } | |
| 146 | + } | |
| 147 | +}, | |
| 148 | + | |
| 149 | +- 引入图片的相关性: | |
| 150 | +图片的向量最好做SKU维度,用 SPU 维度还是 SKU 维度? | |
| 151 | + 1. SKU维度(主款式,option1维度),如果用户搜索“蓝色 T恤”,这种图片相关性会比较有价值。 | |
| 152 | + 2. 我不考虑颜色的差异,其余的款式一般是大小之类的。这些图片,embedding细分到 SKU 维度,可能价值不大,性价比偏低 | |
| 153 | + | |
| 154 | +- 属性的筛选: | |
| 155 | +训练一个bert/transformer多分类模型,分类: 颜色、尺寸、材质 等等。但是要注意一些属性的值不规范、非常多,要考虑 是不是做规范化,如何规范化。 | |
| 156 | + | |
| 157 | +2.6 无结果重查与翻译缺失处理 | |
| 158 | + | |
| 159 | +- 无结果重查 | |
| 160 | +稀有语言,翻译可能超时(因为zh-en互译之外的翻译耗时更长) | |
| 161 | + | |
| 162 | + | |
| 163 | +--- | |
| 164 | + | |
| 165 | +3. 模型与推理服务优化 | |
| 166 | + | |
| 167 | +3.1 大模型API与本地部署 | |
| 168 | + | |
| 169 | +- 外部需求: | |
| 170 | + 1. 对推理能力要求很低、对耗时要求很高的大模型API(或者本地部署一个7b Q4量化的大模型),prompt大概30-50个token,首token响应要求500ms以内 | |
| 171 | + 2. ES支持reranker pipline? | |
| 172 | + | |
| 173 | +- 本地部署一个7b Q4量化的大模型 | |
| 174 | + | |
| 175 | +3.2 Embedding服务优化 | |
| 176 | + | |
| 177 | +- 先阅读文本embedding相关的代码: | |
| 178 | +@embeddings/README.md @embeddings/server.py @docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md @embeddings/text_encoder.py | |
| 179 | +目前有TEXT_MAX_INFLIGHT / IMAGE_MAX_INFLIGHT 准入限制,超限返回过载状态码。 | |
| 180 | + | |
| 181 | +- 文本embedding服务,要支持 priority 查询参数,priority > 0:不计入上述 inflight、不会因准入被拒绝(图片embedding不需要支持,因为只有离线需要用到图片embedding) | |
| 182 | +priority == 0(默认,适合做索引之类的离线任务):仍走原有 TEXT_MAX_INFLIGHT / IMAGE_MAX_INFLIGHT 准入;超限返回过载状态码。 | |
| 183 | +priority > 0(或者==1)(适合在线请求):不会因准入被拒绝,但是仍然需要占用inflight,这样保证在线请求不被限制,并且在线请求很多的时候可以拒绝掉离线的请求。 | |
| 184 | + | |
| 185 | +- 除了限制规则的修改,更进一步的,也需要保证这种请求是优先处理的(priority=1的相比=0的更优先被处理)。 | |
| 186 | +关于技术方案,有Worker + 双队列、PriorityMutex等等,除此之外,也请你思考合适的方案。 | |
| 187 | +成熟稳定、不带来复杂度、性能、稳定性方面的副作用,是最重要的。请先了解代码、需求,深度思考解决方案 | |
| 188 | + | |
| 189 | +- 向量的缓存 | |
| 190 | + | |
| 191 | +3.3 Reranker优化 | |
| 192 | + | |
| 193 | +- 多reranker: | |
| 194 | +改 reranker 服务,一次请求返回多路分 | |
| 195 | +服务启动时 加载多个 backend(或按请求懒加载),/rerank 响应扩展为例如 | |
| 196 | +scores: [...](兼容主后端)+ scores_by_backend: { "bge": [...], "qwen3_vllm": [...] }。 | |
| 197 | +搜索侧解析多路分,再融合或只透传 debug。 | |
| 198 | +优点:搜索侧仍只调一个 URL。缺点:单进程多大模型 显存压力很大; | |
| 199 | + | |
| 200 | +- 融合层要注意的一点 | |
| 201 | +fuse_scores_and_resort 目前只消费 一条 rerank_scores 序列,并写入 _rerank_score | |
| 202 | +多 backend 之后需要rerank_scores 都参与融合 | |
| 203 | + | |
| 204 | +- 必要性: | |
| 205 | +见 qwen3-reranker和bge-m3的严重badcase | |
| 206 | +不一定是要多reranker的方式,但是一定会需要解决方案。 | |
| 207 | + | |
| 208 | +- reranker 补充:nvidia/llama-nemotron-rerank-1b-v2 | |
| 209 | +https://huggingface.co/nvidia/llama-nemotron-rerank-1b-v2 | |
| 210 | +后端推理也建议使用vLLM | |
| 211 | +注意搜索相关资料,挖掘我的特斯拉 T4 GPU 的性能,充分挖掘性能 | |
| 212 | +你有充足的自由度进行实验 | |
| 213 | +encoder架构。 | |
| 214 | +比较新。 | |
| 215 | +性能更好。 | |
| 216 | +亚马逊 电商搜索数据集比qwen-reranker-4b更好。 | |
| 217 | +支持vLLM。 | |
| 218 | + | |
| 219 | +- Qwen3-Reranker-4B-GGUF | |
| 220 | +https://modelscope.cn/models/dengcao/Qwen3-Reranker-4B-GGUF/summary | |
| 221 | + 1. 要确定选择哪种量化方式 | |
| 222 | + 2. 确定提示词 | |
| 223 | + | |
| 224 | +- qwen3-embedding、qwen3-reranker (done) | |
| 225 | +选一个推理引擎,相比于我自己直接调 sentence-transformers,主要是多进程和负载均衡、连续批处理,比较有用 | |
| 226 | +当前结论:embedding 场景优先 TEI;vLLM 更偏向生成式与 rerank 场景。 | |
| 227 | + | |
| 228 | +- rerank 性能优化 | |
| 229 | + | |
| 230 | +3.4 翻译模型优化 | |
| 231 | + | |
| 232 | +- 翻译,增加facebook/nllb-200-distilled-600M | |
| 233 | +https://blog.csdn.net/qq_42746084/article/details/154947534 | |
| 234 | +https://huggingface.co/facebook/nllb-200-distilled-600M | |
| 235 | + | |
| 236 | +- 店铺的语言:英语能占到80%,所以专门增加一个en-zh的 | |
| 237 | +https://huggingface.co/Helsinki-NLP/opus-mt-zh-en | |
| 238 | +https://huggingface.co/Helsinki-NLP/opus-mt-en-zh | |
| 239 | + | |
| 240 | +- opus-mt-zh-en | |
| 241 | + | |
| 242 | + from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| 243 | + model_name = "./models/opus-mt-en-zh" | |
| 244 | + tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| 245 | + model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
| 246 | + data = 'test' | |
| 247 | + encoded = tokenizer([data], return_tensors="pt") | |
| 248 | + translation = model.generate(**encoded) | |
| 249 | + result = tokenizer.batch_decode(translation, skip_special_tokens=True)[0] | |
| 250 | + print(result) | |
| 251 | + | |
| 252 | +- nllb-200-distilled-600M性能优化 | |
| 253 | +已完成(2026-03) | |
| 254 | + - CTranslate2 迁移 + float16 转换 | |
| 255 | + - 扩展压测报告:perf_reports/20260318/translation_local_models_ct2/README.md | |
| 256 | + - T4 聚焦调优报告:perf_reports/20260318/translation_local_models_ct2_focus/README.md | |
| 257 | + - NLLB T4 商品标题专项报告:perf_reports/20260318/nllb_t4_product_names_ct2/README.md | |
| 258 | + - 当前结论: | |
| 259 | + - NLLB 在线默认推荐:ct2_inter_threads=4 + ct2_max_queued_batches=32 + ct2_batch_type=examples + ct2_decoding_length_mode=source(+8,min=32) | |
| 260 | + - opus-mt-zh-en 维持保守默认更稳 | |
| 261 | + - opus-mt-en-zh 如追求离线吞吐可继续做单独 profile | |
| 262 | + | |
| 263 | +- 请搜索nllb-200-distilled-600M这类seq2seq、transformer架构的模型,有哪些性能优化方案,提高线上翻译服务的吞吐量、降低耗时,搜索相关的在线推理服务方案,找到高性能的服务化方法 | |
| 264 | + | |
| 265 | +- 查看翻译的缓存情况 | |
| 266 | + | |
| 267 | +3.5 其他模型优化 | |
| 268 | + | |
| 269 | +- cnclip的性能优化 | |
| 270 | + | |
| 271 | + | |
| 272 | +--- | |
| 273 | + | |
| 274 | +4. 性能优化与超时配置 | |
| 275 | + | |
| 276 | +4.1 超时配置 | |
| 277 | + | |
| 278 | +- Query 分析阶段等待翻译/embedding 的硬超时 | |
| 279 | +配置文件位置:config/config.yaml | |
| 280 | +配置项:query_config.async_wait_timeout_ms: 80 | |
| 281 | +代码生效点:query/query_parser.py 使用该值换算成秒传给 wait(...) | |
| 282 | + | |
| 283 | +2. Embedding HTTP 调用超时(Text/Image) | |
| 284 | +不再使用任何环境变量覆盖(之前提到的 EMBEDDING_HTTP_TIMEOUT_SEC 已不采用) | |
| 285 | +配置文件位置:config/config.yaml | |
| 286 | +配置项:services.embedding.providers.http.timeout_sec(已在 YAML 里补了示例默认 60) | |
| 287 | +代码生效点: | |
| 288 | +embeddings/text_encoder.py:requests.post(..., timeout=self.timeout_sec) | |
| 289 | +embeddings/image_encoder.py:requests.post(..., timeout=self.timeout_sec) | |
| 290 | + | |
| 291 | +4.2 生成式服务优化(Partial Mode) | |
| 292 | + | |
| 293 | +- product_enrich : Partial Mode : done | |
| 294 | +https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-menu-2400256.d_0_3_0_7.74a630119Ct6zR | |
| 295 | +需在messages 数组中将最后一条消息的 role 设置为 assistant,并在其 content 中提供前缀,在此消息中设置参数 "partial": true。messages格式如下: | |
| 296 | +[ | |
| 297 | + { | |
| 298 | + "role": "user", | |
| 299 | + "content": "请补全这个斐波那契函数,勿添加其它内容" | |
| 300 | + }, | |
| 301 | + { | |
| 302 | + "role": "assistant", | |
| 303 | + "content": "def calculate_fibonacci(n):\n if n <= 1:\n return n\n else:\n", | |
| 304 | + "partial": true | |
| 305 | + } | |
| 306 | +] | |
| 307 | +模型会以前缀内容为起点开始生成。 | |
| 308 | +支持 非思考模式。 | |
| 309 | + | |
| 310 | + | |
| 311 | +--- | |
| 312 | + | |
| 313 | +5. Elasticsearch相关 | |
| 314 | + | |
| 315 | +- es需要licence的两个功能,如果费用低,开通下licence,或者改es源码定制开发下,支持 rank.rrf,reranker | |
| 316 | + | |
| 317 | + { | |
| 318 | + "query": { ...全文检索... }, | |
| 319 | + "knn": { ...向量检索... }, | |
| 320 | + "rank": { | |
| 321 | + "rrf": {} | |
| 322 | + } | |
| 323 | + } | |
| 324 | + | |
| 325 | + | |
| 326 | +--- | |
| 327 | + | |
| 328 | +1. 配置体系重构 | |
| 329 | +Referring to @docs/config-system-review-and-redesign.md , most of the modifications have been completed. Could you conduct a review to check what else needs improvement in the configuration documentation system? Are there any outstanding issues? | |
| 330 | + | |
| 331 | +一、仍然存在大量通过环境变量获取配置的地方 | |
| 332 | +_SERVICE_KIND = (os.getenv("EMBEDDING_SERVICE_KIND", "all") or "all").strip().lower() | |
| 333 | +if _SERVICE_KIND not in {"all", "text", "image"}: | |
| 334 | + raise RuntimeError( | |
| 335 | + f"Invalid EMBEDDING_SERVICE_KIND={_SERVICE_KIND!r}; expected all, text, or image" | |
| 336 | + ) | |
| 337 | +_TEXT_ENABLED_BY_ENV = os.getenv("EMBEDDING_ENABLE_TEXT_MODEL", "true").lower() in ("1", "true", "yes") | |
| 338 | +_IMAGE_ENABLED_BY_ENV = os.getenv("EMBEDDING_ENABLE_IMAGE_MODEL", "true").lower() in ("1", "true", "yes") | |
| 339 | +open_text_model = _TEXT_ENABLED_BY_ENV and _SERVICE_KIND in {"all", "text"} | |
| 340 | +open_image_model = _IMAGE_ENABLED_BY_ENV and _SERVICE_KIND in {"all", "image"} | |
| 341 | + | |
| 342 | +_text_encode_lock = threading.Lock() | |
| 343 | +_image_encode_lock = threading.Lock() | |
| 344 | + | |
| 345 | +_TEXT_MICROBATCH_WINDOW_SEC = max( | |
| 346 | + 0.0, float(os.getenv("TEXT_MICROBATCH_WINDOW_MS", "4")) / 1000.0 | |
| 347 | +) | |
| 348 | +_TEXT_REQUEST_TIMEOUT_SEC = max( | |
| 349 | + 1.0, float(os.getenv("TEXT_REQUEST_TIMEOUT_SEC", "30")) | |
| 350 | +) | |
| 351 | +_TEXT_MAX_INFLIGHT = max(1, int(os.getenv("TEXT_MAX_INFLIGHT", "32"))) | |
| 352 | +_IMAGE_MAX_INFLIGHT = max(1, int(os.getenv("IMAGE_MAX_INFLIGHT", "1"))) | |
| 353 | +_OVERLOAD_STATUS_CODE = int(os.getenv("EMBEDDING_OVERLOAD_STATUS_CODE", "503")) | |
| 354 | +_LOG_PREVIEW_COUNT = max(1, int(os.getenv("EMBEDDING_LOG_PREVIEW_COUNT", "3"))) | |
| 355 | +_LOG_TEXT_PREVIEW_CHARS = max(32, int(os.getenv("EMBEDDING_LOG_TEXT_PREVIEW_CHARS", "120"))) | |
| 356 | +_LOG_IMAGE_PREVIEW_CHARS = max(32, int(os.getenv("EMBEDDING_LOG_IMAGE_PREVIEW_CHARS", "180"))) | |
| 357 | +_VECTOR_PREVIEW_DIMS = max(1, int(os.getenv("EMBEDDING_VECTOR_PREVIEW_DIMS", "6"))) | |
| 358 | +_CACHE_PREFIX = str(REDIS_CONFIG.get("embedding_cache_prefix", "embedding")).strip() or "embedding" | |
| 359 | + | |
| 360 | + | |
| 361 | + | |
| 362 | + | |
| 363 | + | |
| 364 | +还有这些写死的地址 @embedding/config.py | |
| 365 | + | |
| 366 | +self.TEI_BASE_URL = str(text_backend.get("base_url") or "http://127.0.0.1:8080") | |
| 367 | +self.TEI_TIMEOUT_SEC = int(text_backend.get("timeout_sec", 60)) | |
| 368 | + | |
| 369 | +self.USE_CLIP_AS_SERVICE = services.image_backend == "clip_as_service" | |
| 370 | +self.CLIP_AS_SERVICE_SERVER = str(image_backend.get("server") or "grpc://127.0.0.1:51000") | |
| 371 | + | |
| 372 | + | |
| 373 | + | |
| 374 | + | |
| 375 | +看起来似乎并没有完全遵循这些原则? | |
| 376 | +4. 重新设计的设计原则 | |
| 377 | +重新设计应遵循以下规则。 | |
| 378 | + | |
| 379 | +4.1 单一逻辑配置系统 | |
| 380 | +可以有多个文件,但不能有多个职责重叠的加载器。 | |
| 381 | +必须有一个加载器管道,能够生成一个类型化的 AppConfig 对象。 | |
| 382 | + | |
| 383 | +4.2 配置文件负责声明,解析代码负责解释,环境变量负责运行时注入 | |
| 384 | +职责应明确如下: | |
| 385 | +配置文件 | |
| 386 | +声明非敏感的目标行为和可部署的非敏感设置 | |
| 387 | +解析逻辑 | |
| 388 | +加载、合并、验证、规范化并暴露类型化的配置 | |
| 389 | +绝不发明隐藏的业务行为 | |
| 390 | +环境变量 | |
| 391 | +承载密钥和少量运行时/进程相关的值 | |
| 392 | +不随意地重新定义业务行为 | |
| 393 | + | |
| 394 | +4.3 整个系统采用单一的优先级规则 | |
| 395 | +除非明确豁免,否则每个配置类别都应遵循相同的合并模型。 | |
| 396 | + | |
| 397 | +4.4 业务行为不得有静默的隐式后备 | |
| 398 | +在启动时,如果必需的配置缺失或无效,应快速失败。 | |
| 399 | +不要静默地回退到诸如硬编码语言列表之类的遗留行为。 | |
| 400 | + | |
| 401 | +4.5 有效配置必须可观测 | |
| 402 | +每个服务都应能够展示: | |
| 403 | +配置版本或哈希值 | |
| 404 | +加载的源文件 | |
| 405 | +环境名称 | |
| 406 | +经过清理的有效配置 | |
| 407 | + | |
| 408 | +5. 推荐的目标设计 | |
| 409 | + | |
| 410 | +5.1 边界模型 | |
| 411 | +使用三个清晰的层级。 | |
| 412 | +层级 1:代码仓库管理的静态配置 | |
| 413 | +目的: | |
| 414 | +搜索行为 | |
| 415 | +租户行为 | |
| 416 | +提供商/后端注册表 | |
| 417 | +非敏感的服务拓扑默认值 | |
| 418 | +功能开关 | |
| 419 | +示例: | |
| 420 | +字段权重 | |
| 421 | +查询策略 | |
| 422 | +重排序融合参数 | |
| 423 | +租户语言方案 | |
| 424 | +翻译能力注册表 | |
| 425 | +嵌入后端选择默认值 | |
| 426 | + | |
| 427 | +层级 2:特定环境的层叠配置 | |
| 428 | +目的: | |
| 429 | +按环境区分的非敏感差异 | |
| 430 | +按环境区分的服务端点 | |
| 431 | +按环境区分的资源大小默认值 | |
| 432 | +开发/测试/生产环境的运维差异 | |
| 433 | +示例: | |
| 434 | +本地嵌入 URL 与生产环境嵌入 URL | |
| 435 | +开发环境重排序后端与生产环境重排序后端 | |
| 436 | +本地开发环境中较低的并发度 | |
| 437 | + | |
| 438 | +层级 3:环境变量 | |
| 439 | +目的: | |
| 440 | +密钥 | |
| 441 | +绑定主机/端口 | |
| 442 | +外部基础设施凭证 | |
| 443 | +容器编排器的最后一步注入 | |
| 444 | +示例: | |
| 445 | +ES_HOST, ES_USERNAME, ES_PASSWORD | |
| 446 | +DB_HOST, DB_USERNAME, DB_PASSWORD | |
| 447 | +REDIS_HOST, REDIS_PASSWORD | |
| 448 | +DASHSCOPE_API_KEY, DEEPL_AUTH_KEY | |
| 449 | +API_HOST, API_PORT, INDEXER_PORT, TRANSLATION_PORT | |
| 450 | +规则: | |
| 451 | +环境变量不应成为选择业务行为(如翻译模型、嵌入后端或租户语言策略)的常规途径 | |
| 452 | +如果允许对非敏感字段进行环境变量覆盖,则必须将其明确列出并记录为运维覆盖手段,而非隐藏的约定 | |
| 453 | + | |
| 454 | +5.2 统一的优先级 | |
| 455 | +推荐的优先级顺序: | |
| 456 | +代码中的模式(schema)默认值 | |
| 457 | +config/base.yaml | |
| 458 | +config/environments/<env>.yaml | |
| 459 | +来自 config/tenants/ 的租户层叠配置 | |
| 460 | +针对明确允许的运行时键的环境变量 | |
| 461 | +仅适用于当前进程的命令行标志 | |
| 462 | +重要规则: | |
| 463 | +只能有一个模块实现此合并逻辑 | |
| 464 | +任何业务模块不得直接调用 os.getenv() 来获取配置 | |
| 465 | + | |
| 466 | + | |
| 467 | + | |
| 468 | +二、cursor(composer2)帮我审查的 | |
| 469 | + | |
| 470 | +## 配置文档系统审查 | |
| 471 | + | |
| 472 | +这是对配置文档系统的简要审查,对照 [`docs/config-system-review-and-redesign.md`](docs/config-system-review-and-redesign.md) 和当前代码库进行。 | |
| 473 | + | |
| 474 | +### 实现已匹配的部分 | |
| 475 | + | |
| 476 | +代码库已朝着重新设计的目标迈进了很大一步: | |
| 477 | + | |
| 478 | +- 单一类型化根节点:`AppConfig` 位于 [`config/schema.py`](config/schema.py),通过 [`config/loader.py`](config/loader.py) 和 [`get_app_config()`](config/loader.py) 加载。 | |
| 479 | +- 精简的 `services_config`:[`config/services_config.py`](config/services_config.py) 委托给 `get_app_config()`(解决了服务层的发现 A)。 | |
| 480 | +- 租户目录:[`config/tenant_config_loader.py`](config/tenant_config_loader.py) 使用 `get_app_config().tenants`。 | |
| 481 | +- 重写资产路径:`config/dictionaries/query_rewrite.dict` 已存在(解决了发现 E 中的文件名不匹配问题)。 | |
| 482 | +- 可观测性:[`GET /admin/config`](api/routes/admin.py) 返回经过脱敏处理的有效配置树;[`GET /admin/config/meta`](api/routes/admin.py) 暴露环境信息、`config_hash`、`loaded_files`、`deprecated_keys`(涵盖了 §5.10 的意图;文档中提议的是 `/admin/config/effective`,但实际实现位于 `/admin/config`)。 | |
| 483 | + | |
| 484 | +因此,"单一加载器 + 有效配置可见性" 的故事在代码中已基本实现;文档尚未完全跟上。 | |
| 485 | + | |
| 486 | +--- | |
| 487 | + | |
| 488 | +## 文档问题(影响最大) | |
| 489 | + | |
| 490 | +### 1. 管理 API 文档中关于 `/admin/config` 的描述错误 | |
| 491 | + | |
| 492 | +[`docs/搜索API对接指南.md`](docs/搜索API对接指南.md)(管理部分附近)和 [`docs/搜索API对接指南-06-管理接口(Admin).md`](docs/搜索API对接指南-06-管理接口(Admin).md) 仍将 `/admin/config` 描述为按租户的 JSON(包含 `tenant_id`、`es_index_name`、`supported_languages` 等字段)。实际实现返回的是 `AppConfig.sanitized_dict()`(完整的应用配置,敏感信息已脱敏),而不是租户摘要字段。 | |
| 493 | + | |
| 494 | +这些指南中还缺少: `GET /admin/config/meta`。 | |
| 495 | + | |
| 496 | +健康检查: 拆分指南中的示例包含了 [`HealthResponse`](api/models.py) 中不存在的字段(只有 `status` 和 `elasticsearch`)。 | |
| 497 | + | |
| 498 | +对于任何仅根据文档进行 API 集成的人来说,这是最明显的"未解决问题"。 | |
| 499 | + | |
| 500 | +### 2. 面向开发者的指南仍将 `services_config` 作为"配置解析器"的核心 | |
| 501 | + | |
| 502 | +[`docs/DEVELOPER_GUIDE.md`](docs/DEVELOPER_GUIDE.md) §5.2 仍说搜索配置由 `ConfigLoader` 加载,服务由 `config/services_config` "解析"。§6.2 仍将 `config/services_config.py` 列为主要的"解析入口"。[`docs/QUICKSTART.md`](docs/QUICKSTART.md) §3.1 仍说"配置解析:`config/services_config.py`"。 | |
| 503 | + | |
| 504 | +文档中准确的说法应该是:规范入口是 `config/loader.py` + `get_app_config()`;[`config/config_loader.py`](config/config_loader.py) 中的 `ConfigLoader` 包装了统一加载器;`services_config` 是现有调用点的兼容性外观。 | |
| 505 | + | |
| 506 | +### 3. 重新设计文档本身不是"活的"状态文档 | |
| 507 | + | |
| 508 | +[`docs/config-system-review-and-redesign.md`](docs/config-system-review-and-redesign.md) 读起来仍是纯粹的问题陈述 + 目标,没有简短的**"已实现 vs 剩余"**部分。这很容易让人假设什么都没做,或者重复工作。添加一个小附录(或一页 `config/README.md` —— 见下文)可以解决这个问题。 | |
| 509 | + | |
| 510 | +### 4. 缺少 `config/README.md`(§5.3 中推荐) | |
| 511 | + | |
| 512 | +仍然没有专门的 `config/README.md` 来描述:加载器入口点、高级优先级、字典存放位置、指向 `/admin/config` + `/admin/config/meta` 的链接,以及重新设计文档的链接。这是重新设计中明确的交付物,可以锚定"文档系统"。 | |
| 513 | + | |
| 514 | +### 5. 轻微的文档整洁问题 | |
| 515 | + | |
| 516 | +- [`docs/QUICKSTART.md`](docs/QUICKSTART.md) §1.9 环境变量项目后的行有一个多余字符:`---·`(可能是打字错误)。 | |
| 517 | +- [`docs/DEVELOPER_GUIDE.md`](docs/DEVELOPER_GUIDE.md) §10 文档索引没有列出 `config-system-review-and-redesign.md` 或未来的 `config/README.md`。 | |
| 518 | + | |
| 519 | +--- | |
| 520 | + | |
| 521 | +## 重新设计目标与当前代码之间的差距(文档不应声称"已完成") | |
| 522 | + | |
| 523 | +这些影响文档的诚实度: | |
| 524 | + | |
| 525 | +| 主题 | 状态 | | |
| 526 | +|--------|--------| | |
| 527 | +| `config dump` CLI(§5.10) | `main.py` 中不存在;运维人员依赖 HTTP 或临时脚本。 | | |
| 528 | +| 隐藏的 `["en", "zh"]` 回退(阶段 3 / 发现 D) | 仍在 [`indexer/document_transformer.py`](indexer/document_transformer.py)、[`suggestion/builder.py`](suggestion/builder.py) 等中使用。 | | |
| 529 | +| 加载器外的 `os.getenv`(规则 1–2) | 仍在例如 [`embeddings/server.py`](embeddings/server.py)、[`reranker/server.py`](reranker/server.py)、[`api/app.py`](api/app.py) 中使用 —— 文档声称"仅加载器"将是夸大其词。 | | |
| 530 | +| 拆分 `base.yaml` / `environments/` / `tenants/*.yaml`(阶段 5) | 未采用;仍是单一的 [`config/config.yaml`](config/config.yaml)。 | | |
| 531 | +| 遗留租户标志(阶段 6 / 发现 H) | [`indexer/README.md`](indexer/README.md) 仍描述上游 MySQL 的 `translate_to_en` / `translate_to_zh`(这可能作为上游模式文档保留;需与 Python `tenant_config` 模型区分开来)。 | | |
| 532 | + | |
| 533 | +--- | |
| 534 | + | |
| 535 | +## 推荐的后续步骤(仅文档,按优先级排序) | |
| 536 | + | |
| 537 | +1. 修复管理 API 文档(合并指南 + `-06-` 拆分):`/admin/config` 的响应格式,添加 `/admin/config/meta`,使健康检查示例与 [`HealthResponse`](api/models.py) 一致。 | |
| 538 | +2. 更新 DEVELOPER_GUIDE §5–§6 和 QUICKSTART §1.9 / §3.1,将 `get_app_config()` / `loader.py` 描述为主要入口,将 `services_config` 描述为适配器。 | |
| 539 | +3. 添加 `config/README.md`(简短的操作 + 开发者入口)。 | |
| 540 | +4. 在 `config-system-review-and-redesign.md` 中添加带日期的实现状态表(已交付 vs 推迟的内容),使审查文档不与现实矛盾。 | |
| 541 | +5. DEVELOPER_GUIDE §9 检查清单:将"配置来自 `services_config`"替换为允许 `get_app_config()` 或精简适配器的语言,与 §6 保持一致。 | |
| 542 | + | |
| 543 | +如果需要,我可以在后续处理中为项目 1–3 和重新设计文档中的简短状态块应用补丁。 | |
| 544 | + | |
| 545 | +其他云API | |
| 546 | +1 | |
| 547 | +1)提供两个rerank云API_KEY给我:(优先级:高) | |
| 548 | +AWS Bedrock / Azure 两家云有提供的Cohere Rerank 3.5/4模型API,开通APIKEY | |
| 549 | +google云 Vertex AI Ranking API | |
| 550 | + | |
| 551 | +已经调研: | |
| 552 | +阿里云在美国地区没有提供任意reranker API | |
| 553 | +AWS Bedrock / Azure 两家云有提供Cohere Rerank 3.5 | |
| 554 | +google云Vertex AI Ranking API性能更好 | |
| 555 | + | |
| 556 | +以上两个APIKEY给我,我来测试性能和效果。 | |
| 557 | + | |
| 558 | + | |
| 559 | +2)寻找美国地区reranker API最佳实践(优先级:高) | |
| 560 | +效果要求:qwen3-reranker-4b(或者同等能力。可对比huggingface公开的评测指标)的API | |
| 561 | +性能要求:在我们的服务器上,一个请求内排序400条结果、耗时低于300ms | |
| 562 | +测试评估:基于电商领域商品搜索场景评估效果(我可以提供数据) | |
| 563 | +据我了解的Cohere Rerank可能达不到这个性能要求,可能可以考虑拆分为4个请求、每个100条,做到300ms以内可能可以。 | |
| 564 | +参考Cohere Rerank 3.5 benchmark: | |
| 565 | +https://docs.oracle.com/en-us/iaas/Content/generative-ai/benchmark-cohere-rerank-3-5.htm | |
| 566 | + | |
| 567 | + | |
| 568 | +3)提供谷歌翻译API的apikey (优先级:低) | |
| 569 | +给我apikey,我看下耗时,希望耗时P95低于80ms满足在线请求使用 | |
| 570 | +在线翻译的问题已经基本解决,这一块需求不是特别大。 | |
| 571 | + | |
| 572 | +2 | |
| 573 | +混用 大模型 使用:hunyuan-turbos-latest | |
| 574 | +混元 OpenAI 兼容接口相关调用示例:https://cloud.tencent.com/document/product/1729/111007 | |
| 575 | + | |
| 576 | +腾讯云 混元大模型 API_KEY:sk-mN2PiW2gp57B3ykxGs4QhvYxhPzXRZ2bcR5kPqadjboGYwiz | |
| 577 | + | |
| 578 | +hunyuan翻译:使用模型 hunyuan-translation | |
| 579 | +https://cloud.tencent.com/document/product/1729/113395#4.-.E7.A4.BA.E4.BE.8B | |
| 580 | + | |
| 581 | +谷歌翻译 基础版:https://docs.cloud.google.com/translate/docs/reference/rest/v2/translate | |
| 582 | + | |
| 583 | +阿里云 百炼模型 现在使用的apikey是国内的。 | |
| 584 | +各地域的 Base URL 和对应的 API Key 是绑定的。 | |
| 585 | + | |
| 586 | +现在使用了美国的服务器,使用了美国的地址,需要在 美国地域控制台页面(https://modelstudio.console.aliyun.com/us-east-1 )中创建或获取API_KEY: | |
| 587 | + | |
| 588 | +登录 百炼美国地域控制台:https://modelstudio.console.aliyun.com/us-east-1?spm=5176.2020520104.0.0.6b383a98WjpXff | |
| 589 | +在 API Key 管理 中创建或复制一个适用于美国地域的 Key | |
| 590 | + | |
| 591 | +搜索效果反馈: | |
| 592 | +做完一些短期优化后,需要做一些case驱动的优化。 | |
| 593 | +给到100条测试用例,每个搜索词,要记录请求ID、以及 希望排序靠前但是没有靠前的(比如希望出现在第一页但是没出现在第一页的)、以及未召回的商品ID(希望出现在前几页但是没翻到的) | |
| 594 | +6. 其他任务 | |
| 595 | + | |
| 596 | +- suggest 索引,现在是全量脚本,要交给金伟 | |
| 0 | 597 | \ No newline at end of file | ... | ... |
docs/TODO.txt
| ... | ... | @@ -55,14 +55,6 @@ image_embedding改为,一个spu有多个sku向量,每个向量内部properti |
| 55 | 55 | }, |
| 56 | 56 | |
| 57 | 57 | |
| 58 | - | |
| 59 | - | |
| 60 | -tags字段使用的优化: | |
| 61 | -现在是keyword,在搜索中,不太好使用(目前主要用于suggest)。 | |
| 62 | -可以考虑也拆分多语言,配合analyzer使用(和qanchors一样) | |
| 63 | - | |
| 64 | - | |
| 65 | - | |
| 66 | 58 | 外部需求: |
| 67 | 59 | 1. 对推理能力要求很低、对耗时要求很高的大模型API(或者本地部署一个7b Q4量化的大模型),prompt大概30-50个token,首token响应要求500ms以内 |
| 68 | 60 | 2. ES支持reranker pipline? |
| ... | ... | @@ -86,7 +78,7 @@ query匹配了其中任何一个词,都认为,具有颜色意图 |
| 86 | 78 | 匹配规则: 用细粒度、粗粒度分词,看是否有在词表中的。原始query分词、和每种翻译的分词,都要用。 |
| 87 | 79 | |
| 88 | 80 | 意图判断: 暂时留空,直接返回true。目前没有模型,即只要召回了(词表匹配了),即认为有该维度款式需求。 |
| 89 | - | |
| 81 | +(以后考虑建设fasttext/bert系列多分类模型) | |
| 90 | 82 | |
| 91 | 83 | |
| 92 | 84 | 意图使用: |
| ... | ... | @@ -119,17 +111,23 @@ query匹配了其中任何一个词,都认为,具有颜色意图 |
| 119 | 111 | 5. TODO : 还有一个问题。 目前,sku只返回一个维度(店铺主维度。默认应该是option1,不是所有维度的sku信息都返回的。所以,如果有款式意图,但是主维度是颜色,那么拿不到全的款式sku) |
| 120 | 112 | |
| 121 | 113 | |
| 122 | - | |
| 123 | - | |
| 124 | 114 | 当前项目功能已经较多,但是有清晰的框架,请务必基于现有框架进行改造,不要进行补丁式的修改,避免代码逻辑分叉。 |
| 125 | - | |
| 126 | 115 | 请一步一步来,先设计意图识别模块,仔细思考需求,意图识别模块需要提供哪些内容,用于返回数据接口的定义,深度思考,定义一个合理的接口后,再给出合理的模块设计。 |
| 127 | 116 | |
| 128 | 117 | |
| 129 | 118 | |
| 130 | 119 | |
| 120 | +文本相关性: | |
| 121 | +调研: | |
| 122 | +Princeton WordNet — 英文同义词底库 | |
| 123 | +Shopify Product Taxonomy — 电商品类标准 | |
| 124 | +Querqy — 电商搜索规则框架 | |
| 125 | +gensimpson/elasticsearch-synonyms — ES 同义词规则落地 | |
| 131 | 126 | |
| 132 | 127 | |
| 128 | +tags字段使用的优化: | |
| 129 | +现在是keyword,在搜索中,不太好使用(目前主要用于suggest)。 | |
| 130 | +可以考虑也拆分多语言,配合analyzer使用(和qanchors一样) | |
| 133 | 131 | |
| 134 | 132 | |
| 135 | 133 | |
| ... | ... | @@ -463,8 +461,9 @@ scores: [...](兼容主后端)+ scores_by_backend: { "bge": [...], "qwen3_vl |
| 463 | 461 | fuse_scores_and_resort 目前只消费 一条 rerank_scores 序列,并写入 _rerank_score |
| 464 | 462 | 多 backend 之后需要rerank_scores 都参与融合 |
| 465 | 463 | |
| 466 | - | |
| 467 | - | |
| 464 | +必要性: | |
| 465 | +见 qwen3-reranker和bge-m3的严重badcase | |
| 466 | +不一定是要多reranker的方式,但是一定会需要解决方案。 | |
| 468 | 467 | |
| 469 | 468 | |
| 470 | 469 | ... | ... |
search/es_query_builder.py
| ... | ... | @@ -37,10 +37,13 @@ class ESQueryBuilder: |
| 37 | 37 | translation_minimum_should_match: str = "70%", |
| 38 | 38 | translation_boost: float = 0.4, |
| 39 | 39 | tie_breaker_base_query: float = 0.9, |
| 40 | + best_fields_boosts: Optional[Dict[str, float]] = None, | |
| 41 | + best_fields_clause_boost: float = 2.0, | |
| 40 | 42 | mixed_script_merged_field_boost_scale: float = 0.6, |
| 43 | + phrase_field_boosts: Optional[Dict[str, float]] = None, | |
| 41 | 44 | phrase_match_base_fields: Optional[Tuple[str, ...]] = None, |
| 42 | - phrase_match_slop: int = 2, | |
| 43 | - phrase_match_tie_breaker: float = 0.4, | |
| 45 | + phrase_match_slop: int = 0, | |
| 46 | + phrase_match_tie_breaker: float = 0.0, | |
| 44 | 47 | phrase_match_boost: float = 3.0, |
| 45 | 48 | ): |
| 46 | 49 | """ |
| ... | ... | @@ -77,7 +80,26 @@ class ESQueryBuilder: |
| 77 | 80 | self.translation_boost = float(translation_boost) |
| 78 | 81 | self.tie_breaker_base_query = float(tie_breaker_base_query) |
| 79 | 82 | self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale) |
| 80 | - self.phrase_match_base_fields = tuple(phrase_match_base_fields or ("title", "qanchors")) | |
| 83 | + default_best_fields = { | |
| 84 | + base: self._get_field_boost(base) | |
| 85 | + for base in self.core_multilingual_fields | |
| 86 | + if base in self.multilingual_fields | |
| 87 | + } | |
| 88 | + self.best_fields_boosts = { | |
| 89 | + str(base): float(boost) | |
| 90 | + for base, boost in (best_fields_boosts or default_best_fields).items() | |
| 91 | + } | |
| 92 | + self.best_fields_clause_boost = float(best_fields_clause_boost) | |
| 93 | + default_phrase_base_fields = tuple(phrase_match_base_fields or ("title", "qanchors")) | |
| 94 | + default_phrase_fields = { | |
| 95 | + base: self._get_field_boost(base) | |
| 96 | + for base in default_phrase_base_fields | |
| 97 | + if base in self.multilingual_fields | |
| 98 | + } | |
| 99 | + self.phrase_field_boosts = { | |
| 100 | + str(base): float(boost) | |
| 101 | + for base, boost in (phrase_field_boosts or default_phrase_fields).items() | |
| 102 | + } | |
| 81 | 103 | self.phrase_match_slop = int(phrase_match_slop) |
| 82 | 104 | self.phrase_match_tie_breaker = float(phrase_match_tie_breaker) |
| 83 | 105 | self.phrase_match_boost = float(phrase_match_boost) |
| ... | ... | @@ -399,27 +421,6 @@ class ESQueryBuilder: |
| 399 | 421 | |
| 400 | 422 | return functions |
| 401 | 423 | |
| 402 | - def _build_text_query(self, query_text: str) -> Dict[str, Any]: | |
| 403 | - """ | |
| 404 | - Build simple text matching query (BM25). | |
| 405 | - | |
| 406 | - Args: | |
| 407 | - query_text: Query text | |
| 408 | - | |
| 409 | - Returns: | |
| 410 | - ES query clause | |
| 411 | - """ | |
| 412 | - return { | |
| 413 | - "multi_match": { | |
| 414 | - "query": query_text, | |
| 415 | - "fields": self.match_fields, | |
| 416 | - "minimum_should_match": "67%", | |
| 417 | - "tie_breaker": 0.9, | |
| 418 | - "boost": 1.0, | |
| 419 | - "_name": "base_query" | |
| 420 | - } | |
| 421 | - } | |
| 422 | - | |
| 423 | 424 | def _format_field_with_boost(self, field_name: str, boost: float) -> str: |
| 424 | 425 | if abs(float(boost) - 1.0) < 1e-9: |
| 425 | 426 | return field_name |
| ... | ... | @@ -435,70 +436,38 @@ class ESQueryBuilder: |
| 435 | 436 | return float(self.field_boosts[base_field]) |
| 436 | 437 | return 1.0 |
| 437 | 438 | |
| 438 | - def _build_match_field_specs(self, language: str) -> Tuple[List[MatchFieldSpec], List[MatchFieldSpec]]: | |
| 439 | + def _build_match_field_specs( | |
| 440 | + self, | |
| 441 | + language: str, | |
| 442 | + *, | |
| 443 | + multilingual_fields: Optional[List[str]] = None, | |
| 444 | + shared_fields: Optional[List[str]] = None, | |
| 445 | + boost_overrides: Optional[Dict[str, float]] = None, | |
| 446 | + ) -> List[MatchFieldSpec]: | |
| 439 | 447 | """ |
| 440 | - Per-language match targets as (field_path, boost). Single source of truth before string formatting. | |
| 441 | - Returns (all_fields, core_fields); core_fields are for phrase/keyword strategies elsewhere. | |
| 448 | + Per-language match targets as (field_path, boost). Single source of truth before | |
| 449 | + formatting as Elasticsearch ``fields`` strings. | |
| 442 | 450 | """ |
| 443 | 451 | lang = (language or "").strip().lower() |
| 444 | - all_specs: List[MatchFieldSpec] = [] | |
| 445 | - core_specs: List[MatchFieldSpec] = [] | |
| 446 | - | |
| 447 | - for base in self.multilingual_fields: | |
| 448 | - field = f"{base}.{lang}" | |
| 449 | - all_specs.append((field, self._get_field_boost(base, lang))) | |
| 450 | - | |
| 451 | - for shared in self.shared_fields: | |
| 452 | - all_specs.append((shared, self._get_field_boost(shared, None))) | |
| 452 | + specs: List[MatchFieldSpec] = [] | |
| 453 | + text_fields = multilingual_fields if multilingual_fields is not None else self.multilingual_fields | |
| 454 | + term_fields = shared_fields if shared_fields is not None else self.shared_fields | |
| 455 | + overrides = boost_overrides or {} | |
| 453 | 456 | |
| 454 | - for base in self.core_multilingual_fields: | |
| 457 | + for base in text_fields: | |
| 455 | 458 | field = f"{base}.{lang}" |
| 456 | - core_specs.append((field, self._get_field_boost(base, lang))) | |
| 459 | + boost = float(overrides.get(base, self._get_field_boost(base, lang))) | |
| 460 | + specs.append((field, boost)) | |
| 457 | 461 | |
| 458 | - return all_specs, core_specs | |
| 462 | + for shared in term_fields: | |
| 463 | + boost = float(overrides.get(shared, self._get_field_boost(shared, None))) | |
| 464 | + specs.append((shared, boost)) | |
| 465 | + return specs | |
| 459 | 466 | |
| 460 | 467 | def _format_match_field_specs(self, specs: List[MatchFieldSpec]) -> List[str]: |
| 461 | 468 | """Format (field_path, boost) pairs for Elasticsearch multi_match ``fields``.""" |
| 462 | 469 | return [self._format_field_with_boost(path, boost) for path, boost in specs] |
| 463 | 470 | |
| 464 | - def _build_phrase_match_fields(self, language: str) -> List[str]: | |
| 465 | - """Fields for phrase multi_match: base names × ``.{lang}`` with ``field_boosts``.""" | |
| 466 | - lang = (language or "").strip().lower() | |
| 467 | - if not lang: | |
| 468 | - return [] | |
| 469 | - out: List[str] = [] | |
| 470 | - for base in self.phrase_match_base_fields: | |
| 471 | - path = f"{base}.{lang}" | |
| 472 | - boost = self._get_field_boost(base, lang) | |
| 473 | - out.append(self._format_field_with_boost(path, boost)) | |
| 474 | - return out | |
| 475 | - | |
| 476 | - def _append_phrase_should_clause( | |
| 477 | - self, | |
| 478 | - should_clauses: List[Dict[str, Any]], | |
| 479 | - lang: str, | |
| 480 | - lang_query: str, | |
| 481 | - clause_name: str | |
| 482 | - ) -> None: | |
| 483 | - text = (lang_query or "").strip() | |
| 484 | - if not text: | |
| 485 | - return | |
| 486 | - phrase_fields = self._build_phrase_match_fields(lang) | |
| 487 | - if not phrase_fields: | |
| 488 | - return | |
| 489 | - boost = self.phrase_match_boost | |
| 490 | - should_clauses.append({ | |
| 491 | - "multi_match": { | |
| 492 | - "_name": f"{clause_name}_phrase", | |
| 493 | - "query": lang_query, | |
| 494 | - "type": "phrase", | |
| 495 | - "fields": phrase_fields, | |
| 496 | - "slop": self.phrase_match_slop, | |
| 497 | - "tie_breaker": self.phrase_match_tie_breaker, | |
| 498 | - "boost": boost, | |
| 499 | - } | |
| 500 | - }) | |
| 501 | - | |
| 502 | 471 | def _merge_supplemental_lang_field_specs( |
| 503 | 472 | self, |
| 504 | 473 | specs: List[MatchFieldSpec], |
| ... | ... | @@ -506,7 +475,7 @@ class ESQueryBuilder: |
| 506 | 475 | ) -> List[MatchFieldSpec]: |
| 507 | 476 | """Append supplemental-language columns; boosts multiplied by mixed_script scale.""" |
| 508 | 477 | scale = float(self.mixed_script_merged_field_boost_scale) |
| 509 | - extra_all, _ = self._build_match_field_specs(supplemental_lang) | |
| 478 | + extra_all = self._build_match_field_specs(supplemental_lang) | |
| 510 | 479 | seen = {path for path, _ in specs} |
| 511 | 480 | out = list(specs) |
| 512 | 481 | for path, boost in extra_all: |
| ... | ... | @@ -543,6 +512,103 @@ class ESQueryBuilder: |
| 543 | 512 | out = self._merge_supplemental_lang_field_specs(out, "zh") |
| 544 | 513 | return out |
| 545 | 514 | |
| 515 | + def _build_best_fields_clause(self, language: str, query_text: str) -> Optional[Dict[str, Any]]: | |
| 516 | + specs = self._build_match_field_specs( | |
| 517 | + language, | |
| 518 | + multilingual_fields=list(self.best_fields_boosts), | |
| 519 | + shared_fields=[], | |
| 520 | + boost_overrides=self.best_fields_boosts, | |
| 521 | + ) | |
| 522 | + fields = self._format_match_field_specs(specs) | |
| 523 | + if not fields: | |
| 524 | + return None | |
| 525 | + return { | |
| 526 | + "multi_match": { | |
| 527 | + "query": query_text, | |
| 528 | + "type": "best_fields", | |
| 529 | + "fields": fields, | |
| 530 | + "boost": self.best_fields_clause_boost, | |
| 531 | + } | |
| 532 | + } | |
| 533 | + | |
| 534 | + def _build_phrase_clause(self, language: str, query_text: str) -> Optional[Dict[str, Any]]: | |
| 535 | + specs = self._build_match_field_specs( | |
| 536 | + language, | |
| 537 | + multilingual_fields=list(self.phrase_field_boosts), | |
| 538 | + shared_fields=[], | |
| 539 | + boost_overrides=self.phrase_field_boosts, | |
| 540 | + ) | |
| 541 | + fields = self._format_match_field_specs(specs) | |
| 542 | + if not fields: | |
| 543 | + return None | |
| 544 | + clause: Dict[str, Any] = { | |
| 545 | + "multi_match": { | |
| 546 | + "query": query_text, | |
| 547 | + "type": "phrase", | |
| 548 | + "fields": fields, | |
| 549 | + "boost": self.phrase_match_boost, | |
| 550 | + } | |
| 551 | + } | |
| 552 | + if self.phrase_match_slop > 0: | |
| 553 | + clause["multi_match"]["slop"] = self.phrase_match_slop | |
| 554 | + if self.phrase_match_tie_breaker > 0: | |
| 555 | + clause["multi_match"]["tie_breaker"] = self.phrase_match_tie_breaker | |
| 556 | + return clause | |
| 557 | + | |
| 558 | + def _build_lexical_language_clause( | |
| 559 | + self, | |
| 560 | + lang: str, | |
| 561 | + lang_query: str, | |
| 562 | + clause_name: str, | |
| 563 | + *, | |
| 564 | + is_source: bool, | |
| 565 | + contains_chinese: bool, | |
| 566 | + contains_english: bool, | |
| 567 | + index_languages: List[str], | |
| 568 | + ) -> Optional[Dict[str, Any]]: | |
| 569 | + all_specs = self._build_match_field_specs(lang) | |
| 570 | + expanded_specs = self._expand_match_field_specs_for_mixed_script( | |
| 571 | + lang, | |
| 572 | + all_specs, | |
| 573 | + contains_chinese, | |
| 574 | + contains_english, | |
| 575 | + index_languages, | |
| 576 | + is_source, | |
| 577 | + ) | |
| 578 | + combined_fields = self._format_match_field_specs(expanded_specs) | |
| 579 | + if not combined_fields: | |
| 580 | + return None | |
| 581 | + minimum_should_match = ( | |
| 582 | + self.base_minimum_should_match if is_source else self.translation_minimum_should_match | |
| 583 | + ) | |
| 584 | + should_clauses = [ | |
| 585 | + clause | |
| 586 | + for clause in ( | |
| 587 | + self._build_best_fields_clause(lang, lang_query), | |
| 588 | + self._build_phrase_clause(lang, lang_query), | |
| 589 | + ) | |
| 590 | + if clause | |
| 591 | + ] | |
| 592 | + clause: Dict[str, Any] = { | |
| 593 | + "bool": { | |
| 594 | + "_name": clause_name, | |
| 595 | + "must": [ | |
| 596 | + { | |
| 597 | + "combined_fields": { | |
| 598 | + "query": lang_query, | |
| 599 | + "fields": combined_fields, | |
| 600 | + "minimum_should_match": minimum_should_match, | |
| 601 | + } | |
| 602 | + } | |
| 603 | + ], | |
| 604 | + } | |
| 605 | + } | |
| 606 | + if should_clauses: | |
| 607 | + clause["bool"]["should"] = should_clauses | |
| 608 | + if not is_source: | |
| 609 | + clause["bool"]["boost"] = float(self.translation_boost) | |
| 610 | + return clause | |
| 611 | + | |
| 546 | 612 | def _get_embedding_field(self, language: str) -> str: |
| 547 | 613 | """Get embedding field name for a language.""" |
| 548 | 614 | # Currently using unified embedding field |
| ... | ... | @@ -603,42 +669,18 @@ class ESQueryBuilder: |
| 603 | 669 | |
| 604 | 670 | def append_clause(lang: str, lang_query: str, clause_name: str, is_source: bool) -> None: |
| 605 | 671 | nonlocal should_clauses |
| 606 | - all_specs, _ = self._build_match_field_specs(lang) | |
| 607 | - expanded_specs = self._expand_match_field_specs_for_mixed_script( | |
| 672 | + clause = self._build_lexical_language_clause( | |
| 608 | 673 | lang, |
| 609 | - all_specs, | |
| 610 | - contains_chinese, | |
| 611 | - contains_english, | |
| 612 | - normalized_index_languages, | |
| 613 | - is_source, | |
| 674 | + lang_query, | |
| 675 | + clause_name, | |
| 676 | + is_source=is_source, | |
| 677 | + contains_chinese=contains_chinese, | |
| 678 | + contains_english=contains_english, | |
| 679 | + index_languages=normalized_index_languages, | |
| 614 | 680 | ) |
| 615 | - match_fields = self._format_match_field_specs(expanded_specs) | |
| 616 | - if not match_fields: | |
| 681 | + if not clause: | |
| 617 | 682 | return |
| 618 | - minimum_should_match = ( | |
| 619 | - self.base_minimum_should_match if is_source else self.translation_minimum_should_match | |
| 620 | - ) | |
| 621 | - | |
| 622 | - clause = { | |
| 623 | - "multi_match": { | |
| 624 | - "_name": clause_name, | |
| 625 | - "fields": match_fields, | |
| 626 | - "minimum_should_match": minimum_should_match, | |
| 627 | - "query": lang_query, | |
| 628 | - "tie_breaker": self.tie_breaker_base_query, | |
| 629 | - } | |
| 630 | - } | |
| 631 | - # base_query: never set multi_match.boost (ES default 1.0). | |
| 632 | - # Translation clauses: single knob from config — translation_boost. | |
| 633 | - if not is_source: | |
| 634 | - tb = float(self.translation_boost) | |
| 635 | - clause["multi_match"]["boost"] = tb | |
| 636 | - should_clauses.append({ | |
| 637 | - "multi_match": clause["multi_match"] | |
| 638 | - }) | |
| 639 | - self._append_phrase_should_clause( | |
| 640 | - should_clauses, lang, lang_query, clause_name | |
| 641 | - ) | |
| 683 | + should_clauses.append(clause) | |
| 642 | 684 | |
| 643 | 685 | if base_query_text: |
| 644 | 686 | append_clause(source_lang, base_query_text, "base_query", True) |
| ... | ... | @@ -661,24 +703,9 @@ class ESQueryBuilder: |
| 661 | 703 | "query": query_text, |
| 662 | 704 | "fields": fallback_fields, |
| 663 | 705 | "minimum_should_match": self.base_minimum_should_match, |
| 664 | - "tie_breaker": self.tie_breaker_base_query, | |
| 665 | - } | |
| 666 | - } | |
| 667 | - fb_should: List[Dict[str, Any]] = [fallback_lexical] | |
| 668 | - self._append_phrase_should_clause( | |
| 669 | - fb_should, | |
| 670 | - self.default_language, | |
| 671 | - query_text, | |
| 672 | - "base_query_fallback" | |
| 673 | - ) | |
| 674 | - if len(fb_should) == 1: | |
| 675 | - return fallback_lexical | |
| 676 | - return { | |
| 677 | - "bool": { | |
| 678 | - "should": fb_should, | |
| 679 | - "minimum_should_match": 1, | |
| 680 | 706 | } |
| 681 | 707 | } |
| 708 | + return fallback_lexical | |
| 682 | 709 | |
| 683 | 710 | # Return bool query with should clauses |
| 684 | 711 | if len(should_clauses) == 1: | ... | ... |
search/searcher.py
| ... | ... | @@ -133,6 +133,10 @@ class Searcher: |
| 133 | 133 | translation_minimum_should_match=self.config.query_config.translation_minimum_should_match, |
| 134 | 134 | translation_boost=self.config.query_config.translation_boost, |
| 135 | 135 | tie_breaker_base_query=self.config.query_config.tie_breaker_base_query, |
| 136 | + best_fields_boosts=self.config.query_config.best_fields, | |
| 137 | + best_fields_clause_boost=self.config.query_config.best_fields_boost, | |
| 138 | + phrase_field_boosts=self.config.query_config.phrase_fields, | |
| 139 | + phrase_match_boost=self.config.query_config.phrase_match_boost, | |
| 136 | 140 | ) |
| 137 | 141 | |
| 138 | 142 | def _apply_source_filter(self, es_query: Dict[str, Any]) -> None: | ... | ... |
tests/test_es_query_builder.py
| ... | ... | @@ -14,19 +14,19 @@ def _builder() -> ESQueryBuilder: |
| 14 | 14 | ) |
| 15 | 15 | |
| 16 | 16 | |
| 17 | -def _lexical_multi_match_fields(query_root: Dict[str, Any]) -> list: | |
| 18 | - """Fields from the non-phrase multi_match (bool.should or single clause).""" | |
| 19 | - if "multi_match" in query_root: | |
| 20 | - mm = query_root["multi_match"] | |
| 21 | - if mm.get("type") == "phrase": | |
| 22 | - raise AssertionError("root multi_match is phrase-only") | |
| 23 | - return mm["fields"] | |
| 17 | +def _lexical_clause(query_root: Dict[str, Any]) -> Dict[str, Any]: | |
| 18 | + """Return the first named lexical bool clause from query_root.""" | |
| 19 | + if "bool" in query_root and query_root["bool"].get("_name"): | |
| 20 | + return query_root["bool"] | |
| 24 | 21 | for clause in query_root.get("bool", {}).get("should", []): |
| 25 | - mm = clause.get("multi_match") or {} | |
| 26 | - if mm.get("type") == "phrase": | |
| 27 | - continue | |
| 28 | - return mm["fields"] | |
| 29 | - raise AssertionError("no lexical multi_match in query_root") | |
| 22 | + clause_bool = clause.get("bool") or {} | |
| 23 | + if clause_bool.get("_name"): | |
| 24 | + return clause_bool | |
| 25 | + raise AssertionError("no lexical bool clause in query_root") | |
| 26 | + | |
| 27 | + | |
| 28 | +def _lexical_combined_fields(query_root: Dict[str, Any]) -> list: | |
| 29 | + return _lexical_clause(query_root)["must"][0]["combined_fields"]["fields"] | |
| 30 | 30 | |
| 31 | 31 | |
| 32 | 32 | def test_knn_prefilter_includes_range_filters(): |
| ... | ... | @@ -96,14 +96,11 @@ def test_text_query_contains_only_base_and_translation_named_queries(): |
| 96 | 96 | index_languages=["en", "zh", "fr"], |
| 97 | 97 | ) |
| 98 | 98 | should = q["query"]["bool"]["should"] |
| 99 | - names = [clause["multi_match"]["_name"] for clause in should] | |
| 99 | + names = [clause["bool"]["_name"] for clause in should] | |
| 100 | 100 | |
| 101 | - assert names == [ | |
| 102 | - "base_query", | |
| 103 | - "base_query_phrase", | |
| 104 | - "base_query_trans_zh", | |
| 105 | - "base_query_trans_zh_phrase", | |
| 106 | - ] | |
| 101 | + assert names == ["base_query", "base_query_trans_zh"] | |
| 102 | + base_should = q["query"]["bool"]["should"][0]["bool"]["should"] | |
| 103 | + assert [clause["multi_match"]["type"] for clause in base_should] == ["best_fields", "phrase"] | |
| 107 | 104 | |
| 108 | 105 | |
| 109 | 106 | def test_text_query_skips_duplicate_translation_same_as_base(): |
| ... | ... | @@ -122,8 +119,8 @@ def test_text_query_skips_duplicate_translation_same_as_base(): |
| 122 | 119 | ) |
| 123 | 120 | |
| 124 | 121 | root = q["query"] |
| 125 | - assert root["bool"]["should"][0]["multi_match"]["_name"] == "base_query" | |
| 126 | - assert root["bool"]["should"][1]["multi_match"]["_name"] == "base_query_phrase" | |
| 122 | + assert root["bool"]["_name"] == "base_query" | |
| 123 | + assert [clause["multi_match"]["type"] for clause in root["bool"]["should"]] == ["best_fields", "phrase"] | |
| 127 | 124 | |
| 128 | 125 | |
| 129 | 126 | def test_mixed_script_merges_en_fields_into_zh_clause(): |
| ... | ... | @@ -147,7 +144,7 @@ def test_mixed_script_merges_en_fields_into_zh_clause(): |
| 147 | 144 | enable_knn=False, |
| 148 | 145 | index_languages=["zh", "en"], |
| 149 | 146 | ) |
| 150 | - fields = _lexical_multi_match_fields(q["query"]) | |
| 147 | + fields = _lexical_combined_fields(q["query"]) | |
| 151 | 148 | bases = {f.split("^", 1)[0] for f in fields} |
| 152 | 149 | assert "title.zh" in bases and "title.en" in bases |
| 153 | 150 | assert "brief.zh" in bases and "brief.en" in bases |
| ... | ... | @@ -177,7 +174,7 @@ def test_mixed_script_merges_zh_fields_into_en_clause(): |
| 177 | 174 | enable_knn=False, |
| 178 | 175 | index_languages=["zh", "en"], |
| 179 | 176 | ) |
| 180 | - fields = _lexical_multi_match_fields(q["query"]) | |
| 177 | + fields = _lexical_combined_fields(q["query"]) | |
| 181 | 178 | bases = {f.split("^", 1)[0] for f in fields} |
| 182 | 179 | assert "title.en" in bases and "title.zh" in bases |
| 183 | 180 | assert "title.zh^0.6" in fields |
| ... | ... | @@ -205,7 +202,7 @@ def test_mixed_script_merged_fields_scale_configured_boosts(): |
| 205 | 202 | enable_knn=False, |
| 206 | 203 | index_languages=["zh", "en"], |
| 207 | 204 | ) |
| 208 | - fields = _lexical_multi_match_fields(q["query"]) | |
| 205 | + fields = _lexical_combined_fields(q["query"]) | |
| 209 | 206 | assert "title.zh^5.0" in fields |
| 210 | 207 | assert "title.en^6.0" in fields # 10.0 * 0.6 |
| 211 | 208 | |
| ... | ... | @@ -231,7 +228,7 @@ def test_mixed_script_does_not_merge_en_when_not_in_index_languages(): |
| 231 | 228 | enable_knn=False, |
| 232 | 229 | index_languages=["zh"], |
| 233 | 230 | ) |
| 234 | - fields = _lexical_multi_match_fields(q["query"]) | |
| 231 | + fields = _lexical_combined_fields(q["query"]) | |
| 235 | 232 | bases = {f.split("^", 1)[0] for f in fields} |
| 236 | 233 | assert "title.zh" in bases |
| 237 | 234 | assert "title.en" not in bases | ... | ... |
tests/test_es_query_builder_text_recall_languages.py
| ... | ... | @@ -2,8 +2,8 @@ |
| 2 | 2 | ES text recall: base_query (rewritten @ detected_language) + base_query_trans_*. |
| 3 | 3 | |
| 4 | 4 | Covers combinations of query language vs tenant index_languages, translations, |
| 5 | -and mixed Chinese/English queries. Asserts multi_match _name, query text, and | |
| 6 | -target language fields (title.{lang}). | |
| 5 | +and mixed Chinese/English queries. Asserts named lexical clause boundaries, | |
| 6 | +combined_fields payloads, and per-language target fields (title.{lang}). | |
| 7 | 7 | """ |
| 8 | 8 | |
| 9 | 9 | from types import SimpleNamespace |
| ... | ... | @@ -34,7 +34,7 @@ def _builder_multilingual_title_only( |
| 34 | 34 | def _unwrap_inner_query(es_body: Dict[str, Any]) -> Dict[str, Any]: |
| 35 | 35 | """Navigate bool.must / function_score wrappers to the text recall root.""" |
| 36 | 36 | q = es_body.get("query") or {} |
| 37 | - if "bool" in q and "must" in q["bool"] and q["bool"]["must"]: | |
| 37 | + if "bool" in q and not q["bool"].get("_name") and "must" in q["bool"] and q["bool"]["must"]: | |
| 38 | 38 | q = q["bool"]["must"][0] |
| 39 | 39 | if "function_score" in q: |
| 40 | 40 | q = q["function_score"]["query"] |
| ... | ... | @@ -49,30 +49,45 @@ def _extract_multi_match_clauses(es_body: Dict[str, Any]) -> List[Dict[str, Any] |
| 49 | 49 | return [c["multi_match"] for c in should if "multi_match" in c] |
| 50 | 50 | |
| 51 | 51 | |
| 52 | +def _extract_named_lexical_clauses(es_body: Dict[str, Any]) -> List[Dict[str, Any]]: | |
| 53 | + inner = _unwrap_inner_query(es_body) | |
| 54 | + if "bool" in inner and inner["bool"].get("_name"): | |
| 55 | + return [inner["bool"]] | |
| 56 | + should = (inner.get("bool") or {}).get("should") or [] | |
| 57 | + return [c["bool"] for c in should if "bool" in c and c["bool"].get("_name")] | |
| 58 | + | |
| 59 | + | |
| 52 | 60 | def _clauses_index(es_body: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: |
| 53 | - """Map _name -> multi_match dict.""" | |
| 61 | + """Map lexical clause _name -> bool query body.""" | |
| 54 | 62 | out: Dict[str, Dict[str, Any]] = {} |
| 55 | - for mm in _extract_multi_match_clauses(es_body): | |
| 56 | - name = mm.get("_name") | |
| 63 | + for clause in _extract_named_lexical_clauses(es_body): | |
| 64 | + name = clause.get("_name") | |
| 57 | 65 | if name: |
| 58 | - out[str(name)] = mm | |
| 66 | + out[str(name)] = clause | |
| 59 | 67 | return out |
| 60 | 68 | |
| 61 | 69 | |
| 62 | -def _with_phrase(lexical_names: set[str]) -> set[str]: | |
| 63 | - """Each lexical recall clause has a companion ``*_phrase`` multi_match.""" | |
| 64 | - return lexical_names | {f"{n}_phrase" for n in lexical_names} | |
| 70 | +def _combined_fields_clause(clause: Dict[str, Any]) -> Dict[str, Any]: | |
| 71 | + return clause["must"][0]["combined_fields"] | |
| 72 | + | |
| 65 | 73 | |
| 74 | +def _should_multi_matches(clause: Dict[str, Any]) -> List[Dict[str, Any]]: | |
| 75 | + return [item["multi_match"] for item in clause.get("should") or [] if "multi_match" in item] | |
| 66 | 76 | |
| 67 | -def _title_fields(mm: Dict[str, Any]) -> List[str]: | |
| 68 | - fields = mm.get("fields") or [] | |
| 77 | + | |
| 78 | +def _should_multi_matches_by_type(clause: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: | |
| 79 | + return {str(mm.get("type") or "best_fields"): mm for mm in _should_multi_matches(clause)} | |
| 80 | + | |
| 81 | + | |
| 82 | +def _title_fields(clause: Dict[str, Any]) -> List[str]: | |
| 83 | + fields = _combined_fields_clause(clause).get("fields") or [] | |
| 69 | 84 | return [f for f in fields if str(f).startswith("title.")] |
| 70 | 85 | |
| 71 | 86 | |
| 72 | -def _has_title_lang(mm: Dict[str, Any], lang: str) -> bool: | |
| 87 | +def _has_title_lang(clause: Dict[str, Any], lang: str) -> bool: | |
| 73 | 88 | """True if any field is title.{lang} with optional ^boost suffix.""" |
| 74 | 89 | prefix = f"title.{lang}" |
| 75 | - for f in mm.get("fields") or []: | |
| 90 | + for f in _combined_fields_clause(clause).get("fields") or []: | |
| 76 | 91 | s = str(f) |
| 77 | 92 | if s == prefix or s.startswith(prefix + "^"): |
| 78 | 93 | return True |
| ... | ... | @@ -119,10 +134,10 @@ def test_zh_query_index_zh_en_includes_base_zh_and_trans_en(): |
| 119 | 134 | index_languages=["zh", "en"], |
| 120 | 135 | ) |
| 121 | 136 | idx = _clauses_index(q) |
| 122 | - assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"}) | |
| 123 | - assert idx["base_query"]["query"] == "连衣裙" | |
| 137 | + assert set(idx) == {"base_query", "base_query_trans_en"} | |
| 138 | + assert _combined_fields_clause(idx["base_query"])["query"] == "连衣裙" | |
| 124 | 139 | assert "title.zh" in _title_fields(idx["base_query"]) |
| 125 | - assert idx["base_query_trans_en"]["query"] == "dress" | |
| 140 | + assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "dress" | |
| 126 | 141 | assert "title.en" in _title_fields(idx["base_query_trans_en"]) |
| 127 | 142 | |
| 128 | 143 | |
| ... | ... | @@ -137,10 +152,10 @@ def test_en_query_index_zh_en_includes_base_en_and_trans_zh(): |
| 137 | 152 | index_languages=["en", "zh"], |
| 138 | 153 | ) |
| 139 | 154 | idx = _clauses_index(q) |
| 140 | - assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | |
| 141 | - assert idx["base_query"]["query"] == "dress" | |
| 155 | + assert set(idx) == {"base_query", "base_query_trans_zh"} | |
| 156 | + assert _combined_fields_clause(idx["base_query"])["query"] == "dress" | |
| 142 | 157 | assert "title.en" in _title_fields(idx["base_query"]) |
| 143 | - assert idx["base_query_trans_zh"]["query"] == "连衣裙" | |
| 158 | + assert _combined_fields_clause(idx["base_query_trans_zh"])["query"] == "连衣裙" | |
| 144 | 159 | assert "title.zh" in _title_fields(idx["base_query_trans_zh"]) |
| 145 | 160 | |
| 146 | 161 | |
| ... | ... | @@ -155,13 +170,11 @@ def test_de_query_index_de_en_fr_includes_base_and_two_translations(): |
| 155 | 170 | index_languages=["de", "en", "fr"], |
| 156 | 171 | ) |
| 157 | 172 | idx = _clauses_index(q) |
| 158 | - assert set(idx) == _with_phrase( | |
| 159 | - {"base_query", "base_query_trans_en", "base_query_trans_fr"} | |
| 160 | - ) | |
| 161 | - assert idx["base_query"]["query"] == "kleid" | |
| 173 | + assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_fr"} | |
| 174 | + assert _combined_fields_clause(idx["base_query"])["query"] == "kleid" | |
| 162 | 175 | assert "title.de" in _title_fields(idx["base_query"]) |
| 163 | - assert idx["base_query_trans_en"]["query"] == "dress" | |
| 164 | - assert idx["base_query_trans_fr"]["query"] == "robe" | |
| 176 | + assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "dress" | |
| 177 | + assert _combined_fields_clause(idx["base_query_trans_fr"])["query"] == "robe" | |
| 165 | 178 | |
| 166 | 179 | |
| 167 | 180 | # --- 检测语言不在 index_languages:仍有 base(弱)+ 翻译(强) --- |
| ... | ... | @@ -178,15 +191,13 @@ def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields(): |
| 178 | 191 | index_languages=["en", "zh"], |
| 179 | 192 | ) |
| 180 | 193 | idx = _clauses_index(q) |
| 181 | - assert set(idx) == _with_phrase( | |
| 182 | - {"base_query", "base_query_trans_en", "base_query_trans_zh"} | |
| 183 | - ) | |
| 184 | - assert idx["base_query"]["query"] == "schuh" | |
| 194 | + assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_zh"} | |
| 195 | + assert _combined_fields_clause(idx["base_query"])["query"] == "schuh" | |
| 185 | 196 | assert "title.de" in _title_fields(idx["base_query"]) |
| 186 | 197 | assert "boost" not in idx["base_query"] |
| 187 | - assert idx["base_query_trans_en"]["query"] == "shoe" | |
| 198 | + assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "shoe" | |
| 188 | 199 | assert idx["base_query_trans_en"]["boost"] == qb.translation_boost |
| 189 | - assert idx["base_query_trans_zh"]["query"] == "鞋" | |
| 200 | + assert _combined_fields_clause(idx["base_query_trans_zh"])["query"] == "鞋" | |
| 190 | 201 | assert idx["base_query_trans_zh"]["boost"] == qb.translation_boost |
| 191 | 202 | |
| 192 | 203 | |
| ... | ... | @@ -206,10 +217,10 @@ def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause(): |
| 206 | 217 | contains_english=True, |
| 207 | 218 | ) |
| 208 | 219 | idx = _clauses_index(q) |
| 209 | - assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"}) | |
| 210 | - assert idx["base_query"]["query"] == "红色 dress" | |
| 220 | + assert set(idx) == {"base_query", "base_query_trans_en"} | |
| 221 | + assert _combined_fields_clause(idx["base_query"])["query"] == "红色 dress" | |
| 211 | 222 | assert _has_title_lang(idx["base_query"], "zh") and _has_title_lang(idx["base_query"], "en") |
| 212 | - assert idx["base_query_trans_en"]["query"] == "red dress" | |
| 223 | + assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "red dress" | |
| 213 | 224 | assert _has_title_lang(idx["base_query_trans_en"], "en") |
| 214 | 225 | |
| 215 | 226 | |
| ... | ... | @@ -226,10 +237,10 @@ def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause(): |
| 226 | 237 | contains_english=True, |
| 227 | 238 | ) |
| 228 | 239 | idx = _clauses_index(q) |
| 229 | - assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | |
| 230 | - assert idx["base_query"]["query"] == "nike 运动鞋" | |
| 240 | + assert set(idx) == {"base_query", "base_query_trans_zh"} | |
| 241 | + assert _combined_fields_clause(idx["base_query"])["query"] == "nike 运动鞋" | |
| 231 | 242 | assert _has_title_lang(idx["base_query"], "en") and _has_title_lang(idx["base_query"], "zh") |
| 232 | - assert idx["base_query_trans_zh"]["query"] == "耐克运动鞋" | |
| 243 | + assert _combined_fields_clause(idx["base_query_trans_zh"])["query"] == "耐克运动鞋" | |
| 233 | 244 | |
| 234 | 245 | |
| 235 | 246 | def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): |
| ... | ... | @@ -245,7 +256,7 @@ def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): |
| 245 | 256 | contains_english=True, |
| 246 | 257 | ) |
| 247 | 258 | idx = _clauses_index(q) |
| 248 | - assert set(idx) == _with_phrase({"base_query"}) | |
| 259 | + assert set(idx) == {"base_query"} | |
| 249 | 260 | bases = {f.split("^", 1)[0] for f in _title_fields(idx["base_query"])} |
| 250 | 261 | assert bases == {"title.zh"} |
| 251 | 262 | |
| ... | ... | @@ -264,7 +275,7 @@ def test_skips_translation_when_same_lang_and_same_text_as_base(): |
| 264 | 275 | index_languages=["en", "zh"], |
| 265 | 276 | ) |
| 266 | 277 | idx = _clauses_index(q) |
| 267 | - assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | |
| 278 | + assert set(idx) == {"base_query", "base_query_trans_zh"} | |
| 268 | 279 | |
| 269 | 280 | |
| 270 | 281 | def test_keeps_translation_when_same_text_but_different_lang_than_base(): |
| ... | ... | @@ -278,8 +289,8 @@ def test_keeps_translation_when_same_text_but_different_lang_than_base(): |
| 278 | 289 | index_languages=["en", "zh"], |
| 279 | 290 | ) |
| 280 | 291 | idx = _clauses_index(q) |
| 281 | - assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | |
| 282 | - assert idx["base_query_trans_zh"]["query"] == "NIKE" | |
| 292 | + assert set(idx) == {"base_query", "base_query_trans_zh"} | |
| 293 | + assert _combined_fields_clause(idx["base_query_trans_zh"])["query"] == "NIKE" | |
| 283 | 294 | |
| 284 | 295 | |
| 285 | 296 | # --- 翻译 key 规范化、空翻译跳过 --- |
| ... | ... | @@ -297,7 +308,7 @@ def test_translation_language_key_is_normalized_case_insensitive(): |
| 297 | 308 | ) |
| 298 | 309 | idx = _clauses_index(q) |
| 299 | 310 | assert "base_query_trans_zh" in idx |
| 300 | - assert idx["base_query_trans_zh"]["query"] == "连衣裙" | |
| 311 | + assert _combined_fields_clause(idx["base_query_trans_zh"])["query"] == "连衣裙" | |
| 301 | 312 | |
| 302 | 313 | |
| 303 | 314 | def test_empty_translation_value_is_skipped(): |
| ... | ... | @@ -331,8 +342,10 @@ def test_empty_index_languages_treats_source_as_in_index_boosts(): |
| 331 | 342 | idx = _clauses_index(q) |
| 332 | 343 | assert "boost" not in idx["base_query"] |
| 333 | 344 | assert idx["base_query_trans_en"]["boost"] == qb.translation_boost |
| 334 | - assert idx["base_query_phrase"]["boost"] == qb.phrase_match_boost | |
| 335 | - assert idx["base_query_trans_en_phrase"]["boost"] == qb.phrase_match_boost | |
| 345 | + base_should = _should_multi_matches_by_type(idx["base_query"]) | |
| 346 | + trans_should = _should_multi_matches_by_type(idx["base_query_trans_en"]) | |
| 347 | + assert base_should["phrase"]["boost"] == qb.phrase_match_boost | |
| 348 | + assert trans_should["phrase"]["boost"] == qb.phrase_match_boost | |
| 336 | 349 | |
| 337 | 350 | |
| 338 | 351 | # --- 无翻译:仅 base_query --- |
| ... | ... | @@ -349,7 +362,7 @@ def test_no_translations_only_base_query(): |
| 349 | 362 | index_languages=["en", "zh"], |
| 350 | 363 | ) |
| 351 | 364 | idx = _clauses_index(q) |
| 352 | - assert set(idx) == _with_phrase({"base_query"}) | |
| 365 | + assert set(idx) == {"base_query"} | |
| 353 | 366 | |
| 354 | 367 | |
| 355 | 368 | # --- 与 KNN 同存时仍能解析文本子句(顶层 knn 不影响 query 内结构) --- |
| ... | ... | @@ -373,7 +386,7 @@ def test_text_clauses_present_alongside_knn(): |
| 373 | 386 | ) |
| 374 | 387 | assert "knn" in q |
| 375 | 388 | idx = _clauses_index(q) |
| 376 | - assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | |
| 389 | + assert set(idx) == {"base_query", "base_query_trans_zh"} | |
| 377 | 390 | |
| 378 | 391 | |
| 379 | 392 | def test_detected_language_unknown_falls_back_to_default_language(): |
| ... | ... | @@ -393,8 +406,8 @@ def test_detected_language_unknown_falls_back_to_default_language(): |
| 393 | 406 | index_languages=["en", "zh"], |
| 394 | 407 | ) |
| 395 | 408 | idx = _clauses_index(q) |
| 396 | - assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | |
| 397 | - assert idx["base_query"]["query"] == "shirt" | |
| 409 | + assert set(idx) == {"base_query", "base_query_trans_zh"} | |
| 410 | + assert _combined_fields_clause(idx["base_query"])["query"] == "shirt" | |
| 398 | 411 | assert _has_title_lang(idx["base_query"], "en") |
| 399 | 412 | |
| 400 | 413 | |
| ... | ... | @@ -409,10 +422,10 @@ def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): |
| 409 | 422 | index_languages=["ru", "en"], |
| 410 | 423 | ) |
| 411 | 424 | idx = _clauses_index(q) |
| 412 | - assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"}) | |
| 413 | - assert idx["base_query"]["query"] == "платье" | |
| 425 | + assert set(idx) == {"base_query", "base_query_trans_en"} | |
| 426 | + assert _combined_fields_clause(idx["base_query"])["query"] == "платье" | |
| 414 | 427 | assert _has_title_lang(idx["base_query"], "ru") |
| 415 | - assert idx["base_query_trans_en"]["query"] == "dress" | |
| 428 | + assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "dress" | |
| 416 | 429 | |
| 417 | 430 | |
| 418 | 431 | def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause(): |
| ... | ... | @@ -431,7 +444,7 @@ def test_translation_for_lang_not_listed_in_index_languages_still_generates_clau |
| 431 | 444 | ) |
| 432 | 445 | idx = _clauses_index(q) |
| 433 | 446 | assert "base_query_trans_de" in idx |
| 434 | - assert idx["base_query_trans_de"]["query"] == "Kleid" | |
| 447 | + assert _combined_fields_clause(idx["base_query_trans_de"])["query"] == "Kleid" | |
| 435 | 448 | assert _has_title_lang(idx["base_query_trans_de"], "de") |
| 436 | 449 | |
| 437 | 450 | |
| ... | ... | @@ -449,5 +462,5 @@ def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_bas |
| 449 | 462 | contains_english=False, |
| 450 | 463 | ) |
| 451 | 464 | idx = _clauses_index(q) |
| 452 | - assert idx["base_query"]["query"] == "红色连衣裙" | |
| 453 | - assert idx["base_query_trans_en"]["query"] == "red dress" | |
| 465 | + assert _combined_fields_clause(idx["base_query"])["query"] == "红色连衣裙" | |
| 466 | + assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "red dress" | ... | ... |