Commit e756b18eebb8bf04453e1383d003f941ac88db16
1 parent
a3d3fb11
重构了文本召回构建器,现在每个 base_query / base_query_trans_*
子句都变成了一个带有以下结构的命名布尔查询: must:combined_fields should:加权后的 best_fields 和 phrase 子句 主要改动位于 search/es_query_builder.py,但此次调整沿用了现有语言路由设计,并未引入一次性分支。额外的 should 子句权重现在通过 config/schema.py、config/loader.py、search/searcher.py 以及 config/config.yaml 进行配置驱动,从而保持结构的集中管理。
Showing
9 changed files
with
883 additions
and
222 deletions
Show diff stats
config/config.yaml
| @@ -81,6 +81,7 @@ field_boosts: | @@ -81,6 +81,7 @@ field_boosts: | ||
| 81 | category_path: 2.0 | 81 | category_path: 2.0 |
| 82 | brief: 1.5 | 82 | brief: 1.5 |
| 83 | description: 1.5 | 83 | description: 1.5 |
| 84 | + vendor: 1.5 | ||
| 84 | option1_values: 1.5 | 85 | option1_values: 1.5 |
| 85 | option2_values: 1.5 | 86 | option2_values: 1.5 |
| 86 | option3_values: 1.5 | 87 | option3_values: 1.5 |
| @@ -126,6 +127,7 @@ query_config: | @@ -126,6 +127,7 @@ query_config: | ||
| 126 | - "category_name_text" | 127 | - "category_name_text" |
| 127 | - "brief" | 128 | - "brief" |
| 128 | - "description" | 129 | - "description" |
| 130 | + - "vendor" | ||
| 129 | shared_fields: | 131 | shared_fields: |
| 130 | - "tags" | 132 | - "tags" |
| 131 | - "option1_values" | 133 | - "option1_values" |
| @@ -133,7 +135,7 @@ query_config: | @@ -133,7 +135,7 @@ query_config: | ||
| 133 | - "option3_values" | 135 | - "option3_values" |
| 134 | core_multilingual_fields: | 136 | core_multilingual_fields: |
| 135 | - "title" | 137 | - "title" |
| 136 | - - "brief" | 138 | + - "qanchors" |
| 137 | - "category_name_text" | 139 | - "category_name_text" |
| 138 | 140 | ||
| 139 | # 统一文本召回策略(主查询 + 翻译查询) | 141 | # 统一文本召回策略(主查询 + 翻译查询) |
| @@ -142,6 +144,15 @@ query_config: | @@ -142,6 +144,15 @@ query_config: | ||
| 142 | translation_minimum_should_match: "75%" | 144 | translation_minimum_should_match: "75%" |
| 143 | translation_boost: 0.75 | 145 | translation_boost: 0.75 |
| 144 | tie_breaker_base_query: 0.5 | 146 | tie_breaker_base_query: 0.5 |
| 147 | + best_fields_boost: 2.0 | ||
| 148 | + best_fields: | ||
| 149 | + title: 4.0 | ||
| 150 | + qanchors: 3.0 | ||
| 151 | + category_name_text: 2.0 | ||
| 152 | + phrase_fields: | ||
| 153 | + title: 5.0 | ||
| 154 | + qanchors: 4.0 | ||
| 155 | + phrase_match_boost: 3.0 | ||
| 145 | 156 | ||
| 146 | # Embedding字段名称 | 157 | # Embedding字段名称 |
| 147 | text_embedding_field: "title_embedding" | 158 | text_embedding_field: "title_embedding" |
config/loader.py
| @@ -285,6 +285,16 @@ class AppConfigLoader: | @@ -285,6 +285,16 @@ class AppConfigLoader: | ||
| 285 | translation_minimum_should_match=str(text_strategy.get("translation_minimum_should_match", "70%")), | 285 | translation_minimum_should_match=str(text_strategy.get("translation_minimum_should_match", "70%")), |
| 286 | translation_boost=float(text_strategy.get("translation_boost", 0.4)), | 286 | translation_boost=float(text_strategy.get("translation_boost", 0.4)), |
| 287 | tie_breaker_base_query=float(text_strategy.get("tie_breaker_base_query", 0.9)), | 287 | tie_breaker_base_query=float(text_strategy.get("tie_breaker_base_query", 0.9)), |
| 288 | + best_fields={ | ||
| 289 | + str(field): float(boost) | ||
| 290 | + for field, boost in dict(text_strategy.get("best_fields") or {}).items() | ||
| 291 | + }, | ||
| 292 | + best_fields_boost=float(text_strategy.get("best_fields_boost", 2.0)), | ||
| 293 | + phrase_fields={ | ||
| 294 | + str(field): float(boost) | ||
| 295 | + for field, boost in dict(text_strategy.get("phrase_fields") or {}).items() | ||
| 296 | + }, | ||
| 297 | + phrase_match_boost=float(text_strategy.get("phrase_match_boost", 3.0)), | ||
| 288 | zh_to_en_model=str(query_cfg.get("zh_to_en_model") or "opus-mt-zh-en"), | 298 | zh_to_en_model=str(query_cfg.get("zh_to_en_model") or "opus-mt-zh-en"), |
| 289 | en_to_zh_model=str(query_cfg.get("en_to_zh_model") or "opus-mt-en-zh"), | 299 | en_to_zh_model=str(query_cfg.get("en_to_zh_model") or "opus-mt-en-zh"), |
| 290 | default_translation_model=str( | 300 | default_translation_model=str( |
config/schema.py
| @@ -55,6 +55,10 @@ class QueryConfig: | @@ -55,6 +55,10 @@ class QueryConfig: | ||
| 55 | translation_minimum_should_match: str = "70%" | 55 | translation_minimum_should_match: str = "70%" |
| 56 | translation_boost: float = 0.4 | 56 | translation_boost: float = 0.4 |
| 57 | tie_breaker_base_query: float = 0.9 | 57 | tie_breaker_base_query: float = 0.9 |
| 58 | + best_fields: Dict[str, float] = field(default_factory=dict) | ||
| 59 | + best_fields_boost: float = 2.0 | ||
| 60 | + phrase_fields: Dict[str, float] = field(default_factory=dict) | ||
| 61 | + phrase_match_boost: float = 3.0 | ||
| 58 | zh_to_en_model: str = "opus-mt-zh-en" | 62 | zh_to_en_model: str = "opus-mt-zh-en" |
| 59 | en_to_zh_model: str = "opus-mt-en-zh" | 63 | en_to_zh_model: str = "opus-mt-en-zh" |
| 60 | default_translation_model: str = "nllb-200-distilled-600m" | 64 | default_translation_model: str = "nllb-200-distilled-600m" |
| @@ -0,0 +1,596 @@ | @@ -0,0 +1,596 @@ | ||
| 1 | +项目 TODO 清单 | ||
| 2 | + | ||
| 3 | +2. 核心搜索功能优化 | ||
| 4 | + | ||
| 5 | +2.1 意图识别模块 | ||
| 6 | + | ||
| 7 | +- 增加款式意图识别模块 | ||
| 8 | + | ||
| 9 | +- 意图类型: 颜色,尺码(目前只需要支持这两种) | ||
| 10 | + | ||
| 11 | +- 意图召回层: | ||
| 12 | +每种意图,有一个召回词集合 | ||
| 13 | +对query(包括原始query、各种翻译query 都做匹配) | ||
| 14 | + | ||
| 15 | +- 以颜色意图为例: | ||
| 16 | +有一个词表,每一行 都逗号分割,互为同义词,行内第一个为标准化词 | ||
| 17 | +query匹配了其中任何一个词,都认为,具有颜色意图 | ||
| 18 | +匹配规则: 用细粒度、粗粒度分词,看是否有在词表中的。原始query分词、和每种翻译的分词,都要用。 | ||
| 19 | + | ||
| 20 | +- 意图判断: | ||
| 21 | + 暂时留空,直接返回true。目前没有模型,即只要召回了(词表匹配了),即认为有该维度款式需求。 | ||
| 22 | + (以后考虑建设fasttext/bert系列多分类模型) | ||
| 23 | + | ||
| 24 | +- 意图使用: | ||
| 25 | +我们第一阶段,使用 参与ES提权。 | ||
| 26 | + | ||
| 27 | +- 一、参与ES提权 | ||
| 28 | + | ||
| 29 | +- 二、参与reranker | ||
| 30 | + | ||
| 31 | +- 如果有: 先做sku筛选,然后把最优的拼接到名称中,参与reranker。 | ||
| 32 | + | ||
| 33 | + | ||
| 34 | + | ||
| 35 | +- 现在在reranker、分页之后、做填充的时候,已经有做sku的筛选。 | ||
| 36 | +需要优化: | ||
| 37 | +现在是,先做包含的判断,找到第一个 option_value被query包含的,则直接认为匹配。改为 | ||
| 38 | + 1. 第一轮:遍历完,如果有且仅有一个被query包含,那么认为匹配。 | ||
| 39 | + 2. 第二轮:如果有多个符合(被query包含),跳到3。如果没有,对每个词都走泛化词表进行匹配。 | ||
| 40 | + 3. 第三轮:如果有多个,那么对这多个,走embedding相关性取最高的。如果一个也没有,则对所有的走embedding相关性取最高的 | ||
| 41 | + 这个sku筛选也需要提取为一个独立的模块。 | ||
| 42 | + | ||
| 43 | +- 另外:现在是reranker、分页之后做sku筛选,要改为: | ||
| 44 | + 1. 有款式意图的时候,才做sku筛选 | ||
| 45 | + 2. sku筛选的时机,改为在reranker之前,对所有内容做sku筛选,然后 | ||
| 46 | + 3. 从仅 option1 扩展到多个维度,识别的意图,包含意图的维度名(color)和维度名的泛化词list(color、颜色、colour、olors、、、、),遍历option1_name,option2_name,option3_name,看哪个能匹配上意图的维度名list,哪个匹配上了,则在这个维度筛选。 | ||
| 47 | + 4. Rerank doc (有款式意图的时候)要带上属性后缀,拼接到title后面。在调用 run_rerank 前,对每条 hit 生成「用于重排的 doc 文本」(标题 + 可选后缀) | ||
| 48 | + 5. TODO : 还有一个问题。 目前,sku只返回一个维度(店铺主维度。默认应该是option1,不是所有维度的sku信息都返回的。所以,如果有款式意图,但是主维度是颜色,那么拿不到全的款式sku) | ||
| 49 | + | ||
| 50 | +- 筛选SKU: 先只筛选第一个维度,但考虑到用户搜索词可能带了尺码,所以第二、三个维度也要考虑 | ||
| 51 | + | ||
| 52 | + 当前项目功能已经较多,但是有清晰的框架,请务必基于现有框架进行改造,不要进行补丁式的修改,避免代码逻辑分叉。 | ||
| 53 | + 请一步一步来,先设计意图识别模块,仔细思考需求,意图识别模块需要提供哪些内容,用于返回数据接口的定义,深度思考,定义一个合理的接口后,再给出合理的模块设计。 | ||
| 54 | + | ||
| 55 | +2.3 向量检索与融合 | ||
| 56 | + | ||
| 57 | +- 把knn跟文本相关性的融合方式修改为 "rank": {"rrf": {} }需要licence,可以帮我修改源码支持吗? | ||
| 58 | + | ||
| 59 | + knn_boost: 2.0 | ||
| 60 | + | ||
| 61 | + { | ||
| 62 | + "query": { ...全文检索... }, | ||
| 63 | + "knn": { ...向量检索... }, | ||
| 64 | + "rank": { | ||
| 65 | + "rrf": {} | ||
| 66 | + } | ||
| 67 | + } | ||
| 68 | + | ||
| 69 | +- 融合打分(已完成,2026-03) | ||
| 70 | + | ||
| 71 | + 以下已经完成: | ||
| 72 | + 1. fuse_scores_and_resort 已改为乘法融合,并通过 matched_queries 提取: | ||
| 73 | + - base_query | ||
| 74 | + - base_query_trans_* | ||
| 75 | + - fallback_original_query_* | ||
| 76 | + - knn_query | ||
| 77 | + 2. 文本相关性大分不再依赖 phrase_query / keywords_query,这两类查询已清理。 | ||
| 78 | + 3. 当前融合策略: | ||
| 79 | + - text_score = primary(weighted_source, weighted_translation, weighted_fallback) + 0.25 * support | ||
| 80 | + - fused_score = (rerank_score + 0.00001) * (text_score + 0.1) ** 0.35 * (knn_score + 0.6) ** 0.2 | ||
| 81 | + 4. track_scores 与 include_named_queries_score 已接入,调试字段与评估方法已同步到: | ||
| 82 | + - docs/相关性检索优化说明.md | ||
| 83 | + - docs/搜索API对接指南.md | ||
| 84 | + - docs/Usage-Guide.md | ||
| 85 | + | ||
| 86 | + 未完成的: | ||
| 87 | + (归一化、次序融合?还乘法公式?) | ||
| 88 | + RRF:先把多路召回稳妥融合 | ||
| 89 | + linear + minmax:让你能精调 knn 和文本的权重 | ||
| 90 | + reranker:对前面召回出来的 top-k 再做“最后一刀” | ||
| 91 | + | ||
| 92 | +2.4 文本相关性优化 | ||
| 93 | + | ||
| 94 | +- 调研: | ||
| 95 | +Princeton WordNet — 英文同义词底库 | ||
| 96 | +Shopify Product Taxonomy — 电商品类标准 | ||
| 97 | +Querqy — 电商搜索规则框架 | ||
| 98 | +gensimpson/elasticsearch-synonyms — ES 同义词规则落地 | ||
| 99 | + | ||
| 100 | +- tags字段使用的优化: | ||
| 101 | +现在是keyword,在搜索中,不太好使用(目前主要用于suggest)。 | ||
| 102 | +可以考虑也拆分多语言,配合analyzer使用(和qanchors一样) | ||
| 103 | + | ||
| 104 | +- 是否需要: | ||
| 105 | +当「源语言不在 index_languages」且「某些目标语言的翻译缺失」时,ES 里会额外加一层 用「原始 query 字符串」去撞缺失语种字段 | ||
| 106 | + | ||
| 107 | +- 检索相关性优化: | ||
| 108 | +原始搜索词和翻译的词,都需要有对应的主干分析 | ||
| 109 | +这个主干可以根据词性简单提取名词即可 | ||
| 110 | +在搜索时,原始词和主干都成对地出现,原始词和trunk_keywords一起组成一个或查询。 | ||
| 111 | +有一种方案是把原始词和主干词拼接起来。但是bm25要调tf系数。 | ||
| 112 | + | ||
| 113 | +2.5 图片相关性与向量字段调整 | ||
| 114 | + | ||
| 115 | +- "image_embedding": { | ||
| 116 | + "type": "nested", | ||
| 117 | + "properties": { | ||
| 118 | + "vector": { | ||
| 119 | + "type": "dense_vector", | ||
| 120 | + "dims": 1024, | ||
| 121 | + "index": true, | ||
| 122 | + "similarity": "dot_product", | ||
| 123 | + "element_type": "bfloat16" | ||
| 124 | + }, | ||
| 125 | + "url": { | ||
| 126 | + "type": "text" | ||
| 127 | + } | ||
| 128 | + } | ||
| 129 | +}, | ||
| 130 | +去掉 image_embedding_512 | ||
| 131 | +image_embedding改为,一个spu有多个sku向量,每个向量内部properties: | ||
| 132 | +除了vector url还应该包括,该图片是对应哪些sku | ||
| 133 | +"image_embedding": { | ||
| 134 | + "type": "nested", | ||
| 135 | + "properties": { | ||
| 136 | + "vector": { | ||
| 137 | + "type": "dense_vector", | ||
| 138 | + "dims": 1024, | ||
| 139 | + "index": true, | ||
| 140 | + "similarity": "dot_product", | ||
| 141 | + "element_type": "bfloat16" | ||
| 142 | + }, | ||
| 143 | + "url": { | ||
| 144 | + "type": "text" | ||
| 145 | + } | ||
| 146 | + } | ||
| 147 | +}, | ||
| 148 | + | ||
| 149 | +- 引入图片的相关性: | ||
| 150 | +图片的向量最好做SKU维度,用 SPU 维度还是 SKU 维度? | ||
| 151 | + 1. SKU维度(主款式,option1维度),如果用户搜索“蓝色 T恤”,这种图片相关性会比较有价值。 | ||
| 152 | + 2. 我不考虑颜色的差异,其余的款式一般是大小之类的。这些图片,embedding细分到 SKU 维度,可能价值不大,性价比偏低 | ||
| 153 | + | ||
| 154 | +- 属性的筛选: | ||
| 155 | +训练一个bert/transformer多分类模型,分类: 颜色、尺寸、材质 等等。但是要注意一些属性的值不规范、非常多,要考虑 是不是做规范化,如何规范化。 | ||
| 156 | + | ||
| 157 | +2.6 无结果重查与翻译缺失处理 | ||
| 158 | + | ||
| 159 | +- 无结果重查 | ||
| 160 | +稀有语言,翻译可能超时(因为zh-en互译之外的翻译耗时更长) | ||
| 161 | + | ||
| 162 | + | ||
| 163 | +--- | ||
| 164 | + | ||
| 165 | +3. 模型与推理服务优化 | ||
| 166 | + | ||
| 167 | +3.1 大模型API与本地部署 | ||
| 168 | + | ||
| 169 | +- 外部需求: | ||
| 170 | + 1. 对推理能力要求很低、对耗时要求很高的大模型API(或者本地部署一个7b Q4量化的大模型),prompt大概30-50个token,首token响应要求500ms以内 | ||
| 171 | + 2. ES支持reranker pipline? | ||
| 172 | + | ||
| 173 | +- 本地部署一个7b Q4量化的大模型 | ||
| 174 | + | ||
| 175 | +3.2 Embedding服务优化 | ||
| 176 | + | ||
| 177 | +- 先阅读文本embedding相关的代码: | ||
| 178 | +@embeddings/README.md @embeddings/server.py @docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md @embeddings/text_encoder.py | ||
| 179 | +目前有TEXT_MAX_INFLIGHT / IMAGE_MAX_INFLIGHT 准入限制,超限返回过载状态码。 | ||
| 180 | + | ||
| 181 | +- 文本embedding服务,要支持 priority 查询参数,priority > 0:不计入上述 inflight、不会因准入被拒绝(图片embedding不需要支持,因为只有离线需要用到图片embedding) | ||
| 182 | +priority == 0(默认,适合做索引之类的离线任务):仍走原有 TEXT_MAX_INFLIGHT / IMAGE_MAX_INFLIGHT 准入;超限返回过载状态码。 | ||
| 183 | +priority > 0(或者==1)(适合在线请求):不会因准入被拒绝,但是仍然需要占用inflight,这样保证在线请求不被限制,并且在线请求很多的时候可以拒绝掉离线的请求。 | ||
| 184 | + | ||
| 185 | +- 除了限制规则的修改,更进一步的,也需要保证这种请求是优先处理的(priority=1的相比=0的更优先被处理)。 | ||
| 186 | +关于技术方案,有Worker + 双队列、PriorityMutex等等,除此之外,也请你思考合适的方案。 | ||
| 187 | +成熟稳定、不带来复杂度、性能、稳定性方面的副作用,是最重要的。请先了解代码、需求,深度思考解决方案 | ||
| 188 | + | ||
| 189 | +- 向量的缓存 | ||
| 190 | + | ||
| 191 | +3.3 Reranker优化 | ||
| 192 | + | ||
| 193 | +- 多reranker: | ||
| 194 | +改 reranker 服务,一次请求返回多路分 | ||
| 195 | +服务启动时 加载多个 backend(或按请求懒加载),/rerank 响应扩展为例如 | ||
| 196 | +scores: [...](兼容主后端)+ scores_by_backend: { "bge": [...], "qwen3_vllm": [...] }。 | ||
| 197 | +搜索侧解析多路分,再融合或只透传 debug。 | ||
| 198 | +优点:搜索侧仍只调一个 URL。缺点:单进程多大模型 显存压力很大; | ||
| 199 | + | ||
| 200 | +- 融合层要注意的一点 | ||
| 201 | +fuse_scores_and_resort 目前只消费 一条 rerank_scores 序列,并写入 _rerank_score | ||
| 202 | +多 backend 之后需要rerank_scores 都参与融合 | ||
| 203 | + | ||
| 204 | +- 必要性: | ||
| 205 | +见 qwen3-reranker和bge-m3的严重badcase | ||
| 206 | +不一定是要多reranker的方式,但是一定会需要解决方案。 | ||
| 207 | + | ||
| 208 | +- reranker 补充:nvidia/llama-nemotron-rerank-1b-v2 | ||
| 209 | +https://huggingface.co/nvidia/llama-nemotron-rerank-1b-v2 | ||
| 210 | +后端推理也建议使用vLLM | ||
| 211 | +注意搜索相关资料,挖掘我的特斯拉 T4 GPU 的性能,充分挖掘性能 | ||
| 212 | +你有充足的自由度进行实验 | ||
| 213 | +encoder架构。 | ||
| 214 | +比较新。 | ||
| 215 | +性能更好。 | ||
| 216 | +亚马逊 电商搜索数据集比qwen-reranker-4b更好。 | ||
| 217 | +支持vLLM。 | ||
| 218 | + | ||
| 219 | +- Qwen3-Reranker-4B-GGUF | ||
| 220 | +https://modelscope.cn/models/dengcao/Qwen3-Reranker-4B-GGUF/summary | ||
| 221 | + 1. 要确定选择哪种量化方式 | ||
| 222 | + 2. 确定提示词 | ||
| 223 | + | ||
| 224 | +- qwen3-embedding、qwen3-reranker (done) | ||
| 225 | +选一个推理引擎,相比于我自己直接调 sentence-transformers,主要是多进程和负载均衡、连续批处理,比较有用 | ||
| 226 | +当前结论:embedding 场景优先 TEI;vLLM 更偏向生成式与 rerank 场景。 | ||
| 227 | + | ||
| 228 | +- rerank 性能优化 | ||
| 229 | + | ||
| 230 | +3.4 翻译模型优化 | ||
| 231 | + | ||
| 232 | +- 翻译,增加facebook/nllb-200-distilled-600M | ||
| 233 | +https://blog.csdn.net/qq_42746084/article/details/154947534 | ||
| 234 | +https://huggingface.co/facebook/nllb-200-distilled-600M | ||
| 235 | + | ||
| 236 | +- 店铺的语言:英语能占到80%,所以专门增加一个en-zh的 | ||
| 237 | +https://huggingface.co/Helsinki-NLP/opus-mt-zh-en | ||
| 238 | +https://huggingface.co/Helsinki-NLP/opus-mt-en-zh | ||
| 239 | + | ||
| 240 | +- opus-mt-zh-en | ||
| 241 | + | ||
| 242 | + from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | ||
| 243 | + model_name = "./models/opus-mt-en-zh" | ||
| 244 | + tokenizer = AutoTokenizer.from_pretrained(model_name) | ||
| 245 | + model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | ||
| 246 | + data = 'test' | ||
| 247 | + encoded = tokenizer([data], return_tensors="pt") | ||
| 248 | + translation = model.generate(**encoded) | ||
| 249 | + result = tokenizer.batch_decode(translation, skip_special_tokens=True)[0] | ||
| 250 | + print(result) | ||
| 251 | + | ||
| 252 | +- nllb-200-distilled-600M性能优化 | ||
| 253 | +已完成(2026-03) | ||
| 254 | + - CTranslate2 迁移 + float16 转换 | ||
| 255 | + - 扩展压测报告:perf_reports/20260318/translation_local_models_ct2/README.md | ||
| 256 | + - T4 聚焦调优报告:perf_reports/20260318/translation_local_models_ct2_focus/README.md | ||
| 257 | + - NLLB T4 商品标题专项报告:perf_reports/20260318/nllb_t4_product_names_ct2/README.md | ||
| 258 | + - 当前结论: | ||
| 259 | + - NLLB 在线默认推荐:ct2_inter_threads=4 + ct2_max_queued_batches=32 + ct2_batch_type=examples + ct2_decoding_length_mode=source(+8,min=32) | ||
| 260 | + - opus-mt-zh-en 维持保守默认更稳 | ||
| 261 | + - opus-mt-en-zh 如追求离线吞吐可继续做单独 profile | ||
| 262 | + | ||
| 263 | +- 请搜索nllb-200-distilled-600M这类seq2seq、transformer架构的模型,有哪些性能优化方案,提高线上翻译服务的吞吐量、降低耗时,搜索相关的在线推理服务方案,找到高性能的服务化方法 | ||
| 264 | + | ||
| 265 | +- 查看翻译的缓存情况 | ||
| 266 | + | ||
| 267 | +3.5 其他模型优化 | ||
| 268 | + | ||
| 269 | +- cnclip的性能优化 | ||
| 270 | + | ||
| 271 | + | ||
| 272 | +--- | ||
| 273 | + | ||
| 274 | +4. 性能优化与超时配置 | ||
| 275 | + | ||
| 276 | +4.1 超时配置 | ||
| 277 | + | ||
| 278 | +- Query 分析阶段等待翻译/embedding 的硬超时 | ||
| 279 | +配置文件位置:config/config.yaml | ||
| 280 | +配置项:query_config.async_wait_timeout_ms: 80 | ||
| 281 | +代码生效点:query/query_parser.py 使用该值换算成秒传给 wait(...) | ||
| 282 | + | ||
| 283 | +2. Embedding HTTP 调用超时(Text/Image) | ||
| 284 | +不再使用任何环境变量覆盖(之前提到的 EMBEDDING_HTTP_TIMEOUT_SEC 已不采用) | ||
| 285 | +配置文件位置:config/config.yaml | ||
| 286 | +配置项:services.embedding.providers.http.timeout_sec(已在 YAML 里补了示例默认 60) | ||
| 287 | +代码生效点: | ||
| 288 | +embeddings/text_encoder.py:requests.post(..., timeout=self.timeout_sec) | ||
| 289 | +embeddings/image_encoder.py:requests.post(..., timeout=self.timeout_sec) | ||
| 290 | + | ||
| 291 | +4.2 生成式服务优化(Partial Mode) | ||
| 292 | + | ||
| 293 | +- product_enrich : Partial Mode : done | ||
| 294 | +https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-menu-2400256.d_0_3_0_7.74a630119Ct6zR | ||
| 295 | +需在messages 数组中将最后一条消息的 role 设置为 assistant,并在其 content 中提供前缀,在此消息中设置参数 "partial": true。messages格式如下: | ||
| 296 | +[ | ||
| 297 | + { | ||
| 298 | + "role": "user", | ||
| 299 | + "content": "请补全这个斐波那契函数,勿添加其它内容" | ||
| 300 | + }, | ||
| 301 | + { | ||
| 302 | + "role": "assistant", | ||
| 303 | + "content": "def calculate_fibonacci(n):\n if n <= 1:\n return n\n else:\n", | ||
| 304 | + "partial": true | ||
| 305 | + } | ||
| 306 | +] | ||
| 307 | +模型会以前缀内容为起点开始生成。 | ||
| 308 | +支持 非思考模式。 | ||
| 309 | + | ||
| 310 | + | ||
| 311 | +--- | ||
| 312 | + | ||
| 313 | +5. Elasticsearch相关 | ||
| 314 | + | ||
| 315 | +- es需要licence的两个功能,如果费用低,开通下licence,或者改es源码定制开发下,支持 rank.rrf,reranker | ||
| 316 | + | ||
| 317 | + { | ||
| 318 | + "query": { ...全文检索... }, | ||
| 319 | + "knn": { ...向量检索... }, | ||
| 320 | + "rank": { | ||
| 321 | + "rrf": {} | ||
| 322 | + } | ||
| 323 | + } | ||
| 324 | + | ||
| 325 | + | ||
| 326 | +--- | ||
| 327 | + | ||
| 328 | +1. 配置体系重构 | ||
| 329 | +Referring to @docs/config-system-review-and-redesign.md , most of the modifications have been completed. Could you conduct a review to check what else needs improvement in the configuration documentation system? Are there any outstanding issues? | ||
| 330 | + | ||
| 331 | +一、仍然存在大量通过环境变量获取配置的地方 | ||
| 332 | +_SERVICE_KIND = (os.getenv("EMBEDDING_SERVICE_KIND", "all") or "all").strip().lower() | ||
| 333 | +if _SERVICE_KIND not in {"all", "text", "image"}: | ||
| 334 | + raise RuntimeError( | ||
| 335 | + f"Invalid EMBEDDING_SERVICE_KIND={_SERVICE_KIND!r}; expected all, text, or image" | ||
| 336 | + ) | ||
| 337 | +_TEXT_ENABLED_BY_ENV = os.getenv("EMBEDDING_ENABLE_TEXT_MODEL", "true").lower() in ("1", "true", "yes") | ||
| 338 | +_IMAGE_ENABLED_BY_ENV = os.getenv("EMBEDDING_ENABLE_IMAGE_MODEL", "true").lower() in ("1", "true", "yes") | ||
| 339 | +open_text_model = _TEXT_ENABLED_BY_ENV and _SERVICE_KIND in {"all", "text"} | ||
| 340 | +open_image_model = _IMAGE_ENABLED_BY_ENV and _SERVICE_KIND in {"all", "image"} | ||
| 341 | + | ||
| 342 | +_text_encode_lock = threading.Lock() | ||
| 343 | +_image_encode_lock = threading.Lock() | ||
| 344 | + | ||
| 345 | +_TEXT_MICROBATCH_WINDOW_SEC = max( | ||
| 346 | + 0.0, float(os.getenv("TEXT_MICROBATCH_WINDOW_MS", "4")) / 1000.0 | ||
| 347 | +) | ||
| 348 | +_TEXT_REQUEST_TIMEOUT_SEC = max( | ||
| 349 | + 1.0, float(os.getenv("TEXT_REQUEST_TIMEOUT_SEC", "30")) | ||
| 350 | +) | ||
| 351 | +_TEXT_MAX_INFLIGHT = max(1, int(os.getenv("TEXT_MAX_INFLIGHT", "32"))) | ||
| 352 | +_IMAGE_MAX_INFLIGHT = max(1, int(os.getenv("IMAGE_MAX_INFLIGHT", "1"))) | ||
| 353 | +_OVERLOAD_STATUS_CODE = int(os.getenv("EMBEDDING_OVERLOAD_STATUS_CODE", "503")) | ||
| 354 | +_LOG_PREVIEW_COUNT = max(1, int(os.getenv("EMBEDDING_LOG_PREVIEW_COUNT", "3"))) | ||
| 355 | +_LOG_TEXT_PREVIEW_CHARS = max(32, int(os.getenv("EMBEDDING_LOG_TEXT_PREVIEW_CHARS", "120"))) | ||
| 356 | +_LOG_IMAGE_PREVIEW_CHARS = max(32, int(os.getenv("EMBEDDING_LOG_IMAGE_PREVIEW_CHARS", "180"))) | ||
| 357 | +_VECTOR_PREVIEW_DIMS = max(1, int(os.getenv("EMBEDDING_VECTOR_PREVIEW_DIMS", "6"))) | ||
| 358 | +_CACHE_PREFIX = str(REDIS_CONFIG.get("embedding_cache_prefix", "embedding")).strip() or "embedding" | ||
| 359 | + | ||
| 360 | + | ||
| 361 | + | ||
| 362 | + | ||
| 363 | + | ||
| 364 | +还有这些写死的地址 @embedding/config.py | ||
| 365 | + | ||
| 366 | +self.TEI_BASE_URL = str(text_backend.get("base_url") or "http://127.0.0.1:8080") | ||
| 367 | +self.TEI_TIMEOUT_SEC = int(text_backend.get("timeout_sec", 60)) | ||
| 368 | + | ||
| 369 | +self.USE_CLIP_AS_SERVICE = services.image_backend == "clip_as_service" | ||
| 370 | +self.CLIP_AS_SERVICE_SERVER = str(image_backend.get("server") or "grpc://127.0.0.1:51000") | ||
| 371 | + | ||
| 372 | + | ||
| 373 | + | ||
| 374 | + | ||
| 375 | +看起来似乎并没有完全遵循这些原则? | ||
| 376 | +4. 重新设计的设计原则 | ||
| 377 | +重新设计应遵循以下规则。 | ||
| 378 | + | ||
| 379 | +4.1 单一逻辑配置系统 | ||
| 380 | +可以有多个文件,但不能有多个职责重叠的加载器。 | ||
| 381 | +必须有一个加载器管道,能够生成一个类型化的 AppConfig 对象。 | ||
| 382 | + | ||
| 383 | +4.2 配置文件负责声明,解析代码负责解释,环境变量负责运行时注入 | ||
| 384 | +职责应明确如下: | ||
| 385 | +配置文件 | ||
| 386 | +声明非敏感的目标行为和可部署的非敏感设置 | ||
| 387 | +解析逻辑 | ||
| 388 | +加载、合并、验证、规范化并暴露类型化的配置 | ||
| 389 | +绝不发明隐藏的业务行为 | ||
| 390 | +环境变量 | ||
| 391 | +承载密钥和少量运行时/进程相关的值 | ||
| 392 | +不随意地重新定义业务行为 | ||
| 393 | + | ||
| 394 | +4.3 整个系统采用单一的优先级规则 | ||
| 395 | +除非明确豁免,否则每个配置类别都应遵循相同的合并模型。 | ||
| 396 | + | ||
| 397 | +4.4 业务行为不得有静默的隐式后备 | ||
| 398 | +在启动时,如果必需的配置缺失或无效,应快速失败。 | ||
| 399 | +不要静默地回退到诸如硬编码语言列表之类的遗留行为。 | ||
| 400 | + | ||
| 401 | +4.5 有效配置必须可观测 | ||
| 402 | +每个服务都应能够展示: | ||
| 403 | +配置版本或哈希值 | ||
| 404 | +加载的源文件 | ||
| 405 | +环境名称 | ||
| 406 | +经过清理的有效配置 | ||
| 407 | + | ||
| 408 | +5. 推荐的目标设计 | ||
| 409 | + | ||
| 410 | +5.1 边界模型 | ||
| 411 | +使用三个清晰的层级。 | ||
| 412 | +层级 1:代码仓库管理的静态配置 | ||
| 413 | +目的: | ||
| 414 | +搜索行为 | ||
| 415 | +租户行为 | ||
| 416 | +提供商/后端注册表 | ||
| 417 | +非敏感的服务拓扑默认值 | ||
| 418 | +功能开关 | ||
| 419 | +示例: | ||
| 420 | +字段权重 | ||
| 421 | +查询策略 | ||
| 422 | +重排序融合参数 | ||
| 423 | +租户语言方案 | ||
| 424 | +翻译能力注册表 | ||
| 425 | +嵌入后端选择默认值 | ||
| 426 | + | ||
| 427 | +层级 2:特定环境的层叠配置 | ||
| 428 | +目的: | ||
| 429 | +按环境区分的非敏感差异 | ||
| 430 | +按环境区分的服务端点 | ||
| 431 | +按环境区分的资源大小默认值 | ||
| 432 | +开发/测试/生产环境的运维差异 | ||
| 433 | +示例: | ||
| 434 | +本地嵌入 URL 与生产环境嵌入 URL | ||
| 435 | +开发环境重排序后端与生产环境重排序后端 | ||
| 436 | +本地开发环境中较低的并发度 | ||
| 437 | + | ||
| 438 | +层级 3:环境变量 | ||
| 439 | +目的: | ||
| 440 | +密钥 | ||
| 441 | +绑定主机/端口 | ||
| 442 | +外部基础设施凭证 | ||
| 443 | +容器编排器的最后一步注入 | ||
| 444 | +示例: | ||
| 445 | +ES_HOST, ES_USERNAME, ES_PASSWORD | ||
| 446 | +DB_HOST, DB_USERNAME, DB_PASSWORD | ||
| 447 | +REDIS_HOST, REDIS_PASSWORD | ||
| 448 | +DASHSCOPE_API_KEY, DEEPL_AUTH_KEY | ||
| 449 | +API_HOST, API_PORT, INDEXER_PORT, TRANSLATION_PORT | ||
| 450 | +规则: | ||
| 451 | +环境变量不应成为选择业务行为(如翻译模型、嵌入后端或租户语言策略)的常规途径 | ||
| 452 | +如果允许对非敏感字段进行环境变量覆盖,则必须将其明确列出并记录为运维覆盖手段,而非隐藏的约定 | ||
| 453 | + | ||
| 454 | +5.2 统一的优先级 | ||
| 455 | +推荐的优先级顺序: | ||
| 456 | +代码中的模式(schema)默认值 | ||
| 457 | +config/base.yaml | ||
| 458 | +config/environments/<env>.yaml | ||
| 459 | +来自 config/tenants/ 的租户层叠配置 | ||
| 460 | +针对明确允许的运行时键的环境变量 | ||
| 461 | +仅适用于当前进程的命令行标志 | ||
| 462 | +重要规则: | ||
| 463 | +只能有一个模块实现此合并逻辑 | ||
| 464 | +任何业务模块不得直接调用 os.getenv() 来获取配置 | ||
| 465 | + | ||
| 466 | + | ||
| 467 | + | ||
| 468 | +二、cursor(composer2)帮我审查的 | ||
| 469 | + | ||
| 470 | +## 配置文档系统审查 | ||
| 471 | + | ||
| 472 | +这是对配置文档系统的简要审查,对照 [`docs/config-system-review-and-redesign.md`](docs/config-system-review-and-redesign.md) 和当前代码库进行。 | ||
| 473 | + | ||
| 474 | +### 实现已匹配的部分 | ||
| 475 | + | ||
| 476 | +代码库已朝着重新设计的目标迈进了很大一步: | ||
| 477 | + | ||
| 478 | +- 单一类型化根节点:`AppConfig` 位于 [`config/schema.py`](config/schema.py),通过 [`config/loader.py`](config/loader.py) 和 [`get_app_config()`](config/loader.py) 加载。 | ||
| 479 | +- 精简的 `services_config`:[`config/services_config.py`](config/services_config.py) 委托给 `get_app_config()`(解决了服务层的发现 A)。 | ||
| 480 | +- 租户目录:[`config/tenant_config_loader.py`](config/tenant_config_loader.py) 使用 `get_app_config().tenants`。 | ||
| 481 | +- 重写资产路径:`config/dictionaries/query_rewrite.dict` 已存在(解决了发现 E 中的文件名不匹配问题)。 | ||
| 482 | +- 可观测性:[`GET /admin/config`](api/routes/admin.py) 返回经过脱敏处理的有效配置树;[`GET /admin/config/meta`](api/routes/admin.py) 暴露环境信息、`config_hash`、`loaded_files`、`deprecated_keys`(涵盖了 §5.10 的意图;文档中提议的是 `/admin/config/effective`,但实际实现位于 `/admin/config`)。 | ||
| 483 | + | ||
| 484 | +因此,"单一加载器 + 有效配置可见性" 的故事在代码中已基本实现;文档尚未完全跟上。 | ||
| 485 | + | ||
| 486 | +--- | ||
| 487 | + | ||
| 488 | +## 文档问题(影响最大) | ||
| 489 | + | ||
| 490 | +### 1. 管理 API 文档中关于 `/admin/config` 的描述错误 | ||
| 491 | + | ||
| 492 | +[`docs/搜索API对接指南.md`](docs/搜索API对接指南.md)(管理部分附近)和 [`docs/搜索API对接指南-06-管理接口(Admin).md`](docs/搜索API对接指南-06-管理接口(Admin).md) 仍将 `/admin/config` 描述为按租户的 JSON(包含 `tenant_id`、`es_index_name`、`supported_languages` 等字段)。实际实现返回的是 `AppConfig.sanitized_dict()`(完整的应用配置,敏感信息已脱敏),而不是租户摘要字段。 | ||
| 493 | + | ||
| 494 | +这些指南中还缺少: `GET /admin/config/meta`。 | ||
| 495 | + | ||
| 496 | +健康检查: 拆分指南中的示例包含了 [`HealthResponse`](api/models.py) 中不存在的字段(只有 `status` 和 `elasticsearch`)。 | ||
| 497 | + | ||
| 498 | +对于任何仅根据文档进行 API 集成的人来说,这是最明显的"未解决问题"。 | ||
| 499 | + | ||
| 500 | +### 2. 面向开发者的指南仍将 `services_config` 作为"配置解析器"的核心 | ||
| 501 | + | ||
| 502 | +[`docs/DEVELOPER_GUIDE.md`](docs/DEVELOPER_GUIDE.md) §5.2 仍说搜索配置由 `ConfigLoader` 加载,服务由 `config/services_config` "解析"。§6.2 仍将 `config/services_config.py` 列为主要的"解析入口"。[`docs/QUICKSTART.md`](docs/QUICKSTART.md) §3.1 仍说"配置解析:`config/services_config.py`"。 | ||
| 503 | + | ||
| 504 | +文档中准确的说法应该是:规范入口是 `config/loader.py` + `get_app_config()`;[`config/config_loader.py`](config/config_loader.py) 中的 `ConfigLoader` 包装了统一加载器;`services_config` 是现有调用点的兼容性外观。 | ||
| 505 | + | ||
| 506 | +### 3. 重新设计文档本身不是"活的"状态文档 | ||
| 507 | + | ||
| 508 | +[`docs/config-system-review-and-redesign.md`](docs/config-system-review-and-redesign.md) 读起来仍是纯粹的问题陈述 + 目标,没有简短的**"已实现 vs 剩余"**部分。这很容易让人假设什么都没做,或者重复工作。添加一个小附录(或一页 `config/README.md` —— 见下文)可以解决这个问题。 | ||
| 509 | + | ||
| 510 | +### 4. 缺少 `config/README.md`(§5.3 中推荐) | ||
| 511 | + | ||
| 512 | +仍然没有专门的 `config/README.md` 来描述:加载器入口点、高级优先级、字典存放位置、指向 `/admin/config` + `/admin/config/meta` 的链接,以及重新设计文档的链接。这是重新设计中明确的交付物,可以锚定"文档系统"。 | ||
| 513 | + | ||
| 514 | +### 5. 轻微的文档整洁问题 | ||
| 515 | + | ||
| 516 | +- [`docs/QUICKSTART.md`](docs/QUICKSTART.md) §1.9 环境变量项目后的行有一个多余字符:`---·`(可能是打字错误)。 | ||
| 517 | +- [`docs/DEVELOPER_GUIDE.md`](docs/DEVELOPER_GUIDE.md) §10 文档索引没有列出 `config-system-review-and-redesign.md` 或未来的 `config/README.md`。 | ||
| 518 | + | ||
| 519 | +--- | ||
| 520 | + | ||
| 521 | +## 重新设计目标与当前代码之间的差距(文档不应声称"已完成") | ||
| 522 | + | ||
| 523 | +这些影响文档的诚实度: | ||
| 524 | + | ||
| 525 | +| 主题 | 状态 | | ||
| 526 | +|--------|--------| | ||
| 527 | +| `config dump` CLI(§5.10) | `main.py` 中不存在;运维人员依赖 HTTP 或临时脚本。 | | ||
| 528 | +| 隐藏的 `["en", "zh"]` 回退(阶段 3 / 发现 D) | 仍在 [`indexer/document_transformer.py`](indexer/document_transformer.py)、[`suggestion/builder.py`](suggestion/builder.py) 等中使用。 | | ||
| 529 | +| 加载器外的 `os.getenv`(规则 1–2) | 仍在例如 [`embeddings/server.py`](embeddings/server.py)、[`reranker/server.py`](reranker/server.py)、[`api/app.py`](api/app.py) 中使用 —— 文档声称"仅加载器"将是夸大其词。 | | ||
| 530 | +| 拆分 `base.yaml` / `environments/` / `tenants/*.yaml`(阶段 5) | 未采用;仍是单一的 [`config/config.yaml`](config/config.yaml)。 | | ||
| 531 | +| 遗留租户标志(阶段 6 / 发现 H) | [`indexer/README.md`](indexer/README.md) 仍描述上游 MySQL 的 `translate_to_en` / `translate_to_zh`(这可能作为上游模式文档保留;需与 Python `tenant_config` 模型区分开来)。 | | ||
| 532 | + | ||
| 533 | +--- | ||
| 534 | + | ||
| 535 | +## 推荐的后续步骤(仅文档,按优先级排序) | ||
| 536 | + | ||
| 537 | +1. 修复管理 API 文档(合并指南 + `-06-` 拆分):`/admin/config` 的响应格式,添加 `/admin/config/meta`,使健康检查示例与 [`HealthResponse`](api/models.py) 一致。 | ||
| 538 | +2. 更新 DEVELOPER_GUIDE §5–§6 和 QUICKSTART §1.9 / §3.1,将 `get_app_config()` / `loader.py` 描述为主要入口,将 `services_config` 描述为适配器。 | ||
| 539 | +3. 添加 `config/README.md`(简短的操作 + 开发者入口)。 | ||
| 540 | +4. 在 `config-system-review-and-redesign.md` 中添加带日期的实现状态表(已交付 vs 推迟的内容),使审查文档不与现实矛盾。 | ||
| 541 | +5. DEVELOPER_GUIDE §9 检查清单:将"配置来自 `services_config`"替换为允许 `get_app_config()` 或精简适配器的语言,与 §6 保持一致。 | ||
| 542 | + | ||
| 543 | +如果需要,我可以在后续处理中为项目 1–3 和重新设计文档中的简短状态块应用补丁。 | ||
| 544 | + | ||
| 545 | +其他云API | ||
| 546 | +1 | ||
| 547 | +1)提供两个rerank云API_KEY给我:(优先级:高) | ||
| 548 | +AWS Bedrock / Azure 两家云有提供的Cohere Rerank 3.5/4模型API,开通APIKEY | ||
| 549 | +google云 Vertex AI Ranking API | ||
| 550 | + | ||
| 551 | +已经调研: | ||
| 552 | +阿里云在美国地区没有提供任意reranker API | ||
| 553 | +AWS Bedrock / Azure 两家云有提供Cohere Rerank 3.5 | ||
| 554 | +google云Vertex AI Ranking API性能更好 | ||
| 555 | + | ||
| 556 | +以上两个APIKEY给我,我来测试性能和效果。 | ||
| 557 | + | ||
| 558 | + | ||
| 559 | +2)寻找美国地区reranker API最佳实践(优先级:高) | ||
| 560 | +效果要求:qwen3-reranker-4b(或者同等能力。可对比huggingface公开的评测指标)的API | ||
| 561 | +性能要求:在我们的服务器上,一个请求内排序400条结果、耗时低于300ms | ||
| 562 | +测试评估:基于电商领域商品搜索场景评估效果(我可以提供数据) | ||
| 563 | +据我了解的Cohere Rerank可能达不到这个性能要求,可能可以考虑拆分为4个请求、每个100条,做到300ms以内可能可以。 | ||
| 564 | +参考Cohere Rerank 3.5 benchmark: | ||
| 565 | +https://docs.oracle.com/en-us/iaas/Content/generative-ai/benchmark-cohere-rerank-3-5.htm | ||
| 566 | + | ||
| 567 | + | ||
| 568 | +3)提供谷歌翻译API的apikey (优先级:低) | ||
| 569 | +给我apikey,我看下耗时,希望耗时P95低于80ms满足在线请求使用 | ||
| 570 | +在线翻译的问题已经基本解决,这一块需求不是特别大。 | ||
| 571 | + | ||
| 572 | +2 | ||
| 573 | +混用 大模型 使用:hunyuan-turbos-latest | ||
| 574 | +混元 OpenAI 兼容接口相关调用示例:https://cloud.tencent.com/document/product/1729/111007 | ||
| 575 | + | ||
| 576 | +腾讯云 混元大模型 API_KEY:sk-mN2PiW2gp57B3ykxGs4QhvYxhPzXRZ2bcR5kPqadjboGYwiz | ||
| 577 | + | ||
| 578 | +hunyuan翻译:使用模型 hunyuan-translation | ||
| 579 | +https://cloud.tencent.com/document/product/1729/113395#4.-.E7.A4.BA.E4.BE.8B | ||
| 580 | + | ||
| 581 | +谷歌翻译 基础版:https://docs.cloud.google.com/translate/docs/reference/rest/v2/translate | ||
| 582 | + | ||
| 583 | +阿里云 百炼模型 现在使用的apikey是国内的。 | ||
| 584 | +各地域的 Base URL 和对应的 API Key 是绑定的。 | ||
| 585 | + | ||
| 586 | +现在使用了美国的服务器,使用了美国的地址,需要在 美国地域控制台页面(https://modelstudio.console.aliyun.com/us-east-1 )中创建或获取API_KEY: | ||
| 587 | + | ||
| 588 | +登录 百炼美国地域控制台:https://modelstudio.console.aliyun.com/us-east-1?spm=5176.2020520104.0.0.6b383a98WjpXff | ||
| 589 | +在 API Key 管理 中创建或复制一个适用于美国地域的 Key | ||
| 590 | + | ||
| 591 | +搜索效果反馈: | ||
| 592 | +做完一些短期优化后,需要做一些case驱动的优化。 | ||
| 593 | +给到100条测试用例,每个搜索词,要记录请求ID、以及 希望排序靠前但是没有靠前的(比如希望出现在第一页但是没出现在第一页的)、以及未召回的商品ID(希望出现在前几页但是没翻到的) | ||
| 594 | +6. 其他任务 | ||
| 595 | + | ||
| 596 | +- suggest 索引,现在是全量脚本,要交给金伟 | ||
| 0 | \ No newline at end of file | 597 | \ No newline at end of file |
docs/TODO.txt
| @@ -55,14 +55,6 @@ image_embedding改为,一个spu有多个sku向量,每个向量内部properti | @@ -55,14 +55,6 @@ image_embedding改为,一个spu有多个sku向量,每个向量内部properti | ||
| 55 | }, | 55 | }, |
| 56 | 56 | ||
| 57 | 57 | ||
| 58 | - | ||
| 59 | - | ||
| 60 | -tags字段使用的优化: | ||
| 61 | -现在是keyword,在搜索中,不太好使用(目前主要用于suggest)。 | ||
| 62 | -可以考虑也拆分多语言,配合analyzer使用(和qanchors一样) | ||
| 63 | - | ||
| 64 | - | ||
| 65 | - | ||
| 66 | 外部需求: | 58 | 外部需求: |
| 67 | 1. 对推理能力要求很低、对耗时要求很高的大模型API(或者本地部署一个7b Q4量化的大模型),prompt大概30-50个token,首token响应要求500ms以内 | 59 | 1. 对推理能力要求很低、对耗时要求很高的大模型API(或者本地部署一个7b Q4量化的大模型),prompt大概30-50个token,首token响应要求500ms以内 |
| 68 | 2. ES支持reranker pipline? | 60 | 2. ES支持reranker pipline? |
| @@ -86,7 +78,7 @@ query匹配了其中任何一个词,都认为,具有颜色意图 | @@ -86,7 +78,7 @@ query匹配了其中任何一个词,都认为,具有颜色意图 | ||
| 86 | 匹配规则: 用细粒度、粗粒度分词,看是否有在词表中的。原始query分词、和每种翻译的分词,都要用。 | 78 | 匹配规则: 用细粒度、粗粒度分词,看是否有在词表中的。原始query分词、和每种翻译的分词,都要用。 |
| 87 | 79 | ||
| 88 | 意图判断: 暂时留空,直接返回true。目前没有模型,即只要召回了(词表匹配了),即认为有该维度款式需求。 | 80 | 意图判断: 暂时留空,直接返回true。目前没有模型,即只要召回了(词表匹配了),即认为有该维度款式需求。 |
| 89 | - | 81 | +(以后考虑建设fasttext/bert系列多分类模型) |
| 90 | 82 | ||
| 91 | 83 | ||
| 92 | 意图使用: | 84 | 意图使用: |
| @@ -119,17 +111,23 @@ query匹配了其中任何一个词,都认为,具有颜色意图 | @@ -119,17 +111,23 @@ query匹配了其中任何一个词,都认为,具有颜色意图 | ||
| 119 | 5. TODO : 还有一个问题。 目前,sku只返回一个维度(店铺主维度。默认应该是option1,不是所有维度的sku信息都返回的。所以,如果有款式意图,但是主维度是颜色,那么拿不到全的款式sku) | 111 | 5. TODO : 还有一个问题。 目前,sku只返回一个维度(店铺主维度。默认应该是option1,不是所有维度的sku信息都返回的。所以,如果有款式意图,但是主维度是颜色,那么拿不到全的款式sku) |
| 120 | 112 | ||
| 121 | 113 | ||
| 122 | - | ||
| 123 | - | ||
| 124 | 当前项目功能已经较多,但是有清晰的框架,请务必基于现有框架进行改造,不要进行补丁式的修改,避免代码逻辑分叉。 | 114 | 当前项目功能已经较多,但是有清晰的框架,请务必基于现有框架进行改造,不要进行补丁式的修改,避免代码逻辑分叉。 |
| 125 | - | ||
| 126 | 请一步一步来,先设计意图识别模块,仔细思考需求,意图识别模块需要提供哪些内容,用于返回数据接口的定义,深度思考,定义一个合理的接口后,再给出合理的模块设计。 | 115 | 请一步一步来,先设计意图识别模块,仔细思考需求,意图识别模块需要提供哪些内容,用于返回数据接口的定义,深度思考,定义一个合理的接口后,再给出合理的模块设计。 |
| 127 | 116 | ||
| 128 | 117 | ||
| 129 | 118 | ||
| 130 | 119 | ||
| 120 | +文本相关性: | ||
| 121 | +调研: | ||
| 122 | +Princeton WordNet — 英文同义词底库 | ||
| 123 | +Shopify Product Taxonomy — 电商品类标准 | ||
| 124 | +Querqy — 电商搜索规则框架 | ||
| 125 | +gensimpson/elasticsearch-synonyms — ES 同义词规则落地 | ||
| 131 | 126 | ||
| 132 | 127 | ||
| 128 | +tags字段使用的优化: | ||
| 129 | +现在是keyword,在搜索中,不太好使用(目前主要用于suggest)。 | ||
| 130 | +可以考虑也拆分多语言,配合analyzer使用(和qanchors一样) | ||
| 133 | 131 | ||
| 134 | 132 | ||
| 135 | 133 | ||
| @@ -463,8 +461,9 @@ scores: [...](兼容主后端)+ scores_by_backend: { "bge": [...], "qwen3_vl | @@ -463,8 +461,9 @@ scores: [...](兼容主后端)+ scores_by_backend: { "bge": [...], "qwen3_vl | ||
| 463 | fuse_scores_and_resort 目前只消费 一条 rerank_scores 序列,并写入 _rerank_score | 461 | fuse_scores_and_resort 目前只消费 一条 rerank_scores 序列,并写入 _rerank_score |
| 464 | 多 backend 之后需要rerank_scores 都参与融合 | 462 | 多 backend 之后需要rerank_scores 都参与融合 |
| 465 | 463 | ||
| 466 | - | ||
| 467 | - | 464 | +必要性: |
| 465 | +见 qwen3-reranker和bge-m3的严重badcase | ||
| 466 | +不一定是要多reranker的方式,但是一定会需要解决方案。 | ||
| 468 | 467 | ||
| 469 | 468 | ||
| 470 | 469 |
search/es_query_builder.py
| @@ -37,10 +37,13 @@ class ESQueryBuilder: | @@ -37,10 +37,13 @@ class ESQueryBuilder: | ||
| 37 | translation_minimum_should_match: str = "70%", | 37 | translation_minimum_should_match: str = "70%", |
| 38 | translation_boost: float = 0.4, | 38 | translation_boost: float = 0.4, |
| 39 | tie_breaker_base_query: float = 0.9, | 39 | tie_breaker_base_query: float = 0.9, |
| 40 | + best_fields_boosts: Optional[Dict[str, float]] = None, | ||
| 41 | + best_fields_clause_boost: float = 2.0, | ||
| 40 | mixed_script_merged_field_boost_scale: float = 0.6, | 42 | mixed_script_merged_field_boost_scale: float = 0.6, |
| 43 | + phrase_field_boosts: Optional[Dict[str, float]] = None, | ||
| 41 | phrase_match_base_fields: Optional[Tuple[str, ...]] = None, | 44 | phrase_match_base_fields: Optional[Tuple[str, ...]] = None, |
| 42 | - phrase_match_slop: int = 2, | ||
| 43 | - phrase_match_tie_breaker: float = 0.4, | 45 | + phrase_match_slop: int = 0, |
| 46 | + phrase_match_tie_breaker: float = 0.0, | ||
| 44 | phrase_match_boost: float = 3.0, | 47 | phrase_match_boost: float = 3.0, |
| 45 | ): | 48 | ): |
| 46 | """ | 49 | """ |
| @@ -77,7 +80,26 @@ class ESQueryBuilder: | @@ -77,7 +80,26 @@ class ESQueryBuilder: | ||
| 77 | self.translation_boost = float(translation_boost) | 80 | self.translation_boost = float(translation_boost) |
| 78 | self.tie_breaker_base_query = float(tie_breaker_base_query) | 81 | self.tie_breaker_base_query = float(tie_breaker_base_query) |
| 79 | self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale) | 82 | self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale) |
| 80 | - self.phrase_match_base_fields = tuple(phrase_match_base_fields or ("title", "qanchors")) | 83 | + default_best_fields = { |
| 84 | + base: self._get_field_boost(base) | ||
| 85 | + for base in self.core_multilingual_fields | ||
| 86 | + if base in self.multilingual_fields | ||
| 87 | + } | ||
| 88 | + self.best_fields_boosts = { | ||
| 89 | + str(base): float(boost) | ||
| 90 | + for base, boost in (best_fields_boosts or default_best_fields).items() | ||
| 91 | + } | ||
| 92 | + self.best_fields_clause_boost = float(best_fields_clause_boost) | ||
| 93 | + default_phrase_base_fields = tuple(phrase_match_base_fields or ("title", "qanchors")) | ||
| 94 | + default_phrase_fields = { | ||
| 95 | + base: self._get_field_boost(base) | ||
| 96 | + for base in default_phrase_base_fields | ||
| 97 | + if base in self.multilingual_fields | ||
| 98 | + } | ||
| 99 | + self.phrase_field_boosts = { | ||
| 100 | + str(base): float(boost) | ||
| 101 | + for base, boost in (phrase_field_boosts or default_phrase_fields).items() | ||
| 102 | + } | ||
| 81 | self.phrase_match_slop = int(phrase_match_slop) | 103 | self.phrase_match_slop = int(phrase_match_slop) |
| 82 | self.phrase_match_tie_breaker = float(phrase_match_tie_breaker) | 104 | self.phrase_match_tie_breaker = float(phrase_match_tie_breaker) |
| 83 | self.phrase_match_boost = float(phrase_match_boost) | 105 | self.phrase_match_boost = float(phrase_match_boost) |
| @@ -399,27 +421,6 @@ class ESQueryBuilder: | @@ -399,27 +421,6 @@ class ESQueryBuilder: | ||
| 399 | 421 | ||
| 400 | return functions | 422 | return functions |
| 401 | 423 | ||
| 402 | - def _build_text_query(self, query_text: str) -> Dict[str, Any]: | ||
| 403 | - """ | ||
| 404 | - Build simple text matching query (BM25). | ||
| 405 | - | ||
| 406 | - Args: | ||
| 407 | - query_text: Query text | ||
| 408 | - | ||
| 409 | - Returns: | ||
| 410 | - ES query clause | ||
| 411 | - """ | ||
| 412 | - return { | ||
| 413 | - "multi_match": { | ||
| 414 | - "query": query_text, | ||
| 415 | - "fields": self.match_fields, | ||
| 416 | - "minimum_should_match": "67%", | ||
| 417 | - "tie_breaker": 0.9, | ||
| 418 | - "boost": 1.0, | ||
| 419 | - "_name": "base_query" | ||
| 420 | - } | ||
| 421 | - } | ||
| 422 | - | ||
| 423 | def _format_field_with_boost(self, field_name: str, boost: float) -> str: | 424 | def _format_field_with_boost(self, field_name: str, boost: float) -> str: |
| 424 | if abs(float(boost) - 1.0) < 1e-9: | 425 | if abs(float(boost) - 1.0) < 1e-9: |
| 425 | return field_name | 426 | return field_name |
| @@ -435,70 +436,38 @@ class ESQueryBuilder: | @@ -435,70 +436,38 @@ class ESQueryBuilder: | ||
| 435 | return float(self.field_boosts[base_field]) | 436 | return float(self.field_boosts[base_field]) |
| 436 | return 1.0 | 437 | return 1.0 |
| 437 | 438 | ||
| 438 | - def _build_match_field_specs(self, language: str) -> Tuple[List[MatchFieldSpec], List[MatchFieldSpec]]: | 439 | + def _build_match_field_specs( |
| 440 | + self, | ||
| 441 | + language: str, | ||
| 442 | + *, | ||
| 443 | + multilingual_fields: Optional[List[str]] = None, | ||
| 444 | + shared_fields: Optional[List[str]] = None, | ||
| 445 | + boost_overrides: Optional[Dict[str, float]] = None, | ||
| 446 | + ) -> List[MatchFieldSpec]: | ||
| 439 | """ | 447 | """ |
| 440 | - Per-language match targets as (field_path, boost). Single source of truth before string formatting. | ||
| 441 | - Returns (all_fields, core_fields); core_fields are for phrase/keyword strategies elsewhere. | 448 | + Per-language match targets as (field_path, boost). Single source of truth before |
| 449 | + formatting as Elasticsearch ``fields`` strings. | ||
| 442 | """ | 450 | """ |
| 443 | lang = (language or "").strip().lower() | 451 | lang = (language or "").strip().lower() |
| 444 | - all_specs: List[MatchFieldSpec] = [] | ||
| 445 | - core_specs: List[MatchFieldSpec] = [] | ||
| 446 | - | ||
| 447 | - for base in self.multilingual_fields: | ||
| 448 | - field = f"{base}.{lang}" | ||
| 449 | - all_specs.append((field, self._get_field_boost(base, lang))) | ||
| 450 | - | ||
| 451 | - for shared in self.shared_fields: | ||
| 452 | - all_specs.append((shared, self._get_field_boost(shared, None))) | 452 | + specs: List[MatchFieldSpec] = [] |
| 453 | + text_fields = multilingual_fields if multilingual_fields is not None else self.multilingual_fields | ||
| 454 | + term_fields = shared_fields if shared_fields is not None else self.shared_fields | ||
| 455 | + overrides = boost_overrides or {} | ||
| 453 | 456 | ||
| 454 | - for base in self.core_multilingual_fields: | 457 | + for base in text_fields: |
| 455 | field = f"{base}.{lang}" | 458 | field = f"{base}.{lang}" |
| 456 | - core_specs.append((field, self._get_field_boost(base, lang))) | 459 | + boost = float(overrides.get(base, self._get_field_boost(base, lang))) |
| 460 | + specs.append((field, boost)) | ||
| 457 | 461 | ||
| 458 | - return all_specs, core_specs | 462 | + for shared in term_fields: |
| 463 | + boost = float(overrides.get(shared, self._get_field_boost(shared, None))) | ||
| 464 | + specs.append((shared, boost)) | ||
| 465 | + return specs | ||
| 459 | 466 | ||
| 460 | def _format_match_field_specs(self, specs: List[MatchFieldSpec]) -> List[str]: | 467 | def _format_match_field_specs(self, specs: List[MatchFieldSpec]) -> List[str]: |
| 461 | """Format (field_path, boost) pairs for Elasticsearch multi_match ``fields``.""" | 468 | """Format (field_path, boost) pairs for Elasticsearch multi_match ``fields``.""" |
| 462 | return [self._format_field_with_boost(path, boost) for path, boost in specs] | 469 | return [self._format_field_with_boost(path, boost) for path, boost in specs] |
| 463 | 470 | ||
| 464 | - def _build_phrase_match_fields(self, language: str) -> List[str]: | ||
| 465 | - """Fields for phrase multi_match: base names × ``.{lang}`` with ``field_boosts``.""" | ||
| 466 | - lang = (language or "").strip().lower() | ||
| 467 | - if not lang: | ||
| 468 | - return [] | ||
| 469 | - out: List[str] = [] | ||
| 470 | - for base in self.phrase_match_base_fields: | ||
| 471 | - path = f"{base}.{lang}" | ||
| 472 | - boost = self._get_field_boost(base, lang) | ||
| 473 | - out.append(self._format_field_with_boost(path, boost)) | ||
| 474 | - return out | ||
| 475 | - | ||
| 476 | - def _append_phrase_should_clause( | ||
| 477 | - self, | ||
| 478 | - should_clauses: List[Dict[str, Any]], | ||
| 479 | - lang: str, | ||
| 480 | - lang_query: str, | ||
| 481 | - clause_name: str | ||
| 482 | - ) -> None: | ||
| 483 | - text = (lang_query or "").strip() | ||
| 484 | - if not text: | ||
| 485 | - return | ||
| 486 | - phrase_fields = self._build_phrase_match_fields(lang) | ||
| 487 | - if not phrase_fields: | ||
| 488 | - return | ||
| 489 | - boost = self.phrase_match_boost | ||
| 490 | - should_clauses.append({ | ||
| 491 | - "multi_match": { | ||
| 492 | - "_name": f"{clause_name}_phrase", | ||
| 493 | - "query": lang_query, | ||
| 494 | - "type": "phrase", | ||
| 495 | - "fields": phrase_fields, | ||
| 496 | - "slop": self.phrase_match_slop, | ||
| 497 | - "tie_breaker": self.phrase_match_tie_breaker, | ||
| 498 | - "boost": boost, | ||
| 499 | - } | ||
| 500 | - }) | ||
| 501 | - | ||
| 502 | def _merge_supplemental_lang_field_specs( | 471 | def _merge_supplemental_lang_field_specs( |
| 503 | self, | 472 | self, |
| 504 | specs: List[MatchFieldSpec], | 473 | specs: List[MatchFieldSpec], |
| @@ -506,7 +475,7 @@ class ESQueryBuilder: | @@ -506,7 +475,7 @@ class ESQueryBuilder: | ||
| 506 | ) -> List[MatchFieldSpec]: | 475 | ) -> List[MatchFieldSpec]: |
| 507 | """Append supplemental-language columns; boosts multiplied by mixed_script scale.""" | 476 | """Append supplemental-language columns; boosts multiplied by mixed_script scale.""" |
| 508 | scale = float(self.mixed_script_merged_field_boost_scale) | 477 | scale = float(self.mixed_script_merged_field_boost_scale) |
| 509 | - extra_all, _ = self._build_match_field_specs(supplemental_lang) | 478 | + extra_all = self._build_match_field_specs(supplemental_lang) |
| 510 | seen = {path for path, _ in specs} | 479 | seen = {path for path, _ in specs} |
| 511 | out = list(specs) | 480 | out = list(specs) |
| 512 | for path, boost in extra_all: | 481 | for path, boost in extra_all: |
| @@ -543,6 +512,103 @@ class ESQueryBuilder: | @@ -543,6 +512,103 @@ class ESQueryBuilder: | ||
| 543 | out = self._merge_supplemental_lang_field_specs(out, "zh") | 512 | out = self._merge_supplemental_lang_field_specs(out, "zh") |
| 544 | return out | 513 | return out |
| 545 | 514 | ||
| 515 | + def _build_best_fields_clause(self, language: str, query_text: str) -> Optional[Dict[str, Any]]: | ||
| 516 | + specs = self._build_match_field_specs( | ||
| 517 | + language, | ||
| 518 | + multilingual_fields=list(self.best_fields_boosts), | ||
| 519 | + shared_fields=[], | ||
| 520 | + boost_overrides=self.best_fields_boosts, | ||
| 521 | + ) | ||
| 522 | + fields = self._format_match_field_specs(specs) | ||
| 523 | + if not fields: | ||
| 524 | + return None | ||
| 525 | + return { | ||
| 526 | + "multi_match": { | ||
| 527 | + "query": query_text, | ||
| 528 | + "type": "best_fields", | ||
| 529 | + "fields": fields, | ||
| 530 | + "boost": self.best_fields_clause_boost, | ||
| 531 | + } | ||
| 532 | + } | ||
| 533 | + | ||
| 534 | + def _build_phrase_clause(self, language: str, query_text: str) -> Optional[Dict[str, Any]]: | ||
| 535 | + specs = self._build_match_field_specs( | ||
| 536 | + language, | ||
| 537 | + multilingual_fields=list(self.phrase_field_boosts), | ||
| 538 | + shared_fields=[], | ||
| 539 | + boost_overrides=self.phrase_field_boosts, | ||
| 540 | + ) | ||
| 541 | + fields = self._format_match_field_specs(specs) | ||
| 542 | + if not fields: | ||
| 543 | + return None | ||
| 544 | + clause: Dict[str, Any] = { | ||
| 545 | + "multi_match": { | ||
| 546 | + "query": query_text, | ||
| 547 | + "type": "phrase", | ||
| 548 | + "fields": fields, | ||
| 549 | + "boost": self.phrase_match_boost, | ||
| 550 | + } | ||
| 551 | + } | ||
| 552 | + if self.phrase_match_slop > 0: | ||
| 553 | + clause["multi_match"]["slop"] = self.phrase_match_slop | ||
| 554 | + if self.phrase_match_tie_breaker > 0: | ||
| 555 | + clause["multi_match"]["tie_breaker"] = self.phrase_match_tie_breaker | ||
| 556 | + return clause | ||
| 557 | + | ||
| 558 | + def _build_lexical_language_clause( | ||
| 559 | + self, | ||
| 560 | + lang: str, | ||
| 561 | + lang_query: str, | ||
| 562 | + clause_name: str, | ||
| 563 | + *, | ||
| 564 | + is_source: bool, | ||
| 565 | + contains_chinese: bool, | ||
| 566 | + contains_english: bool, | ||
| 567 | + index_languages: List[str], | ||
| 568 | + ) -> Optional[Dict[str, Any]]: | ||
| 569 | + all_specs = self._build_match_field_specs(lang) | ||
| 570 | + expanded_specs = self._expand_match_field_specs_for_mixed_script( | ||
| 571 | + lang, | ||
| 572 | + all_specs, | ||
| 573 | + contains_chinese, | ||
| 574 | + contains_english, | ||
| 575 | + index_languages, | ||
| 576 | + is_source, | ||
| 577 | + ) | ||
| 578 | + combined_fields = self._format_match_field_specs(expanded_specs) | ||
| 579 | + if not combined_fields: | ||
| 580 | + return None | ||
| 581 | + minimum_should_match = ( | ||
| 582 | + self.base_minimum_should_match if is_source else self.translation_minimum_should_match | ||
| 583 | + ) | ||
| 584 | + should_clauses = [ | ||
| 585 | + clause | ||
| 586 | + for clause in ( | ||
| 587 | + self._build_best_fields_clause(lang, lang_query), | ||
| 588 | + self._build_phrase_clause(lang, lang_query), | ||
| 589 | + ) | ||
| 590 | + if clause | ||
| 591 | + ] | ||
| 592 | + clause: Dict[str, Any] = { | ||
| 593 | + "bool": { | ||
| 594 | + "_name": clause_name, | ||
| 595 | + "must": [ | ||
| 596 | + { | ||
| 597 | + "combined_fields": { | ||
| 598 | + "query": lang_query, | ||
| 599 | + "fields": combined_fields, | ||
| 600 | + "minimum_should_match": minimum_should_match, | ||
| 601 | + } | ||
| 602 | + } | ||
| 603 | + ], | ||
| 604 | + } | ||
| 605 | + } | ||
| 606 | + if should_clauses: | ||
| 607 | + clause["bool"]["should"] = should_clauses | ||
| 608 | + if not is_source: | ||
| 609 | + clause["bool"]["boost"] = float(self.translation_boost) | ||
| 610 | + return clause | ||
| 611 | + | ||
| 546 | def _get_embedding_field(self, language: str) -> str: | 612 | def _get_embedding_field(self, language: str) -> str: |
| 547 | """Get embedding field name for a language.""" | 613 | """Get embedding field name for a language.""" |
| 548 | # Currently using unified embedding field | 614 | # Currently using unified embedding field |
| @@ -603,42 +669,18 @@ class ESQueryBuilder: | @@ -603,42 +669,18 @@ class ESQueryBuilder: | ||
| 603 | 669 | ||
| 604 | def append_clause(lang: str, lang_query: str, clause_name: str, is_source: bool) -> None: | 670 | def append_clause(lang: str, lang_query: str, clause_name: str, is_source: bool) -> None: |
| 605 | nonlocal should_clauses | 671 | nonlocal should_clauses |
| 606 | - all_specs, _ = self._build_match_field_specs(lang) | ||
| 607 | - expanded_specs = self._expand_match_field_specs_for_mixed_script( | 672 | + clause = self._build_lexical_language_clause( |
| 608 | lang, | 673 | lang, |
| 609 | - all_specs, | ||
| 610 | - contains_chinese, | ||
| 611 | - contains_english, | ||
| 612 | - normalized_index_languages, | ||
| 613 | - is_source, | 674 | + lang_query, |
| 675 | + clause_name, | ||
| 676 | + is_source=is_source, | ||
| 677 | + contains_chinese=contains_chinese, | ||
| 678 | + contains_english=contains_english, | ||
| 679 | + index_languages=normalized_index_languages, | ||
| 614 | ) | 680 | ) |
| 615 | - match_fields = self._format_match_field_specs(expanded_specs) | ||
| 616 | - if not match_fields: | 681 | + if not clause: |
| 617 | return | 682 | return |
| 618 | - minimum_should_match = ( | ||
| 619 | - self.base_minimum_should_match if is_source else self.translation_minimum_should_match | ||
| 620 | - ) | ||
| 621 | - | ||
| 622 | - clause = { | ||
| 623 | - "multi_match": { | ||
| 624 | - "_name": clause_name, | ||
| 625 | - "fields": match_fields, | ||
| 626 | - "minimum_should_match": minimum_should_match, | ||
| 627 | - "query": lang_query, | ||
| 628 | - "tie_breaker": self.tie_breaker_base_query, | ||
| 629 | - } | ||
| 630 | - } | ||
| 631 | - # base_query: never set multi_match.boost (ES default 1.0). | ||
| 632 | - # Translation clauses: single knob from config — translation_boost. | ||
| 633 | - if not is_source: | ||
| 634 | - tb = float(self.translation_boost) | ||
| 635 | - clause["multi_match"]["boost"] = tb | ||
| 636 | - should_clauses.append({ | ||
| 637 | - "multi_match": clause["multi_match"] | ||
| 638 | - }) | ||
| 639 | - self._append_phrase_should_clause( | ||
| 640 | - should_clauses, lang, lang_query, clause_name | ||
| 641 | - ) | 683 | + should_clauses.append(clause) |
| 642 | 684 | ||
| 643 | if base_query_text: | 685 | if base_query_text: |
| 644 | append_clause(source_lang, base_query_text, "base_query", True) | 686 | append_clause(source_lang, base_query_text, "base_query", True) |
| @@ -661,24 +703,9 @@ class ESQueryBuilder: | @@ -661,24 +703,9 @@ class ESQueryBuilder: | ||
| 661 | "query": query_text, | 703 | "query": query_text, |
| 662 | "fields": fallback_fields, | 704 | "fields": fallback_fields, |
| 663 | "minimum_should_match": self.base_minimum_should_match, | 705 | "minimum_should_match": self.base_minimum_should_match, |
| 664 | - "tie_breaker": self.tie_breaker_base_query, | ||
| 665 | - } | ||
| 666 | - } | ||
| 667 | - fb_should: List[Dict[str, Any]] = [fallback_lexical] | ||
| 668 | - self._append_phrase_should_clause( | ||
| 669 | - fb_should, | ||
| 670 | - self.default_language, | ||
| 671 | - query_text, | ||
| 672 | - "base_query_fallback" | ||
| 673 | - ) | ||
| 674 | - if len(fb_should) == 1: | ||
| 675 | - return fallback_lexical | ||
| 676 | - return { | ||
| 677 | - "bool": { | ||
| 678 | - "should": fb_should, | ||
| 679 | - "minimum_should_match": 1, | ||
| 680 | } | 706 | } |
| 681 | } | 707 | } |
| 708 | + return fallback_lexical | ||
| 682 | 709 | ||
| 683 | # Return bool query with should clauses | 710 | # Return bool query with should clauses |
| 684 | if len(should_clauses) == 1: | 711 | if len(should_clauses) == 1: |
search/searcher.py
| @@ -133,6 +133,10 @@ class Searcher: | @@ -133,6 +133,10 @@ class Searcher: | ||
| 133 | translation_minimum_should_match=self.config.query_config.translation_minimum_should_match, | 133 | translation_minimum_should_match=self.config.query_config.translation_minimum_should_match, |
| 134 | translation_boost=self.config.query_config.translation_boost, | 134 | translation_boost=self.config.query_config.translation_boost, |
| 135 | tie_breaker_base_query=self.config.query_config.tie_breaker_base_query, | 135 | tie_breaker_base_query=self.config.query_config.tie_breaker_base_query, |
| 136 | + best_fields_boosts=self.config.query_config.best_fields, | ||
| 137 | + best_fields_clause_boost=self.config.query_config.best_fields_boost, | ||
| 138 | + phrase_field_boosts=self.config.query_config.phrase_fields, | ||
| 139 | + phrase_match_boost=self.config.query_config.phrase_match_boost, | ||
| 136 | ) | 140 | ) |
| 137 | 141 | ||
| 138 | def _apply_source_filter(self, es_query: Dict[str, Any]) -> None: | 142 | def _apply_source_filter(self, es_query: Dict[str, Any]) -> None: |
tests/test_es_query_builder.py
| @@ -14,19 +14,19 @@ def _builder() -> ESQueryBuilder: | @@ -14,19 +14,19 @@ def _builder() -> ESQueryBuilder: | ||
| 14 | ) | 14 | ) |
| 15 | 15 | ||
| 16 | 16 | ||
| 17 | -def _lexical_multi_match_fields(query_root: Dict[str, Any]) -> list: | ||
| 18 | - """Fields from the non-phrase multi_match (bool.should or single clause).""" | ||
| 19 | - if "multi_match" in query_root: | ||
| 20 | - mm = query_root["multi_match"] | ||
| 21 | - if mm.get("type") == "phrase": | ||
| 22 | - raise AssertionError("root multi_match is phrase-only") | ||
| 23 | - return mm["fields"] | 17 | +def _lexical_clause(query_root: Dict[str, Any]) -> Dict[str, Any]: |
| 18 | + """Return the first named lexical bool clause from query_root.""" | ||
| 19 | + if "bool" in query_root and query_root["bool"].get("_name"): | ||
| 20 | + return query_root["bool"] | ||
| 24 | for clause in query_root.get("bool", {}).get("should", []): | 21 | for clause in query_root.get("bool", {}).get("should", []): |
| 25 | - mm = clause.get("multi_match") or {} | ||
| 26 | - if mm.get("type") == "phrase": | ||
| 27 | - continue | ||
| 28 | - return mm["fields"] | ||
| 29 | - raise AssertionError("no lexical multi_match in query_root") | 22 | + clause_bool = clause.get("bool") or {} |
| 23 | + if clause_bool.get("_name"): | ||
| 24 | + return clause_bool | ||
| 25 | + raise AssertionError("no lexical bool clause in query_root") | ||
| 26 | + | ||
| 27 | + | ||
| 28 | +def _lexical_combined_fields(query_root: Dict[str, Any]) -> list: | ||
| 29 | + return _lexical_clause(query_root)["must"][0]["combined_fields"]["fields"] | ||
| 30 | 30 | ||
| 31 | 31 | ||
| 32 | def test_knn_prefilter_includes_range_filters(): | 32 | def test_knn_prefilter_includes_range_filters(): |
| @@ -96,14 +96,11 @@ def test_text_query_contains_only_base_and_translation_named_queries(): | @@ -96,14 +96,11 @@ def test_text_query_contains_only_base_and_translation_named_queries(): | ||
| 96 | index_languages=["en", "zh", "fr"], | 96 | index_languages=["en", "zh", "fr"], |
| 97 | ) | 97 | ) |
| 98 | should = q["query"]["bool"]["should"] | 98 | should = q["query"]["bool"]["should"] |
| 99 | - names = [clause["multi_match"]["_name"] for clause in should] | 99 | + names = [clause["bool"]["_name"] for clause in should] |
| 100 | 100 | ||
| 101 | - assert names == [ | ||
| 102 | - "base_query", | ||
| 103 | - "base_query_phrase", | ||
| 104 | - "base_query_trans_zh", | ||
| 105 | - "base_query_trans_zh_phrase", | ||
| 106 | - ] | 101 | + assert names == ["base_query", "base_query_trans_zh"] |
| 102 | + base_should = q["query"]["bool"]["should"][0]["bool"]["should"] | ||
| 103 | + assert [clause["multi_match"]["type"] for clause in base_should] == ["best_fields", "phrase"] | ||
| 107 | 104 | ||
| 108 | 105 | ||
| 109 | def test_text_query_skips_duplicate_translation_same_as_base(): | 106 | def test_text_query_skips_duplicate_translation_same_as_base(): |
| @@ -122,8 +119,8 @@ def test_text_query_skips_duplicate_translation_same_as_base(): | @@ -122,8 +119,8 @@ def test_text_query_skips_duplicate_translation_same_as_base(): | ||
| 122 | ) | 119 | ) |
| 123 | 120 | ||
| 124 | root = q["query"] | 121 | root = q["query"] |
| 125 | - assert root["bool"]["should"][0]["multi_match"]["_name"] == "base_query" | ||
| 126 | - assert root["bool"]["should"][1]["multi_match"]["_name"] == "base_query_phrase" | 122 | + assert root["bool"]["_name"] == "base_query" |
| 123 | + assert [clause["multi_match"]["type"] for clause in root["bool"]["should"]] == ["best_fields", "phrase"] | ||
| 127 | 124 | ||
| 128 | 125 | ||
| 129 | def test_mixed_script_merges_en_fields_into_zh_clause(): | 126 | def test_mixed_script_merges_en_fields_into_zh_clause(): |
| @@ -147,7 +144,7 @@ def test_mixed_script_merges_en_fields_into_zh_clause(): | @@ -147,7 +144,7 @@ def test_mixed_script_merges_en_fields_into_zh_clause(): | ||
| 147 | enable_knn=False, | 144 | enable_knn=False, |
| 148 | index_languages=["zh", "en"], | 145 | index_languages=["zh", "en"], |
| 149 | ) | 146 | ) |
| 150 | - fields = _lexical_multi_match_fields(q["query"]) | 147 | + fields = _lexical_combined_fields(q["query"]) |
| 151 | bases = {f.split("^", 1)[0] for f in fields} | 148 | bases = {f.split("^", 1)[0] for f in fields} |
| 152 | assert "title.zh" in bases and "title.en" in bases | 149 | assert "title.zh" in bases and "title.en" in bases |
| 153 | assert "brief.zh" in bases and "brief.en" in bases | 150 | assert "brief.zh" in bases and "brief.en" in bases |
| @@ -177,7 +174,7 @@ def test_mixed_script_merges_zh_fields_into_en_clause(): | @@ -177,7 +174,7 @@ def test_mixed_script_merges_zh_fields_into_en_clause(): | ||
| 177 | enable_knn=False, | 174 | enable_knn=False, |
| 178 | index_languages=["zh", "en"], | 175 | index_languages=["zh", "en"], |
| 179 | ) | 176 | ) |
| 180 | - fields = _lexical_multi_match_fields(q["query"]) | 177 | + fields = _lexical_combined_fields(q["query"]) |
| 181 | bases = {f.split("^", 1)[0] for f in fields} | 178 | bases = {f.split("^", 1)[0] for f in fields} |
| 182 | assert "title.en" in bases and "title.zh" in bases | 179 | assert "title.en" in bases and "title.zh" in bases |
| 183 | assert "title.zh^0.6" in fields | 180 | assert "title.zh^0.6" in fields |
| @@ -205,7 +202,7 @@ def test_mixed_script_merged_fields_scale_configured_boosts(): | @@ -205,7 +202,7 @@ def test_mixed_script_merged_fields_scale_configured_boosts(): | ||
| 205 | enable_knn=False, | 202 | enable_knn=False, |
| 206 | index_languages=["zh", "en"], | 203 | index_languages=["zh", "en"], |
| 207 | ) | 204 | ) |
| 208 | - fields = _lexical_multi_match_fields(q["query"]) | 205 | + fields = _lexical_combined_fields(q["query"]) |
| 209 | assert "title.zh^5.0" in fields | 206 | assert "title.zh^5.0" in fields |
| 210 | assert "title.en^6.0" in fields # 10.0 * 0.6 | 207 | assert "title.en^6.0" in fields # 10.0 * 0.6 |
| 211 | 208 | ||
| @@ -231,7 +228,7 @@ def test_mixed_script_does_not_merge_en_when_not_in_index_languages(): | @@ -231,7 +228,7 @@ def test_mixed_script_does_not_merge_en_when_not_in_index_languages(): | ||
| 231 | enable_knn=False, | 228 | enable_knn=False, |
| 232 | index_languages=["zh"], | 229 | index_languages=["zh"], |
| 233 | ) | 230 | ) |
| 234 | - fields = _lexical_multi_match_fields(q["query"]) | 231 | + fields = _lexical_combined_fields(q["query"]) |
| 235 | bases = {f.split("^", 1)[0] for f in fields} | 232 | bases = {f.split("^", 1)[0] for f in fields} |
| 236 | assert "title.zh" in bases | 233 | assert "title.zh" in bases |
| 237 | assert "title.en" not in bases | 234 | assert "title.en" not in bases |
tests/test_es_query_builder_text_recall_languages.py
| @@ -2,8 +2,8 @@ | @@ -2,8 +2,8 @@ | ||
| 2 | ES text recall: base_query (rewritten @ detected_language) + base_query_trans_*. | 2 | ES text recall: base_query (rewritten @ detected_language) + base_query_trans_*. |
| 3 | 3 | ||
| 4 | Covers combinations of query language vs tenant index_languages, translations, | 4 | Covers combinations of query language vs tenant index_languages, translations, |
| 5 | -and mixed Chinese/English queries. Asserts multi_match _name, query text, and | ||
| 6 | -target language fields (title.{lang}). | 5 | +and mixed Chinese/English queries. Asserts named lexical clause boundaries, |
| 6 | +combined_fields payloads, and per-language target fields (title.{lang}). | ||
| 7 | """ | 7 | """ |
| 8 | 8 | ||
| 9 | from types import SimpleNamespace | 9 | from types import SimpleNamespace |
| @@ -34,7 +34,7 @@ def _builder_multilingual_title_only( | @@ -34,7 +34,7 @@ def _builder_multilingual_title_only( | ||
| 34 | def _unwrap_inner_query(es_body: Dict[str, Any]) -> Dict[str, Any]: | 34 | def _unwrap_inner_query(es_body: Dict[str, Any]) -> Dict[str, Any]: |
| 35 | """Navigate bool.must / function_score wrappers to the text recall root.""" | 35 | """Navigate bool.must / function_score wrappers to the text recall root.""" |
| 36 | q = es_body.get("query") or {} | 36 | q = es_body.get("query") or {} |
| 37 | - if "bool" in q and "must" in q["bool"] and q["bool"]["must"]: | 37 | + if "bool" in q and not q["bool"].get("_name") and "must" in q["bool"] and q["bool"]["must"]: |
| 38 | q = q["bool"]["must"][0] | 38 | q = q["bool"]["must"][0] |
| 39 | if "function_score" in q: | 39 | if "function_score" in q: |
| 40 | q = q["function_score"]["query"] | 40 | q = q["function_score"]["query"] |
| @@ -49,30 +49,45 @@ def _extract_multi_match_clauses(es_body: Dict[str, Any]) -> List[Dict[str, Any] | @@ -49,30 +49,45 @@ def _extract_multi_match_clauses(es_body: Dict[str, Any]) -> List[Dict[str, Any] | ||
| 49 | return [c["multi_match"] for c in should if "multi_match" in c] | 49 | return [c["multi_match"] for c in should if "multi_match" in c] |
| 50 | 50 | ||
| 51 | 51 | ||
| 52 | +def _extract_named_lexical_clauses(es_body: Dict[str, Any]) -> List[Dict[str, Any]]: | ||
| 53 | + inner = _unwrap_inner_query(es_body) | ||
| 54 | + if "bool" in inner and inner["bool"].get("_name"): | ||
| 55 | + return [inner["bool"]] | ||
| 56 | + should = (inner.get("bool") or {}).get("should") or [] | ||
| 57 | + return [c["bool"] for c in should if "bool" in c and c["bool"].get("_name")] | ||
| 58 | + | ||
| 59 | + | ||
| 52 | def _clauses_index(es_body: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: | 60 | def _clauses_index(es_body: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: |
| 53 | - """Map _name -> multi_match dict.""" | 61 | + """Map lexical clause _name -> bool query body.""" |
| 54 | out: Dict[str, Dict[str, Any]] = {} | 62 | out: Dict[str, Dict[str, Any]] = {} |
| 55 | - for mm in _extract_multi_match_clauses(es_body): | ||
| 56 | - name = mm.get("_name") | 63 | + for clause in _extract_named_lexical_clauses(es_body): |
| 64 | + name = clause.get("_name") | ||
| 57 | if name: | 65 | if name: |
| 58 | - out[str(name)] = mm | 66 | + out[str(name)] = clause |
| 59 | return out | 67 | return out |
| 60 | 68 | ||
| 61 | 69 | ||
| 62 | -def _with_phrase(lexical_names: set[str]) -> set[str]: | ||
| 63 | - """Each lexical recall clause has a companion ``*_phrase`` multi_match.""" | ||
| 64 | - return lexical_names | {f"{n}_phrase" for n in lexical_names} | 70 | +def _combined_fields_clause(clause: Dict[str, Any]) -> Dict[str, Any]: |
| 71 | + return clause["must"][0]["combined_fields"] | ||
| 72 | + | ||
| 65 | 73 | ||
| 74 | +def _should_multi_matches(clause: Dict[str, Any]) -> List[Dict[str, Any]]: | ||
| 75 | + return [item["multi_match"] for item in clause.get("should") or [] if "multi_match" in item] | ||
| 66 | 76 | ||
| 67 | -def _title_fields(mm: Dict[str, Any]) -> List[str]: | ||
| 68 | - fields = mm.get("fields") or [] | 77 | + |
| 78 | +def _should_multi_matches_by_type(clause: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: | ||
| 79 | + return {str(mm.get("type") or "best_fields"): mm for mm in _should_multi_matches(clause)} | ||
| 80 | + | ||
| 81 | + | ||
| 82 | +def _title_fields(clause: Dict[str, Any]) -> List[str]: | ||
| 83 | + fields = _combined_fields_clause(clause).get("fields") or [] | ||
| 69 | return [f for f in fields if str(f).startswith("title.")] | 84 | return [f for f in fields if str(f).startswith("title.")] |
| 70 | 85 | ||
| 71 | 86 | ||
| 72 | -def _has_title_lang(mm: Dict[str, Any], lang: str) -> bool: | 87 | +def _has_title_lang(clause: Dict[str, Any], lang: str) -> bool: |
| 73 | """True if any field is title.{lang} with optional ^boost suffix.""" | 88 | """True if any field is title.{lang} with optional ^boost suffix.""" |
| 74 | prefix = f"title.{lang}" | 89 | prefix = f"title.{lang}" |
| 75 | - for f in mm.get("fields") or []: | 90 | + for f in _combined_fields_clause(clause).get("fields") or []: |
| 76 | s = str(f) | 91 | s = str(f) |
| 77 | if s == prefix or s.startswith(prefix + "^"): | 92 | if s == prefix or s.startswith(prefix + "^"): |
| 78 | return True | 93 | return True |
| @@ -119,10 +134,10 @@ def test_zh_query_index_zh_en_includes_base_zh_and_trans_en(): | @@ -119,10 +134,10 @@ def test_zh_query_index_zh_en_includes_base_zh_and_trans_en(): | ||
| 119 | index_languages=["zh", "en"], | 134 | index_languages=["zh", "en"], |
| 120 | ) | 135 | ) |
| 121 | idx = _clauses_index(q) | 136 | idx = _clauses_index(q) |
| 122 | - assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"}) | ||
| 123 | - assert idx["base_query"]["query"] == "连衣裙" | 137 | + assert set(idx) == {"base_query", "base_query_trans_en"} |
| 138 | + assert _combined_fields_clause(idx["base_query"])["query"] == "连衣裙" | ||
| 124 | assert "title.zh" in _title_fields(idx["base_query"]) | 139 | assert "title.zh" in _title_fields(idx["base_query"]) |
| 125 | - assert idx["base_query_trans_en"]["query"] == "dress" | 140 | + assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "dress" |
| 126 | assert "title.en" in _title_fields(idx["base_query_trans_en"]) | 141 | assert "title.en" in _title_fields(idx["base_query_trans_en"]) |
| 127 | 142 | ||
| 128 | 143 | ||
| @@ -137,10 +152,10 @@ def test_en_query_index_zh_en_includes_base_en_and_trans_zh(): | @@ -137,10 +152,10 @@ def test_en_query_index_zh_en_includes_base_en_and_trans_zh(): | ||
| 137 | index_languages=["en", "zh"], | 152 | index_languages=["en", "zh"], |
| 138 | ) | 153 | ) |
| 139 | idx = _clauses_index(q) | 154 | idx = _clauses_index(q) |
| 140 | - assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | ||
| 141 | - assert idx["base_query"]["query"] == "dress" | 155 | + assert set(idx) == {"base_query", "base_query_trans_zh"} |
| 156 | + assert _combined_fields_clause(idx["base_query"])["query"] == "dress" | ||
| 142 | assert "title.en" in _title_fields(idx["base_query"]) | 157 | assert "title.en" in _title_fields(idx["base_query"]) |
| 143 | - assert idx["base_query_trans_zh"]["query"] == "连衣裙" | 158 | + assert _combined_fields_clause(idx["base_query_trans_zh"])["query"] == "连衣裙" |
| 144 | assert "title.zh" in _title_fields(idx["base_query_trans_zh"]) | 159 | assert "title.zh" in _title_fields(idx["base_query_trans_zh"]) |
| 145 | 160 | ||
| 146 | 161 | ||
| @@ -155,13 +170,11 @@ def test_de_query_index_de_en_fr_includes_base_and_two_translations(): | @@ -155,13 +170,11 @@ def test_de_query_index_de_en_fr_includes_base_and_two_translations(): | ||
| 155 | index_languages=["de", "en", "fr"], | 170 | index_languages=["de", "en", "fr"], |
| 156 | ) | 171 | ) |
| 157 | idx = _clauses_index(q) | 172 | idx = _clauses_index(q) |
| 158 | - assert set(idx) == _with_phrase( | ||
| 159 | - {"base_query", "base_query_trans_en", "base_query_trans_fr"} | ||
| 160 | - ) | ||
| 161 | - assert idx["base_query"]["query"] == "kleid" | 173 | + assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_fr"} |
| 174 | + assert _combined_fields_clause(idx["base_query"])["query"] == "kleid" | ||
| 162 | assert "title.de" in _title_fields(idx["base_query"]) | 175 | assert "title.de" in _title_fields(idx["base_query"]) |
| 163 | - assert idx["base_query_trans_en"]["query"] == "dress" | ||
| 164 | - assert idx["base_query_trans_fr"]["query"] == "robe" | 176 | + assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "dress" |
| 177 | + assert _combined_fields_clause(idx["base_query_trans_fr"])["query"] == "robe" | ||
| 165 | 178 | ||
| 166 | 179 | ||
| 167 | # --- 检测语言不在 index_languages:仍有 base(弱)+ 翻译(强) --- | 180 | # --- 检测语言不在 index_languages:仍有 base(弱)+ 翻译(强) --- |
| @@ -178,15 +191,13 @@ def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields(): | @@ -178,15 +191,13 @@ def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields(): | ||
| 178 | index_languages=["en", "zh"], | 191 | index_languages=["en", "zh"], |
| 179 | ) | 192 | ) |
| 180 | idx = _clauses_index(q) | 193 | idx = _clauses_index(q) |
| 181 | - assert set(idx) == _with_phrase( | ||
| 182 | - {"base_query", "base_query_trans_en", "base_query_trans_zh"} | ||
| 183 | - ) | ||
| 184 | - assert idx["base_query"]["query"] == "schuh" | 194 | + assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_zh"} |
| 195 | + assert _combined_fields_clause(idx["base_query"])["query"] == "schuh" | ||
| 185 | assert "title.de" in _title_fields(idx["base_query"]) | 196 | assert "title.de" in _title_fields(idx["base_query"]) |
| 186 | assert "boost" not in idx["base_query"] | 197 | assert "boost" not in idx["base_query"] |
| 187 | - assert idx["base_query_trans_en"]["query"] == "shoe" | 198 | + assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "shoe" |
| 188 | assert idx["base_query_trans_en"]["boost"] == qb.translation_boost | 199 | assert idx["base_query_trans_en"]["boost"] == qb.translation_boost |
| 189 | - assert idx["base_query_trans_zh"]["query"] == "鞋" | 200 | + assert _combined_fields_clause(idx["base_query_trans_zh"])["query"] == "鞋" |
| 190 | assert idx["base_query_trans_zh"]["boost"] == qb.translation_boost | 201 | assert idx["base_query_trans_zh"]["boost"] == qb.translation_boost |
| 191 | 202 | ||
| 192 | 203 | ||
| @@ -206,10 +217,10 @@ def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause(): | @@ -206,10 +217,10 @@ def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause(): | ||
| 206 | contains_english=True, | 217 | contains_english=True, |
| 207 | ) | 218 | ) |
| 208 | idx = _clauses_index(q) | 219 | idx = _clauses_index(q) |
| 209 | - assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"}) | ||
| 210 | - assert idx["base_query"]["query"] == "红色 dress" | 220 | + assert set(idx) == {"base_query", "base_query_trans_en"} |
| 221 | + assert _combined_fields_clause(idx["base_query"])["query"] == "红色 dress" | ||
| 211 | assert _has_title_lang(idx["base_query"], "zh") and _has_title_lang(idx["base_query"], "en") | 222 | assert _has_title_lang(idx["base_query"], "zh") and _has_title_lang(idx["base_query"], "en") |
| 212 | - assert idx["base_query_trans_en"]["query"] == "red dress" | 223 | + assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "red dress" |
| 213 | assert _has_title_lang(idx["base_query_trans_en"], "en") | 224 | assert _has_title_lang(idx["base_query_trans_en"], "en") |
| 214 | 225 | ||
| 215 | 226 | ||
| @@ -226,10 +237,10 @@ def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause(): | @@ -226,10 +237,10 @@ def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause(): | ||
| 226 | contains_english=True, | 237 | contains_english=True, |
| 227 | ) | 238 | ) |
| 228 | idx = _clauses_index(q) | 239 | idx = _clauses_index(q) |
| 229 | - assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | ||
| 230 | - assert idx["base_query"]["query"] == "nike 运动鞋" | 240 | + assert set(idx) == {"base_query", "base_query_trans_zh"} |
| 241 | + assert _combined_fields_clause(idx["base_query"])["query"] == "nike 运动鞋" | ||
| 231 | assert _has_title_lang(idx["base_query"], "en") and _has_title_lang(idx["base_query"], "zh") | 242 | assert _has_title_lang(idx["base_query"], "en") and _has_title_lang(idx["base_query"], "zh") |
| 232 | - assert idx["base_query_trans_zh"]["query"] == "耐克运动鞋" | 243 | + assert _combined_fields_clause(idx["base_query_trans_zh"])["query"] == "耐克运动鞋" |
| 233 | 244 | ||
| 234 | 245 | ||
| 235 | def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): | 246 | def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): |
| @@ -245,7 +256,7 @@ def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): | @@ -245,7 +256,7 @@ def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): | ||
| 245 | contains_english=True, | 256 | contains_english=True, |
| 246 | ) | 257 | ) |
| 247 | idx = _clauses_index(q) | 258 | idx = _clauses_index(q) |
| 248 | - assert set(idx) == _with_phrase({"base_query"}) | 259 | + assert set(idx) == {"base_query"} |
| 249 | bases = {f.split("^", 1)[0] for f in _title_fields(idx["base_query"])} | 260 | bases = {f.split("^", 1)[0] for f in _title_fields(idx["base_query"])} |
| 250 | assert bases == {"title.zh"} | 261 | assert bases == {"title.zh"} |
| 251 | 262 | ||
| @@ -264,7 +275,7 @@ def test_skips_translation_when_same_lang_and_same_text_as_base(): | @@ -264,7 +275,7 @@ def test_skips_translation_when_same_lang_and_same_text_as_base(): | ||
| 264 | index_languages=["en", "zh"], | 275 | index_languages=["en", "zh"], |
| 265 | ) | 276 | ) |
| 266 | idx = _clauses_index(q) | 277 | idx = _clauses_index(q) |
| 267 | - assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | 278 | + assert set(idx) == {"base_query", "base_query_trans_zh"} |
| 268 | 279 | ||
| 269 | 280 | ||
| 270 | def test_keeps_translation_when_same_text_but_different_lang_than_base(): | 281 | def test_keeps_translation_when_same_text_but_different_lang_than_base(): |
| @@ -278,8 +289,8 @@ def test_keeps_translation_when_same_text_but_different_lang_than_base(): | @@ -278,8 +289,8 @@ def test_keeps_translation_when_same_text_but_different_lang_than_base(): | ||
| 278 | index_languages=["en", "zh"], | 289 | index_languages=["en", "zh"], |
| 279 | ) | 290 | ) |
| 280 | idx = _clauses_index(q) | 291 | idx = _clauses_index(q) |
| 281 | - assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | ||
| 282 | - assert idx["base_query_trans_zh"]["query"] == "NIKE" | 292 | + assert set(idx) == {"base_query", "base_query_trans_zh"} |
| 293 | + assert _combined_fields_clause(idx["base_query_trans_zh"])["query"] == "NIKE" | ||
| 283 | 294 | ||
| 284 | 295 | ||
| 285 | # --- 翻译 key 规范化、空翻译跳过 --- | 296 | # --- 翻译 key 规范化、空翻译跳过 --- |
| @@ -297,7 +308,7 @@ def test_translation_language_key_is_normalized_case_insensitive(): | @@ -297,7 +308,7 @@ def test_translation_language_key_is_normalized_case_insensitive(): | ||
| 297 | ) | 308 | ) |
| 298 | idx = _clauses_index(q) | 309 | idx = _clauses_index(q) |
| 299 | assert "base_query_trans_zh" in idx | 310 | assert "base_query_trans_zh" in idx |
| 300 | - assert idx["base_query_trans_zh"]["query"] == "连衣裙" | 311 | + assert _combined_fields_clause(idx["base_query_trans_zh"])["query"] == "连衣裙" |
| 301 | 312 | ||
| 302 | 313 | ||
| 303 | def test_empty_translation_value_is_skipped(): | 314 | def test_empty_translation_value_is_skipped(): |
| @@ -331,8 +342,10 @@ def test_empty_index_languages_treats_source_as_in_index_boosts(): | @@ -331,8 +342,10 @@ def test_empty_index_languages_treats_source_as_in_index_boosts(): | ||
| 331 | idx = _clauses_index(q) | 342 | idx = _clauses_index(q) |
| 332 | assert "boost" not in idx["base_query"] | 343 | assert "boost" not in idx["base_query"] |
| 333 | assert idx["base_query_trans_en"]["boost"] == qb.translation_boost | 344 | assert idx["base_query_trans_en"]["boost"] == qb.translation_boost |
| 334 | - assert idx["base_query_phrase"]["boost"] == qb.phrase_match_boost | ||
| 335 | - assert idx["base_query_trans_en_phrase"]["boost"] == qb.phrase_match_boost | 345 | + base_should = _should_multi_matches_by_type(idx["base_query"]) |
| 346 | + trans_should = _should_multi_matches_by_type(idx["base_query_trans_en"]) | ||
| 347 | + assert base_should["phrase"]["boost"] == qb.phrase_match_boost | ||
| 348 | + assert trans_should["phrase"]["boost"] == qb.phrase_match_boost | ||
| 336 | 349 | ||
| 337 | 350 | ||
| 338 | # --- 无翻译:仅 base_query --- | 351 | # --- 无翻译:仅 base_query --- |
| @@ -349,7 +362,7 @@ def test_no_translations_only_base_query(): | @@ -349,7 +362,7 @@ def test_no_translations_only_base_query(): | ||
| 349 | index_languages=["en", "zh"], | 362 | index_languages=["en", "zh"], |
| 350 | ) | 363 | ) |
| 351 | idx = _clauses_index(q) | 364 | idx = _clauses_index(q) |
| 352 | - assert set(idx) == _with_phrase({"base_query"}) | 365 | + assert set(idx) == {"base_query"} |
| 353 | 366 | ||
| 354 | 367 | ||
| 355 | # --- 与 KNN 同存时仍能解析文本子句(顶层 knn 不影响 query 内结构) --- | 368 | # --- 与 KNN 同存时仍能解析文本子句(顶层 knn 不影响 query 内结构) --- |
| @@ -373,7 +386,7 @@ def test_text_clauses_present_alongside_knn(): | @@ -373,7 +386,7 @@ def test_text_clauses_present_alongside_knn(): | ||
| 373 | ) | 386 | ) |
| 374 | assert "knn" in q | 387 | assert "knn" in q |
| 375 | idx = _clauses_index(q) | 388 | idx = _clauses_index(q) |
| 376 | - assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | 389 | + assert set(idx) == {"base_query", "base_query_trans_zh"} |
| 377 | 390 | ||
| 378 | 391 | ||
| 379 | def test_detected_language_unknown_falls_back_to_default_language(): | 392 | def test_detected_language_unknown_falls_back_to_default_language(): |
| @@ -393,8 +406,8 @@ def test_detected_language_unknown_falls_back_to_default_language(): | @@ -393,8 +406,8 @@ def test_detected_language_unknown_falls_back_to_default_language(): | ||
| 393 | index_languages=["en", "zh"], | 406 | index_languages=["en", "zh"], |
| 394 | ) | 407 | ) |
| 395 | idx = _clauses_index(q) | 408 | idx = _clauses_index(q) |
| 396 | - assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | ||
| 397 | - assert idx["base_query"]["query"] == "shirt" | 409 | + assert set(idx) == {"base_query", "base_query_trans_zh"} |
| 410 | + assert _combined_fields_clause(idx["base_query"])["query"] == "shirt" | ||
| 398 | assert _has_title_lang(idx["base_query"], "en") | 411 | assert _has_title_lang(idx["base_query"], "en") |
| 399 | 412 | ||
| 400 | 413 | ||
| @@ -409,10 +422,10 @@ def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): | @@ -409,10 +422,10 @@ def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): | ||
| 409 | index_languages=["ru", "en"], | 422 | index_languages=["ru", "en"], |
| 410 | ) | 423 | ) |
| 411 | idx = _clauses_index(q) | 424 | idx = _clauses_index(q) |
| 412 | - assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"}) | ||
| 413 | - assert idx["base_query"]["query"] == "платье" | 425 | + assert set(idx) == {"base_query", "base_query_trans_en"} |
| 426 | + assert _combined_fields_clause(idx["base_query"])["query"] == "платье" | ||
| 414 | assert _has_title_lang(idx["base_query"], "ru") | 427 | assert _has_title_lang(idx["base_query"], "ru") |
| 415 | - assert idx["base_query_trans_en"]["query"] == "dress" | 428 | + assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "dress" |
| 416 | 429 | ||
| 417 | 430 | ||
| 418 | def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause(): | 431 | def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause(): |
| @@ -431,7 +444,7 @@ def test_translation_for_lang_not_listed_in_index_languages_still_generates_clau | @@ -431,7 +444,7 @@ def test_translation_for_lang_not_listed_in_index_languages_still_generates_clau | ||
| 431 | ) | 444 | ) |
| 432 | idx = _clauses_index(q) | 445 | idx = _clauses_index(q) |
| 433 | assert "base_query_trans_de" in idx | 446 | assert "base_query_trans_de" in idx |
| 434 | - assert idx["base_query_trans_de"]["query"] == "Kleid" | 447 | + assert _combined_fields_clause(idx["base_query_trans_de"])["query"] == "Kleid" |
| 435 | assert _has_title_lang(idx["base_query_trans_de"], "de") | 448 | assert _has_title_lang(idx["base_query_trans_de"], "de") |
| 436 | 449 | ||
| 437 | 450 | ||
| @@ -449,5 +462,5 @@ def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_bas | @@ -449,5 +462,5 @@ def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_bas | ||
| 449 | contains_english=False, | 462 | contains_english=False, |
| 450 | ) | 463 | ) |
| 451 | idx = _clauses_index(q) | 464 | idx = _clauses_index(q) |
| 452 | - assert idx["base_query"]["query"] == "红色连衣裙" | ||
| 453 | - assert idx["base_query_trans_en"]["query"] == "red dress" | 465 | + assert _combined_fields_clause(idx["base_query"])["query"] == "红色连衣裙" |
| 466 | + assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "red dress" |