Compare View

switch
from
...
to
 
Commits (6)
api/routes/indexer.py
@@ -449,7 +449,7 @@ def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: @@ -449,7 +449,7 @@ def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages:
449 同步执行内容理解:调用 product_enrich.analyze_products,按语言批量跑 LLM, 449 同步执行内容理解:调用 product_enrich.analyze_products,按语言批量跑 LLM,
450 再聚合成每 SPU 的 qanchors、semantic_attributes、tags。供 run_in_executor 调用。 450 再聚合成每 SPU 的 qanchors、semantic_attributes、tags。供 run_in_executor 调用。
451 """ 451 """
452 - from indexer.product_enrich import analyze_products 452 + from indexer.product_enrich import analyze_products, split_multi_value_field
453 453
454 llm_langs = list(dict.fromkeys(languages)) or ["en"] 454 llm_langs = list(dict.fromkeys(languages)) or ["en"]
455 455
@@ -510,10 +510,7 @@ def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: @@ -510,10 +510,7 @@ def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages:
510 raw = row.get(name) 510 raw = row.get(name)
511 if not raw: 511 if not raw:
512 continue 512 continue
513 - for part in re.split(r"[,;|/\n\t]+", str(raw)):  
514 - value = part.strip()  
515 - if not value:  
516 - continue 513 + for value in split_multi_value_field(str(raw)):
517 rec["semantic_attributes"].append({"lang": lang, "name": name, "value": value}) 514 rec["semantic_attributes"].append({"lang": lang, "name": name, "value": value})
518 if name == "tags": 515 if name == "tags":
519 rec["tags"].append(value) 516 rec["tags"].append(value)
config/config.yaml
1 # Unified Configuration for Multi-Tenant Search Engine 1 # Unified Configuration for Multi-Tenant Search Engine
2 # 统一配置文件,所有租户共用一套配置 2 # 统一配置文件,所有租户共用一套配置
3 # 注意:索引结构由 mappings/search_products.json 定义,此文件只配置搜索行为 3 # 注意:索引结构由 mappings/search_products.json 定义,此文件只配置搜索行为
  4 +#
  5 +# 约定:下列键为必填;进程环境变量可覆盖 infrastructure / runtime 中同名语义项
  6 +#(如 ES_HOST、API_PORT 等),未设置环境变量时使用本文件中的值。
  7 +
  8 +# Process / bind addresses (环境变量 APP_ENV、RUNTIME_ENV、ES_INDEX_NAMESPACE 可覆盖前两者的语义)
  9 +runtime:
  10 + environment: "prod"
  11 + index_namespace: ""
  12 + api_host: "0.0.0.0"
  13 + api_port: 6002
  14 + indexer_host: "0.0.0.0"
  15 + indexer_port: 6004
  16 + embedding_host: "0.0.0.0"
  17 + embedding_port: 6005
  18 + embedding_text_port: 6005
  19 + embedding_image_port: 6008
  20 + translator_host: "127.0.0.1"
  21 + translator_port: 6006
  22 + reranker_host: "127.0.0.1"
  23 + reranker_port: 6007
  24 +
  25 +# 基础设施连接(敏感项优先读环境变量:ES_*、REDIS_*、DB_*、DASHSCOPE_API_KEY、DEEPL_AUTH_KEY)
  26 +infrastructure:
  27 + elasticsearch:
  28 + host: "http://localhost:9200"
  29 + username: null
  30 + password: null
  31 + redis:
  32 + host: "localhost"
  33 + port: 6479
  34 + snapshot_db: 0
  35 + password: null
  36 + socket_timeout: 1
  37 + socket_connect_timeout: 1
  38 + retry_on_timeout: false
  39 + cache_expire_days: 720
  40 + embedding_cache_prefix: "embedding"
  41 + anchor_cache_prefix: "product_anchors"
  42 + anchor_cache_expire_days: 30
  43 + database:
  44 + host: null
  45 + port: 3306
  46 + database: null
  47 + username: null
  48 + password: null
  49 + secrets:
  50 + dashscope_api_key: null
  51 + deepl_auth_key: null
4 52
5 # Elasticsearch Index 53 # Elasticsearch Index
6 es_index_name: "search_products" 54 es_index_name: "search_products"
7 55
  56 +# 检索域 / 索引列表(可为空列表;每项字段均需显式给出)
  57 +indexes: []
  58 +
8 # Config assets 59 # Config assets
9 assets: 60 assets:
10 query_rewrite_dictionary_path: "config/dictionaries/query_rewrite.dict" 61 query_rewrite_dictionary_path: "config/dictionaries/query_rewrite.dict"
@@ -20,20 +71,19 @@ es_settings: @@ -20,20 +71,19 @@ es_settings:
20 refresh_interval: "30s" 71 refresh_interval: "30s"
21 72
22 # 字段权重配置(用于搜索时的字段boost) 73 # 字段权重配置(用于搜索时的字段boost)
23 -# 统一按“字段基名”配置;查询时按 search_langs 动态拼接 .{lang}。 74 +# 统一按“字段基名”配置;查询时按实际检索语言动态拼接 .{lang}。
24 # 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。 75 # 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。
25 field_boosts: 76 field_boosts:
26 title: 3.0 77 title: 3.0
  78 + qanchors: 2.5
  79 + tags: 2.0
  80 + category_name_text: 2.0
  81 + category_path: 2.0
27 brief: 1.5 82 brief: 1.5
28 - description: 1.0  
29 - qanchors: 1.5  
30 - vendor: 1.5  
31 - category_path: 1.5  
32 - category_name_text: 1.5  
33 - tags: 1.0  
34 - option1_values: 0.6  
35 - option2_values: 0.4  
36 - option3_values: 0.4 83 + description: 1.5
  84 + option1_values: 1.5
  85 + option2_values: 1.5
  86 + option3_values: 1.5
37 87
38 # Query Configuration(查询配置) 88 # Query Configuration(查询配置)
39 query_config: 89 query_config:
@@ -47,10 +97,23 @@ query_config: @@ -47,10 +97,23 @@ query_config:
47 enable_text_embedding: true 97 enable_text_embedding: true
48 enable_query_rewrite: true 98 enable_query_rewrite: true
49 99
  100 + # 查询翻译模型(须与 services.translation.capabilities 中某项一致)
  101 + # 源语种在租户 index_languages 内:主召回可打在源语种字段,用下面三项。
  102 + # zh_to_en_model: "opus-mt-zh-en"
  103 + # en_to_zh_model: "opus-mt-en-zh"
  104 + # default_translation_model: "nllb-200-distilled-600m"
  105 + zh_to_en_model: "deepl"
  106 + en_to_zh_model: "deepl"
  107 + default_translation_model: "deepl"
  108 + # 源语种不在 index_languages:翻译对可检索文本更关键,可单独指定(缺省则与上一组相同)
  109 + zh_to_en_model__source_not_in_index: "deepl"
  110 + en_to_zh_model__source_not_in_index: "deepl"
  111 + default_translation_model__source_not_in_index: "deepl"
  112 +
50 # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。 113 # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。
51 # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。 114 # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。
52 - translation_embedding_wait_budget_ms_source_in_index: 80  
53 - translation_embedding_wait_budget_ms_source_not_in_index: 200 115 + translation_embedding_wait_budget_ms_source_in_index: 500 # 80
  116 + translation_embedding_wait_budget_ms_source_not_in_index: 500 #200
54 117
55 # 动态多语言检索字段配置 118 # 动态多语言检索字段配置
56 # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式; 119 # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式;
@@ -58,11 +121,11 @@ query_config: @@ -58,11 +121,11 @@ query_config:
58 search_fields: 121 search_fields:
59 multilingual_fields: 122 multilingual_fields:
60 - "title" 123 - "title"
61 - - "brief"  
62 - - "description"  
63 - - "vendor" 124 + - "qanchors"
64 - "category_path" 125 - "category_path"
65 - "category_name_text" 126 - "category_name_text"
  127 + - "brief"
  128 + - "description"
66 shared_fields: 129 shared_fields:
67 - "tags" 130 - "tags"
68 - "option1_values" 131 - "option1_values"
@@ -71,18 +134,14 @@ query_config: @@ -71,18 +134,14 @@ query_config:
71 core_multilingual_fields: 134 core_multilingual_fields:
72 - "title" 135 - "title"
73 - "brief" 136 - "brief"
74 - - "vendor"  
75 - "category_name_text" 137 - "category_name_text"
76 138
77 - # 统一文本召回策略(主查询 + 翻译查询 + 原始查询兜底 139 + # 统一文本召回策略(主查询 + 翻译查询
78 text_query_strategy: 140 text_query_strategy:
79 base_minimum_should_match: "75%" 141 base_minimum_should_match: "75%"
80 translation_minimum_should_match: "75%" 142 translation_minimum_should_match: "75%"
81 - translation_boost: 0.4  
82 - translation_boost_when_source_missing: 1.0  
83 - source_boost_when_missing: 0.6  
84 - original_query_fallback_boost_when_translation_missing: 0.2  
85 - tie_breaker_base_query: 0.9 143 + translation_boost: 0.75
  144 + tie_breaker_base_query: 0.5
86 145
87 # Embedding字段名称 146 # Embedding字段名称
88 text_embedding_field: "title_embedding" 147 text_embedding_field: "title_embedding"
@@ -120,7 +179,7 @@ query_config: @@ -120,7 +179,7 @@ query_config:
120 - skus 179 - skus
121 180
122 # KNN boost配置(向量召回的boost值) 181 # KNN boost配置(向量召回的boost值)
123 - knn_boost: 0.25 # Lower boost for embedding recall 182 + knn_boost: 2.0 # Lower boost for embedding recall
124 183
125 # Function Score配置(ES层打分规则) 184 # Function Score配置(ES层打分规则)
126 function_score: 185 function_score:
@@ -148,6 +207,17 @@ services: @@ -148,6 +207,17 @@ services:
148 cache: 207 cache:
149 ttl_seconds: 62208000 208 ttl_seconds: 62208000
150 sliding_expiration: true 209 sliding_expiration: true
  210 + # When false, cache keys are exact-match per request model only (ignores model_quality_tiers for lookups).
  211 + enable_model_quality_tier_cache: true
  212 + # Higher tier = better quality. Multiple models may share one tier (同级).
  213 + # A request may reuse Redis keys from models with tier > A or tier == A (not from lower tiers).
  214 + model_quality_tiers:
  215 + deepl: 30
  216 + qwen-mt: 30
  217 + llm: 30
  218 + nllb-200-distilled-600m: 20
  219 + opus-mt-zh-en: 10
  220 + opus-mt-en-zh: 10
151 capabilities: 221 capabilities:
152 qwen-mt: 222 qwen-mt:
153 enabled: true 223 enabled: true
@@ -290,7 +360,7 @@ services: @@ -290,7 +360,7 @@ services:
290 engine: "vllm" 360 engine: "vllm"
291 max_model_len: 160 361 max_model_len: 160
292 tensor_parallel_size: 1 362 tensor_parallel_size: 1
293 - gpu_memory_utilization: 0.36 363 + gpu_memory_utilization: 0.20
294 dtype: "float16" 364 dtype: "float16"
295 enable_prefix_caching: true 365 enable_prefix_caching: true
296 enforce_eager: false 366 enforce_eager: false
@@ -284,19 +284,30 @@ class AppConfigLoader: @@ -284,19 +284,30 @@ class AppConfigLoader:
284 base_minimum_should_match=str(text_strategy.get("base_minimum_should_match", "70%")), 284 base_minimum_should_match=str(text_strategy.get("base_minimum_should_match", "70%")),
285 translation_minimum_should_match=str(text_strategy.get("translation_minimum_should_match", "70%")), 285 translation_minimum_should_match=str(text_strategy.get("translation_minimum_should_match", "70%")),
286 translation_boost=float(text_strategy.get("translation_boost", 0.4)), 286 translation_boost=float(text_strategy.get("translation_boost", 0.4)),
287 - translation_boost_when_source_missing=float(  
288 - text_strategy.get("translation_boost_when_source_missing", 1.0)  
289 - ),  
290 - source_boost_when_missing=float(text_strategy.get("source_boost_when_missing", 0.6)),  
291 - original_query_fallback_boost_when_translation_missing=float(  
292 - text_strategy.get("original_query_fallback_boost_when_translation_missing", 0.2)  
293 - ),  
294 tie_breaker_base_query=float(text_strategy.get("tie_breaker_base_query", 0.9)), 287 tie_breaker_base_query=float(text_strategy.get("tie_breaker_base_query", 0.9)),
295 zh_to_en_model=str(query_cfg.get("zh_to_en_model") or "opus-mt-zh-en"), 288 zh_to_en_model=str(query_cfg.get("zh_to_en_model") or "opus-mt-zh-en"),
296 en_to_zh_model=str(query_cfg.get("en_to_zh_model") or "opus-mt-en-zh"), 289 en_to_zh_model=str(query_cfg.get("en_to_zh_model") or "opus-mt-en-zh"),
297 default_translation_model=str( 290 default_translation_model=str(
298 query_cfg.get("default_translation_model") or "nllb-200-distilled-600m" 291 query_cfg.get("default_translation_model") or "nllb-200-distilled-600m"
299 ), 292 ),
  293 + zh_to_en_model_source_not_in_index=(
  294 + str(v)
  295 + if (v := query_cfg.get("zh_to_en_model__source_not_in_index"))
  296 + not in (None, "")
  297 + else None
  298 + ),
  299 + en_to_zh_model_source_not_in_index=(
  300 + str(v)
  301 + if (v := query_cfg.get("en_to_zh_model__source_not_in_index"))
  302 + not in (None, "")
  303 + else None
  304 + ),
  305 + default_translation_model_source_not_in_index=(
  306 + str(v)
  307 + if (v := query_cfg.get("default_translation_model__source_not_in_index"))
  308 + not in (None, "")
  309 + else None
  310 + ),
300 translation_embedding_wait_budget_ms_source_in_index=int( 311 translation_embedding_wait_budget_ms_source_in_index=int(
301 query_cfg.get("translation_embedding_wait_budget_ms_source_in_index", 80) 312 query_cfg.get("translation_embedding_wait_budget_ms_source_in_index", 80)
302 ), 313 ),
@@ -54,13 +54,14 @@ class QueryConfig: @@ -54,13 +54,14 @@ class QueryConfig:
54 base_minimum_should_match: str = "70%" 54 base_minimum_should_match: str = "70%"
55 translation_minimum_should_match: str = "70%" 55 translation_minimum_should_match: str = "70%"
56 translation_boost: float = 0.4 56 translation_boost: float = 0.4
57 - translation_boost_when_source_missing: float = 1.0  
58 - source_boost_when_missing: float = 0.6  
59 - original_query_fallback_boost_when_translation_missing: float = 0.2  
60 tie_breaker_base_query: float = 0.9 57 tie_breaker_base_query: float = 0.9
61 zh_to_en_model: str = "opus-mt-zh-en" 58 zh_to_en_model: str = "opus-mt-zh-en"
62 en_to_zh_model: str = "opus-mt-en-zh" 59 en_to_zh_model: str = "opus-mt-en-zh"
63 default_translation_model: str = "nllb-200-distilled-600m" 60 default_translation_model: str = "nllb-200-distilled-600m"
  61 + # 检测语种不在租户 index_languages(无可直接命中的多语字段)时使用;None 表示与上一组同模型。
  62 + zh_to_en_model_source_not_in_index: Optional[str] = None
  63 + en_to_zh_model_source_not_in_index: Optional[str] = None
  64 + default_translation_model_source_not_in_index: Optional[str] = None
64 # 查询阶段:翻译与向量生成并发提交后,共用同一等待预算(毫秒)。 65 # 查询阶段:翻译与向量生成并发提交后,共用同一等待预算(毫秒)。
65 # 检测语言已在租户 index_languages 内:偏快返回,预算较短。 66 # 检测语言已在租户 index_languages 内:偏快返回,预算较短。
66 # 检测语言不在 index_languages 内:翻译对召回更关键,预算较长。 67 # 检测语言不在 index_languages 内:翻译对召回更关键,预算较长。
docs/DEVELOPER_GUIDE.md
@@ -147,7 +147,7 @@ docs/ # 文档(含本指南) @@ -147,7 +147,7 @@ docs/ # 文档(含本指南)
147 147
148 ### 4.4 query 148 ### 4.4 query
149 149
150 -- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化;输出可供 Searcher 使用的结构化查询信息(含 search_langs 语言计划) 150 +- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化;输出解析事实(如 `rewritten_query`、`detected_language`、`translations`、`query_vector`),不再承担 ES 语言计划拼装
151 - **原则**:翻译/向量通过 `providers` 获取,不直接依赖具体服务 URL 或实现;支持按配置关闭翻译/向量(如短查询、typing 场景)。 151 - **原则**:翻译/向量通过 `providers` 获取,不直接依赖具体服务 URL 或实现;支持按配置关闭翻译/向量(如短查询、typing 场景)。
152 152
153 ### 4.5 search 153 ### 4.5 search
docs/QUICKSTART.md
@@ -558,6 +558,21 @@ lsof -i :6004 @@ -558,6 +558,21 @@ lsof -i :6004
558 558
559 更完整的运行排障(多环境切换、Suggestion 构建、FAQ)见 `docs/Usage-Guide.md`。 559 更完整的运行排障(多环境切换、Suggestion 构建、FAQ)见 `docs/Usage-Guide.md`。
560 560
  561 +### 5.4 HanLP 与 `transformers` 版本(`BertTokenizer.encode_plus`)
  562 +
  563 +若日志出现 **`AttributeError: BertTokenizer has no attribute encode_plus`**,通常是 **同一 venv 里装了 `transformers` 5.x**,与 **HanLP 2.1.x** 不兼容(HanLP 仍调用已移除的 `encode_plus`)。
  564 +
  565 +**处理:** 将 `transformers` 固定到 **4.x**(例如 4.44+),然后重装/校验 HanLP:
  566 +
  567 +```bash
  568 +source activate.sh
  569 +pip install -r requirements_hanlp.txt
  570 +python -c "from transformers import BertTokenizer; import transformers as t; print(t.__version__, hasattr(BertTokenizer, 'encode_plus'))"
  571 +# 期望:4.x 且 True
  572 +```
  573 +
  574 +**说明:** 重排/TEI 等若使用 **独立 venv**(如 `.venv-reranker`),可与主 venv 的 `transformers` 版本分离;主 venv 只要装了 HanLP 做查询分词,就不要把 `transformers` 升到 5。
  575 +
561 --- 576 ---
562 577
563 ## 6. 相关文档 578 ## 6. 相关文档
docs/TODO-ES能力提升.md 0 → 100644
@@ -0,0 +1,69 @@ @@ -0,0 +1,69 @@
  1 +ES 付费版本 or 定制开发(建议先看下付费版本价格)
  2 +ES定制开发:
  3 +RRF / retrievers
  4 +
  5 +Elastic 的订阅矩阵里明确列了这些相关能力:Retrievers: linear, rule, RRF, text similarity re-ranker,以及 Reciprocal Rank Fusion (RRF) for hybrid search。
  6 +
  7 +这类能力最有价值的点是:
  8 +它们把混合检索从“自己拼 DSL 和手搓打分”变成了官方支持的多阶段检索框架。重排:text similarity re-ranker / Elastic Rerank. text_similarity_reranker 用 NLP 模型对 top-k 结果按语义相似度重新排序;它可以用内置的 Elastic Rerank,也可以接 Cohere、Vertex AI,或者你自己上传的 text similarity 模型。
  9 +
  10 +{
  11 + "retriever": {
  12 + "rrf": {
  13 + "retrievers": [
  14 + { "standard": { "query": { ... } } },
  15 + { "knn": { ... } }
  16 + ]
  17 + }
  18 + }
  19 +}
  20 +
  21 +
  22 +加reranker:
  23 +text_similarity_reranker 用 NLP 模型对 top-k 结果按语义相似度重新排序;它可以用内置的 Elastic Rerank,也可以接 Cohere、Vertex AI,或者你自己上传的 text similarity 模型。
  24 +
  25 +{
  26 + "retriever": {
  27 + "text_similarity_reranker": {
  28 + "retriever": {
  29 + "rrf": { ... }
  30 + },
  31 + ...
  32 + }
  33 + }
  34 +}
  35 +
  36 +{
  37 + "retriever": {
  38 + "text_similarity_reranker": {
  39 + "retriever": {
  40 + "rrf": {
  41 + "retrievers": [
  42 + {
  43 + "standard": {
  44 + "query": {
  45 + "...": "..."
  46 + }
  47 + }
  48 + },
  49 + {
  50 + "knn": {
  51 + "...": "..."
  52 + }
  53 + }
  54 + ],
  55 + "rank_window_size": 100,
  56 + "rank_constant": 20
  57 + }
  58 + },
  59 + "field": "your_rerank_text_field",
  60 + "inference_text": "白色 oversized T-shirt",
  61 + "inference_id": ".rerank-v1-elasticsearch",
  62 + "rank_window_size": 50
  63 + }
  64 + },
  65 + "size": 20
  66 +}
  67 +
  68 +
  69 +
1 1
2 2
3 -@reranker/backends/qwen3_vllm.py 单次 generate 前有进程内锁,同一进程里不会并行多路 vLLM 推理,这个锁有必要吗?是否会影响性能?是否能够打开,使得性能更好?比如这个场景,我一次请求 400 条,分成每64个一个batch,基于我现在的gpu配置,可以再提高并发度吗?  
4 -测试了,让每个批次都并发地进行,耗时没有变化 3 +
  4 +本地部署一个7b Q4量化的大模型
  5 +es需要licence的两个功能,如果费用低,开通下licence,或者改es源码定制开发下,支持 rank.rrf,reranker
  6 +
  7 +
  8 +
  9 +把knn跟文本相关性的融合方式修改为 "rank": {"rrf": {} }需要licence,可以帮我修改源码支持吗?
  10 +
  11 + knn_boost: 2.0
  12 +
  13 +
  14 +{
  15 + "query": { ...全文检索... },
  16 + "knn": { ...向量检索... },
  17 + "rank": {
  18 + "rrf": {}
  19 + }
  20 +}
  21 +
  22 +
  23 +"image_embedding": {
  24 + "type": "nested",
  25 + "properties": {
  26 + "vector": {
  27 + "type": "dense_vector",
  28 + "dims": 1024,
  29 + "index": true,
  30 + "similarity": "dot_product",
  31 + "element_type": "bfloat16"
  32 + },
  33 + "url": {
  34 + "type": "text"
  35 + }
  36 + }
  37 +},
  38 +去掉 image_embedding_512
  39 +image_embedding改为,一个spu有多个sku向量,每个向量内部properties:
  40 +除了vector url还应该包括,该图片是对应哪些sku
  41 +"image_embedding": {
  42 + "type": "nested",
  43 + "properties": {
  44 + "vector": {
  45 + "type": "dense_vector",
  46 + "dims": 1024,
  47 + "index": true,
  48 + "similarity": "dot_product",
  49 + "element_type": "bfloat16"
  50 + },
  51 + "url": {
  52 + "type": "text"
  53 + }
  54 + }
  55 +},
  56 +
  57 +
  58 +
  59 +
  60 +tags字段使用的优化:
  61 +现在是keyword,在搜索中,不太好使用(目前主要用于suggest)。
  62 +可以考虑也拆分多语言,配合analyzer使用(和qanchors一样)
  63 +
  64 +
  65 +
  66 +外部需求:
  67 +1. 对推理能力要求很低、对耗时要求很高的大模型API(或者本地部署一个7b Q4量化的大模型),prompt大概30-50个token,首token响应要求500ms以内
  68 +2. ES支持reranker pipline?
  69 +
  70 +
  71 +
  72 +
  73 +
5 74
6 增加款式意图识别模块 75 增加款式意图识别模块
7 76
8 -意图类型: 颜色,尺(目前只需要支持这两种) 77 +意图类型: 颜色,尺(目前只需要支持这两种)
9 78
10 意图召回层: 79 意图召回层:
11 每种意图,有一个召回词集合 80 每种意图,有一个召回词集合
12 对query(包括原始query、各种翻译query 都做匹配) 81 对query(包括原始query、各种翻译query 都做匹配)
13 82
14 -意图识别层:  
15 -如果召回 判断有款式需求, 83 +以颜色意图为例:
  84 +有一个词表,每一行 都逗号分割,互为同义词,行内第一个为标准化词
  85 +query匹配了其中任何一个词,都认为,具有颜色意图
  86 +匹配规则: 用细粒度、粗粒度分词,看是否有在词表中的。原始query分词、和每种翻译的分词,都要用。
  87 +
  88 +意图判断: 暂时留空,直接返回true。目前没有模型,即只要召回了(词表匹配了),即认为有该维度款式需求。
  89 +
  90 +
  91 +
  92 +意图使用:
  93 +
  94 +我们第一阶段,使用 参与ES提权。
  95 +
  96 +一、参与ES提权
  97 +
  98 +
  99 +二、参与reranker
16 100
17 101
18 -是否有:  
19 -颜色需求  
20 -尺码需求  
21 如果有: 先做sku筛选,然后把最优的拼接到名称中,参与reranker。 102 如果有: 先做sku筛选,然后把最优的拼接到名称中,参与reranker。
22 103
23 104
24 现在在reranker、分页之后、做填充的时候,已经有做sku的筛选。 105 现在在reranker、分页之后、做填充的时候,已经有做sku的筛选。
25 需要优化: 106 需要优化:
26 现在是,先做包含的判断,找到第一个 option_value被query包含的,则直接认为匹配。改为 107 现在是,先做包含的判断,找到第一个 option_value被query包含的,则直接认为匹配。改为
27 -1. 第一轮:遍历完,如果有且仅有一个才这样。  
28 -2. 第二轮:如果有多个,跳到3。如果没有,对每个词都走泛化词表进行匹配。 108 +1. 第一轮:遍历完,如果有且仅有一个被query包含,那么认为匹配。
  109 +2. 第二轮:如果有多个符合(被query包含),跳到3。如果没有,对每个词都走泛化词表进行匹配。
29 3. 第三轮:如果有多个,那么对这多个,走embedding相关性取最高的。如果一个也没有,则对所有的走embedding相关性取最高的 110 3. 第三轮:如果有多个,那么对这多个,走embedding相关性取最高的。如果一个也没有,则对所有的走embedding相关性取最高的
30 -这个sku筛选也需要提取为一个独立的模块  
31 -  
32 -  
33 -  
34 -2026-03-21 10:29:23,698 - elastic_transport.transport - INFO - POST http://localhost:9200/search_products_tenant_163/_search?include_named_queries_score=false [status:200 duration:0.009s]  
35 -2026-03-21 10:29:23,700 - request_context - INFO - 分页详情回填 | ids=20 | filled=20 | took=7ms  
36 -2026-03-21 10:29:23,700 - request_context - INFO - 重排分页切片 | from=20, size=20, 返回=20条  
37 -2026-03-21 10:29:23,720 - embeddings.text_encoder - ERROR - TextEmbeddingEncoder service request failed: 502 Server Error: Bad Gateway for url: http://127.0.0.1:6005/embed/text?normalize=true&priority=1  
38 -Traceback (most recent call last):  
39 - File "/data/saas-search/embeddings/text_encoder.py", line 63, in _call_service  
40 - response.raise_for_status()  
41 - File "/data/saas-search/.venv/lib/python3.12/site-packages/requests/models.py", line 1026, in raise_for_status  
42 - raise HTTPError(http_error_msg, response=self)  
43 -requests.exceptions.HTTPError: 502 Server Error: Bad Gateway for url: http://127.0.0.1:6005/embed/text?normalize=true&priority=1  
44 -2026-03-21 10:29:23,720 - search.searcher - WARNING - Failed to encode SKU option1 values for final-page sorting: 502 Server Error: Bad Gateway for url: http://127.0.0.1:6005/embed/text?normalize=true&priority=1  
45 -Traceback (most recent call last):  
46 - File "/data/saas-search/search/searcher.py", line 448, in _apply_sku_sorting_for_page_hits  
47 - encoded_option_vectors = text_encoder.encode(option1_values_to_encode, priority=1)  
48 - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^  
49 - File "/data/saas-search/embeddings/text_encoder.py", line 112, in encode  
50 - response_data = self._call_service(  
51 - ^^^^^^^^^^^^^^^^^^^  
52 - File "/data/saas-search/embeddings/text_encoder.py", line 63, in _call_service  
53 - response.raise_for_status()  
54 - File "/data/saas-search/.venv/lib/python3.12/site-packages/requests/models.py", line 1026, in raise_for_status  
55 - raise HTTPError(http_error_msg, response=self)  
56 -requests.exceptions.HTTPError: 502 Server Error: Bad Gateway for url: http://127.0.0.1:6005/embed/text?normalize=true&priority=1  
57 -2026-03-21 10:29:23,721 - request_context - WARNING - SKU option embedding failed: 502 Server Error: Bad Gateway for url: http://127.0.0.1:6005/embed/text?normalize=true&priority=1 111 +这个sku筛选也需要提取为一个独立的模块。
  112 +
  113 +
  114 +另外:现在是reranker、分页之后做sku筛选,要改为:
  115 +1. 有款式意图的时候,才做sku筛选
  116 +2. sku筛选的时机,改为在reranker之前,对所有内容做sku筛选,然后
  117 +3. 从仅 option1 扩展到多个维度,识别的意图,包含意图的维度名(color)和维度名的泛化词list(color、颜色、colour、olors、、、、),遍历option1_name,option2_name,option3_name,看哪个能匹配上意图的维度名list,哪个匹配上了,则在这个维度筛选。
  118 +4. Rerank doc (有款式意图的时候)要带上属性后缀,拼接到title后面。在调用 run_rerank 前,对每条 hit 生成「用于重排的 doc 文本」(标题 + 可选后缀)
  119 +5. TODO : 还有一个问题。 目前,sku只返回一个维度(店铺主维度。默认应该是option1,不是所有维度的sku信息都返回的。所以,如果有款式意图,但是主维度是颜色,那么拿不到全的款式sku)
  120 +
58 121
59 122
60 123
  124 +当前项目功能已经较多,但是有清晰的框架,请务必基于现有框架进行改造,不要进行补丁式的修改,避免代码逻辑分叉。
  125 +
  126 +请一步一步来,先设计意图识别模块,仔细思考需求,意图识别模块需要提供哪些内容,用于返回数据接口的定义,深度思考,定义一个合理的接口后,再给出合理的模块设计。
  127 +
  128 +
  129 +
  130 +
  131 +
  132 +
  133 +
  134 +
  135 +
  136 +
  137 +
  138 +
  139 +
  140 +是否需要:
  141 +当「源语言不在 index_languages」且「某些目标语言的翻译缺失」时,ES 里会额外加一层 用「原始 query 字符串」去撞缺失语种字段
  142 +
  143 +
61 144
62 先阅读文本embedding相关的代码: 145 先阅读文本embedding相关的代码:
63 @embeddings/README.md @embeddings/server.py @docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md @embeddings/text_encoder.py 146 @embeddings/README.md @embeddings/server.py @docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md @embeddings/text_encoder.py
@@ -361,6 +444,31 @@ embeddings/image_encoder.py:requests.post(..., timeout=self.timeout_sec) @@ -361,6 +444,31 @@ embeddings/image_encoder.py:requests.post(..., timeout=self.timeout_sec)
361 444
362 445
363 446
  447 +
  448 +
  449 +
  450 +
  451 +
  452 +
  453 +
  454 +多reranker:
  455 +
  456 +改 reranker 服务,一次请求返回多路分
  457 +服务启动时 加载多个 backend(或按请求懒加载),/rerank 响应扩展为例如
  458 +scores: [...](兼容主后端)+ scores_by_backend: { "bge": [...], "qwen3_vllm": [...] }。
  459 +搜索侧解析多路分,再融合或只透传 debug。
  460 +优点:搜索侧仍只调一个 URL。缺点:单进程多大模型 显存压力很大;
  461 +
  462 +融合层要注意的一点
  463 +fuse_scores_and_resort 目前只消费 一条 rerank_scores 序列,并写入 _rerank_score
  464 +多 backend 之后需要rerank_scores 都参与融合
  465 +
  466 +
  467 +
  468 +
  469 +
  470 +
  471 +
364 product_enrich : Partial Mode : done 472 product_enrich : Partial Mode : done
365 https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-menu-2400256.d_0_3_0_7.74a630119Ct6zR 473 https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-menu-2400256.d_0_3_0_7.74a630119Ct6zR
366 需在messages 数组中将最后一条消息的 role 设置为 assistant,并在其 content 中提供前缀,在此消息中设置参数 "partial": true。messages格式如下: 474 需在messages 数组中将最后一条消息的 role 设置为 assistant,并在其 content 中提供前缀,在此消息中设置参数 "partial": true。messages格式如下:
@@ -383,6 +491,8 @@ https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-men @@ -383,6 +491,8 @@ https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-men
383 491
384 492
385 融合打分(已完成,2026-03) 493 融合打分(已完成,2026-03)
  494 +
  495 +以下已经完成:
386 1. `fuse_scores_and_resort` 已改为乘法融合,并通过 `matched_queries` 提取: 496 1. `fuse_scores_and_resort` 已改为乘法融合,并通过 `matched_queries` 提取:
387 - `base_query` 497 - `base_query`
388 - `base_query_trans_*` 498 - `base_query_trans_*`
@@ -397,7 +507,11 @@ https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-men @@ -397,7 +507,11 @@ https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-men
397 - `docs/搜索API对接指南.md` 507 - `docs/搜索API对接指南.md`
398 - `docs/Usage-Guide.md` 508 - `docs/Usage-Guide.md`
399 509
400 - 510 +未完成的:
  511 +(归一化、次序融合?还乘法公式?)
  512 +RRF:先把多路召回稳妥融合
  513 +linear + minmax:让你能精调 knn 和文本的权重
  514 +reranker:对前面召回出来的 top-k 再做“最后一刀”
401 515
402 516
403 517
docs/搜索API对接指南-01-搜索接口.md
@@ -66,9 +66,11 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) @@ -66,9 +66,11 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"})
66 | `min_score` | float | N | null | 最小相关性分数阈值 | 66 | `min_score` | float | N | null | 最小相关性分数阈值 |
67 | `sku_filter_dimension` | array[string] | N | null | 子SKU筛选维度列表(见[SKU筛选维度](#35-sku筛选维度)) | 67 | `sku_filter_dimension` | array[string] | N | null | 子SKU筛选维度列表(见[SKU筛选维度](#35-sku筛选维度)) |
68 | `debug` | boolean | N | false | 是否返回调试信息 | 68 | `debug` | boolean | N | false | 是否返回调试信息 |
69 -| `enable_rerank` | boolean/null | N | null | 是否开启重排(调用外部重排服务对 ES 结果进行二次排序)。不传/传 null 使用服务端 `rerank.enabled`(默认开启)。开启后会先对 ES TopN(`rerank_window`)重排,再按分页截取;若 `from+size>1000`,则不重排,直接按分页从 ES 返回 |  
70 -| `rerank_query_template` | string | N | null | 重排 query 模板(可选)。支持 `{query}` 占位符;不传则使用服务端配置 |  
71 -| `rerank_doc_template` | string | N | null | 重排 doc 模板(可选)。支持 `{title} {brief} {vendor} {description} {category_path}`;不传则使用服务端配置 | 69 +| `enable_rerank` | boolean/null | N | null | 是否开启重排(调用外部重排服务对 ES 结果进行二次排序)。不传/传 null 使用服务端 `rerank.enabled`。当有效开启且 `from + size <= rerank_window` 时:ES 先取前 `rerank_window` 条,重排后再按 `from`/`size` 截取当前页;若 `from + size > rerank_window`,则**不进行**窗口内重排,直接按请求的 `from`/`size` 查询 ES(`rerank_window` 见 `config.yaml` 的 `rerank.rerank_window`,仓库示例默认 400) |
  70 +| `rerank_query_template` | string | N | null | 重排 query 模板(可选)。支持 `{query}` 占位符;不传则使用服务端 `rerank.rerank_query_template` |
  71 +| `rerank_doc_template` | string | N | null | 重排 doc 模板(可选)。支持 `{title} {brief} {vendor} {description} {category_path}` 等占位符(由 `search/rerank_client.py` 按语言字段拼装);不传则使用服务端 `rerank.rerank_doc_template` |
  72 +
  73 +**与后端代码的对应关系**(便于联调):HTTP `POST /search/` 请求体由 `api/models.py` 的 `SearchRequest` 校验;路由 `api/routes/search.py` 将字段原样传入 `Searcher.search(...)`(含上述三个重排相关字段)。CLI `python main.py search` 目前未暴露这些参数,走配置默认值。
72 | `user_id` | string | N | null | 用户ID(用于个性化,预留) | 74 | `user_id` | string | N | null | 用户ID(用于个性化,预留) |
73 | `session_id` | string | N | null | 会话ID(用于分析,预留) | 75 | `session_id` | string | N | null | 会话ID(用于分析,预留) |
74 76
@@ -551,9 +553,6 @@ response = requests.post(url, headers=headers, json={&quot;query&quot;: &quot;芭比娃娃&quot;}) @@ -551,9 +553,6 @@ response = requests.post(url, headers=headers, json={&quot;query&quot;: &quot;芭比娃娃&quot;})
551 | `rewritten_query` | string | 重写后的查询 | 553 | `rewritten_query` | string | 重写后的查询 |
552 | `detected_language` | string | 检测到的语言 | 554 | `detected_language` | string | 检测到的语言 |
553 | `translations` | object | 翻译结果 | 555 | `translations` | object | 翻译结果 |
554 -| `query_text_by_lang` | object | 实际参与检索的多语言 query 文本 |  
555 -| `search_langs` | array[string] | 实际参与检索的语言列表 |  
556 -| `supplemental_search_langs` | array[string] | 因 mixed query 补入的附加语言列表 |  
557 | `has_vector` | boolean | 是否生成了向量 | 556 | `has_vector` | boolean | 是否生成了向量 |
558 557
559 `debug_info.per_result[]` 常见字段: 558 `debug_info.per_result[]` 常见字段:
@@ -563,10 +562,9 @@ response = requests.post(url, headers=headers, json={&quot;query&quot;: &quot;芭比娃娃&quot;}) @@ -563,10 +562,9 @@ response = requests.post(url, headers=headers, json={&quot;query&quot;: &quot;芭比娃娃&quot;})
563 | `spu_id` | string | 结果 SPU ID | 562 | `spu_id` | string | 结果 SPU ID |
564 | `es_score` | float | ES 原始 `_score` | 563 | `es_score` | float | ES 原始 `_score` |
565 | `rerank_score` | float | 重排分数 | 564 | `rerank_score` | float | 重排分数 |
566 -| `text_score` | float | 文本相关性大分(由 `base_query` / `base_query_trans_*` / `fallback_original_query_*` 聚合而来) | 565 +| `text_score` | float | 文本相关性大分(由 `base_query` / `base_query_trans_*` 聚合而来) |
567 | `text_source_score` | float | `base_query` 分数 | 566 | `text_source_score` | float | `base_query` 分数 |
568 | `text_translation_score` | float | `base_query_trans_*` 里的最大分数 | 567 | `text_translation_score` | float | `base_query_trans_*` 里的最大分数 |
569 -| `text_fallback_score` | float | `fallback_original_query_*` 里的最大分数 |  
570 | `text_primary_score` | float | 文本大分中的主证据部分 | 568 | `text_primary_score` | float | 文本大分中的主证据部分 |
571 | `text_support_score` | float | 文本大分中的辅助证据部分 | 569 | `text_support_score` | float | 文本大分中的辅助证据部分 |
572 | `knn_score` | float | `knn_query` 分数 | 570 | `knn_score` | float | `knn_query` 分数 |
docs/相关性检索优化说明.md
@@ -2,11 +2,11 @@ @@ -2,11 +2,11 @@
2 2
3 ## 1. 文档目标 3 ## 1. 文档目标
4 4
5 -本文描述当前线上代码的文本检索策略,重点覆盖: 5 +本文描述当前代码中的文本检索策略,重点覆盖:
6 6
7 - 多语言检索路由(`detector` / `translator` / `indexed` 的关系) 7 - 多语言检索路由(`detector` / `translator` / `indexed` 的关系)
8 - 统一文本召回表达式(无布尔 AST 分支) 8 - 统一文本召回表达式(无布尔 AST 分支)
9 -- 翻译缺失时的兜底策略 9 +- 解析层与检索表达式层的职责边界
10 - 重排融合打分与调试字段 10 - 重排融合打分与调试字段
11 - 典型场景下实际生成的 ES 查询结构 11 - 典型场景下实际生成的 ES 查询结构
12 12
@@ -17,9 +17,11 @@ @@ -17,9 +17,11 @@
17 查询链路(文本相关): 17 查询链路(文本相关):
18 18
19 1. `QueryParser.parse()` 19 1. `QueryParser.parse()`
20 - 输出 `detected_language`、`query_text_by_lang`、`search_langs`、`index_languages`、`source_in_index_languages`;另输出 `contains_chinese` / `contains_english`(仅服务混写辅助召回,见 §4 末)。 20 + 负责产出解析事实:`query_normalized`、`rewritten_query`、`detected_language`、`translations`、`query_vector`、`query_tokens`、`contains_chinese`、`contains_english`。
  21 +2. `Searcher.search()`
  22 + 负责读取租户 `index_languages`,并将其一方面传给 `QueryParser` 作为 `target_languages`,另一方面传给 `ESQueryBuilder` 作为字段展开约束。
21 2. `ESQueryBuilder._build_advanced_text_query()` 23 2. `ESQueryBuilder._build_advanced_text_query()`
22 - 按 `search_langs` 动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段,叠加 shared 字段(`tags`、`option*_values`);若命中混写辅助条件,在同一子句内并入另一语种列(§4 末)。 24 + 基于 `rewritten_query + detected_language + translations + index_languages` 构建 `base_query` 与 `base_query_trans_*`;并按语言动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段,叠加 shared 字段(`tags`、`option*_values`)。
23 3. `build_query()` 25 3. `build_query()`
24 统一走文本策略,不再有布尔 AST 枝路。 26 统一走文本策略,不再有布尔 AST 枝路。
25 27
@@ -37,18 +39,18 @@ @@ -37,18 +39,18 @@
37 源语言字段做主召回;其他语言走翻译补召回(低权重)。 39 源语言字段做主召回;其他语言走翻译补召回(低权重)。
38 2. 若 `detected_language not in index_languages`: 40 2. 若 `detected_language not in index_languages`:
39 翻译到 `index_languages` 是主路径;源语言字段仅作弱召回。 41 翻译到 `index_languages` 是主路径;源语言字段仅作弱召回。
40 -3. 若第 2 步翻译部分失败或全部失败:  
41 - 对缺失翻译的 `index_languages` 字段,追加“原文低权重兜底”子句,避免完全丢失这些语种索引面的召回机会。 42 +3. 若翻译部分失败或全部失败:
  43 + 当前实现不会再额外生成“原文打到其他语种字段”的兜底子句;系统保留 `base_query` 并继续执行,可观测性由 `translations` / warning / 命名子句分数提供。
42 44
43 ### 3.2 翻译与向量:并发提交与共享超时 45 ### 3.2 翻译与向量:并发提交与共享超时
44 46
45 -`QueryParser.parse()` 内(Stage 4–6)对**离线调用**采用线程池提交 + **一次** `concurrent.futures.wait`: 47 +`QueryParser.parse()` 内对翻译与向量采用线程池提交 + **一次** `concurrent.futures.wait`:
46 48
47 -- **翻译**:对 `index_languages` 中除 `detected_language` 外的每个目标语种各提交一个 `translator.translate` 任务(多目标时并发执行)。  
48 -- **查询向量**(若开启 `enable_text_embedding` 且域为 default):再提交一个 `text_encoder.encode` 任务。 49 +- **翻译**:对调用方传入的 `target_languages` 中、除 `detected_language` 外的每个目标语种各提交一个 `translator.translate` 任务(多目标时并发执行)。
  50 +- **查询向量**:若开启 `enable_text_embedding`,再提交一个 `text_encoder.encode` 任务。
49 - 上述任务进入**同一** future 集合;例如租户索引为 `[zh, en]` 且检测语种**不在**索引内时,常为 **2 路翻译 + 1 路向量,共 3 个任务并发**,共用超时。 51 - 上述任务进入**同一** future 集合;例如租户索引为 `[zh, en]` 且检测语种**不在**索引内时,常为 **2 路翻译 + 1 路向量,共 3 个任务并发**,共用超时。
50 52
51 -**等待预算(毫秒)**由 `detected_language` 是否属于租户 `index_languages` 决定(`query_config`): 53 +**等待预算(毫秒)**由 `detected_language` 是否属于调用方传入的 `target_languages` 决定(`query_config`):
52 54
53 - **在索引内**:`translation_embedding_wait_budget_ms_source_in_index`(默认较短,如 80ms)— 主召回已能打在源语种字段,翻译/向量稍慢可容忍。 55 - **在索引内**:`translation_embedding_wait_budget_ms_source_in_index`(默认较短,如 80ms)— 主召回已能打在源语种字段,翻译/向量稍慢可容忍。
54 - **不在索引内**:`translation_embedding_wait_budget_ms_source_not_in_index`(默认较长,如 200ms)— 翻译对可检索文本更关键,给足时间。 56 - **不在索引内**:`translation_embedding_wait_budget_ms_source_not_in_index`(默认较长,如 200ms)— 翻译对可检索文本更关键,给足时间。
@@ -62,7 +64,7 @@ @@ -62,7 +64,7 @@
62 ```json 64 ```json
63 { 65 {
64 "multi_match": { 66 "multi_match": {
65 - "_name": "base_query|base_query_trans_xx|fallback_original_query_xx", 67 + "_name": "base_query|base_query_trans_xx",
66 "query": "<text>", 68 "query": "<text>",
67 "fields": ["title.xx^3.0", "brief.xx^1.5", "...", "tags", "option1_values^0.5", "..."], 69 "fields": ["title.xx^3.0", "brief.xx^1.5", "...", "tags", "option1_values^0.5", "..."],
68 "minimum_should_match": "75%", 70 "minimum_should_match": "75%",
@@ -75,7 +77,7 @@ @@ -75,7 +77,7 @@
75 最终按 `bool.should` 组合,`minimum_should_match: 1`。 77 最终按 `bool.should` 组合,`minimum_should_match: 1`。
76 78
77 > **附 — 混写辅助召回** 79 > **附 — 混写辅助召回**
78 -> 当中英(或多脚本)混写时,为略抬召回:`QueryParser` 用 `contains_chinese`(文中有汉字)、`contains_english`(分词中有长度 ≥3 的纯英文 token)打标;`ESQueryBuilder` 在某一语言的 `multi_match` 上,按规则把**另一语种**的同类字段并入同一 `fields`(受 `index_languages` 限制),并入列的 boost 为配置值再乘 **`mixed_script_merged_field_boost_scale`(默认 0.8,`ESQueryBuilder` 构造参数)**。`fallback_original_query_*` 同样适用。字段在内部以 `(path, boost)` 列表合并后再格式化为 ES 字符串。 80 +> 当中英(或多脚本)混写时,为略抬召回:`QueryParser` 用 `contains_chinese`(文中有汉字)、`contains_english`(分词中有长度 ≥3 的纯英文 token)打标;`ESQueryBuilder` 在某一语言的 `multi_match` 上,按规则把**另一语种**的同类字段并入同一 `fields`(受 `index_languages` 限制),并入列的 boost 为配置值再乘 **`mixed_script_merged_field_boost_scale`(默认 0.6,`ESQueryBuilder` 构造参数)**。字段在内部以 `(path, boost)` 列表合并后再格式化为 ES 字符串。
79 81
80 ## 5. 关键配置项(文本策略) 82 ## 5. 关键配置项(文本策略)
81 83
@@ -88,20 +90,12 @@ @@ -88,20 +90,12 @@
88 90
89 - `base_minimum_should_match` 91 - `base_minimum_should_match`
90 - `translation_minimum_should_match` 92 - `translation_minimum_should_match`
91 -- `translation_boost`  
92 -- `translation_boost_when_source_missing`  
93 -- `source_boost_when_missing`  
94 -- `original_query_fallback_boost_when_translation_missing`(新增) 93 +- `translation_boost`(所有 `base_query_trans_*` 共用)
95 - `tie_breaker_base_query` 94 - `tie_breaker_base_query`
96 95
97 -新增项说明:  
98 -  
99 -- `original_query_fallback_boost_when_translation_missing`:  
100 - 当源语种不在索引语言且翻译缺失时,原文打到缺失目标语字段的低权重系数,默认 `0.2`。  
101 -  
102 说明: 96 说明:
103 97
104 -- `phrase_query` / `keywords_query` 已从当前实现中移除,文本相关性只由 `base_query`、`base_query_trans_*`、`fallback_original_query_*` 三类子句组成。 98 +- `phrase_query` / `keywords_query` 已从当前实现中移除,文本相关性只由 `base_query`、`base_query_trans_*`类子句组成。
105 99
106 ## 6. 典型场景与实际 DSL 100 ## 6. 典型场景与实际 DSL
107 101
@@ -111,11 +105,12 @@ @@ -111,11 +105,12 @@
111 105
112 - `detected_language=de` 106 - `detected_language=de`
113 - `index_languages=[de,en]` 107 - `index_languages=[de,en]`
114 -- `query_text_by_lang={de:"herren schuhe", en:"men shoes"}` 108 +- `rewritten_query="herren schuhe"`
  109 +- `translations={en:"men shoes"}`
115 110
116 策略结果: 111 策略结果:
117 112
118 -- `base_query`:德语字段,正常权重 113 +- `base_query`:德语字段,**不写** `multi_match.boost`
119 - `base_query_trans_en`:英语字段,`boost=translation_boost`(默认 0.4) 114 - `base_query_trans_en`:英语字段,`boost=translation_boost`(默认 0.4)
120 115
121 ### 场景 B:源语种不在索引语言中,部分翻译缺失 116 ### 场景 B:源语种不在索引语言中,部分翻译缺失
@@ -126,38 +121,44 @@ @@ -126,38 +121,44 @@
126 121
127 策略结果: 122 策略结果:
128 123
129 -- `base_query`(德语字段):`boost=source_boost_when_missing`(默认 0.6)  
130 -- `base_query_trans_en`(英文字段):`boost=translation_boost_when_source_missing`(默认 1.0)  
131 -- `fallback_original_query_zh`(中文字段):原文低权重兜底(默认 0.2) 124 +- `base_query`(德语字段):**不写** `multi_match.boost`(默认 1.0)
  125 +- `base_query_trans_en`(英文字段):`boost=translation_boost`(如 0.4)
  126 +- 不会生成额外中文兜底子句
132 127
133 ### 场景 C:源语种不在索引语言中,翻译全部失败 128 ### 场景 C:源语种不在索引语言中,翻译全部失败
134 129
135 - `detected_language=de` 130 - `detected_language=de`
136 - `index_languages=[en,zh]` 131 - `index_languages=[en,zh]`
137 -- `query_text_by_lang` 仅有 `de` 132 +- `translations={}`
138 133
139 策略结果: 134 策略结果:
140 135
141 -- `base_query`(德语字段,低权重)  
142 -- `fallback_original_query_en`(英文字段原文兜底)  
143 -- `fallback_original_query_zh`(中文字段原文兜底) 136 +- `base_query`(德语字段,**无** `boost` 字段)
  137 +- 不会生成 `base_query_trans_*`
144 138
145 -这能避免“只有源语种字段查询,且该语种字段在商家索引中稀疏/为空”导致的弱召回问题 139 +这意味着当前实现优先保证职责清晰与可解释性,而不是继续在 Builder 内部隐式制造“跨语种原文兜底”
146 140
147 -## 7. QueryParser 与 ESBuilder 的职责分工 141 +## 7. QueryParser 与 Searcher / ESBuilder 的职责分工
148 142
149 -- `QueryParser` 负责“语言计划”与“可用文本”:  
150 - - `search_langs`  
151 - - `query_text_by_lang`  
152 - - `source_in_index_languages`  
153 - - `index_languages` 143 +- `QueryParser` 负责“解析事实”:
  144 + - `query_normalized`
  145 + - `rewritten_query`
  146 + - `detected_language`
  147 + - `translations`
  148 + - `query_vector`
  149 + - `query_tokens`
154 - `contains_chinese` / `contains_english` 150 - `contains_chinese` / `contains_english`
  151 +- `Searcher` 负责“租户语境”:
  152 + - `index_languages`
  153 + - 将其传给 parser 作为 `target_languages`
  154 + - 将其传给 builder 作为字段展开约束
155 - `ESQueryBuilder` 负责“表达式展开”: 155 - `ESQueryBuilder` 负责“表达式展开”:
156 - 动态字段组装 156 - 动态字段组装
157 - 子句权重分配 157 - 子句权重分配
158 - - 翻译缺失兜底子句拼接 158 + - `base_query` / `base_query_trans_*` 子句拼接
  159 + - 跳过“与 base_query 文本和语言完全相同”的重复翻译子句
159 160
160 -这种分层让策略调优主要落在配置和 Builder,不破坏 Parser 的职责边界 161 +这种分层让 parser 不再返回 ES 专用的“语言计划字段”,职责边界更清晰
161 162
162 ## 8. 融合打分(Rerank + Text + KNN) 163 ## 8. 融合打分(Rerank + Text + KNN)
163 164
@@ -165,24 +166,21 @@ @@ -165,24 +166,21 @@
165 166
166 ### 8.1 文本相关性大分 167 ### 8.1 文本相关性大分
167 168
168 -文本大分由部分组成: 169 +文本大分由部分组成:
169 170
170 - `base_query` 171 - `base_query`
171 - `base_query_trans_*` 172 - `base_query_trans_*`
172 -- `fallback_original_query_*`  
173 173
174 聚合方式: 174 聚合方式:
175 175
176 1. `source_score = base_query` 176 1. `source_score = base_query`
177 2. `translation_score = max(base_query_trans_*)` 177 2. `translation_score = max(base_query_trans_*)`
178 -3. `fallback_score = max(fallback_original_query_*)`  
179 -4. 加权: 178 +3. 加权:
180 - `weighted_source = source_score` 179 - `weighted_source = source_score`
181 - `weighted_translation = 0.8 * translation_score` 180 - `weighted_translation = 0.8 * translation_score`
182 - - `weighted_fallback = 0.55 * fallback_score`  
183 -5. 合成:  
184 - - `primary = max(weighted_source, weighted_translation, weighted_fallback)`  
185 - - `support = weighted_source + weighted_translation + weighted_fallback - primary` 181 +4. 合成:
  182 + - `primary = max(weighted_source, weighted_translation)`
  183 + - `support = weighted_source + weighted_translation - primary`
186 - `text_score = primary + 0.25 * support` 184 - `text_score = primary + 0.25 * support`
187 185
188 如果以上子分都缺失,则回退到 ES `_score` 作为 `text_score`,避免纯文本召回被误打成 0。 186 如果以上子分都缺失,则回退到 ES `_score` 作为 `text_score`,避免纯文本召回被误打成 0。
@@ -212,7 +210,6 @@ fused_score = ( @@ -212,7 +210,6 @@ fused_score = (
212 - `text_score` 210 - `text_score`
213 - `text_source_score` 211 - `text_source_score`
214 - `text_translation_score` 212 - `text_translation_score`
215 -- `text_fallback_score`  
216 - `text_primary_score` 213 - `text_primary_score`
217 - `text_support_score` 214 - `text_support_score`
218 - `knn_score` 215 - `knn_score`
@@ -221,9 +218,9 @@ fused_score = ( @@ -221,9 +218,9 @@ fused_score = (
221 218
222 `debug_info.query_analysis` 还会暴露: 219 `debug_info.query_analysis` 还会暴露:
223 220
224 -- `query_text_by_lang`  
225 -- `search_langs`  
226 -- `supplemental_search_langs` 221 +- `translations`
  222 +- `detected_language`
  223 +- `rewritten_query`
227 224
228 这些字段用于检索效果评估与 bad case 归因。 225 这些字段用于检索效果评估与 bad case 归因。
229 226
@@ -231,7 +228,7 @@ fused_score = ( @@ -231,7 +228,7 @@ fused_score = (
231 228
232 1. 当前文本主链路已移除布尔 AST 分支。 229 1. 当前文本主链路已移除布尔 AST 分支。
233 2. 文档中的旧描述(如 `operator: AND` 固定开启)不再适用,当前实现未强制设置该参数。 230 2. 文档中的旧描述(如 `operator: AND` 固定开启)不再适用,当前实现未强制设置该参数。
234 -3. `HanLP` 为可选依赖;不可用时退化到轻量分词,不影响主链路可用性 231 +3. `HanLP` 为必需依赖;当前 parser 不再提供轻量 fallback
235 4. 若后续扩展到更多语种,请确保: 232 4. 若后续扩展到更多语种,请确保:
236 - mapping 中存在对应 `.<lang>` 字段 233 - mapping 中存在对应 `.<lang>` 字段
237 - `index_languages` 配置在支持列表内 234 - `index_languages` 配置在支持列表内
@@ -263,10 +260,9 @@ python ./scripts/eval_search_quality.py @@ -263,10 +260,9 @@ python ./scripts/eval_search_quality.py
263 建议在 `tests/` 增加文本策略用例: 260 建议在 `tests/` 增加文本策略用例:
264 261
265 1. 源语种在索引语言,翻译命中缓存 262 1. 源语种在索引语言,翻译命中缓存
266 -2. 源语种不在索引语言,翻译部分失败(验证 fallback 子句)  
267 -3. 源语种不在索引语言,翻译全部失败(验证多目标 fallback)  
268 -4. 自定义 `original_query_fallback_boost_when_translation_missing` 生效  
269 -5. 非 `zh/en` 语种字段动态拼接(如 `de/fr/es`) 263 +2. 源语种不在索引语言,翻译部分失败(验证仅保留 `base_query` + 成功翻译子句)
  264 +3. 源语种不在索引语言,翻译全部失败(验证无 `base_query_trans_*` 时仍可正常执行)
  265 +4. 非 `zh/en` 语种字段动态拼接(如 `de/fr/es`)
270 266
271 267
272 268
@@ -281,3 +277,24 @@ title.en: 2026 Korean-style High-waisted Slimming Corduroy Skirt with Slit, Mid- @@ -281,3 +277,24 @@ title.en: 2026 Korean-style High-waisted Slimming Corduroy Skirt with Slit, Mid-
281 Rerank score: 0.9643 277 Rerank score: 0.9643
282 title.en: Black Half-high Collar Base Shirt Women's Autumn and Winter fleece-lined Contrast Color Pure Desire Design Sense Horn Sleeve Ruffled Inner Top 278 title.en: Black Half-high Collar Base Shirt Women's Autumn and Winter fleece-lined Contrast Color Pure Desire Design Sense Horn Sleeve Ruffled Inner Top
283 title.zh: 黑色高领半高领女士秋冬内搭加绒拼色纯欲设计荷叶边袖内衬上衣 279 title.zh: 黑色高领半高领女士秋冬内搭加绒拼色纯欲设计荷叶边袖内衬上衣
  280 +
  281 +
  282 +
  283 +qwen3-0.6b的严重badcase:
  284 +q=牛仔裤
  285 +
  286 +Rerank score: 0.0002
  287 +title.en: Wrangler Womens Cowboy Cut Slim Fit Jean Bleach
  288 +title.zh: Wrangler 女士牛仔裤 牛仔剪裁 紧身版型 漂白色
  289 +
  290 +Rerank score: 0.0168
  291 +title.en: Fleece Lined Tights Sheer Women - Fake Translucent Warm Pantyhose Leggings Sheer Thick Tights for Winter
  292 +title.zh: 加绒透肤女士连裤袜 - 仿透视保暖长筒袜 冬季厚款透肤连裤袜
  293 +
  294 +Rerank score: 0.1366
  295 +title.en: Dockers Men's Classic Fit Workday Khaki Smart 360 FLEX Pants (Standard and Big & Tall)
  296 +title.zh: Dockers 男士经典版型工作日卡其色智能360度弹力裤(标准码与加大码)
  297 +
  298 +Rerank score: 0.0981
  299 +title.en: Lazy One Pajama Shorts for Men, Men's Pajama Bottoms, Sleepwear
  300 +title.zh: 懒人男士睡裤,男式家居裤,睡眠服饰
indexer/document_transformer.py
@@ -13,7 +13,7 @@ import numpy as np @@ -13,7 +13,7 @@ import numpy as np
13 import logging 13 import logging
14 import re 14 import re
15 from typing import Dict, Any, Optional, List 15 from typing import Dict, Any, Optional, List
16 -from indexer.product_enrich import analyze_products 16 +from indexer.product_enrich import analyze_products, split_multi_value_field
17 17
18 logger = logging.getLogger(__name__) 18 logger = logging.getLogger(__name__)
19 19
@@ -121,7 +121,7 @@ class SPUDocumentTransformer: @@ -121,7 +121,7 @@ class SPUDocumentTransformer:
121 # Tags 121 # Tags
122 if pd.notna(spu_row.get('tags')): 122 if pd.notna(spu_row.get('tags')):
123 tags_str = str(spu_row['tags']) 123 tags_str = str(spu_row['tags'])
124 - doc['tags'] = [tag.strip() for tag in tags_str.split(',') if tag.strip()] 124 + doc['tags'] = split_multi_value_field(tags_str)
125 125
126 # Category相关字段 126 # Category相关字段
127 self._fill_category_fields(doc, spu_row) 127 self._fill_category_fields(doc, spu_row)
@@ -282,11 +282,7 @@ class SPUDocumentTransformer: @@ -282,11 +282,7 @@ class SPUDocumentTransformer:
282 raw = row.get(name) 282 raw = row.get(name)
283 if not raw: 283 if not raw:
284 continue 284 continue
285 - parts = re.split(r"[,;|/\n\t]+", str(raw))  
286 - for part in parts:  
287 - value = part.strip()  
288 - if not value:  
289 - continue 285 + for value in split_multi_value_field(str(raw)):
290 semantic_list.append({"lang": lang, "name": name, "value": value}) 286 semantic_list.append({"lang": lang, "name": name, "value": value})
291 287
292 if qanchors_obj: 288 if qanchors_obj:
@@ -703,11 +699,7 @@ class SPUDocumentTransformer: @@ -703,11 +699,7 @@ class SPUDocumentTransformer:
703 raw = row.get(name) 699 raw = row.get(name)
704 if not raw: 700 if not raw:
705 continue 701 continue
706 - parts = re.split(r"[,;|/\n\t]+", str(raw))  
707 - for part in parts:  
708 - value = part.strip()  
709 - if not value:  
710 - continue 702 + for value in split_multi_value_field(str(raw)):
711 semantic_list.append( 703 semantic_list.append(
712 { 704 {
713 "lang": lang, 705 "lang": lang,
indexer/product_enrich.py
@@ -144,6 +144,20 @@ if _missing_prompt_langs: @@ -144,6 +144,20 @@ if _missing_prompt_langs:
144 ) 144 )
145 145
146 146
  147 +# 多值字段分隔:英文逗号、中文逗号、顿号,及历史约定的 ; | / 与空白
  148 +_MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[,、,;|/\n\t]+")
  149 +
  150 +
  151 +def split_multi_value_field(text: Optional[str]) -> List[str]:
  152 + """将 LLM/业务中的多值字符串拆成短语列表(strip 后去空)。"""
  153 + if text is None:
  154 + return []
  155 + s = str(text).strip()
  156 + if not s:
  157 + return []
  158 + return [p.strip() for p in _MULTI_VALUE_FIELD_SPLIT_RE.split(s) if p.strip()]
  159 +
  160 +
147 def _normalize_space(text: str) -> str: 161 def _normalize_space(text: str) -> str:
148 return re.sub(r"\s+", " ", (text or "").strip()) 162 return re.sub(r"\s+", " ", (text or "").strip())
149 163
query/query_parser.py
1 """ 1 """
2 Query parser - main module for query processing. 2 Query parser - main module for query processing.
3 3
4 -Handles query rewriting, translation, and embedding generation. 4 +Responsibilities are intentionally narrow:
  5 +- normalize and rewrite the incoming query
  6 +- detect language and tokenize with HanLP
  7 +- run translation and embedding requests concurrently
  8 +- return parser facts, not Elasticsearch language-planning data
5 """ 9 """
6 10
7 -from typing import Dict, List, Optional, Any, Union, Tuple 11 +from dataclasses import dataclass, field
  12 +from typing import Any, Callable, Dict, List, Optional, Tuple
8 import numpy as np 13 import numpy as np
9 import logging 14 import logging
10 import re 15 import re
@@ -18,15 +23,12 @@ from .query_rewriter import QueryRewriter, QueryNormalizer @@ -18,15 +23,12 @@ from .query_rewriter import QueryRewriter, QueryNormalizer
18 23
19 logger = logging.getLogger(__name__) 24 logger = logging.getLogger(__name__)
20 25
21 -try:  
22 - import hanlp # type: ignore  
23 -except Exception: # pragma: no cover  
24 - hanlp = None 26 +import hanlp # type: ignore
25 27
26 28
27 def simple_tokenize_query(text: str) -> List[str]: 29 def simple_tokenize_query(text: str) -> List[str]:
28 """ 30 """
29 - Lightweight tokenizer for suggestion length / analysis (aligned with QueryParser fallback). 31 + Lightweight tokenizer for suggestion-side heuristics only.
30 32
31 - Consecutive CJK characters form one token 33 - Consecutive CJK characters form one token
32 - Latin / digit runs (with internal hyphens) form tokens 34 - Latin / digit runs (with internal hyphens) form tokens
@@ -37,63 +39,32 @@ def simple_tokenize_query(text: str) -&gt; List[str]: @@ -37,63 +39,32 @@ def simple_tokenize_query(text: str) -&gt; List[str]:
37 return pattern.findall(text) 39 return pattern.findall(text)
38 40
39 41
  42 +@dataclass(slots=True)
40 class ParsedQuery: 43 class ParsedQuery:
41 - """Container for parsed query results."""  
42 -  
43 - def __init__(  
44 - self,  
45 - original_query: str,  
46 - query_normalized: str,  
47 - rewritten_query: Optional[str] = None,  
48 - detected_language: Optional[str] = None,  
49 - translations: Dict[str, str] = None,  
50 - query_vector: Optional[np.ndarray] = None,  
51 - domain: str = "default",  
52 - keywords: str = "",  
53 - token_count: int = 0,  
54 - query_tokens: Optional[List[str]] = None,  
55 - query_text_by_lang: Optional[Dict[str, str]] = None,  
56 - search_langs: Optional[List[str]] = None,  
57 - index_languages: Optional[List[str]] = None,  
58 - source_in_index_languages: bool = True,  
59 - contains_chinese: bool = False,  
60 - contains_english: bool = False,  
61 - ):  
62 - self.original_query = original_query  
63 - self.query_normalized = query_normalized  
64 - self.rewritten_query = rewritten_query or query_normalized  
65 - self.detected_language = detected_language  
66 - self.translations = translations or {}  
67 - self.query_vector = query_vector  
68 - self.domain = domain  
69 - # Query analysis fields  
70 - self.keywords = keywords  
71 - self.token_count = token_count  
72 - self.query_tokens = query_tokens or []  
73 - self.query_text_by_lang = query_text_by_lang or {}  
74 - self.search_langs = search_langs or []  
75 - self.index_languages = index_languages or []  
76 - self.source_in_index_languages = bool(source_in_index_languages)  
77 - self.contains_chinese = bool(contains_chinese)  
78 - self.contains_english = bool(contains_english) 44 + """Container for query parser facts."""
  45 +
  46 + original_query: str
  47 + query_normalized: str
  48 + rewritten_query: str
  49 + detected_language: Optional[str] = None
  50 + translations: Dict[str, str] = field(default_factory=dict)
  51 + query_vector: Optional[np.ndarray] = None
  52 + query_tokens: List[str] = field(default_factory=list)
  53 + contains_chinese: bool = False
  54 + contains_english: bool = False
79 55
80 def to_dict(self) -> Dict[str, Any]: 56 def to_dict(self) -> Dict[str, Any]:
81 """Convert to dictionary representation.""" 57 """Convert to dictionary representation."""
82 - result = { 58 + return {
83 "original_query": self.original_query, 59 "original_query": self.original_query,
84 "query_normalized": self.query_normalized, 60 "query_normalized": self.query_normalized,
85 "rewritten_query": self.rewritten_query, 61 "rewritten_query": self.rewritten_query,
86 "detected_language": self.detected_language, 62 "detected_language": self.detected_language,
87 "translations": self.translations, 63 "translations": self.translations,
88 - "domain": self.domain 64 + "query_tokens": self.query_tokens,
  65 + "contains_chinese": self.contains_chinese,
  66 + "contains_english": self.contains_english,
89 } 67 }
90 - result["query_text_by_lang"] = self.query_text_by_lang  
91 - result["search_langs"] = self.search_langs  
92 - result["index_languages"] = self.index_languages  
93 - result["source_in_index_languages"] = self.source_in_index_languages  
94 - result["contains_chinese"] = self.contains_chinese  
95 - result["contains_english"] = self.contains_english  
96 - return result  
97 68
98 69
99 class QueryParser: 70 class QueryParser:
@@ -102,7 +73,7 @@ class QueryParser: @@ -102,7 +73,7 @@ class QueryParser:
102 1. Normalization 73 1. Normalization
103 2. Query rewriting (brand/category mappings, synonyms) 74 2. Query rewriting (brand/category mappings, synonyms)
104 3. Language detection 75 3. Language detection
105 - 4. Translation to target languages 76 + 4. Translation to caller-provided target languages
106 5. Text embedding generation (for semantic search) 77 5. Text embedding generation (for semantic search)
107 """ 78 """
108 79
@@ -110,7 +81,8 @@ class QueryParser: @@ -110,7 +81,8 @@ class QueryParser:
110 self, 81 self,
111 config: SearchConfig, 82 config: SearchConfig,
112 text_encoder: Optional[TextEmbeddingEncoder] = None, 83 text_encoder: Optional[TextEmbeddingEncoder] = None,
113 - translator: Optional[Any] = None 84 + translator: Optional[Any] = None,
  85 + tokenizer: Optional[Callable[[str], Any]] = None,
114 ): 86 ):
115 """ 87 """
116 Initialize query parser. 88 Initialize query parser.
@@ -128,23 +100,7 @@ class QueryParser: @@ -128,23 +100,7 @@ class QueryParser:
128 self.normalizer = QueryNormalizer() 100 self.normalizer = QueryNormalizer()
129 self.language_detector = LanguageDetector() 101 self.language_detector = LanguageDetector()
130 self.rewriter = QueryRewriter(config.query_config.rewrite_dictionary) 102 self.rewriter = QueryRewriter(config.query_config.rewrite_dictionary)
131 -  
132 - # Optional HanLP components (heavy). If unavailable, fall back to a lightweight tokenizer.  
133 - self._tok = None  
134 - self._pos_tag = None  
135 - if hanlp is not None:  
136 - try:  
137 - logger.info("Initializing HanLP components...")  
138 - self._tok = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF)  
139 - self._tok.config.output_spans = True  
140 - self._pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)  
141 - logger.info("HanLP components initialized")  
142 - except Exception as e:  
143 - logger.warning(f"HanLP init failed, falling back to simple tokenizer: {e}")  
144 - self._tok = None  
145 - self._pos_tag = None  
146 - else:  
147 - logger.info("HanLP not installed; using simple tokenizer") 103 + self._tokenizer = tokenizer or self._build_tokenizer()
148 104
149 # Eager initialization (startup-time failure visibility, no lazy init in request path) 105 # Eager initialization (startup-time failure visibility, no lazy init in request path)
150 if self.config.query_config.enable_text_embedding and self._text_encoder is None: 106 if self.config.query_config.enable_text_embedding and self._text_encoder is None:
@@ -170,57 +126,81 @@ class QueryParser: @@ -170,57 +126,81 @@ class QueryParser:
170 """Return pre-initialized translator.""" 126 """Return pre-initialized translator."""
171 return self._translator 127 return self._translator
172 128
  129 + def _build_tokenizer(self) -> Callable[[str], Any]:
  130 + """Build the tokenizer used by query parsing. No fallback path by design."""
  131 + if hanlp is None:
  132 + raise RuntimeError("HanLP is required for QueryParser tokenization")
  133 + logger.info("Initializing HanLP tokenizer...")
  134 + tokenizer = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF)
  135 + tokenizer.config.output_spans = True
  136 + logger.info("HanLP tokenizer initialized")
  137 + return tokenizer
  138 +
173 @staticmethod 139 @staticmethod
174 - def _pick_query_translation_model(source_lang: str, target_lang: str, config: SearchConfig) -> str: 140 + def _pick_query_translation_model(
  141 + source_lang: str,
  142 + target_lang: str,
  143 + config: SearchConfig,
  144 + source_language_in_index: bool,
  145 + ) -> str:
175 """Pick the translation capability for query-time translation (configurable).""" 146 """Pick the translation capability for query-time translation (configurable)."""
176 src = str(source_lang or "").strip().lower() 147 src = str(source_lang or "").strip().lower()
177 tgt = str(target_lang or "").strip().lower() 148 tgt = str(target_lang or "").strip().lower()
  149 + qc = config.query_config
  150 +
  151 + if source_language_in_index:
  152 + if src == "zh" and tgt == "en":
  153 + return qc.zh_to_en_model
  154 + if src == "en" and tgt == "zh":
  155 + return qc.en_to_zh_model
  156 + return qc.default_translation_model
178 157
179 - # Use dedicated models for zh<->en if configured  
180 if src == "zh" and tgt == "en": 158 if src == "zh" and tgt == "en":
181 - return config.query_config.zh_to_en_model 159 + return qc.zh_to_en_model_source_not_in_index or qc.zh_to_en_model
182 if src == "en" and tgt == "zh": 160 if src == "en" and tgt == "zh":
183 - return config.query_config.en_to_zh_model  
184 -  
185 - # For any other language pairs, fall back to the configurable default model.  
186 - # By default this is `nllb-200-distilled-600m` (multi-lingual local model).  
187 - return config.query_config.default_translation_model  
188 -  
189 - def _simple_tokenize(self, text: str) -> List[str]:  
190 - return simple_tokenize_query(text)  
191 -  
192 - def _extract_keywords(self, query: str) -> str:  
193 - """Extract keywords (nouns with length > 1) from query."""  
194 - if self._tok is not None and self._pos_tag is not None:  
195 - tok_result = self._tok(query)  
196 - if not tok_result:  
197 - return ""  
198 - words = [x[0] for x in tok_result]  
199 - pos_tags = self._pos_tag(words)  
200 - keywords = []  
201 - for word, pos in zip(words, pos_tags):  
202 - if len(word) > 1 and isinstance(pos, str) and pos.startswith("N"):  
203 - keywords.append(word)  
204 - return " ".join(keywords)  
205 -  
206 - # Fallback: treat tokens with length > 1 as "keywords"  
207 - tokens = self._simple_tokenize(query)  
208 - keywords = [t for t in tokens if len(t) > 1]  
209 - return " ".join(keywords)  
210 -  
211 - def _get_token_count(self, query: str) -> int:  
212 - """Get token count (HanLP if available, otherwise simple)."""  
213 - if self._tok is not None:  
214 - tok_result = self._tok(query)  
215 - return len(tok_result) if tok_result else 0  
216 - return len(self._simple_tokenize(query)) 161 + return qc.en_to_zh_model_source_not_in_index or qc.en_to_zh_model
  162 + return qc.default_translation_model_source_not_in_index or qc.default_translation_model
  163 +
  164 + @staticmethod
  165 + def _normalize_language_codes(languages: Optional[List[str]]) -> List[str]:
  166 + normalized: List[str] = []
  167 + seen = set()
  168 + for language in languages or []:
  169 + token = str(language or "").strip().lower()
  170 + if not token or token in seen:
  171 + continue
  172 + seen.add(token)
  173 + normalized.append(token)
  174 + return normalized
  175 +
  176 + @staticmethod
  177 + def _extract_tokens(tokenizer_result: Any) -> List[str]:
  178 + """Normalize tokenizer output into a flat token string list."""
  179 + if not tokenizer_result:
  180 + return []
  181 + if isinstance(tokenizer_result, str):
  182 + token = tokenizer_result.strip()
  183 + return [token] if token else []
  184 +
  185 + tokens: List[str] = []
  186 + for item in tokenizer_result:
  187 + token: Optional[str] = None
  188 + if isinstance(item, str):
  189 + token = item
  190 + elif isinstance(item, (list, tuple)) and item:
  191 + token = str(item[0])
  192 + elif item is not None:
  193 + token = str(item)
  194 +
  195 + if token is None:
  196 + continue
  197 + token = token.strip()
  198 + if token:
  199 + tokens.append(token)
  200 + return tokens
217 201
218 def _get_query_tokens(self, query: str) -> List[str]: 202 def _get_query_tokens(self, query: str) -> List[str]:
219 - """Get token list (HanLP if available, otherwise simple)."""  
220 - if self._tok is not None:  
221 - tok_result = self._tok(query)  
222 - return [x[0] for x in tok_result] if tok_result else []  
223 - return self._simple_tokenize(query) 203 + return self._extract_tokens(self._tokenizer(query))
224 204
225 @staticmethod 205 @staticmethod
226 def _contains_cjk(text: str) -> bool: 206 def _contains_cjk(text: str) -> bool:
@@ -237,64 +217,24 @@ class QueryParser: @@ -237,64 +217,24 @@ class QueryParser:
237 return False 217 return False
238 return bool(re.fullmatch(r"[A-Za-z]+(?:-[A-Za-z]+)*", token)) 218 return bool(re.fullmatch(r"[A-Za-z]+(?:-[A-Za-z]+)*", token))
239 219
240 - @staticmethod  
241 - def _extract_latin_tokens(text: str) -> List[str]:  
242 - """Extract latin word tokens from query text."""  
243 - return re.findall(r"[A-Za-z]+(?:-[A-Za-z]+)*", text or "")  
244 -  
245 - def _infer_supplemental_search_langs(  
246 - self,  
247 - query_text: str,  
248 - detected_lang: str,  
249 - index_langs: List[str],  
250 - ) -> List[str]:  
251 - """  
252 - Infer extra languages to search when the query mixes scripts.  
253 -  
254 - Rules:  
255 - - If any Chinese characters appear, include `zh` when available.  
256 - - If the query contains meaningful latin tokens, include `en` when available.  
257 - "Meaningful" means either:  
258 - 1) at least 2 latin tokens with length >= 4, or  
259 - 2) at least 1 latin token with length >= 4 and latin chars occupy >= 20% of non-space chars.  
260 - """  
261 - supplemental: List[str] = []  
262 - normalized_index_langs = {str(lang or "").strip().lower() for lang in index_langs}  
263 - normalized_detected = str(detected_lang or "").strip().lower()  
264 - query_text = str(query_text or "")  
265 -  
266 - if "zh" in normalized_index_langs and self._contains_cjk(query_text) and normalized_detected != "zh":  
267 - supplemental.append("zh")  
268 -  
269 - latin_tokens = self._extract_latin_tokens(query_text)  
270 - significant_latin_tokens = [tok for tok in latin_tokens if len(tok) >= 4]  
271 - latin_chars = sum(len(tok) for tok in latin_tokens)  
272 - non_space_chars = len(re.sub(r"\s+", "", query_text))  
273 - latin_ratio = (latin_chars / non_space_chars) if non_space_chars > 0 else 0.0  
274 - has_meaningful_english = (  
275 - len(significant_latin_tokens) >= 2 or  
276 - (len(significant_latin_tokens) >= 1 and latin_ratio >= 0.2)  
277 - )  
278 -  
279 - if "en" in normalized_index_langs and has_meaningful_english and normalized_detected != "en":  
280 - supplemental.append("en")  
281 -  
282 - return supplemental  
283 -  
284 def parse( 220 def parse(
285 self, 221 self,
286 query: str, 222 query: str,
287 tenant_id: Optional[str] = None, 223 tenant_id: Optional[str] = None,
288 generate_vector: bool = True, 224 generate_vector: bool = True,
289 - context: Optional[Any] = None 225 + context: Optional[Any] = None,
  226 + target_languages: Optional[List[str]] = None,
290 ) -> ParsedQuery: 227 ) -> ParsedQuery:
291 """ 228 """
292 Parse query through all processing stages. 229 Parse query through all processing stages.
293 230
294 Args: 231 Args:
295 query: Raw query string 232 query: Raw query string
  233 + tenant_id: Deprecated and ignored by QueryParser. Kept temporarily
  234 + to avoid a wider refactor in this first step.
296 generate_vector: Whether to generate query embedding 235 generate_vector: Whether to generate query embedding
297 context: Optional request context for tracking and logging 236 context: Optional request context for tracking and logging
  237 + target_languages: Translation target languages decided by the caller
298 238
299 Returns: 239 Returns:
300 ParsedQuery object with all processing results 240 ParsedQuery object with all processing results
@@ -325,15 +265,9 @@ class QueryParser: @@ -325,15 +265,9 @@ class QueryParser:
325 if context: 265 if context:
326 context.store_intermediate_result('query_normalized', normalized) 266 context.store_intermediate_result('query_normalized', normalized)
327 267
328 - # Extract domain if present (e.g., "brand:Nike" -> domain="brand", query="Nike")  
329 - domain, query_text = self.normalizer.extract_domain_query(normalized)  
330 - log_debug(f"Domain extraction | Domain: '{domain}', Query: '{query_text}'")  
331 - if context:  
332 - context.store_intermediate_result('extracted_domain', domain)  
333 - context.store_intermediate_result('domain_query', query_text)  
334 -  
335 # Stage 2: Query rewriting 268 # Stage 2: Query rewriting
336 - rewritten = None 269 + query_text = normalized
  270 + rewritten = normalized
337 if self.config.query_config.rewrite_dictionary: # Enable rewrite if dictionary exists 271 if self.config.query_config.rewrite_dictionary: # Enable rewrite if dictionary exists
338 rewritten = self.rewriter.rewrite(query_text) 272 rewritten = self.rewriter.rewrite(query_text)
339 if rewritten != query_text: 273 if rewritten != query_text:
@@ -351,43 +285,57 @@ class QueryParser: @@ -351,43 +285,57 @@ class QueryParser:
351 log_info(f"Language detection | Detected language: {detected_lang}") 285 log_info(f"Language detection | Detected language: {detected_lang}")
352 if context: 286 if context:
353 context.store_intermediate_result('detected_language', detected_lang) 287 context.store_intermediate_result('detected_language', detected_lang)
  288 + # Stage 4: Query analysis (tokenization + script flags)
  289 + query_tokens = self._get_query_tokens(query_text)
  290 + contains_chinese = self._contains_cjk(query_text)
  291 + contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens)
  292 +
  293 + log_debug(
  294 + f"Query analysis | Query tokens: {query_tokens} | "
  295 + f"contains_chinese={contains_chinese} | contains_english={contains_english}"
  296 + )
  297 + if context:
  298 + context.store_intermediate_result('query_tokens', query_tokens)
  299 + context.store_intermediate_result('contains_chinese', contains_chinese)
  300 + context.store_intermediate_result('contains_english', contains_english)
354 301
355 - # Stage 4: Translation — always submit to thread pool; results are collected together with  
356 - # embedding in one wait() that uses a configurable budget (short vs long by source-in-index). 302 + # Stage 5: Translation + embedding. Parser only coordinates async enrichment work; the
  303 + # caller decides translation targets and later search-field planning.
357 translations: Dict[str, str] = {} 304 translations: Dict[str, str] = {}
358 - translation_futures: Dict[str, Any] = {}  
359 - translation_executor: Optional[ThreadPoolExecutor] = None  
360 - index_langs: List[str] = [] 305 + future_to_task: Dict[Any, Tuple[str, Optional[str]]] = {}
  306 + async_executor: Optional[ThreadPoolExecutor] = None
361 detected_norm = str(detected_lang or "").strip().lower() 307 detected_norm = str(detected_lang or "").strip().lower()
  308 + normalized_targets = self._normalize_language_codes(target_languages)
  309 + translation_targets = [lang for lang in normalized_targets if lang != detected_norm]
  310 + source_language_in_index = bool(normalized_targets) and detected_norm in normalized_targets
  311 +
  312 + # Stage 6: Text embedding - async execution
  313 + query_vector = None
  314 + should_generate_embedding = (
  315 + generate_vector and
  316 + self.config.query_config.enable_text_embedding
  317 + )
  318 +
  319 + task_count = len(translation_targets) + (1 if should_generate_embedding else 0)
  320 + if task_count > 0:
  321 + async_executor = ThreadPoolExecutor(
  322 + max_workers=max(1, min(task_count, 4)),
  323 + thread_name_prefix="query-enrichment",
  324 + )
362 325
363 try: 326 try:
364 - # 根据租户配置的 index_languages 决定翻译目标语言  
365 - from config.tenant_config_loader import get_tenant_config_loader  
366 - tenant_loader = get_tenant_config_loader()  
367 - tenant_cfg = tenant_loader.get_tenant_config(tenant_id or "default")  
368 - raw_index_langs = tenant_cfg.get("index_languages") or []  
369 - index_langs = []  
370 - seen_langs = set()  
371 - for lang in raw_index_langs:  
372 - norm_lang = str(lang or "").strip().lower()  
373 - if not norm_lang or norm_lang in seen_langs:  
374 - continue  
375 - seen_langs.add(norm_lang)  
376 - index_langs.append(norm_lang)  
377 -  
378 - target_langs_for_translation = [lang for lang in index_langs if lang != detected_norm]  
379 -  
380 - if target_langs_for_translation:  
381 - translation_executor = ThreadPoolExecutor(  
382 - max_workers=max(1, min(len(target_langs_for_translation), 4)),  
383 - thread_name_prefix="query-translation",  
384 - )  
385 - for lang in target_langs_for_translation:  
386 - model_name = self._pick_query_translation_model(detected_lang, lang, self.config) 327 + if async_executor is not None:
  328 + for lang in translation_targets:
  329 + model_name = self._pick_query_translation_model(
  330 + detected_lang,
  331 + lang,
  332 + self.config,
  333 + source_language_in_index,
  334 + )
387 log_debug( 335 log_debug(
388 f"Submitting query translation | source={detected_lang} target={lang} model={model_name}" 336 f"Submitting query translation | source={detected_lang} target={lang} model={model_name}"
389 ) 337 )
390 - translation_futures[lang] = translation_executor.submit( 338 + future = async_executor.submit(
391 self.translator.translate, 339 self.translator.translate,
392 query_text, 340 query_text,
393 lang, 341 lang,
@@ -395,107 +343,61 @@ class QueryParser: @@ -395,107 +343,61 @@ class QueryParser:
395 "ecommerce_search_query", 343 "ecommerce_search_query",
396 model_name, 344 model_name,
397 ) 345 )
398 -  
399 - if context:  
400 - context.store_intermediate_result('translations', translations)  
401 - for lang, translation in translations.items():  
402 - if translation:  
403 - context.store_intermediate_result(f'translation_{lang}', translation)  
404 - 346 + future_to_task[future] = ("translation", lang)
  347 +
  348 + if should_generate_embedding:
  349 + if self.text_encoder is None:
  350 + raise RuntimeError("Text embedding is enabled but text encoder is not initialized")
  351 + log_debug("Submitting query vector generation")
  352 +
  353 + def _encode_query_vector() -> Optional[np.ndarray]:
  354 + arr = self.text_encoder.encode([query_text], priority=1)
  355 + if arr is None or len(arr) == 0:
  356 + return None
  357 + vec = arr[0]
  358 + if vec is None:
  359 + return None
  360 + return np.asarray(vec, dtype=np.float32)
  361 +
  362 + future = async_executor.submit(_encode_query_vector)
  363 + future_to_task[future] = ("embedding", None)
405 except Exception as e: 364 except Exception as e:
406 - error_msg = f"Translation failed | Error: {str(e)}" 365 + error_msg = f"Async query enrichment submission failed | Error: {str(e)}"
407 log_info(error_msg) 366 log_info(error_msg)
408 if context: 367 if context:
409 context.add_warning(error_msg) 368 context.add_warning(error_msg)
  369 + if async_executor is not None:
  370 + async_executor.shutdown(wait=False)
  371 + async_executor = None
  372 + future_to_task.clear()
410 373
411 - # Stage 5: Query analysis (keywords, token count, query_tokens)  
412 - keywords = self._extract_keywords(query_text)  
413 - query_tokens = self._get_query_tokens(query_text)  
414 - token_count = len(query_tokens)  
415 - contains_chinese = self._contains_cjk(query_text)  
416 - contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens)  
417 -  
418 - log_debug(f"Query analysis | Keywords: {keywords} | Token count: {token_count} | "  
419 - f"Query tokens: {query_tokens} | contains_chinese={contains_chinese} | "  
420 - f"contains_english={contains_english}")  
421 - if context:  
422 - context.store_intermediate_result('keywords', keywords)  
423 - context.store_intermediate_result('token_count', token_count)  
424 - context.store_intermediate_result('query_tokens', query_tokens)  
425 - context.store_intermediate_result('contains_chinese', contains_chinese)  
426 - context.store_intermediate_result('contains_english', contains_english)  
427 -  
428 - # Stage 6: Text embedding (only for non-short queries) - async execution  
429 - query_vector = None  
430 - embedding_future = None  
431 - should_generate_embedding = (  
432 - generate_vector and  
433 - self.config.query_config.enable_text_embedding and  
434 - domain == "default"  
435 - )  
436 -  
437 - encoding_executor = None  
438 - if should_generate_embedding:  
439 - try:  
440 - if self.text_encoder is None:  
441 - raise RuntimeError("Text embedding is enabled but text encoder is not initialized")  
442 - log_debug("Starting query vector generation (async)")  
443 - # Submit encoding task to thread pool for async execution  
444 - encoding_executor = ThreadPoolExecutor(max_workers=1)  
445 - def _encode_query_vector() -> Optional[np.ndarray]:  
446 - arr = self.text_encoder.encode([query_text], priority=1)  
447 - if arr is None or len(arr) == 0:  
448 - return None  
449 - vec = arr[0]  
450 - return vec if isinstance(vec, np.ndarray) else None  
451 - embedding_future = encoding_executor.submit(  
452 - _encode_query_vector  
453 - )  
454 - except Exception as e:  
455 - error_msg = f"Query vector generation task submission failed | Error: {str(e)}"  
456 - log_info(error_msg)  
457 - if context:  
458 - context.add_warning(error_msg)  
459 - encoding_executor = None  
460 - embedding_future = None  
461 -  
462 - # Wait for translation + embedding concurrently; shared budget (ms) depends on whether  
463 - # the detected language is in tenant index_languages. 374 + # Wait for translation + embedding concurrently; shared budget depends on whether
  375 + # the detected language belongs to caller-provided target_languages.
464 qc = self.config.query_config 376 qc = self.config.query_config
465 - source_in_index_for_budget = detected_norm in index_langs 377 + source_in_target_languages = bool(normalized_targets) and detected_norm in normalized_targets
466 budget_ms = ( 378 budget_ms = (
467 qc.translation_embedding_wait_budget_ms_source_in_index 379 qc.translation_embedding_wait_budget_ms_source_in_index
468 - if source_in_index_for_budget 380 + if source_in_target_languages
469 else qc.translation_embedding_wait_budget_ms_source_not_in_index 381 else qc.translation_embedding_wait_budget_ms_source_not_in_index
470 ) 382 )
471 budget_sec = max(0.0, float(budget_ms) / 1000.0) 383 budget_sec = max(0.0, float(budget_ms) / 1000.0)
472 384
473 - if translation_futures: 385 + if translation_targets:
474 log_info( 386 log_info(
475 f"Translation+embedding shared wait budget | budget_ms={budget_ms} | " 387 f"Translation+embedding shared wait budget | budget_ms={budget_ms} | "
476 - f"source_in_index_languages={source_in_index_for_budget} | "  
477 - f"translation_targets={list(translation_futures.keys())}" 388 + f"source_in_target_languages={source_in_target_languages} | "
  389 + f"translation_targets={translation_targets}"
478 ) 390 )
479 391
480 - if translation_futures or embedding_future: 392 + if future_to_task:
481 log_debug( 393 log_debug(
482 f"Waiting for async tasks (translation+embedding) | budget_ms={budget_ms} | " 394 f"Waiting for async tasks (translation+embedding) | budget_ms={budget_ms} | "
483 - f"source_in_index_languages={source_in_index_for_budget}" 395 + f"source_in_target_languages={source_in_target_languages}"
484 ) 396 )
485 397
486 - all_futures: List[Any] = []  
487 - future_to_lang: Dict[Any, tuple] = {}  
488 - for lang, future in translation_futures.items():  
489 - all_futures.append(future)  
490 - future_to_lang[future] = ("translation", lang)  
491 -  
492 - if embedding_future:  
493 - all_futures.append(embedding_future)  
494 - future_to_lang[embedding_future] = ("embedding", None)  
495 -  
496 - done, not_done = wait(all_futures, timeout=budget_sec) 398 + done, not_done = wait(list(future_to_task.keys()), timeout=budget_sec)
497 for future in done: 399 for future in done:
498 - task_type, lang = future_to_lang[future] 400 + task_type, lang = future_to_task[future]
499 try: 401 try:
500 result = future.result() 402 result = future.result()
501 if task_type == "translation": 403 if task_type == "translation":
@@ -528,7 +430,7 @@ class QueryParser: @@ -528,7 +430,7 @@ class QueryParser:
528 430
529 if not_done: 431 if not_done:
530 for future in not_done: 432 for future in not_done:
531 - task_type, lang = future_to_lang[future] 433 + task_type, lang = future_to_task[future]
532 if task_type == "translation": 434 if task_type == "translation":
533 timeout_msg = ( 435 timeout_msg = (
534 f"Translation timeout (>{budget_ms}ms) | Language: {lang} | " 436 f"Translation timeout (>{budget_ms}ms) | Language: {lang} | "
@@ -542,68 +444,21 @@ class QueryParser: @@ -542,68 +444,21 @@ class QueryParser:
542 if context: 444 if context:
543 context.add_warning(timeout_msg) 445 context.add_warning(timeout_msg)
544 446
545 - if encoding_executor:  
546 - encoding_executor.shutdown(wait=False)  
547 - if translation_executor:  
548 - translation_executor.shutdown(wait=False) 447 + if async_executor:
  448 + async_executor.shutdown(wait=False)
549 449
550 if translations and context: 450 if translations and context:
551 context.store_intermediate_result("translations", translations) 451 context.store_intermediate_result("translations", translations)
552 -  
553 - # Build language-scoped query plan: source language + available translations  
554 - query_text_by_lang: Dict[str, str] = {}  
555 - if query_text:  
556 - query_text_by_lang[detected_lang] = query_text  
557 - for lang, translated_text in (translations or {}).items():  
558 - if translated_text and str(translated_text).strip():  
559 - query_text_by_lang[str(lang).strip().lower()] = str(translated_text)  
560 -  
561 - supplemental_search_langs = self._infer_supplemental_search_langs(  
562 - query_text=query_text,  
563 - detected_lang=detected_lang,  
564 - index_langs=index_langs,  
565 - )  
566 - for lang in supplemental_search_langs:  
567 - if lang not in query_text_by_lang and query_text:  
568 - # Use the original mixed-script query as a robust fallback probe for that language field set.  
569 - query_text_by_lang[lang] = query_text  
570 -  
571 - source_in_index_languages = detected_norm in index_langs  
572 - ordered_search_langs: List[str] = []  
573 - seen_order = set()  
574 - if detected_lang in query_text_by_lang:  
575 - ordered_search_langs.append(detected_lang)  
576 - seen_order.add(detected_lang)  
577 - for lang in index_langs:  
578 - if lang in query_text_by_lang and lang not in seen_order:  
579 - ordered_search_langs.append(lang)  
580 - seen_order.add(lang)  
581 - for lang in query_text_by_lang.keys():  
582 - if lang not in seen_order:  
583 - ordered_search_langs.append(lang)  
584 - seen_order.add(lang)  
585 -  
586 - if context:  
587 - context.store_intermediate_result("search_langs", ordered_search_langs)  
588 - context.store_intermediate_result("query_text_by_lang", query_text_by_lang)  
589 - context.store_intermediate_result("supplemental_search_langs", supplemental_search_langs)  
590 452
591 # Build result 453 # Build result
592 result = ParsedQuery( 454 result = ParsedQuery(
593 original_query=query, 455 original_query=query,
594 query_normalized=normalized, 456 query_normalized=normalized,
595 - rewritten_query=rewritten, 457 + rewritten_query=query_text,
596 detected_language=detected_lang, 458 detected_language=detected_lang,
597 translations=translations, 459 translations=translations,
598 query_vector=query_vector, 460 query_vector=query_vector,
599 - domain=domain,  
600 - keywords=keywords,  
601 - token_count=token_count,  
602 query_tokens=query_tokens, 461 query_tokens=query_tokens,
603 - query_text_by_lang=query_text_by_lang,  
604 - search_langs=ordered_search_langs,  
605 - index_languages=index_langs,  
606 - source_in_index_languages=source_in_index_languages,  
607 contains_chinese=contains_chinese, 462 contains_chinese=contains_chinese,
608 contains_english=contains_english, 463 contains_english=contains_english,
609 ) 464 )
@@ -611,14 +466,13 @@ class QueryParser: @@ -611,14 +466,13 @@ class QueryParser:
611 if context and hasattr(context, 'logger'): 466 if context and hasattr(context, 'logger'):
612 context.logger.info( 467 context.logger.info(
613 f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | " 468 f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | "
614 - f"Language: {detected_lang} | Domain: {domain} | "  
615 f"Translation count: {len(translations)} | Vector: {'yes' if query_vector is not None else 'no'}", 469 f"Translation count: {len(translations)} | Vector: {'yes' if query_vector is not None else 'no'}",
616 extra={'reqid': context.reqid, 'uid': context.uid} 470 extra={'reqid': context.reqid, 'uid': context.uid}
617 ) 471 )
618 else: 472 else:
619 logger.info( 473 logger.info(
620 f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | " 474 f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | "
621 - f"Language: {detected_lang} | Domain: {domain}" 475 + f"Language: {detected_lang}"
622 ) 476 )
623 477
624 return result 478 return result
requirements_hanlp.txt 0 → 100644
@@ -0,0 +1,13 @@ @@ -0,0 +1,13 @@
  1 +# Optional: HanLP query tokenization for the main backend venv (QueryParser).
  2 +#
  3 +# Install:
  4 +# source activate.sh
  5 +# pip install -r requirements_hanlp.txt
  6 +#
  7 +# Why pin transformers<5:
  8 +# transformers 5.x no longer exposes `encode_plus` on `BertTokenizer`, but HanLP 2.1.x
  9 +# still calls it → AttributeError during `hanlp.load(...)`.
  10 +# Use transformers 4.44+ (4.x) which remains API-compatible with HanLP.
  11 +
  12 +hanlp>=2.1.0
  13 +transformers>=4.44,<5
scripts/eval_search_quality.py
@@ -83,7 +83,6 @@ class RankedItem: @@ -83,7 +83,6 @@ class RankedItem:
83 text_score: float | None 83 text_score: float | None
84 text_source_score: float | None 84 text_source_score: float | None
85 text_translation_score: float | None 85 text_translation_score: float | None
86 - text_fallback_score: float | None  
87 text_primary_score: float | None 86 text_primary_score: float | None
88 text_support_score: float | None 87 text_support_score: float | None
89 knn_score: float | None 88 knn_score: float | None
@@ -146,7 +145,6 @@ def _evaluate_query(searcher, tenant_id: str, query: str) -&gt; Dict[str, Any]: @@ -146,7 +145,6 @@ def _evaluate_query(searcher, tenant_id: str, query: str) -&gt; Dict[str, Any]:
146 text_score=_to_float(debug_item.get("text_score")), 145 text_score=_to_float(debug_item.get("text_score")),
147 text_source_score=_to_float(debug_item.get("text_source_score")), 146 text_source_score=_to_float(debug_item.get("text_source_score")),
148 text_translation_score=_to_float(debug_item.get("text_translation_score")), 147 text_translation_score=_to_float(debug_item.get("text_translation_score")),
149 - text_fallback_score=_to_float(debug_item.get("text_fallback_score")),  
150 text_primary_score=_to_float(debug_item.get("text_primary_score")), 148 text_primary_score=_to_float(debug_item.get("text_primary_score")),
151 text_support_score=_to_float(debug_item.get("text_support_score")), 149 text_support_score=_to_float(debug_item.get("text_support_score")),
152 knn_score=_to_float(debug_item.get("knn_score")), 150 knn_score=_to_float(debug_item.get("knn_score")),
@@ -185,12 +183,11 @@ def _render_markdown(report: Dict[str, Any]) -&gt; str: @@ -185,12 +183,11 @@ def _render_markdown(report: Dict[str, Any]) -&gt; str:
185 f"- total={entry['total']} max_score={entry['max_score']:.6f} took_ms={entry['took_ms']}" 183 f"- total={entry['total']} max_score={entry['max_score']:.6f} took_ms={entry['took_ms']}"
186 ) 184 )
187 lines.append( 185 lines.append(
188 - f"- detected_language={qa.get('detected_language')} search_langs={qa.get('search_langs')} supplemental_search_langs={qa.get('supplemental_search_langs')}" 186 + f"- detected_language={qa.get('detected_language')} translations={qa.get('translations')}"
189 ) 187 )
190 - lines.append(f"- query_text_by_lang={qa.get('query_text_by_lang')}")  
191 lines.append("") 188 lines.append("")
192 - lines.append("| rank | spu_id | title | fused | rerank | text | text_src | text_trans | text_fb | knn | es | matched_queries |")  
193 - lines.append("| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |") 189 + lines.append("| rank | spu_id | title | fused | rerank | text | text_src | text_trans | knn | es | matched_queries |")
  190 + lines.append("| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |")
194 for item in entry.get("top20", []): 191 for item in entry.get("top20", []):
195 title = str(item.get("title", "")).replace("|", "/") 192 title = str(item.get("title", "")).replace("|", "/")
196 matched = json.dumps(item.get("matched_queries"), ensure_ascii=False) 193 matched = json.dumps(item.get("matched_queries"), ensure_ascii=False)
@@ -199,7 +196,7 @@ def _render_markdown(report: Dict[str, Any]) -&gt; str: @@ -199,7 +196,7 @@ def _render_markdown(report: Dict[str, Any]) -&gt; str:
199 f"| {item.get('rank')} | {item.get('spu_id')} | {title} | " 196 f"| {item.get('rank')} | {item.get('spu_id')} | {title} | "
200 f"{item.get('fused_score')} | {item.get('rerank_score')} | {item.get('text_score')} | " 197 f"{item.get('fused_score')} | {item.get('rerank_score')} | {item.get('text_score')} | "
201 f"{item.get('text_source_score')} | {item.get('text_translation_score')} | " 198 f"{item.get('text_source_score')} | {item.get('text_translation_score')} | "
202 - f"{item.get('text_fallback_score')} | {item.get('knn_score')} | {item.get('es_score')} | {matched} |" 199 + f"{item.get('knn_score')} | {item.get('es_score')} | {matched} |"
203 ) 200 )
204 lines.append("") 201 lines.append("")
205 return "\n".join(lines) 202 return "\n".join(lines)
search/es_query_builder.py
@@ -36,11 +36,12 @@ class ESQueryBuilder: @@ -36,11 +36,12 @@ class ESQueryBuilder:
36 base_minimum_should_match: str = "70%", 36 base_minimum_should_match: str = "70%",
37 translation_minimum_should_match: str = "70%", 37 translation_minimum_should_match: str = "70%",
38 translation_boost: float = 0.4, 38 translation_boost: float = 0.4,
39 - translation_boost_when_source_missing: float = 1.0,  
40 - source_boost_when_missing: float = 0.6,  
41 - original_query_fallback_boost_when_translation_missing: float = 0.2,  
42 tie_breaker_base_query: float = 0.9, 39 tie_breaker_base_query: float = 0.9,
43 mixed_script_merged_field_boost_scale: float = 0.6, 40 mixed_script_merged_field_boost_scale: float = 0.6,
  41 + phrase_match_base_fields: Optional[Tuple[str, ...]] = None,
  42 + phrase_match_slop: int = 2,
  43 + phrase_match_tie_breaker: float = 0.4,
  44 + phrase_match_boost: float = 3.0,
44 ): 45 ):
45 """ 46 """
46 Initialize query builder. 47 Initialize query builder.
@@ -74,13 +75,12 @@ class ESQueryBuilder: @@ -74,13 +75,12 @@ class ESQueryBuilder:
74 self.base_minimum_should_match = base_minimum_should_match 75 self.base_minimum_should_match = base_minimum_should_match
75 self.translation_minimum_should_match = translation_minimum_should_match 76 self.translation_minimum_should_match = translation_minimum_should_match
76 self.translation_boost = float(translation_boost) 77 self.translation_boost = float(translation_boost)
77 - self.translation_boost_when_source_missing = float(translation_boost_when_source_missing)  
78 - self.source_boost_when_missing = float(source_boost_when_missing)  
79 - self.original_query_fallback_boost_when_translation_missing = float(  
80 - original_query_fallback_boost_when_translation_missing  
81 - )  
82 self.tie_breaker_base_query = float(tie_breaker_base_query) 78 self.tie_breaker_base_query = float(tie_breaker_base_query)
83 self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale) 79 self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale)
  80 + self.phrase_match_base_fields = tuple(phrase_match_base_fields or ("title", "qanchors"))
  81 + self.phrase_match_slop = int(phrase_match_slop)
  82 + self.phrase_match_tie_breaker = float(phrase_match_tie_breaker)
  83 + self.phrase_match_boost = float(phrase_match_boost)
84 84
85 def _apply_source_filter(self, es_query: Dict[str, Any]) -> None: 85 def _apply_source_filter(self, es_query: Dict[str, Any]) -> None:
86 """ 86 """
@@ -159,7 +159,8 @@ class ESQueryBuilder: @@ -159,7 +159,8 @@ class ESQueryBuilder:
159 knn_k: int = 50, 159 knn_k: int = 50,
160 knn_num_candidates: int = 200, 160 knn_num_candidates: int = 200,
161 min_score: Optional[float] = None, 161 min_score: Optional[float] = None,
162 - parsed_query: Optional[Any] = None 162 + parsed_query: Optional[Any] = None,
  163 + index_languages: Optional[List[str]] = None,
163 ) -> Dict[str, Any]: 164 ) -> Dict[str, Any]:
164 """ 165 """
165 Build complete ES query with post_filter support for multi-select faceting. 166 Build complete ES query with post_filter support for multi-select faceting.
@@ -167,7 +168,7 @@ class ESQueryBuilder: @@ -167,7 +168,7 @@ class ESQueryBuilder:
167 结构:filters and (text_recall or embedding_recall) + post_filter 168 结构:filters and (text_recall or embedding_recall) + post_filter
168 - conjunctive_filters: 应用在 query.bool.filter(影响结果和聚合) 169 - conjunctive_filters: 应用在 query.bool.filter(影响结果和聚合)
169 - disjunctive_filters: 应用在 post_filter(只影响结果,不影响聚合) 170 - disjunctive_filters: 应用在 post_filter(只影响结果,不影响聚合)
170 - - text_recall: 文本相关性召回(按 search_langs 动态语言字段) 171 + - text_recall: 文本相关性召回(按实际 clause 语言动态字段)
171 - embedding_recall: 向量召回(KNN) 172 - embedding_recall: 向量召回(KNN)
172 - function_score: 包装召回部分,支持提权字段 173 - function_score: 包装召回部分,支持提权字段
173 174
@@ -202,7 +203,11 @@ class ESQueryBuilder: @@ -202,7 +203,11 @@ class ESQueryBuilder:
202 # Text recall (always include if query_text exists) 203 # Text recall (always include if query_text exists)
203 if query_text: 204 if query_text:
204 # Unified text query strategy 205 # Unified text query strategy
205 - text_query = self._build_advanced_text_query(query_text, parsed_query) 206 + text_query = self._build_advanced_text_query(
  207 + query_text,
  208 + parsed_query,
  209 + index_languages=index_languages,
  210 + )
206 recall_clauses.append(text_query) 211 recall_clauses.append(text_query)
207 212
208 # Embedding recall (KNN - separate from query, handled below) 213 # Embedding recall (KNN - separate from query, handled below)
@@ -456,6 +461,44 @@ class ESQueryBuilder: @@ -456,6 +461,44 @@ class ESQueryBuilder:
456 """Format (field_path, boost) pairs for Elasticsearch multi_match ``fields``.""" 461 """Format (field_path, boost) pairs for Elasticsearch multi_match ``fields``."""
457 return [self._format_field_with_boost(path, boost) for path, boost in specs] 462 return [self._format_field_with_boost(path, boost) for path, boost in specs]
458 463
  464 + def _build_phrase_match_fields(self, language: str) -> List[str]:
  465 + """Fields for phrase multi_match: base names × ``.{lang}`` with ``field_boosts``."""
  466 + lang = (language or "").strip().lower()
  467 + if not lang:
  468 + return []
  469 + out: List[str] = []
  470 + for base in self.phrase_match_base_fields:
  471 + path = f"{base}.{lang}"
  472 + boost = self._get_field_boost(base, lang)
  473 + out.append(self._format_field_with_boost(path, boost))
  474 + return out
  475 +
  476 + def _append_phrase_should_clause(
  477 + self,
  478 + should_clauses: List[Dict[str, Any]],
  479 + lang: str,
  480 + lang_query: str,
  481 + clause_name: str
  482 + ) -> None:
  483 + text = (lang_query or "").strip()
  484 + if not text:
  485 + return
  486 + phrase_fields = self._build_phrase_match_fields(lang)
  487 + if not phrase_fields:
  488 + return
  489 + boost = self.phrase_match_boost
  490 + should_clauses.append({
  491 + "multi_match": {
  492 + "_name": f"{clause_name}_phrase",
  493 + "query": lang_query,
  494 + "type": "phrase",
  495 + "fields": phrase_fields,
  496 + "slop": self.phrase_match_slop,
  497 + "tie_breaker": self.phrase_match_tie_breaker,
  498 + "boost": boost,
  499 + }
  500 + })
  501 +
459 def _merge_supplemental_lang_field_specs( 502 def _merge_supplemental_lang_field_specs(
460 self, 503 self,
461 specs: List[MatchFieldSpec], 504 specs: List[MatchFieldSpec],
@@ -479,6 +522,7 @@ class ESQueryBuilder: @@ -479,6 +522,7 @@ class ESQueryBuilder:
479 contains_chinese: bool, 522 contains_chinese: bool,
480 contains_english: bool, 523 contains_english: bool,
481 index_languages: List[str], 524 index_languages: List[str],
  525 + is_source: bool = False
482 ) -> List[MatchFieldSpec]: 526 ) -> List[MatchFieldSpec]:
483 """ 527 """
484 When the query mixes scripts, widen each clause to indexed fields for the other script 528 When the query mixes scripts, widen each clause to indexed fields for the other script
@@ -492,10 +536,11 @@ class ESQueryBuilder: @@ -492,10 +536,11 @@ class ESQueryBuilder:
492 536
493 out = list(specs) 537 out = list(specs)
494 lnorm = (lang or "").strip().lower() 538 lnorm = (lang or "").strip().lower()
495 - if contains_english and lnorm != "en" and can_use("en"):  
496 - out = self._merge_supplemental_lang_field_specs(out, "en")  
497 - if contains_chinese and lnorm != "zh" and can_use("zh"):  
498 - out = self._merge_supplemental_lang_field_specs(out, "zh") 539 + if is_source:
  540 + if contains_english and lnorm != "en" and can_use("en"):
  541 + out = self._merge_supplemental_lang_field_specs(out, "en")
  542 + if contains_chinese and lnorm != "zh" and can_use("zh"):
  543 + out = self._merge_supplemental_lang_field_specs(out, "zh")
499 return out 544 return out
500 545
501 def _get_embedding_field(self, language: str) -> str: 546 def _get_embedding_field(self, language: str) -> str:
@@ -503,13 +548,31 @@ class ESQueryBuilder: @@ -503,13 +548,31 @@ class ESQueryBuilder:
503 # Currently using unified embedding field 548 # Currently using unified embedding field
504 return self.text_embedding_field or "title_embedding" 549 return self.text_embedding_field or "title_embedding"
505 550
506 - def _build_advanced_text_query(self, query_text: str, parsed_query: Optional[Any] = None) -> Dict[str, Any]: 551 + @staticmethod
  552 + def _normalize_language_list(languages: Optional[List[str]]) -> List[str]:
  553 + normalized: List[str] = []
  554 + seen = set()
  555 + for language in languages or []:
  556 + token = str(language or "").strip().lower()
  557 + if not token or token in seen:
  558 + continue
  559 + seen.add(token)
  560 + normalized.append(token)
  561 + return normalized
  562 +
  563 + def _build_advanced_text_query(
  564 + self,
  565 + query_text: str,
  566 + parsed_query: Optional[Any] = None,
  567 + *,
  568 + index_languages: Optional[List[str]] = None,
  569 + ) -> Dict[str, Any]:
507 """ 570 """
508 - Build advanced text query using should clauses with primary and fallback lexical strategies. 571 + Build advanced text query using base and translated lexical clauses.
509 572
510 Unified implementation: 573 Unified implementation:
511 - base_query: source-language clause 574 - base_query: source-language clause
512 - - translation queries: target-language clauses from search_langs/query_text_by_lang 575 + - translation queries: target-language clauses from translations
513 - KNN query: added separately in build_query 576 - KNN query: added separately in build_query
514 577
515 Args: 578 Args:
@@ -520,66 +583,41 @@ class ESQueryBuilder: @@ -520,66 +583,41 @@ class ESQueryBuilder:
520 ES bool query with should clauses 583 ES bool query with should clauses
521 """ 584 """
522 should_clauses = [] 585 should_clauses = []
523 -  
524 - # Get query analysis from parsed_query  
525 - query_text_by_lang: Dict[str, str] = {}  
526 - search_langs: List[str] = []  
527 source_lang = self.default_language 586 source_lang = self.default_language
528 - source_in_index_languages = True  
529 - index_languages: List[str] = []  
530 - 587 + translations: Dict[str, str] = {}
531 contains_chinese = False 588 contains_chinese = False
532 contains_english = False 589 contains_english = False
  590 + normalized_index_languages = self._normalize_language_list(index_languages)
  591 +
533 if parsed_query: 592 if parsed_query:
534 - query_text_by_lang = getattr(parsed_query, "query_text_by_lang", None) or {}  
535 - search_langs = getattr(parsed_query, "search_langs", None) or []  
536 detected_lang = getattr(parsed_query, "detected_language", None) 593 detected_lang = getattr(parsed_query, "detected_language", None)
537 source_lang = detected_lang if detected_lang and detected_lang != "unknown" else self.default_language 594 source_lang = detected_lang if detected_lang and detected_lang != "unknown" else self.default_language
538 - source_in_index_languages = bool(  
539 - getattr(parsed_query, "source_in_index_languages", True)  
540 - )  
541 - index_languages = getattr(parsed_query, "index_languages", None) or [] 595 + translations = getattr(parsed_query, "translations", None) or {}
542 contains_chinese = bool(getattr(parsed_query, "contains_chinese", False)) 596 contains_chinese = bool(getattr(parsed_query, "contains_chinese", False))
543 contains_english = bool(getattr(parsed_query, "contains_english", False)) 597 contains_english = bool(getattr(parsed_query, "contains_english", False))
544 598
545 - if not query_text_by_lang:  
546 - query_text_by_lang = {source_lang: query_text}  
547 - if source_lang not in query_text_by_lang and query_text:  
548 - query_text_by_lang[source_lang] = query_text  
549 - if not search_langs:  
550 - search_langs = list(query_text_by_lang.keys())  
551 -  
552 - # Base + translated clauses based on language plan.  
553 - for lang in search_langs:  
554 - lang_query = query_text_by_lang.get(lang)  
555 - if not lang_query:  
556 - continue 599 + source_lang = str(source_lang or self.default_language).strip().lower() or self.default_language
  600 + base_query_text = (
  601 + getattr(parsed_query, "rewritten_query", None) if parsed_query else None
  602 + ) or query_text
  603 +
  604 + def append_clause(lang: str, lang_query: str, clause_name: str, is_source: bool) -> None:
  605 + nonlocal should_clauses
557 all_specs, _ = self._build_match_field_specs(lang) 606 all_specs, _ = self._build_match_field_specs(lang)
558 expanded_specs = self._expand_match_field_specs_for_mixed_script( 607 expanded_specs = self._expand_match_field_specs_for_mixed_script(
559 lang, 608 lang,
560 all_specs, 609 all_specs,
561 contains_chinese, 610 contains_chinese,
562 contains_english, 611 contains_english,
563 - index_languages, 612 + normalized_index_languages,
  613 + is_source,
564 ) 614 )
565 match_fields = self._format_match_field_specs(expanded_specs) 615 match_fields = self._format_match_field_specs(expanded_specs)
566 if not match_fields: 616 if not match_fields:
567 - continue  
568 -  
569 - is_source = (lang == source_lang)  
570 - clause_boost = 1.0  
571 - clause_name = "base_query" if is_source else f"base_query_trans_{lang}" 617 + return
572 minimum_should_match = ( 618 minimum_should_match = (
573 self.base_minimum_should_match if is_source else self.translation_minimum_should_match 619 self.base_minimum_should_match if is_source else self.translation_minimum_should_match
574 ) 620 )
575 - if is_source and not source_in_index_languages:  
576 - clause_boost = self.source_boost_when_missing  
577 - elif not is_source:  
578 - clause_boost = (  
579 - self.translation_boost  
580 - if source_in_index_languages  
581 - else self.translation_boost_when_source_missing  
582 - )  
583 621
584 clause = { 622 clause = {
585 "multi_match": { 623 "multi_match": {
@@ -590,55 +628,34 @@ class ESQueryBuilder: @@ -590,55 +628,34 @@ class ESQueryBuilder:
590 "tie_breaker": self.tie_breaker_base_query, 628 "tie_breaker": self.tie_breaker_base_query,
591 } 629 }
592 } 630 }
593 - if abs(clause_boost - 1.0) > 1e-9:  
594 - clause["multi_match"]["boost"] = clause_boost 631 + # base_query: never set multi_match.boost (ES default 1.0).
  632 + # Translation clauses: single knob from config — translation_boost.
  633 + if not is_source:
  634 + tb = float(self.translation_boost)
  635 + clause["multi_match"]["boost"] = tb
595 should_clauses.append({ 636 should_clauses.append({
596 "multi_match": clause["multi_match"] 637 "multi_match": clause["multi_match"]
597 }) 638 })
  639 + self._append_phrase_should_clause(
  640 + should_clauses, lang, lang_query, clause_name
  641 + )
598 642
599 - # Fallback: source language is not indexed and translation for some index languages is missing.  
600 - # Use original query text on missing index-language fields with a low boost.  
601 - if not source_in_index_languages and query_text and index_languages:  
602 - normalized_index_langs: List[str] = []  
603 - seen_langs = set()  
604 - for lang in index_languages:  
605 - norm_lang = str(lang or "").strip().lower()  
606 - if not norm_lang or norm_lang in seen_langs:  
607 - continue  
608 - seen_langs.add(norm_lang)  
609 - normalized_index_langs.append(norm_lang) 643 + if base_query_text:
  644 + append_clause(source_lang, base_query_text, "base_query", True)
610 645
611 - for lang in normalized_index_langs:  
612 - if lang == source_lang:  
613 - continue  
614 - if lang in query_text_by_lang:  
615 - continue  
616 - fb_specs, _ = self._build_match_field_specs(lang)  
617 - expanded_fb = self._expand_match_field_specs_for_mixed_script(  
618 - lang,  
619 - fb_specs,  
620 - contains_chinese,  
621 - contains_english,  
622 - index_languages,  
623 - )  
624 - match_fields = self._format_match_field_specs(expanded_fb)  
625 - if not match_fields:  
626 - continue  
627 - should_clauses.append({  
628 - "multi_match": {  
629 - "_name": f"fallback_original_query_{lang}",  
630 - "query": query_text,  
631 - "fields": match_fields,  
632 - "minimum_should_match": self.translation_minimum_should_match,  
633 - "tie_breaker": self.tie_breaker_base_query,  
634 - "boost": self.original_query_fallback_boost_when_translation_missing,  
635 - }  
636 - }) 646 + for lang, translated_text in translations.items():
  647 + normalized_lang = str(lang or "").strip().lower()
  648 + normalized_text = str(translated_text or "").strip()
  649 + if not normalized_lang or not normalized_text:
  650 + continue
  651 + if normalized_lang == source_lang and normalized_text == base_query_text:
  652 + continue
  653 + append_clause(normalized_lang, normalized_text, f"base_query_trans_{normalized_lang}", False)
637 654
638 # Fallback to a simple query when language fields cannot be resolved. 655 # Fallback to a simple query when language fields cannot be resolved.
639 if not should_clauses: 656 if not should_clauses:
640 fallback_fields = self.match_fields or ["title.en^1.0"] 657 fallback_fields = self.match_fields or ["title.en^1.0"]
641 - return { 658 + fallback_lexical = {
642 "multi_match": { 659 "multi_match": {
643 "_name": "base_query_fallback", 660 "_name": "base_query_fallback",
644 "query": query_text, 661 "query": query_text,
@@ -647,6 +664,21 @@ class ESQueryBuilder: @@ -647,6 +664,21 @@ class ESQueryBuilder:
647 "tie_breaker": self.tie_breaker_base_query, 664 "tie_breaker": self.tie_breaker_base_query,
648 } 665 }
649 } 666 }
  667 + fb_should: List[Dict[str, Any]] = [fallback_lexical]
  668 + self._append_phrase_should_clause(
  669 + fb_should,
  670 + self.default_language,
  671 + query_text,
  672 + "base_query_fallback"
  673 + )
  674 + if len(fb_should) == 1:
  675 + return fallback_lexical
  676 + return {
  677 + "bool": {
  678 + "should": fb_should,
  679 + "minimum_should_match": 1,
  680 + }
  681 + }
650 682
651 # Return bool query with should clauses 683 # Return bool query with should clauses
652 if len(should_clauses) == 1: 684 if len(should_clauses) == 1:
search/rerank_client.py
@@ -116,7 +116,6 @@ def _extract_named_query_score(matched_queries: Any, name: str) -&gt; float: @@ -116,7 +116,6 @@ def _extract_named_query_score(matched_queries: Any, name: str) -&gt; float:
116 def _collect_text_score_components(matched_queries: Any, fallback_es_score: float) -> Dict[str, float]: 116 def _collect_text_score_components(matched_queries: Any, fallback_es_score: float) -> Dict[str, float]:
117 source_score = _extract_named_query_score(matched_queries, "base_query") 117 source_score = _extract_named_query_score(matched_queries, "base_query")
118 translation_score = 0.0 118 translation_score = 0.0
119 - fallback_score = 0.0  
120 119
121 if isinstance(matched_queries, dict): 120 if isinstance(matched_queries, dict):
122 for query_name, score in matched_queries.items(): 121 for query_name, score in matched_queries.items():
@@ -125,21 +124,16 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa @@ -125,21 +124,16 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa
125 numeric_score = _to_score(score) 124 numeric_score = _to_score(score)
126 if query_name.startswith("base_query_trans_"): 125 if query_name.startswith("base_query_trans_"):
127 translation_score = max(translation_score, numeric_score) 126 translation_score = max(translation_score, numeric_score)
128 - elif query_name.startswith("fallback_original_query_"):  
129 - fallback_score = max(fallback_score, numeric_score)  
130 elif isinstance(matched_queries, list): 127 elif isinstance(matched_queries, list):
131 for query_name in matched_queries: 128 for query_name in matched_queries:
132 if not isinstance(query_name, str): 129 if not isinstance(query_name, str):
133 continue 130 continue
134 if query_name.startswith("base_query_trans_"): 131 if query_name.startswith("base_query_trans_"):
135 translation_score = 1.0 132 translation_score = 1.0
136 - elif query_name.startswith("fallback_original_query_"):  
137 - fallback_score = 1.0  
138 133
139 weighted_source = source_score 134 weighted_source = source_score
140 weighted_translation = 0.8 * translation_score 135 weighted_translation = 0.8 * translation_score
141 - weighted_fallback = 0.55 * fallback_score  
142 - weighted_components = [weighted_source, weighted_translation, weighted_fallback] 136 + weighted_components = [weighted_source, weighted_translation]
143 primary_text_score = max(weighted_components) 137 primary_text_score = max(weighted_components)
144 support_text_score = sum(weighted_components) - primary_text_score 138 support_text_score = sum(weighted_components) - primary_text_score
145 text_score = primary_text_score + 0.25 * support_text_score 139 text_score = primary_text_score + 0.25 * support_text_score
@@ -153,10 +147,8 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa @@ -153,10 +147,8 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa
153 return { 147 return {
154 "source_score": source_score, 148 "source_score": source_score,
155 "translation_score": translation_score, 149 "translation_score": translation_score,
156 - "fallback_score": fallback_score,  
157 "weighted_source_score": weighted_source, 150 "weighted_source_score": weighted_source,
158 "weighted_translation_score": weighted_translation, 151 "weighted_translation_score": weighted_translation,
159 - "weighted_fallback_score": weighted_fallback,  
160 "primary_text_score": primary_text_score, 152 "primary_text_score": primary_text_score,
161 "support_text_score": support_text_score, 153 "support_text_score": support_text_score,
162 "text_score": text_score, 154 "text_score": text_score,
@@ -219,7 +211,6 @@ def fuse_scores_and_resort( @@ -219,7 +211,6 @@ def fuse_scores_and_resort(
219 hit["_knn_score"] = knn_score 211 hit["_knn_score"] = knn_score
220 hit["_text_source_score"] = text_components["source_score"] 212 hit["_text_source_score"] = text_components["source_score"]
221 hit["_text_translation_score"] = text_components["translation_score"] 213 hit["_text_translation_score"] = text_components["translation_score"]
222 - hit["_text_fallback_score"] = text_components["fallback_score"]  
223 hit["_text_primary_score"] = text_components["primary_text_score"] 214 hit["_text_primary_score"] = text_components["primary_text_score"]
224 hit["_text_support_score"] = text_components["support_text_score"] 215 hit["_text_support_score"] = text_components["support_text_score"]
225 hit["_fused_score"] = fused 216 hit["_fused_score"] = fused
@@ -231,7 +222,6 @@ def fuse_scores_and_resort( @@ -231,7 +222,6 @@ def fuse_scores_and_resort(
231 "text_score": text_score, 222 "text_score": text_score,
232 "text_source_score": text_components["source_score"], 223 "text_source_score": text_components["source_score"],
233 "text_translation_score": text_components["translation_score"], 224 "text_translation_score": text_components["translation_score"],
234 - "text_fallback_score": text_components["fallback_score"],  
235 "text_primary_score": text_components["primary_text_score"], 225 "text_primary_score": text_components["primary_text_score"],
236 "text_support_score": text_components["support_text_score"], 226 "text_support_score": text_components["support_text_score"],
237 "knn_score": knn_score, 227 "knn_score": knn_score,
search/searcher.py
@@ -132,11 +132,6 @@ class Searcher: @@ -132,11 +132,6 @@ class Searcher:
132 base_minimum_should_match=self.config.query_config.base_minimum_should_match, 132 base_minimum_should_match=self.config.query_config.base_minimum_should_match,
133 translation_minimum_should_match=self.config.query_config.translation_minimum_should_match, 133 translation_minimum_should_match=self.config.query_config.translation_minimum_should_match,
134 translation_boost=self.config.query_config.translation_boost, 134 translation_boost=self.config.query_config.translation_boost,
135 - translation_boost_when_source_missing=self.config.query_config.translation_boost_when_source_missing,  
136 - source_boost_when_missing=self.config.query_config.source_boost_when_missing,  
137 - original_query_fallback_boost_when_translation_missing=(  
138 - self.config.query_config.original_query_fallback_boost_when_translation_missing  
139 - ),  
140 tie_breaker_base_query=self.config.query_config.tie_breaker_base_query, 135 tie_breaker_base_query=self.config.query_config.tie_breaker_base_query,
141 ) 136 )
142 137
@@ -267,13 +262,6 @@ class Searcher: @@ -267,13 +262,6 @@ class Searcher:
267 if normalized: 262 if normalized:
268 candidates.append(normalized) 263 candidates.append(normalized)
269 264
270 - query_text_by_lang = getattr(parsed_query, "query_text_by_lang", {}) or {}  
271 - if isinstance(query_text_by_lang, dict):  
272 - for text in query_text_by_lang.values():  
273 - normalized = self._normalize_sku_match_text(text)  
274 - if normalized:  
275 - candidates.append(normalized)  
276 -  
277 translations = getattr(parsed_query, "translations", {}) or {} 265 translations = getattr(parsed_query, "translations", {}) or {}
278 if isinstance(translations, dict): 266 if isinstance(translations, dict):
279 for text in translations.values(): 267 for text in translations.values():
@@ -516,10 +504,19 @@ class Searcher: @@ -516,10 +504,19 @@ class Searcher:
516 range_filters: Range filters for numeric fields 504 range_filters: Range filters for numeric fields
517 facets: Facet configurations for faceted search 505 facets: Facet configurations for faceted search
518 min_score: Minimum score threshold 506 min_score: Minimum score threshold
519 - context: Request context for tracking (created if not provided) 507 + context: Request context for tracking (required)
520 sort_by: Field name for sorting 508 sort_by: Field name for sorting
521 sort_order: Sort order: 'asc' or 'desc' 509 sort_order: Sort order: 'asc' or 'desc'
522 debug: Enable debug information output 510 debug: Enable debug information output
  511 + language: Response / field selection language hint (e.g. zh, en)
  512 + sku_filter_dimension: SKU grouping dimensions for per-SPU variant pick
  513 + enable_rerank: If None, use ``config.rerank.enabled``; if set, overrides
  514 + whether the rerank provider is invoked (subject to rerank window).
  515 + rerank_query_template: Override for rerank query text template; None uses
  516 + ``config.rerank.rerank_query_template`` (e.g. ``"{query}"``).
  517 + rerank_doc_template: Override for per-hit document text passed to rerank;
  518 + None uses ``config.rerank.rerank_doc_template``. Placeholders are
  519 + resolved in ``search/rerank_client.py``.
523 520
524 Returns: 521 Returns:
525 SearchResult object with formatted results 522 SearchResult object with formatted results
@@ -592,7 +589,8 @@ class Searcher: @@ -592,7 +589,8 @@ class Searcher:
592 query, 589 query,
593 tenant_id=tenant_id, 590 tenant_id=tenant_id,
594 generate_vector=enable_embedding, 591 generate_vector=enable_embedding,
595 - context=context 592 + context=context,
  593 + target_languages=index_langs if enable_translation else [],
596 ) 594 )
597 # Store query analysis results in context 595 # Store query analysis results in context
598 context.store_query_analysis( 596 context.store_query_analysis(
@@ -602,7 +600,7 @@ class Searcher: @@ -602,7 +600,7 @@ class Searcher:
602 detected_language=parsed_query.detected_language, 600 detected_language=parsed_query.detected_language,
603 translations=parsed_query.translations, 601 translations=parsed_query.translations,
604 query_vector=parsed_query.query_vector.tolist() if parsed_query.query_vector is not None else None, 602 query_vector=parsed_query.query_vector.tolist() if parsed_query.query_vector is not None else None,
605 - domain=parsed_query.domain, 603 + domain="default",
606 is_simple_query=True 604 is_simple_query=True
607 ) 605 )
608 606
@@ -610,7 +608,6 @@ class Searcher: @@ -610,7 +608,6 @@ class Searcher:
610 f"查询解析完成 | 原查询: '{parsed_query.original_query}' | " 608 f"查询解析完成 | 原查询: '{parsed_query.original_query}' | "
611 f"重写后: '{parsed_query.rewritten_query}' | " 609 f"重写后: '{parsed_query.rewritten_query}' | "
612 f"语言: {parsed_query.detected_language} | " 610 f"语言: {parsed_query.detected_language} | "
613 - f"域: {parsed_query.domain} | "  
614 f"向量: {'是' if parsed_query.query_vector is not None else '否'}", 611 f"向量: {'是' if parsed_query.query_vector is not None else '否'}",
615 extra={'reqid': context.reqid, 'uid': context.uid} 612 extra={'reqid': context.reqid, 'uid': context.uid}
616 ) 613 )
@@ -643,7 +640,8 @@ class Searcher: @@ -643,7 +640,8 @@ class Searcher:
643 from_=es_fetch_from, 640 from_=es_fetch_from,
644 enable_knn=enable_embedding and parsed_query.query_vector is not None, 641 enable_knn=enable_embedding and parsed_query.query_vector is not None,
645 min_score=min_score, 642 min_score=min_score,
646 - parsed_query=parsed_query 643 + parsed_query=parsed_query,
  644 + index_languages=index_langs,
647 ) 645 )
648 646
649 # Add facets for faceted search 647 # Add facets for faceted search
@@ -933,7 +931,6 @@ class Searcher: @@ -933,7 +931,6 @@ class Searcher:
933 debug_entry["text_score"] = rerank_debug.get("text_score") 931 debug_entry["text_score"] = rerank_debug.get("text_score")
934 debug_entry["text_source_score"] = rerank_debug.get("text_source_score") 932 debug_entry["text_source_score"] = rerank_debug.get("text_source_score")
935 debug_entry["text_translation_score"] = rerank_debug.get("text_translation_score") 933 debug_entry["text_translation_score"] = rerank_debug.get("text_translation_score")
936 - debug_entry["text_fallback_score"] = rerank_debug.get("text_fallback_score")  
937 debug_entry["text_primary_score"] = rerank_debug.get("text_primary_score") 934 debug_entry["text_primary_score"] = rerank_debug.get("text_primary_score")
938 debug_entry["text_support_score"] = rerank_debug.get("text_support_score") 935 debug_entry["text_support_score"] = rerank_debug.get("text_support_score")
939 debug_entry["knn_score"] = rerank_debug.get("knn_score") 936 debug_entry["knn_score"] = rerank_debug.get("knn_score")
@@ -985,9 +982,6 @@ class Searcher: @@ -985,9 +982,6 @@ class Searcher:
985 "rewritten_query": context.query_analysis.rewritten_query, 982 "rewritten_query": context.query_analysis.rewritten_query,
986 "detected_language": context.query_analysis.detected_language, 983 "detected_language": context.query_analysis.detected_language,
987 "translations": context.query_analysis.translations, 984 "translations": context.query_analysis.translations,
988 - "query_text_by_lang": context.get_intermediate_result("query_text_by_lang", {}),  
989 - "search_langs": context.get_intermediate_result("search_langs", []),  
990 - "supplemental_search_langs": context.get_intermediate_result("supplemental_search_langs", []),  
991 "has_vector": context.query_analysis.query_vector is not None, 985 "has_vector": context.query_analysis.query_vector is not None,
992 "is_simple_query": context.query_analysis.is_simple_query, 986 "is_simple_query": context.query_analysis.is_simple_query,
993 "domain": context.query_analysis.domain 987 "domain": context.query_analysis.domain
suggestion/builder.py
@@ -147,7 +147,7 @@ class SuggestionIndexBuilder: @@ -147,7 +147,7 @@ class SuggestionIndexBuilder:
147 raw = str(value).strip() 147 raw = str(value).strip()
148 if not raw: 148 if not raw:
149 return [] 149 return []
150 - parts = re.split(r"[,;|/\n\t]+", raw) 150 + parts = re.split(r"[,、,;|/\n\t]+", raw)
151 out = [p.strip() for p in parts if p and p.strip()] 151 out = [p.strip() for p in parts if p and p.strip()]
152 if not out: 152 if not out:
153 return [raw] 153 return [raw]
@@ -162,7 +162,7 @@ class SuggestionIndexBuilder: @@ -162,7 +162,7 @@ class SuggestionIndexBuilder:
162 s = str(raw).strip() 162 s = str(raw).strip()
163 if not s: 163 if not s:
164 return [] 164 return []
165 - parts = re.split(r"[,;|/\n\t]+", s) 165 + parts = re.split(r"[,、,;|/\n\t]+", s)
166 out = [p.strip() for p in parts if p and p.strip()] 166 out = [p.strip() for p in parts if p and p.strip()]
167 return out if out else [s] 167 return out if out else [s]
168 168
tests/test_embedding_pipeline.py
@@ -73,6 +73,10 @@ class _FakeQueryEncoder: @@ -73,6 +73,10 @@ class _FakeQueryEncoder:
73 return np.array([np.array([0.11, 0.22, 0.33], dtype=np.float32) for _ in sentences], dtype=object) 73 return np.array([np.array([0.11, 0.22, 0.33], dtype=np.float32) for _ in sentences], dtype=object)
74 74
75 75
  76 +def _tokenizer(text):
  77 + return str(text).split()
  78 +
  79 +
76 class _FakeEmbeddingCache: 80 class _FakeEmbeddingCache:
77 def __init__(self): 81 def __init__(self):
78 self.store: Dict[str, np.ndarray] = {} 82 self.store: Dict[str, np.ndarray] = {}
@@ -210,6 +214,7 @@ def test_query_parser_generates_query_vector_with_encoder(): @@ -210,6 +214,7 @@ def test_query_parser_generates_query_vector_with_encoder():
210 config=_build_test_config(), 214 config=_build_test_config(),
211 text_encoder=encoder, 215 text_encoder=encoder,
212 translator=_FakeTranslator(), 216 translator=_FakeTranslator(),
  217 + tokenizer=_tokenizer,
213 ) 218 )
214 219
215 parsed = parser.parse("red dress", tenant_id="162", generate_vector=True) 220 parsed = parser.parse("red dress", tenant_id="162", generate_vector=True)
@@ -224,6 +229,7 @@ def test_query_parser_skips_query_vector_when_disabled(): @@ -224,6 +229,7 @@ def test_query_parser_skips_query_vector_when_disabled():
224 config=_build_test_config(), 229 config=_build_test_config(),
225 text_encoder=_FakeQueryEncoder(), 230 text_encoder=_FakeQueryEncoder(),
226 translator=_FakeTranslator(), 231 translator=_FakeTranslator(),
  232 + tokenizer=_tokenizer,
227 ) 233 )
228 234
229 parsed = parser.parse("red dress", tenant_id="162", generate_vector=False) 235 parsed = parser.parse("red dress", tenant_id="162", generate_vector=False)
tests/test_es_query_builder.py
1 from types import SimpleNamespace 1 from types import SimpleNamespace
  2 +from typing import Any, Dict
2 3
3 import numpy as np 4 import numpy as np
4 5
@@ -13,6 +14,21 @@ def _builder() -&gt; ESQueryBuilder: @@ -13,6 +14,21 @@ def _builder() -&gt; ESQueryBuilder:
13 ) 14 )
14 15
15 16
  17 +def _lexical_multi_match_fields(query_root: Dict[str, Any]) -> list:
  18 + """Fields from the non-phrase multi_match (bool.should or single clause)."""
  19 + if "multi_match" in query_root:
  20 + mm = query_root["multi_match"]
  21 + if mm.get("type") == "phrase":
  22 + raise AssertionError("root multi_match is phrase-only")
  23 + return mm["fields"]
  24 + for clause in query_root.get("bool", {}).get("should", []):
  25 + mm = clause.get("multi_match") or {}
  26 + if mm.get("type") == "phrase":
  27 + continue
  28 + return mm["fields"]
  29 + raise AssertionError("no lexical multi_match in query_root")
  30 +
  31 +
16 def test_knn_prefilter_includes_range_filters(): 32 def test_knn_prefilter_includes_range_filters():
17 qb = _builder() 33 qb = _builder()
18 q = qb.build_query( 34 q = qb.build_query(
@@ -65,21 +81,49 @@ def test_knn_prefilter_not_added_without_filters(): @@ -65,21 +81,49 @@ def test_knn_prefilter_not_added_without_filters():
65 assert q["knn"]["_name"] == "knn_query" 81 assert q["knn"]["_name"] == "knn_query"
66 82
67 83
68 -def test_text_query_contains_only_base_translation_and_fallback_named_queries(): 84 +def test_text_query_contains_only_base_and_translation_named_queries():
69 qb = _builder() 85 qb = _builder()
70 parsed_query = SimpleNamespace( 86 parsed_query = SimpleNamespace(
71 - query_text_by_lang={"en": "dress", "zh": "连衣裙"},  
72 - search_langs=["en", "zh"], 87 + rewritten_query="dress",
73 detected_language="en", 88 detected_language="en",
74 - source_in_index_languages=False,  
75 - index_languages=["en", "zh", "fr"], 89 + translations={"en": "dress", "zh": "连衣裙"},
76 ) 90 )
77 91
78 - q = qb.build_query(query_text="dress", parsed_query=parsed_query, enable_knn=False) 92 + q = qb.build_query(
  93 + query_text="dress",
  94 + parsed_query=parsed_query,
  95 + enable_knn=False,
  96 + index_languages=["en", "zh", "fr"],
  97 + )
79 should = q["query"]["bool"]["should"] 98 should = q["query"]["bool"]["should"]
80 names = [clause["multi_match"]["_name"] for clause in should] 99 names = [clause["multi_match"]["_name"] for clause in should]
81 100
82 - assert names == ["base_query", "base_query_trans_zh", "fallback_original_query_fr"] 101 + assert names == [
  102 + "base_query",
  103 + "base_query_phrase",
  104 + "base_query_trans_zh",
  105 + "base_query_trans_zh_phrase",
  106 + ]
  107 +
  108 +
  109 +def test_text_query_skips_duplicate_translation_same_as_base():
  110 + qb = _builder()
  111 + parsed_query = SimpleNamespace(
  112 + rewritten_query="dress",
  113 + detected_language="en",
  114 + translations={"en": "dress"},
  115 + )
  116 +
  117 + q = qb.build_query(
  118 + query_text="dress",
  119 + parsed_query=parsed_query,
  120 + enable_knn=False,
  121 + index_languages=["en", "zh"],
  122 + )
  123 +
  124 + root = q["query"]
  125 + assert root["bool"]["should"][0]["multi_match"]["_name"] == "base_query"
  126 + assert root["bool"]["should"][1]["multi_match"]["_name"] == "base_query_phrase"
83 127
84 128
85 def test_mixed_script_merges_en_fields_into_zh_clause(): 129 def test_mixed_script_merges_en_fields_into_zh_clause():
@@ -91,22 +135,25 @@ def test_mixed_script_merges_en_fields_into_zh_clause(): @@ -91,22 +135,25 @@ def test_mixed_script_merges_en_fields_into_zh_clause():
91 default_language="en", 135 default_language="en",
92 ) 136 )
93 parsed_query = SimpleNamespace( 137 parsed_query = SimpleNamespace(
94 - query_text_by_lang={"zh": "法式 dress"},  
95 - search_langs=["zh"], 138 + rewritten_query="法式 dress",
96 detected_language="zh", 139 detected_language="zh",
97 - source_in_index_languages=True,  
98 - index_languages=["zh", "en"], 140 + translations={},
99 contains_chinese=True, 141 contains_chinese=True,
100 contains_english=True, 142 contains_english=True,
101 ) 143 )
102 - q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False)  
103 - fields = q["query"]["multi_match"]["fields"] 144 + q = qb.build_query(
  145 + query_text="法式 dress",
  146 + parsed_query=parsed_query,
  147 + enable_knn=False,
  148 + index_languages=["zh", "en"],
  149 + )
  150 + fields = _lexical_multi_match_fields(q["query"])
104 bases = {f.split("^", 1)[0] for f in fields} 151 bases = {f.split("^", 1)[0] for f in fields}
105 assert "title.zh" in bases and "title.en" in bases 152 assert "title.zh" in bases and "title.en" in bases
106 assert "brief.zh" in bases and "brief.en" in bases 153 assert "brief.zh" in bases and "brief.en" in bases
107 - # Merged supplemental language fields use boost * 0.8 (implicit 1.0 -> ^0.8)  
108 - assert "title.en^0.8" in fields  
109 - assert "brief.en^0.8" in fields 154 + # Merged supplemental language fields use boost * 0.6 by default.
  155 + assert "title.en^0.6" in fields
  156 + assert "brief.en^0.6" in fields
110 157
111 158
112 def test_mixed_script_merges_zh_fields_into_en_clause(): 159 def test_mixed_script_merges_zh_fields_into_en_clause():
@@ -118,19 +165,22 @@ def test_mixed_script_merges_zh_fields_into_en_clause(): @@ -118,19 +165,22 @@ def test_mixed_script_merges_zh_fields_into_en_clause():
118 default_language="en", 165 default_language="en",
119 ) 166 )
120 parsed_query = SimpleNamespace( 167 parsed_query = SimpleNamespace(
121 - query_text_by_lang={"en": "red 连衣裙"},  
122 - search_langs=["en"], 168 + rewritten_query="red 连衣裙",
123 detected_language="en", 169 detected_language="en",
124 - source_in_index_languages=True,  
125 - index_languages=["zh", "en"], 170 + translations={},
126 contains_chinese=True, 171 contains_chinese=True,
127 contains_english=True, 172 contains_english=True,
128 ) 173 )
129 - q = qb.build_query(query_text="red 连衣裙", parsed_query=parsed_query, enable_knn=False)  
130 - fields = q["query"]["multi_match"]["fields"] 174 + q = qb.build_query(
  175 + query_text="red 连衣裙",
  176 + parsed_query=parsed_query,
  177 + enable_knn=False,
  178 + index_languages=["zh", "en"],
  179 + )
  180 + fields = _lexical_multi_match_fields(q["query"])
131 bases = {f.split("^", 1)[0] for f in fields} 181 bases = {f.split("^", 1)[0] for f in fields}
132 assert "title.en" in bases and "title.zh" in bases 182 assert "title.en" in bases and "title.zh" in bases
133 - assert "title.zh^0.8" in fields 183 + assert "title.zh^0.6" in fields
134 184
135 185
136 def test_mixed_script_merged_fields_scale_configured_boosts(): 186 def test_mixed_script_merged_fields_scale_configured_boosts():
@@ -143,18 +193,21 @@ def test_mixed_script_merged_fields_scale_configured_boosts(): @@ -143,18 +193,21 @@ def test_mixed_script_merged_fields_scale_configured_boosts():
143 default_language="en", 193 default_language="en",
144 ) 194 )
145 parsed_query = SimpleNamespace( 195 parsed_query = SimpleNamespace(
146 - query_text_by_lang={"zh": "法式 dress"},  
147 - search_langs=["zh"], 196 + rewritten_query="法式 dress",
148 detected_language="zh", 197 detected_language="zh",
149 - source_in_index_languages=True,  
150 - index_languages=["zh", "en"], 198 + translations={},
151 contains_chinese=True, 199 contains_chinese=True,
152 contains_english=True, 200 contains_english=True,
153 ) 201 )
154 - q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False)  
155 - fields = q["query"]["multi_match"]["fields"] 202 + q = qb.build_query(
  203 + query_text="法式 dress",
  204 + parsed_query=parsed_query,
  205 + enable_knn=False,
  206 + index_languages=["zh", "en"],
  207 + )
  208 + fields = _lexical_multi_match_fields(q["query"])
156 assert "title.zh^5.0" in fields 209 assert "title.zh^5.0" in fields
157 - assert "title.en^8.0" in fields # 10.0 * 0.8 210 + assert "title.en^6.0" in fields # 10.0 * 0.6
158 211
159 212
160 def test_mixed_script_does_not_merge_en_when_not_in_index_languages(): 213 def test_mixed_script_does_not_merge_en_when_not_in_index_languages():
@@ -166,16 +219,19 @@ def test_mixed_script_does_not_merge_en_when_not_in_index_languages(): @@ -166,16 +219,19 @@ def test_mixed_script_does_not_merge_en_when_not_in_index_languages():
166 default_language="zh", 219 default_language="zh",
167 ) 220 )
168 parsed_query = SimpleNamespace( 221 parsed_query = SimpleNamespace(
169 - query_text_by_lang={"zh": "法式 dress"},  
170 - search_langs=["zh"], 222 + rewritten_query="法式 dress",
171 detected_language="zh", 223 detected_language="zh",
172 - source_in_index_languages=True,  
173 - index_languages=["zh"], 224 + translations={},
174 contains_chinese=True, 225 contains_chinese=True,
175 contains_english=True, 226 contains_english=True,
176 ) 227 )
177 - q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False)  
178 - fields = q["query"]["multi_match"]["fields"] 228 + q = qb.build_query(
  229 + query_text="法式 dress",
  230 + parsed_query=parsed_query,
  231 + enable_knn=False,
  232 + index_languages=["zh"],
  233 + )
  234 + fields = _lexical_multi_match_fields(q["query"])
179 bases = {f.split("^", 1)[0] for f in fields} 235 bases = {f.split("^", 1)[0] for f in fields}
180 assert "title.zh" in bases 236 assert "title.zh" in bases
181 assert "title.en" not in bases 237 assert "title.en" not in bases
tests/test_es_query_builder_text_recall_languages.py 0 → 100644
@@ -0,0 +1,453 @@ @@ -0,0 +1,453 @@
  1 +"""
  2 +ES text recall: base_query (rewritten @ detected_language) + base_query_trans_*.
  3 +
  4 +Covers combinations of query language vs tenant index_languages, translations,
  5 +and mixed Chinese/English queries. Asserts multi_match _name, query text, and
  6 +target language fields (title.{lang}).
  7 +"""
  8 +
  9 +from types import SimpleNamespace
  10 +from typing import Any, Dict, List
  11 +
  12 +import numpy as np
  13 +
  14 +from search.es_query_builder import ESQueryBuilder
  15 +
  16 +
  17 +def _builder_multilingual_title_only(
  18 + *,
  19 + default_language: str = "en",
  20 + mixed_script_scale: float = 0.6,
  21 +) -> ESQueryBuilder:
  22 + """Minimal builder: only title.{lang} for easy field assertions."""
  23 + return ESQueryBuilder(
  24 + match_fields=["title.en^1.0"],
  25 + multilingual_fields=["title"],
  26 + shared_fields=[],
  27 + text_embedding_field="title_embedding",
  28 + default_language=default_language,
  29 + mixed_script_merged_field_boost_scale=mixed_script_scale,
  30 + function_score_config=None,
  31 + )
  32 +
  33 +
  34 +def _unwrap_inner_query(es_body: Dict[str, Any]) -> Dict[str, Any]:
  35 + """Navigate bool.must / function_score wrappers to the text recall root."""
  36 + q = es_body.get("query") or {}
  37 + if "bool" in q and "must" in q["bool"] and q["bool"]["must"]:
  38 + q = q["bool"]["must"][0]
  39 + if "function_score" in q:
  40 + q = q["function_score"]["query"]
  41 + return q
  42 +
  43 +
  44 +def _extract_multi_match_clauses(es_body: Dict[str, Any]) -> List[Dict[str, Any]]:
  45 + inner = _unwrap_inner_query(es_body)
  46 + if "multi_match" in inner:
  47 + return [inner["multi_match"]]
  48 + should = (inner.get("bool") or {}).get("should") or []
  49 + return [c["multi_match"] for c in should if "multi_match" in c]
  50 +
  51 +
  52 +def _clauses_index(es_body: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
  53 + """Map _name -> multi_match dict."""
  54 + out: Dict[str, Dict[str, Any]] = {}
  55 + for mm in _extract_multi_match_clauses(es_body):
  56 + name = mm.get("_name")
  57 + if name:
  58 + out[str(name)] = mm
  59 + return out
  60 +
  61 +
  62 +def _with_phrase(lexical_names: set[str]) -> set[str]:
  63 + """Each lexical recall clause has a companion ``*_phrase`` multi_match."""
  64 + return lexical_names | {f"{n}_phrase" for n in lexical_names}
  65 +
  66 +
  67 +def _title_fields(mm: Dict[str, Any]) -> List[str]:
  68 + fields = mm.get("fields") or []
  69 + return [f for f in fields if str(f).startswith("title.")]
  70 +
  71 +
  72 +def _has_title_lang(mm: Dict[str, Any], lang: str) -> bool:
  73 + """True if any field is title.{lang} with optional ^boost suffix."""
  74 + prefix = f"title.{lang}"
  75 + for f in mm.get("fields") or []:
  76 + s = str(f)
  77 + if s == prefix or s.startswith(prefix + "^"):
  78 + return True
  79 + return False
  80 +
  81 +
  82 +def _build(
  83 + qb: ESQueryBuilder,
  84 + *,
  85 + query_text: str,
  86 + rewritten: str,
  87 + detected_language: str,
  88 + translations: Dict[str, str],
  89 + index_languages: List[str],
  90 + contains_chinese: bool = False,
  91 + contains_english: bool = False,
  92 +) -> Dict[str, Any]:
  93 + parsed = SimpleNamespace(
  94 + rewritten_query=rewritten,
  95 + detected_language=detected_language,
  96 + translations=dict(translations),
  97 + contains_chinese=contains_chinese,
  98 + contains_english=contains_english,
  99 + )
  100 + return qb.build_query(
  101 + query_text=query_text,
  102 + parsed_query=parsed,
  103 + enable_knn=False,
  104 + index_languages=index_languages,
  105 + )
  106 +
  107 +
  108 +# --- 检测语言在 index_languages 内:主召回 + 翻译补召回 ---
  109 +
  110 +
  111 +def test_zh_query_index_zh_en_includes_base_zh_and_trans_en():
  112 + qb = _builder_multilingual_title_only(default_language="en")
  113 + q = _build(
  114 + qb,
  115 + query_text="连衣裙",
  116 + rewritten="连衣裙",
  117 + detected_language="zh",
  118 + translations={"en": "dress"},
  119 + index_languages=["zh", "en"],
  120 + )
  121 + idx = _clauses_index(q)
  122 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"})
  123 + assert idx["base_query"]["query"] == "连衣裙"
  124 + assert "title.zh" in _title_fields(idx["base_query"])
  125 + assert idx["base_query_trans_en"]["query"] == "dress"
  126 + assert "title.en" in _title_fields(idx["base_query_trans_en"])
  127 +
  128 +
  129 +def test_en_query_index_zh_en_includes_base_en_and_trans_zh():
  130 + qb = _builder_multilingual_title_only(default_language="en")
  131 + q = _build(
  132 + qb,
  133 + query_text="dress",
  134 + rewritten="dress",
  135 + detected_language="en",
  136 + translations={"zh": "连衣裙"},
  137 + index_languages=["en", "zh"],
  138 + )
  139 + idx = _clauses_index(q)
  140 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"})
  141 + assert idx["base_query"]["query"] == "dress"
  142 + assert "title.en" in _title_fields(idx["base_query"])
  143 + assert idx["base_query_trans_zh"]["query"] == "连衣裙"
  144 + assert "title.zh" in _title_fields(idx["base_query_trans_zh"])
  145 +
  146 +
  147 +def test_de_query_index_de_en_fr_includes_base_and_two_translations():
  148 + qb = _builder_multilingual_title_only(default_language="en")
  149 + q = _build(
  150 + qb,
  151 + query_text="kleid",
  152 + rewritten="kleid",
  153 + detected_language="de",
  154 + translations={"en": "dress", "fr": "robe"},
  155 + index_languages=["de", "en", "fr"],
  156 + )
  157 + idx = _clauses_index(q)
  158 + assert set(idx) == _with_phrase(
  159 + {"base_query", "base_query_trans_en", "base_query_trans_fr"}
  160 + )
  161 + assert idx["base_query"]["query"] == "kleid"
  162 + assert "title.de" in _title_fields(idx["base_query"])
  163 + assert idx["base_query_trans_en"]["query"] == "dress"
  164 + assert idx["base_query_trans_fr"]["query"] == "robe"
  165 +
  166 +
  167 +# --- 检测语言不在 index_languages:仍有 base(弱)+ 翻译(强) ---
  168 +
  169 +
  170 +def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields():
  171 + qb = _builder_multilingual_title_only(default_language="en")
  172 + q = _build(
  173 + qb,
  174 + query_text="schuh",
  175 + rewritten="schuh",
  176 + detected_language="de",
  177 + translations={"en": "shoe", "zh": "鞋"},
  178 + index_languages=["en", "zh"],
  179 + )
  180 + idx = _clauses_index(q)
  181 + assert set(idx) == _with_phrase(
  182 + {"base_query", "base_query_trans_en", "base_query_trans_zh"}
  183 + )
  184 + assert idx["base_query"]["query"] == "schuh"
  185 + assert "title.de" in _title_fields(idx["base_query"])
  186 + assert "boost" not in idx["base_query"]
  187 + assert idx["base_query_trans_en"]["query"] == "shoe"
  188 + assert idx["base_query_trans_en"]["boost"] == qb.translation_boost
  189 + assert idx["base_query_trans_zh"]["query"] == "鞋"
  190 + assert idx["base_query_trans_zh"]["boost"] == qb.translation_boost
  191 +
  192 +
  193 +# --- 中英混写:原文在 base_query;翻译子句独立;混写时 base 子句扩列 ---
  194 +
  195 +
  196 +def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause():
  197 + qb = _builder_multilingual_title_only(default_language="en")
  198 + q = _build(
  199 + qb,
  200 + query_text="红色 dress",
  201 + rewritten="红色 dress",
  202 + detected_language="zh",
  203 + translations={"en": "red dress"},
  204 + index_languages=["zh", "en"],
  205 + contains_chinese=True,
  206 + contains_english=True,
  207 + )
  208 + idx = _clauses_index(q)
  209 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"})
  210 + assert idx["base_query"]["query"] == "红色 dress"
  211 + assert _has_title_lang(idx["base_query"], "zh") and _has_title_lang(idx["base_query"], "en")
  212 + assert idx["base_query_trans_en"]["query"] == "red dress"
  213 + assert _has_title_lang(idx["base_query_trans_en"], "en")
  214 +
  215 +
  216 +def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause():
  217 + qb = _builder_multilingual_title_only(default_language="en")
  218 + q = _build(
  219 + qb,
  220 + query_text="nike 运动鞋",
  221 + rewritten="nike 运动鞋",
  222 + detected_language="en",
  223 + translations={"zh": "耐克运动鞋"},
  224 + index_languages=["zh", "en"],
  225 + contains_chinese=True,
  226 + contains_english=True,
  227 + )
  228 + idx = _clauses_index(q)
  229 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"})
  230 + assert idx["base_query"]["query"] == "nike 运动鞋"
  231 + assert _has_title_lang(idx["base_query"], "en") and _has_title_lang(idx["base_query"], "zh")
  232 + assert idx["base_query_trans_zh"]["query"] == "耐克运动鞋"
  233 +
  234 +
  235 +def test_mixed_zh_query_index_zh_only_no_en_merge_in_base():
  236 + qb = _builder_multilingual_title_only(default_language="en")
  237 + q = _build(
  238 + qb,
  239 + query_text="法式 dress",
  240 + rewritten="法式 dress",
  241 + detected_language="zh",
  242 + translations={},
  243 + index_languages=["zh"],
  244 + contains_chinese=True,
  245 + contains_english=True,
  246 + )
  247 + idx = _clauses_index(q)
  248 + assert set(idx) == _with_phrase({"base_query"})
  249 + bases = {f.split("^", 1)[0] for f in _title_fields(idx["base_query"])}
  250 + assert bases == {"title.zh"}
  251 +
  252 +
  253 +# --- 去重:与 base 同语言同文本的翻译项跳过 ---
  254 +
  255 +
  256 +def test_skips_translation_when_same_lang_and_same_text_as_base():
  257 + qb = _builder_multilingual_title_only(default_language="en")
  258 + q = _build(
  259 + qb,
  260 + query_text="NIKE",
  261 + rewritten="NIKE",
  262 + detected_language="en",
  263 + translations={"en": "NIKE", "zh": "耐克"},
  264 + index_languages=["en", "zh"],
  265 + )
  266 + idx = _clauses_index(q)
  267 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"})
  268 +
  269 +
  270 +def test_keeps_translation_when_same_text_but_different_lang_than_base():
  271 + qb = _builder_multilingual_title_only(default_language="en")
  272 + q = _build(
  273 + qb,
  274 + query_text="NIKE",
  275 + rewritten="NIKE",
  276 + detected_language="en",
  277 + translations={"zh": "NIKE"},
  278 + index_languages=["en", "zh"],
  279 + )
  280 + idx = _clauses_index(q)
  281 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"})
  282 + assert idx["base_query_trans_zh"]["query"] == "NIKE"
  283 +
  284 +
  285 +# --- 翻译 key 规范化、空翻译跳过 ---
  286 +
  287 +
  288 +def test_translation_language_key_is_normalized_case_insensitive():
  289 + qb = _builder_multilingual_title_only(default_language="en")
  290 + q = _build(
  291 + qb,
  292 + query_text="dress",
  293 + rewritten="dress",
  294 + detected_language="en",
  295 + translations={"ZH": "连衣裙"},
  296 + index_languages=["en", "zh"],
  297 + )
  298 + idx = _clauses_index(q)
  299 + assert "base_query_trans_zh" in idx
  300 + assert idx["base_query_trans_zh"]["query"] == "连衣裙"
  301 +
  302 +
  303 +def test_empty_translation_value_is_skipped():
  304 + qb = _builder_multilingual_title_only(default_language="en")
  305 + q = _build(
  306 + qb,
  307 + query_text="dress",
  308 + rewritten="dress",
  309 + detected_language="en",
  310 + translations={"zh": " ", "fr": "robe"},
  311 + index_languages=["en", "zh", "fr"],
  312 + )
  313 + idx = _clauses_index(q)
  314 + assert "base_query_trans_zh" not in idx
  315 + assert "base_query_trans_fr" in idx
  316 +
  317 +
  318 +# --- index_languages 为空:视为「未约束」source_in_index 为 True ---
  319 +
  320 +
  321 +def test_empty_index_languages_treats_source_as_in_index_boosts():
  322 + qb = _builder_multilingual_title_only(default_language="en")
  323 + q = _build(
  324 + qb,
  325 + query_text="x",
  326 + rewritten="x",
  327 + detected_language="de",
  328 + translations={"en": "y"},
  329 + index_languages=[],
  330 + )
  331 + idx = _clauses_index(q)
  332 + assert "boost" not in idx["base_query"]
  333 + assert idx["base_query_trans_en"]["boost"] == qb.translation_boost
  334 + assert idx["base_query_phrase"]["boost"] == qb.phrase_match_boost
  335 + assert idx["base_query_trans_en_phrase"]["boost"] == qb.phrase_match_boost
  336 +
  337 +
  338 +# --- 无翻译:仅 base_query ---
  339 +
  340 +
  341 +def test_no_translations_only_base_query():
  342 + qb = _builder_multilingual_title_only(default_language="en")
  343 + q = _build(
  344 + qb,
  345 + query_text="hello",
  346 + rewritten="hello",
  347 + detected_language="en",
  348 + translations={},
  349 + index_languages=["en", "zh"],
  350 + )
  351 + idx = _clauses_index(q)
  352 + assert set(idx) == _with_phrase({"base_query"})
  353 +
  354 +
  355 +# --- 与 KNN 同存时仍能解析文本子句(顶层 knn 不影响 query 内结构) ---
  356 +
  357 +
  358 +def test_text_clauses_present_alongside_knn():
  359 + qb = _builder_multilingual_title_only(default_language="en")
  360 + parsed = SimpleNamespace(
  361 + rewritten_query="dress",
  362 + detected_language="en",
  363 + translations={"zh": "连衣裙"},
  364 + contains_chinese=False,
  365 + contains_english=True,
  366 + )
  367 + q = qb.build_query(
  368 + query_text="dress",
  369 + query_vector=np.array([0.1, 0.2, 0.3], dtype=np.float32),
  370 + parsed_query=parsed,
  371 + enable_knn=True,
  372 + index_languages=["en", "zh"],
  373 + )
  374 + assert "knn" in q
  375 + idx = _clauses_index(q)
  376 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"})
  377 +
  378 +
  379 +def test_detected_language_unknown_falls_back_to_default_language():
  380 + """与 LanguageDetector 失败时 QueryConfig.default_language 行为对齐。"""
  381 + qb = _builder_multilingual_title_only(default_language="en")
  382 + parsed = SimpleNamespace(
  383 + rewritten_query="shirt",
  384 + detected_language="unknown",
  385 + translations={"zh": "衬衫"},
  386 + contains_chinese=False,
  387 + contains_english=True,
  388 + )
  389 + q = qb.build_query(
  390 + query_text="shirt",
  391 + parsed_query=parsed,
  392 + enable_knn=False,
  393 + index_languages=["en", "zh"],
  394 + )
  395 + idx = _clauses_index(q)
  396 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"})
  397 + assert idx["base_query"]["query"] == "shirt"
  398 + assert _has_title_lang(idx["base_query"], "en")
  399 +
  400 +
  401 +def test_ru_query_index_ru_en_includes_base_ru_and_trans_en():
  402 + qb = _builder_multilingual_title_only(default_language="en")
  403 + q = _build(
  404 + qb,
  405 + query_text="платье",
  406 + rewritten="платье",
  407 + detected_language="ru",
  408 + translations={"en": "dress"},
  409 + index_languages=["ru", "en"],
  410 + )
  411 + idx = _clauses_index(q)
  412 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"})
  413 + assert idx["base_query"]["query"] == "платье"
  414 + assert _has_title_lang(idx["base_query"], "ru")
  415 + assert idx["base_query_trans_en"]["query"] == "dress"
  416 +
  417 +
  418 +def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause():
  419 + """
  420 + 当前实现:凡是 translations 里非空的条目都会生成子句;
  421 + index_languages 只约束混写扩列,不用于过滤翻译子句。
  422 + """
  423 + qb = _builder_multilingual_title_only(default_language="en")
  424 + q = _build(
  425 + qb,
  426 + query_text="dress",
  427 + rewritten="dress",
  428 + detected_language="en",
  429 + translations={"zh": "连衣裙", "de": "Kleid"},
  430 + index_languages=["en", "zh"],
  431 + )
  432 + idx = _clauses_index(q)
  433 + assert "base_query_trans_de" in idx
  434 + assert idx["base_query_trans_de"]["query"] == "Kleid"
  435 + assert _has_title_lang(idx["base_query_trans_de"], "de")
  436 +
  437 +
  438 +def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_base():
  439 + """base_query 始终用 rewritten_query,而非仅 query_text。"""
  440 + qb = _builder_multilingual_title_only(default_language="en")
  441 + q = _build(
  442 + qb,
  443 + query_text=" 红色 ",
  444 + rewritten="红色连衣裙",
  445 + detected_language="zh",
  446 + translations={"en": "red dress"},
  447 + index_languages=["zh", "en"],
  448 + contains_chinese=True,
  449 + contains_english=False,
  450 + )
  451 + idx = _clauses_index(q)
  452 + assert idx["base_query"]["query"] == "红色连衣裙"
  453 + assert idx["base_query_trans_en"]["query"] == "red dress"
tests/test_query_parser_mixed_language.py
1 -from types import SimpleNamespace  
2 -  
3 from config import FunctionScoreConfig, IndexConfig, QueryConfig, RerankConfig, SPUConfig, SearchConfig 1 from config import FunctionScoreConfig, IndexConfig, QueryConfig, RerankConfig, SPUConfig, SearchConfig
4 from query.query_parser import QueryParser 2 from query.query_parser import QueryParser
5 3
@@ -9,6 +7,10 @@ class _DummyTranslator: @@ -9,6 +7,10 @@ class _DummyTranslator:
9 return f"{text}-{target_lang}" 7 return f"{text}-{target_lang}"
10 8
11 9
  10 +def _tokenizer(text):
  11 + return str(text).split()
  12 +
  13 +
12 def test_pure_english_word_token_length_and_script(): 14 def test_pure_english_word_token_length_and_script():
13 assert QueryParser._is_pure_english_word_token("ab") is False 15 assert QueryParser._is_pure_english_word_token("ab") is False
14 assert QueryParser._is_pure_english_word_token("abc") is True 16 assert QueryParser._is_pure_english_word_token("abc") is True
@@ -35,59 +37,57 @@ def _build_config() -&gt; SearchConfig: @@ -35,59 +37,57 @@ def _build_config() -&gt; SearchConfig:
35 37
36 38
37 def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(monkeypatch): 39 def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(monkeypatch):
38 - parser = QueryParser(_build_config(), translator=_DummyTranslator()) 40 + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
39 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh") 41 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh")
40 - monkeypatch.setattr(  
41 - "query.query_parser.get_tenant_config_loader",  
42 - lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["zh", "en"]}),  
43 - raising=False,  
44 - )  
45 42
46 - result = parser.parse("法式 dress 连衣裙", tenant_id="162", generate_vector=False) 43 + result = parser.parse(
  44 + "法式 dress 连衣裙",
  45 + tenant_id="162",
  46 + generate_vector=False,
  47 + target_languages=["zh", "en"],
  48 + )
47 49
48 assert result.detected_language == "zh" 50 assert result.detected_language == "zh"
49 assert result.contains_chinese is True 51 assert result.contains_chinese is True
50 assert result.contains_english is True 52 assert result.contains_english is True
51 - assert "en" in result.search_langs  
52 - # 翻译在预算内完成时会写入目标语言字段(优于仅用原文做 supplemental 探测)  
53 - assert result.query_text_by_lang["en"] == "法式 dress 连衣裙-en"  
54 - assert result.query_text_by_lang["zh"] == "法式 dress 连衣裙" 53 + assert result.translations == {"en": "法式 dress 连衣裙-en"}
  54 + assert result.query_tokens == ["法式", "dress", "连衣裙"]
  55 + assert not hasattr(result, "query_text_by_lang")
  56 + assert not hasattr(result, "search_langs")
55 57
56 58
57 def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch): 59 def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch):
58 - parser = QueryParser(_build_config(), translator=_DummyTranslator()) 60 + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
59 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") 61 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
60 - monkeypatch.setattr(  
61 - "query.query_parser.get_tenant_config_loader",  
62 - lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}),  
63 - raising=False,  
64 - )  
65 62
66 - result = parser.parse("red 连衣裙", tenant_id="0", generate_vector=False) 63 + result = parser.parse(
  64 + "red 连衣裙",
  65 + tenant_id="0",
  66 + generate_vector=False,
  67 + target_languages=["en", "zh"],
  68 + )
67 69
68 assert result.detected_language == "en" 70 assert result.detected_language == "en"
69 assert result.contains_chinese is True 71 assert result.contains_chinese is True
70 assert result.contains_english is True 72 assert result.contains_english is True
71 - assert "zh" in result.search_langs  
72 - assert result.query_text_by_lang["zh"] == "red 连衣裙-zh"  
73 - assert result.query_text_by_lang["en"] == "red 连衣裙" 73 + assert result.translations == {"zh": "red 连衣裙-zh"}
  74 + assert result.query_tokens == ["red", "连衣裙"]
74 75
75 76
76 def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch): 77 def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch):
77 """en 在 index_languages 内时仍应等待并采纳 en->zh 翻译结果(与向量共用预算)。""" 78 """en 在 index_languages 内时仍应等待并采纳 en->zh 翻译结果(与向量共用预算)。"""
78 - parser = QueryParser(_build_config(), translator=_DummyTranslator()) 79 + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
79 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") 80 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
80 - monkeypatch.setattr(  
81 - "query.query_parser.get_tenant_config_loader",  
82 - lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}),  
83 - raising=False,  
84 - )  
85 81
86 - result = parser.parse("off shoulder top", tenant_id="0", generate_vector=False) 82 + result = parser.parse(
  83 + "off shoulder top",
  84 + tenant_id="0",
  85 + generate_vector=False,
  86 + target_languages=["en", "zh"],
  87 + )
87 88
88 assert result.detected_language == "en" 89 assert result.detected_language == "en"
89 assert result.contains_chinese is False 90 assert result.contains_chinese is False
90 assert result.contains_english is True 91 assert result.contains_english is True
91 assert result.translations.get("zh") == "off shoulder top-zh" 92 assert result.translations.get("zh") == "off shoulder top-zh"
92 - assert result.query_text_by_lang.get("zh") == "off shoulder top-zh"  
93 - assert result.source_in_index_languages is True 93 + assert not hasattr(result, "source_in_index_languages")
tests/test_rerank_client.py
@@ -11,7 +11,6 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim @@ -11,7 +11,6 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim
11 "matched_queries": { 11 "matched_queries": {
12 "base_query": 2.4, 12 "base_query": 2.4,
13 "base_query_trans_zh": 1.8, 13 "base_query_trans_zh": 1.8,
14 - "fallback_original_query_zh": 1.2,  
15 "knn_query": 0.8, 14 "knn_query": 0.8,
16 }, 15 },
17 }, 16 },
@@ -27,7 +26,7 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim @@ -27,7 +26,7 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim
27 26
28 debug = fuse_scores_and_resort(hits, [0.9, 0.7]) 27 debug = fuse_scores_and_resort(hits, [0.9, 0.7])
29 28
30 - expected_text_1 = 2.4 + 0.25 * ((0.8 * 1.8) + (0.55 * 1.2)) 29 + expected_text_1 = 2.4 + 0.25 * (0.8 * 1.8)
31 expected_fused_1 = (0.9 + 0.00001) * ((expected_text_1 + 0.1) ** 0.35) * ((0.8 + 0.6) ** 0.2) 30 expected_fused_1 = (0.9 + 0.00001) * ((expected_text_1 + 0.1) ** 0.35) * ((0.8 + 0.6) ** 0.2)
32 expected_fused_2 = (0.7 + 0.00001) * ((9.0 + 0.1) ** 0.35) * ((0.2 + 0.6) ** 0.2) 31 expected_fused_2 = (0.7 + 0.00001) * ((9.0 + 0.1) ** 0.35) * ((0.2 + 0.6) ** 0.2)
33 32
@@ -38,7 +37,6 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim @@ -38,7 +37,6 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim
38 assert isclose(by_id["2"]["_fused_score"], expected_fused_2, rel_tol=1e-9) 37 assert isclose(by_id["2"]["_fused_score"], expected_fused_2, rel_tol=1e-9)
39 assert debug[0]["text_source_score"] == 2.4 38 assert debug[0]["text_source_score"] == 2.4
40 assert debug[0]["text_translation_score"] == 1.8 39 assert debug[0]["text_translation_score"] == 1.8
41 - assert debug[0]["text_fallback_score"] == 1.2  
42 assert debug[0]["knn_score"] == 0.8 40 assert debug[0]["knn_score"] == 0.8
43 assert [hit["_id"] for hit in hits] == ["2", "1"] 41 assert [hit["_id"] for hit in hits] == ["2", "1"]
44 42
tests/test_search_rerank_window.py
@@ -43,7 +43,14 @@ class _FakeParsedQuery: @@ -43,7 +43,14 @@ class _FakeParsedQuery:
43 43
44 44
45 class _FakeQueryParser: 45 class _FakeQueryParser:
46 - def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any): 46 + def parse(
  47 + self,
  48 + query: str,
  49 + tenant_id: str,
  50 + generate_vector: bool,
  51 + context: Any,
  52 + target_languages: Any = None,
  53 + ):
47 return _FakeParsedQuery( 54 return _FakeParsedQuery(
48 original_query=query, 55 original_query=query,
49 query_normalized=query, 56 query_normalized=query,
@@ -191,6 +198,66 @@ def test_config_loader_rerank_enabled_defaults_true(tmp_path: Path): @@ -191,6 +198,66 @@ def test_config_loader_rerank_enabled_defaults_true(tmp_path: Path):
191 "field_boosts": {"title.en": 3.0}, 198 "field_boosts": {"title.en": 3.0},
192 "indexes": [{"name": "default", "label": "default", "fields": ["title.en"]}], 199 "indexes": [{"name": "default", "label": "default", "fields": ["title.en"]}],
193 "query_config": {"supported_languages": ["en"], "default_language": "en"}, 200 "query_config": {"supported_languages": ["en"], "default_language": "en"},
  201 + "services": {
  202 + "translation": {
  203 + "service_url": "http://localhost:6005",
  204 + "timeout_sec": 3.0,
  205 + "default_model": "dummy-model",
  206 + "default_scene": "general",
  207 + "cache": {
  208 + "ttl_seconds": 60,
  209 + "sliding_expiration": True,
  210 + },
  211 + "capabilities": {
  212 + "dummy-model": {
  213 + "enabled": True,
  214 + "backend": "llm",
  215 + "use_cache": True,
  216 + "model": "dummy-model",
  217 + "base_url": "http://localhost:6005/v1",
  218 + "timeout_sec": 3.0,
  219 + }
  220 + },
  221 + },
  222 + "embedding": {
  223 + "provider": "http",
  224 + "providers": {
  225 + "http": {
  226 + "text_base_url": "http://localhost:6005",
  227 + "image_base_url": "http://localhost:6008",
  228 + }
  229 + },
  230 + "backend": "tei",
  231 + "backends": {
  232 + "tei": {
  233 + "base_url": "http://localhost:8080",
  234 + "timeout_sec": 3.0,
  235 + "model_id": "dummy-embedding-model",
  236 + }
  237 + },
  238 + },
  239 + "rerank": {
  240 + "provider": "http",
  241 + "providers": {
  242 + "http": {
  243 + "base_url": "http://localhost:6007",
  244 + "service_url": "http://localhost:6007/rerank",
  245 + }
  246 + },
  247 + "backend": "bge",
  248 + "backends": {
  249 + "bge": {
  250 + "model_name": "dummy-rerank-model",
  251 + "device": "cpu",
  252 + "use_fp16": False,
  253 + "batch_size": 8,
  254 + "max_length": 128,
  255 + "cache_dir": "./model_cache",
  256 + "enable_warmup": False,
  257 + }
  258 + },
  259 + },
  260 + },
194 "spu_config": {"enabled": False}, 261 "spu_config": {"enabled": False},
195 "function_score": {"score_mode": "sum", "boost_mode": "multiply", "functions": []}, 262 "function_score": {"score_mode": "sum", "boost_mode": "multiply", "functions": []},
196 "rerank": {"rerank_window": 384}, 263 "rerank": {"rerank_window": 384},
@@ -354,7 +421,14 @@ def test_searcher_promotes_sku_when_option1_matches_translated_query(monkeypatch @@ -354,7 +421,14 @@ def test_searcher_promotes_sku_when_option1_matches_translated_query(monkeypatch
354 class _TranslatedQueryParser: 421 class _TranslatedQueryParser:
355 text_encoder = None 422 text_encoder = None
356 423
357 - def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any): 424 + def parse(
  425 + self,
  426 + query: str,
  427 + tenant_id: str,
  428 + generate_vector: bool,
  429 + context: Any,
  430 + target_languages: Any = None,
  431 + ):
358 return _FakeParsedQuery( 432 return _FakeParsedQuery(
359 original_query=query, 433 original_query=query,
360 query_normalized=query, 434 query_normalized=query,
@@ -407,15 +481,22 @@ def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_matc @@ -407,15 +481,22 @@ def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_matc
407 encoder = _FakeTextEncoder( 481 encoder = _FakeTextEncoder(
408 { 482 {
409 "linen summer dress": [0.8, 0.2], 483 "linen summer dress": [0.8, 0.2],
410 - "color:Red": [1.0, 0.0],  
411 - "color:Blue": [0.0, 1.0], 484 + "color:red": [1.0, 0.0],
  485 + "color:blue": [0.0, 1.0],
412 } 486 }
413 ) 487 )
414 488
415 class _EmbeddingQueryParser: 489 class _EmbeddingQueryParser:
416 text_encoder = encoder 490 text_encoder = encoder
417 491
418 - def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any): 492 + def parse(
  493 + self,
  494 + query: str,
  495 + tenant_id: str,
  496 + generate_vector: bool,
  497 + context: Any,
  498 + target_languages: Any = None,
  499 + ):
419 return _FakeParsedQuery( 500 return _FakeParsedQuery(
420 original_query=query, 501 original_query=query,
421 query_normalized=query, 502 query_normalized=query,
tests/test_translator_failure_semantics.py
1 import logging 1 import logging
2 2
  3 +import pytest
  4 +
3 from translation.cache import TranslationCache 5 from translation.cache import TranslationCache
4 from translation.logging_utils import ( 6 from translation.logging_utils import (
5 TranslationRequestFilter, 7 TranslationRequestFilter,
@@ -7,6 +9,7 @@ from translation.logging_utils import ( @@ -7,6 +9,7 @@ from translation.logging_utils import (
7 reset_translation_request_id, 9 reset_translation_request_id,
8 ) 10 )
9 from translation.service import TranslationService 11 from translation.service import TranslationService
  12 +from translation.settings import build_translation_config, translation_cache_probe_models
10 13
11 14
12 class _FakeCache: 15 class _FakeCache:
@@ -16,7 +19,8 @@ class _FakeCache: @@ -16,7 +19,8 @@ class _FakeCache:
16 self.get_calls = [] 19 self.get_calls = []
17 self.set_calls = [] 20 self.set_calls = []
18 21
19 - def get(self, *, model, target_lang, source_text): 22 + def get(self, *, model, target_lang, source_text, log_lookup=True):
  23 + del log_lookup
20 self.get_calls.append((model, target_lang, source_text)) 24 self.get_calls.append((model, target_lang, source_text))
21 return self.storage.get((model, target_lang, source_text)) 25 return self.storage.get((model, target_lang, source_text))
22 26
@@ -191,3 +195,262 @@ def test_translation_route_log_focuses_on_routing_decision(monkeypatch, caplog): @@ -191,3 +195,262 @@ def test_translation_route_log_focuses_on_routing_decision(monkeypatch, caplog):
191 assert route_messages == [ 195 assert route_messages == [
192 "Translation route | backend=llm request_type=single use_cache=True cache_available=False" 196 "Translation route | backend=llm request_type=single use_cache=True cache_available=False"
193 ] 197 ]
  198 +
  199 +
  200 +def test_translation_cache_probe_models_order():
  201 + cfg = {"cache": {"model_quality_tiers": {"low": 10, "high": 50, "mid": 30}}}
  202 + assert translation_cache_probe_models(cfg, "low") == ["high", "mid", "low"]
  203 + assert translation_cache_probe_models(cfg, "mid") == ["high", "mid"]
  204 + assert translation_cache_probe_models(cfg, "high") == ["high"]
  205 + assert translation_cache_probe_models(cfg, "unknown") == ["unknown"]
  206 +
  207 +
  208 +def test_translation_cache_probe_models_respects_enable_switch():
  209 + cfg = {
  210 + "cache": {
  211 + "enable_model_quality_tier_cache": False,
  212 + "model_quality_tiers": {"peer-a": 50, "peer-b": 50, "top": 100},
  213 + }
  214 + }
  215 + assert translation_cache_probe_models(cfg, "peer-a") == ["peer-a"]
  216 +
  217 +
  218 +def test_translation_cache_probe_models_same_tier_included():
  219 + """Same numeric tier: all peers are probed (higher tier first, then name order)."""
  220 + cfg = {"cache": {"model_quality_tiers": {"peer-a": 50, "peer-b": 50, "top": 100}}}
  221 + assert translation_cache_probe_models(cfg, "peer-a") == ["top", "peer-a", "peer-b"]
  222 + assert translation_cache_probe_models(cfg, "peer-b") == ["top", "peer-b", "peer-a"]
  223 +
  224 +
  225 +def test_model_quality_tiers_unknown_capability_raises():
  226 + with pytest.raises(ValueError, match="unknown capability"):
  227 + build_translation_config(
  228 + {
  229 + "service_url": "http://127.0.0.1:6006",
  230 + "timeout_sec": 10.0,
  231 + "default_model": "llm",
  232 + "default_scene": "general",
  233 + "cache": {
  234 + "ttl_seconds": 60,
  235 + "sliding_expiration": True,
  236 + "model_quality_tiers": {"ghost": 1},
  237 + },
  238 + "capabilities": {
  239 + "llm": {
  240 + "enabled": True,
  241 + "backend": "llm",
  242 + "model": "dummy-llm",
  243 + "base_url": "https://example.com",
  244 + "timeout_sec": 10.0,
  245 + "use_cache": True,
  246 + }
  247 + },
  248 + }
  249 + )
  250 +
  251 +
  252 +def test_tiered_cache_reuses_higher_tier_entry(monkeypatch):
  253 + monkeypatch.setattr(TranslationCache, "_init_redis_client", staticmethod(lambda: None))
  254 + translate_calls = []
  255 +
  256 + def _fake_create_backend(self, *, name, backend_type, cfg):
  257 + del self, backend_type, cfg
  258 +
  259 + class _Backend:
  260 + model = name
  261 +
  262 + @property
  263 + def supports_batch(self):
  264 + return True
  265 +
  266 + def translate(self, text, target_lang, source_lang=None, scene=None):
  267 + del target_lang, source_lang, scene
  268 + translate_calls.append((name, text))
  269 + if isinstance(text, list):
  270 + return [f"{name}:{item}" for item in text]
  271 + return f"{name}:{text}"
  272 +
  273 + return _Backend()
  274 +
  275 + monkeypatch.setattr(TranslationService, "_create_backend", _fake_create_backend)
  276 + config = {
  277 + "service_url": "http://127.0.0.1:6006",
  278 + "timeout_sec": 10.0,
  279 + "default_model": "opus-mt-zh-en",
  280 + "default_scene": "general",
  281 + "capabilities": {
  282 + "deepl": {
  283 + "enabled": True,
  284 + "backend": "deepl",
  285 + "api_url": "https://api.deepl.com/v2/translate",
  286 + "timeout_sec": 10.0,
  287 + "use_cache": True,
  288 + },
  289 + "opus-mt-zh-en": {
  290 + "enabled": True,
  291 + "backend": "local_marian",
  292 + "model_id": "dummy",
  293 + "model_dir": "dummy",
  294 + "device": "cpu",
  295 + "torch_dtype": "float32",
  296 + "batch_size": 8,
  297 + "max_input_length": 16,
  298 + "max_new_tokens": 16,
  299 + "num_beams": 1,
  300 + "use_cache": True,
  301 + },
  302 + },
  303 + "cache": {
  304 + "ttl_seconds": 60,
  305 + "sliding_expiration": True,
  306 + "model_quality_tiers": {"deepl": 100, "opus-mt-zh-en": 40},
  307 + },
  308 + }
  309 +
  310 + service = TranslationService(config)
  311 + fake_cache = _FakeCache()
  312 + fake_cache.storage[("deepl", "en", "商品标题")] = "from-deepl"
  313 + service._translation_cache = fake_cache
  314 +
  315 + out = service.translate("商品标题", target_lang="en", source_lang="zh", model="opus-mt-zh-en")
  316 + assert out == "from-deepl"
  317 + assert translate_calls == []
  318 + assert fake_cache.get_calls == [("deepl", "en", "商品标题")]
  319 +
  320 +
  321 +def test_tiered_cache_reuses_same_tier_peer(monkeypatch):
  322 + """Model A may use cache written under model B when both share the same tier."""
  323 + monkeypatch.setattr(TranslationCache, "_init_redis_client", staticmethod(lambda: None))
  324 + translate_calls = []
  325 +
  326 + def _fake_create_backend(self, *, name, backend_type, cfg):
  327 + del self, backend_type, cfg
  328 +
  329 + class _Backend:
  330 + model = name
  331 +
  332 + @property
  333 + def supports_batch(self):
  334 + return True
  335 +
  336 + def translate(self, text, target_lang, source_lang=None, scene=None):
  337 + del target_lang, source_lang, scene
  338 + translate_calls.append((name, text))
  339 + if isinstance(text, list):
  340 + return [f"{name}:{item}" for item in text]
  341 + return f"{name}:{text}"
  342 +
  343 + return _Backend()
  344 +
  345 + monkeypatch.setattr(TranslationService, "_create_backend", _fake_create_backend)
  346 + marian_cap = {
  347 + "enabled": True,
  348 + "backend": "local_marian",
  349 + "model_id": "dummy",
  350 + "model_dir": "dummy",
  351 + "device": "cpu",
  352 + "torch_dtype": "float32",
  353 + "batch_size": 8,
  354 + "max_input_length": 16,
  355 + "max_new_tokens": 16,
  356 + "num_beams": 1,
  357 + "use_cache": True,
  358 + }
  359 + config = {
  360 + "service_url": "http://127.0.0.1:6006",
  361 + "timeout_sec": 10.0,
  362 + "default_model": "opus-mt-en-zh",
  363 + "default_scene": "general",
  364 + "capabilities": {
  365 + "opus-mt-zh-en": dict(marian_cap),
  366 + "opus-mt-en-zh": dict(marian_cap),
  367 + },
  368 + "cache": {
  369 + "ttl_seconds": 60,
  370 + "sliding_expiration": True,
  371 + "model_quality_tiers": {"opus-mt-zh-en": 50, "opus-mt-en-zh": 50},
  372 + },
  373 + }
  374 +
  375 + service = TranslationService(config)
  376 + fake_cache = _FakeCache()
  377 + fake_cache.storage[("opus-mt-zh-en", "en", "hello")] = "from-zh-en"
  378 + service._translation_cache = fake_cache
  379 +
  380 + out = service.translate("hello", target_lang="en", source_lang="zh", model="opus-mt-en-zh")
  381 + assert out == "from-zh-en"
  382 + assert translate_calls == []
  383 + assert fake_cache.get_calls == [
  384 + ("opus-mt-en-zh", "en", "hello"),
  385 + ("opus-mt-zh-en", "en", "hello"),
  386 + ]
  387 +
  388 +
  389 +def test_tiered_cache_switch_off_uses_exact_model_only(monkeypatch):
  390 + monkeypatch.setattr(TranslationCache, "_init_redis_client", staticmethod(lambda: None))
  391 + translate_calls = []
  392 +
  393 + def _fake_create_backend(self, *, name, backend_type, cfg):
  394 + del self, backend_type, cfg
  395 +
  396 + class _Backend:
  397 + model = name
  398 +
  399 + @property
  400 + def supports_batch(self):
  401 + return True
  402 +
  403 + def translate(self, text, target_lang, source_lang=None, scene=None):
  404 + del target_lang, source_lang, scene
  405 + translate_calls.append((name, text))
  406 + if isinstance(text, list):
  407 + return [f"{name}:{item}" for item in text]
  408 + return f"{name}:{text}"
  409 +
  410 + return _Backend()
  411 +
  412 + monkeypatch.setattr(TranslationService, "_create_backend", _fake_create_backend)
  413 + config = {
  414 + "service_url": "http://127.0.0.1:6006",
  415 + "timeout_sec": 10.0,
  416 + "default_model": "opus-mt-zh-en",
  417 + "default_scene": "general",
  418 + "capabilities": {
  419 + "deepl": {
  420 + "enabled": True,
  421 + "backend": "deepl",
  422 + "api_url": "https://api.deepl.com/v2/translate",
  423 + "timeout_sec": 10.0,
  424 + "use_cache": True,
  425 + },
  426 + "opus-mt-zh-en": {
  427 + "enabled": True,
  428 + "backend": "local_marian",
  429 + "model_id": "dummy",
  430 + "model_dir": "dummy",
  431 + "device": "cpu",
  432 + "torch_dtype": "float32",
  433 + "batch_size": 8,
  434 + "max_input_length": 16,
  435 + "max_new_tokens": 16,
  436 + "num_beams": 1,
  437 + "use_cache": True,
  438 + },
  439 + },
  440 + "cache": {
  441 + "ttl_seconds": 60,
  442 + "sliding_expiration": True,
  443 + "enable_model_quality_tier_cache": False,
  444 + "model_quality_tiers": {"deepl": 100, "opus-mt-zh-en": 40},
  445 + },
  446 + }
  447 +
  448 + service = TranslationService(config)
  449 + fake_cache = _FakeCache()
  450 + fake_cache.storage[("deepl", "en", "商品标题")] = "from-deepl"
  451 + service._translation_cache = fake_cache
  452 +
  453 + out = service.translate("商品标题", target_lang="en", source_lang="zh", model="opus-mt-zh-en")
  454 + assert out == "opus-mt-zh-en:商品标题"
  455 + assert translate_calls == [("opus-mt-zh-en", "商品标题")]
  456 + assert fake_cache.get_calls == [("opus-mt-zh-en", "en", "商品标题")]
translation/cache.py
@@ -36,7 +36,13 @@ class TranslationCache: @@ -36,7 +36,13 @@ class TranslationCache:
36 digest = hashlib.sha256(text.encode("utf-8")).hexdigest() 36 digest = hashlib.sha256(text.encode("utf-8")).hexdigest()
37 return f"trans:{normalized_model}:{normalized_target_lang}:{text_prefix}{digest}" 37 return f"trans:{normalized_model}:{normalized_target_lang}:{text_prefix}{digest}"
38 38
39 - def get(self, *, model: str, target_lang: str, source_text: str) -> Optional[str]: 39 + def get(
  40 + self,
  41 + *,
  42 + model: str,
  43 + target_lang: str,
  44 + source_text: str
  45 + ) -> Optional[str]:
40 if self.redis_client is None: 46 if self.redis_client is None:
41 return None 47 return None
42 key = self.build_key(model=model, target_lang=target_lang, source_text=source_text) 48 key = self.build_key(model=model, target_lang=target_lang, source_text=source_text)
translation/service.py
@@ -3,7 +3,7 @@ @@ -3,7 +3,7 @@
3 from __future__ import annotations 3 from __future__ import annotations
4 4
5 import logging 5 import logging
6 -from typing import Dict, List, Optional 6 +from typing import Dict, List, Optional, Tuple
7 7
8 from config.loader import get_app_config 8 from config.loader import get_app_config
9 from config.schema import AppConfig 9 from config.schema import AppConfig
@@ -15,6 +15,7 @@ from translation.settings import ( @@ -15,6 +15,7 @@ from translation.settings import (
15 get_translation_capability, 15 get_translation_capability,
16 normalize_translation_model, 16 normalize_translation_model,
17 normalize_translation_scene, 17 normalize_translation_scene,
  18 + translation_cache_probe_models,
18 ) 19 )
19 20
20 logger = logging.getLogger(__name__) 21 logger = logging.getLogger(__name__)
@@ -247,7 +248,11 @@ class TranslationService: @@ -247,7 +248,11 @@ class TranslationService:
247 ) -> Optional[str]: 248 ) -> Optional[str]:
248 if not text.strip(): 249 if not text.strip():
249 return text 250 return text
250 - cached = self._translation_cache.get(model=model, target_lang=target_lang, source_text=text) 251 + cached, _served = self._tiered_cache_get(
  252 + request_model=model,
  253 + target_lang=target_lang,
  254 + source_text=text,
  255 + )
251 if cached is not None: 256 if cached is not None:
252 logger.info( 257 logger.info(
253 "Translation cache served | request_type=single text_len=%s", 258 "Translation cache served | request_type=single text_len=%s",
@@ -279,6 +284,30 @@ class TranslationService: @@ -279,6 +284,30 @@ class TranslationService:
279 ) 284 )
280 return translated 285 return translated
281 286
  287 + def _tiered_cache_get(
  288 + self,
  289 + *,
  290 + request_model: str,
  291 + target_lang: str,
  292 + source_text: str,
  293 + ) -> Tuple[Optional[str], Optional[str]]:
  294 + """Redis lookup: cache from higher-tier or **same-tier** models may satisfy A.
  295 +
  296 + Lower-tier entries are never read. Returns ``(translated, served_model)``.
  297 + """
  298 + probe_models = translation_cache_probe_models(self.config, request_model)
  299 +
  300 + for probe_model in probe_models:
  301 + hit = self._translation_cache.get(
  302 + model=probe_model,
  303 + target_lang=target_lang,
  304 + source_text=source_text,
  305 + )
  306 + if hit is not None:
  307 + return hit, probe_model
  308 +
  309 + return None, None
  310 +
282 def _translate_batch_with_cache( 311 def _translate_batch_with_cache(
283 self, 312 self,
284 *, 313 *,
@@ -300,8 +329,8 @@ class TranslationService: @@ -300,8 +329,8 @@ class TranslationService:
300 if not normalized_text.strip(): 329 if not normalized_text.strip():
301 results[idx] = normalized_text 330 results[idx] = normalized_text
302 continue 331 continue
303 - cached = self._translation_cache.get(  
304 - model=model, 332 + cached, _served = self._tiered_cache_get(
  333 + request_model=model,
305 target_lang=target_lang, 334 target_lang=target_lang,
306 source_text=normalized_text, 335 source_text=normalized_text,
307 ) 336 )
translation/settings.py
@@ -2,7 +2,7 @@ @@ -2,7 +2,7 @@
2 2
3 from __future__ import annotations 3 from __future__ import annotations
4 4
5 -from typing import Any, Dict, List, Mapping, Optional 5 +from typing import Any, Dict, List, Mapping, Optional, Tuple
6 6
7 from translation.scenes import normalize_scene_name 7 from translation.scenes import normalize_scene_name
8 8
@@ -38,6 +38,7 @@ def build_translation_config(raw_cfg: Mapping[str, Any]) -&gt; TranslationConfig: @@ -38,6 +38,7 @@ def build_translation_config(raw_cfg: Mapping[str, Any]) -&gt; TranslationConfig:
38 if not get_enabled_translation_models(config): 38 if not get_enabled_translation_models(config):
39 raise ValueError("At least one translation capability must be enabled") 39 raise ValueError("At least one translation capability must be enabled")
40 40
  41 + _validate_model_quality_tiers(config)
41 return config 42 return config
42 43
43 44
@@ -86,18 +87,107 @@ def get_translation_cache(config: Mapping[str, Any]) -&gt; Dict[str, Any]: @@ -86,18 +87,107 @@ def get_translation_cache(config: Mapping[str, Any]) -&gt; Dict[str, Any]:
86 return dict(cache) 87 return dict(cache)
87 88
88 89
  90 +def translation_cache_probe_models(config: Mapping[str, Any], request_model: str) -> List[str]:
  91 + """Redis cache key models to try.
  92 +
  93 + Sort order: (1) **tier** descending (higher quality first); (2) within the same tier,
  94 + the **request model** before other peers; (3) remaining ties by model name.
  95 +
  96 + For a request to model A with tier T, probes every configured model whose tier is
  97 + **greater than or equal to** T. Lower tiers are never used.
  98 +
  99 + When ``enable_model_quality_tier_cache`` is false, only the request model is probed.
  100 +
  101 + When ``model_quality_tiers`` is empty or ``request_model`` is not listed, only the
  102 + request model is probed (legacy exact-match behavior).
  103 + """
  104 + rm = str(request_model or "").strip().lower()
  105 + cache = config.get("cache")
  106 + if not isinstance(cache, Mapping):
  107 + return [rm]
  108 + if not bool(cache.get("enable_model_quality_tier_cache", True)):
  109 + return [rm]
  110 + tiers = cache.get("model_quality_tiers")
  111 + if not isinstance(tiers, Mapping) or not tiers:
  112 + return [rm]
  113 + if rm not in tiers:
  114 + return [rm]
  115 + threshold = int(tiers[rm])
  116 + scored: List[Tuple[int, str]] = []
  117 + for name, tier_val in tiers.items():
  118 + n = str(name).strip().lower()
  119 + t = int(tier_val)
  120 + if t >= threshold:
  121 + scored.append((t, n))
  122 + scored.sort(
  123 + key=lambda item: (
  124 + -item[0],
  125 + 0 if item[1] == rm else 1,
  126 + item[1],
  127 + )
  128 + )
  129 + out: List[str] = []
  130 + seen: set[str] = set()
  131 + for _t, n in scored:
  132 + if n not in seen:
  133 + seen.add(n)
  134 + out.append(n)
  135 + return out
  136 +
  137 +
89 def _build_cache_config(raw_cache: Any) -> Dict[str, Any]: 138 def _build_cache_config(raw_cache: Any) -> Dict[str, Any]:
90 if not isinstance(raw_cache, Mapping): 139 if not isinstance(raw_cache, Mapping):
91 raise ValueError("services.translation.cache must be a mapping") 140 raise ValueError("services.translation.cache must be a mapping")
  141 + if "enable_model_quality_tier_cache" in raw_cache:
  142 + enable_tier_cache = _require_bool(
  143 + raw_cache["enable_model_quality_tier_cache"],
  144 + "services.translation.cache.enable_model_quality_tier_cache",
  145 + )
  146 + else:
  147 + enable_tier_cache = True
92 return { 148 return {
93 "ttl_seconds": _require_positive_int(raw_cache.get("ttl_seconds"), "services.translation.cache.ttl_seconds"), 149 "ttl_seconds": _require_positive_int(raw_cache.get("ttl_seconds"), "services.translation.cache.ttl_seconds"),
94 "sliding_expiration": _require_bool( 150 "sliding_expiration": _require_bool(
95 raw_cache.get("sliding_expiration"), 151 raw_cache.get("sliding_expiration"),
96 "services.translation.cache.sliding_expiration", 152 "services.translation.cache.sliding_expiration",
97 ), 153 ),
  154 + "enable_model_quality_tier_cache": enable_tier_cache,
  155 + "model_quality_tiers": _build_model_quality_tiers(raw_cache.get("model_quality_tiers")),
98 } 156 }
99 157
100 158
  159 +def _build_model_quality_tiers(raw: Any) -> Dict[str, int]:
  160 + if raw is None:
  161 + return {}
  162 + if not isinstance(raw, Mapping):
  163 + raise ValueError("services.translation.cache.model_quality_tiers must be a mapping")
  164 + resolved: Dict[str, int] = {}
  165 + for name, tier_val in raw.items():
  166 + cap = _require_string(name, "services.translation.cache.model_quality_tiers key").lower()
  167 + field = f"services.translation.cache.model_quality_tiers.{cap}"
  168 + resolved[cap] = _require_non_negative_int(tier_val, field)
  169 + return resolved
  170 +
  171 +
  172 +def _validate_model_quality_tiers(config: TranslationConfig) -> None:
  173 + tiers = config["cache"].get("model_quality_tiers")
  174 + if not isinstance(tiers, Mapping) or not tiers:
  175 + return
  176 + caps = config["capabilities"]
  177 + for name in tiers:
  178 + if name not in caps:
  179 + raise ValueError(
  180 + f"services.translation.cache.model_quality_tiers references unknown capability '{name}'"
  181 + )
  182 +
  183 +
  184 +def _require_non_negative_int(value: Any, field_name: str) -> int:
  185 + parsed = _require_int(value, field_name)
  186 + if parsed < 0:
  187 + raise ValueError(f"{field_name} must be >= 0")
  188 + return parsed
  189 +
  190 +
101 def _build_capabilities(raw_capabilities: Any) -> Dict[str, Dict[str, Any]]: 191 def _build_capabilities(raw_capabilities: Any) -> Dict[str, Dict[str, Any]]:
102 if not isinstance(raw_capabilities, Mapping): 192 if not isinstance(raw_capabilities, Mapping):
103 raise ValueError("services.translation.capabilities must be a mapping") 193 raise ValueError("services.translation.capabilities must be a mapping")