Compare View

switch
from
...
to
 
Commits (6)
api/routes/indexer.py
... ... @@ -449,7 +449,7 @@ def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages:
449 449 同步执行内容理解:调用 product_enrich.analyze_products,按语言批量跑 LLM,
450 450 再聚合成每 SPU 的 qanchors、semantic_attributes、tags。供 run_in_executor 调用。
451 451 """
452   - from indexer.product_enrich import analyze_products
  452 + from indexer.product_enrich import analyze_products, split_multi_value_field
453 453  
454 454 llm_langs = list(dict.fromkeys(languages)) or ["en"]
455 455  
... ... @@ -510,10 +510,7 @@ def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages:
510 510 raw = row.get(name)
511 511 if not raw:
512 512 continue
513   - for part in re.split(r"[,;|/\n\t]+", str(raw)):
514   - value = part.strip()
515   - if not value:
516   - continue
  513 + for value in split_multi_value_field(str(raw)):
517 514 rec["semantic_attributes"].append({"lang": lang, "name": name, "value": value})
518 515 if name == "tags":
519 516 rec["tags"].append(value)
... ...
config/config.yaml
1 1 # Unified Configuration for Multi-Tenant Search Engine
2 2 # 统一配置文件,所有租户共用一套配置
3 3 # 注意:索引结构由 mappings/search_products.json 定义,此文件只配置搜索行为
  4 +#
  5 +# 约定:下列键为必填;进程环境变量可覆盖 infrastructure / runtime 中同名语义项
  6 +#(如 ES_HOST、API_PORT 等),未设置环境变量时使用本文件中的值。
  7 +
  8 +# Process / bind addresses (环境变量 APP_ENV、RUNTIME_ENV、ES_INDEX_NAMESPACE 可覆盖前两者的语义)
  9 +runtime:
  10 + environment: "prod"
  11 + index_namespace: ""
  12 + api_host: "0.0.0.0"
  13 + api_port: 6002
  14 + indexer_host: "0.0.0.0"
  15 + indexer_port: 6004
  16 + embedding_host: "0.0.0.0"
  17 + embedding_port: 6005
  18 + embedding_text_port: 6005
  19 + embedding_image_port: 6008
  20 + translator_host: "127.0.0.1"
  21 + translator_port: 6006
  22 + reranker_host: "127.0.0.1"
  23 + reranker_port: 6007
  24 +
  25 +# 基础设施连接(敏感项优先读环境变量:ES_*、REDIS_*、DB_*、DASHSCOPE_API_KEY、DEEPL_AUTH_KEY)
  26 +infrastructure:
  27 + elasticsearch:
  28 + host: "http://localhost:9200"
  29 + username: null
  30 + password: null
  31 + redis:
  32 + host: "localhost"
  33 + port: 6479
  34 + snapshot_db: 0
  35 + password: null
  36 + socket_timeout: 1
  37 + socket_connect_timeout: 1
  38 + retry_on_timeout: false
  39 + cache_expire_days: 720
  40 + embedding_cache_prefix: "embedding"
  41 + anchor_cache_prefix: "product_anchors"
  42 + anchor_cache_expire_days: 30
  43 + database:
  44 + host: null
  45 + port: 3306
  46 + database: null
  47 + username: null
  48 + password: null
  49 + secrets:
  50 + dashscope_api_key: null
  51 + deepl_auth_key: null
4 52  
5 53 # Elasticsearch Index
6 54 es_index_name: "search_products"
7 55  
  56 +# 检索域 / 索引列表(可为空列表;每项字段均需显式给出)
  57 +indexes: []
  58 +
8 59 # Config assets
9 60 assets:
10 61 query_rewrite_dictionary_path: "config/dictionaries/query_rewrite.dict"
... ... @@ -20,20 +71,19 @@ es_settings:
20 71 refresh_interval: "30s"
21 72  
22 73 # 字段权重配置(用于搜索时的字段boost)
23   -# 统一按“字段基名”配置;查询时按 search_langs 动态拼接 .{lang}。
  74 +# 统一按“字段基名”配置;查询时按实际检索语言动态拼接 .{lang}。
24 75 # 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。
25 76 field_boosts:
26 77 title: 3.0
  78 + qanchors: 2.5
  79 + tags: 2.0
  80 + category_name_text: 2.0
  81 + category_path: 2.0
27 82 brief: 1.5
28   - description: 1.0
29   - qanchors: 1.5
30   - vendor: 1.5
31   - category_path: 1.5
32   - category_name_text: 1.5
33   - tags: 1.0
34   - option1_values: 0.6
35   - option2_values: 0.4
36   - option3_values: 0.4
  83 + description: 1.5
  84 + option1_values: 1.5
  85 + option2_values: 1.5
  86 + option3_values: 1.5
37 87  
38 88 # Query Configuration(查询配置)
39 89 query_config:
... ... @@ -47,10 +97,23 @@ query_config:
47 97 enable_text_embedding: true
48 98 enable_query_rewrite: true
49 99  
  100 + # 查询翻译模型(须与 services.translation.capabilities 中某项一致)
  101 + # 源语种在租户 index_languages 内:主召回可打在源语种字段,用下面三项。
  102 + # zh_to_en_model: "opus-mt-zh-en"
  103 + # en_to_zh_model: "opus-mt-en-zh"
  104 + # default_translation_model: "nllb-200-distilled-600m"
  105 + zh_to_en_model: "deepl"
  106 + en_to_zh_model: "deepl"
  107 + default_translation_model: "deepl"
  108 + # 源语种不在 index_languages:翻译对可检索文本更关键,可单独指定(缺省则与上一组相同)
  109 + zh_to_en_model__source_not_in_index: "deepl"
  110 + en_to_zh_model__source_not_in_index: "deepl"
  111 + default_translation_model__source_not_in_index: "deepl"
  112 +
50 113 # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。
51 114 # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。
52   - translation_embedding_wait_budget_ms_source_in_index: 80
53   - translation_embedding_wait_budget_ms_source_not_in_index: 200
  115 + translation_embedding_wait_budget_ms_source_in_index: 500 # 80
  116 + translation_embedding_wait_budget_ms_source_not_in_index: 500 #200
54 117  
55 118 # 动态多语言检索字段配置
56 119 # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式;
... ... @@ -58,11 +121,11 @@ query_config:
58 121 search_fields:
59 122 multilingual_fields:
60 123 - "title"
61   - - "brief"
62   - - "description"
63   - - "vendor"
  124 + - "qanchors"
64 125 - "category_path"
65 126 - "category_name_text"
  127 + - "brief"
  128 + - "description"
66 129 shared_fields:
67 130 - "tags"
68 131 - "option1_values"
... ... @@ -71,18 +134,14 @@ query_config:
71 134 core_multilingual_fields:
72 135 - "title"
73 136 - "brief"
74   - - "vendor"
75 137 - "category_name_text"
76 138  
77   - # 统一文本召回策略(主查询 + 翻译查询 + 原始查询兜底
  139 + # 统一文本召回策略(主查询 + 翻译查询
78 140 text_query_strategy:
79 141 base_minimum_should_match: "75%"
80 142 translation_minimum_should_match: "75%"
81   - translation_boost: 0.4
82   - translation_boost_when_source_missing: 1.0
83   - source_boost_when_missing: 0.6
84   - original_query_fallback_boost_when_translation_missing: 0.2
85   - tie_breaker_base_query: 0.9
  143 + translation_boost: 0.75
  144 + tie_breaker_base_query: 0.5
86 145  
87 146 # Embedding字段名称
88 147 text_embedding_field: "title_embedding"
... ... @@ -120,7 +179,7 @@ query_config:
120 179 - skus
121 180  
122 181 # KNN boost配置(向量召回的boost值)
123   - knn_boost: 0.25 # Lower boost for embedding recall
  182 + knn_boost: 2.0 # Lower boost for embedding recall
124 183  
125 184 # Function Score配置(ES层打分规则)
126 185 function_score:
... ... @@ -148,6 +207,17 @@ services:
148 207 cache:
149 208 ttl_seconds: 62208000
150 209 sliding_expiration: true
  210 + # When false, cache keys are exact-match per request model only (ignores model_quality_tiers for lookups).
  211 + enable_model_quality_tier_cache: true
  212 + # Higher tier = better quality. Multiple models may share one tier (同级).
  213 + # A request may reuse Redis keys from models with tier > A or tier == A (not from lower tiers).
  214 + model_quality_tiers:
  215 + deepl: 30
  216 + qwen-mt: 30
  217 + llm: 30
  218 + nllb-200-distilled-600m: 20
  219 + opus-mt-zh-en: 10
  220 + opus-mt-en-zh: 10
151 221 capabilities:
152 222 qwen-mt:
153 223 enabled: true
... ... @@ -290,7 +360,7 @@ services:
290 360 engine: "vllm"
291 361 max_model_len: 160
292 362 tensor_parallel_size: 1
293   - gpu_memory_utilization: 0.36
  363 + gpu_memory_utilization: 0.20
294 364 dtype: "float16"
295 365 enable_prefix_caching: true
296 366 enforce_eager: false
... ...
config/loader.py
... ... @@ -284,19 +284,30 @@ class AppConfigLoader:
284 284 base_minimum_should_match=str(text_strategy.get("base_minimum_should_match", "70%")),
285 285 translation_minimum_should_match=str(text_strategy.get("translation_minimum_should_match", "70%")),
286 286 translation_boost=float(text_strategy.get("translation_boost", 0.4)),
287   - translation_boost_when_source_missing=float(
288   - text_strategy.get("translation_boost_when_source_missing", 1.0)
289   - ),
290   - source_boost_when_missing=float(text_strategy.get("source_boost_when_missing", 0.6)),
291   - original_query_fallback_boost_when_translation_missing=float(
292   - text_strategy.get("original_query_fallback_boost_when_translation_missing", 0.2)
293   - ),
294 287 tie_breaker_base_query=float(text_strategy.get("tie_breaker_base_query", 0.9)),
295 288 zh_to_en_model=str(query_cfg.get("zh_to_en_model") or "opus-mt-zh-en"),
296 289 en_to_zh_model=str(query_cfg.get("en_to_zh_model") or "opus-mt-en-zh"),
297 290 default_translation_model=str(
298 291 query_cfg.get("default_translation_model") or "nllb-200-distilled-600m"
299 292 ),
  293 + zh_to_en_model_source_not_in_index=(
  294 + str(v)
  295 + if (v := query_cfg.get("zh_to_en_model__source_not_in_index"))
  296 + not in (None, "")
  297 + else None
  298 + ),
  299 + en_to_zh_model_source_not_in_index=(
  300 + str(v)
  301 + if (v := query_cfg.get("en_to_zh_model__source_not_in_index"))
  302 + not in (None, "")
  303 + else None
  304 + ),
  305 + default_translation_model_source_not_in_index=(
  306 + str(v)
  307 + if (v := query_cfg.get("default_translation_model__source_not_in_index"))
  308 + not in (None, "")
  309 + else None
  310 + ),
300 311 translation_embedding_wait_budget_ms_source_in_index=int(
301 312 query_cfg.get("translation_embedding_wait_budget_ms_source_in_index", 80)
302 313 ),
... ...
config/schema.py
... ... @@ -54,13 +54,14 @@ class QueryConfig:
54 54 base_minimum_should_match: str = "70%"
55 55 translation_minimum_should_match: str = "70%"
56 56 translation_boost: float = 0.4
57   - translation_boost_when_source_missing: float = 1.0
58   - source_boost_when_missing: float = 0.6
59   - original_query_fallback_boost_when_translation_missing: float = 0.2
60 57 tie_breaker_base_query: float = 0.9
61 58 zh_to_en_model: str = "opus-mt-zh-en"
62 59 en_to_zh_model: str = "opus-mt-en-zh"
63 60 default_translation_model: str = "nllb-200-distilled-600m"
  61 + # 检测语种不在租户 index_languages(无可直接命中的多语字段)时使用;None 表示与上一组同模型。
  62 + zh_to_en_model_source_not_in_index: Optional[str] = None
  63 + en_to_zh_model_source_not_in_index: Optional[str] = None
  64 + default_translation_model_source_not_in_index: Optional[str] = None
64 65 # 查询阶段:翻译与向量生成并发提交后,共用同一等待预算(毫秒)。
65 66 # 检测语言已在租户 index_languages 内:偏快返回,预算较短。
66 67 # 检测语言不在 index_languages 内:翻译对召回更关键,预算较长。
... ...
docs/DEVELOPER_GUIDE.md
... ... @@ -147,7 +147,7 @@ docs/ # 文档(含本指南)
147 147  
148 148 ### 4.4 query
149 149  
150   -- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化;输出可供 Searcher 使用的结构化查询信息(含 search_langs 语言计划)
  150 +- **职责**:查询解析与预处理:规范化、语言检测、改写(词典)、翻译、文本向量化;输出解析事实(如 `rewritten_query`、`detected_language`、`translations`、`query_vector`),不再承担 ES 语言计划拼装
151 151 - **原则**:翻译/向量通过 `providers` 获取,不直接依赖具体服务 URL 或实现;支持按配置关闭翻译/向量(如短查询、typing 场景)。
152 152  
153 153 ### 4.5 search
... ...
docs/QUICKSTART.md
... ... @@ -558,6 +558,21 @@ lsof -i :6004
558 558  
559 559 更完整的运行排障(多环境切换、Suggestion 构建、FAQ)见 `docs/Usage-Guide.md`。
560 560  
  561 +### 5.4 HanLP 与 `transformers` 版本(`BertTokenizer.encode_plus`)
  562 +
  563 +若日志出现 **`AttributeError: BertTokenizer has no attribute encode_plus`**,通常是 **同一 venv 里装了 `transformers` 5.x**,与 **HanLP 2.1.x** 不兼容(HanLP 仍调用已移除的 `encode_plus`)。
  564 +
  565 +**处理:** 将 `transformers` 固定到 **4.x**(例如 4.44+),然后重装/校验 HanLP:
  566 +
  567 +```bash
  568 +source activate.sh
  569 +pip install -r requirements_hanlp.txt
  570 +python -c "from transformers import BertTokenizer; import transformers as t; print(t.__version__, hasattr(BertTokenizer, 'encode_plus'))"
  571 +# 期望:4.x 且 True
  572 +```
  573 +
  574 +**说明:** 重排/TEI 等若使用 **独立 venv**(如 `.venv-reranker`),可与主 venv 的 `transformers` 版本分离;主 venv 只要装了 HanLP 做查询分词,就不要把 `transformers` 升到 5。
  575 +
561 576 ---
562 577  
563 578 ## 6. 相关文档
... ...
docs/TODO-ES能力提升.md 0 → 100644
... ... @@ -0,0 +1,69 @@
  1 +ES 付费版本 or 定制开发(建议先看下付费版本价格)
  2 +ES定制开发:
  3 +RRF / retrievers
  4 +
  5 +Elastic 的订阅矩阵里明确列了这些相关能力:Retrievers: linear, rule, RRF, text similarity re-ranker,以及 Reciprocal Rank Fusion (RRF) for hybrid search。
  6 +
  7 +这类能力最有价值的点是:
  8 +它们把混合检索从“自己拼 DSL 和手搓打分”变成了官方支持的多阶段检索框架。重排:text similarity re-ranker / Elastic Rerank. text_similarity_reranker 用 NLP 模型对 top-k 结果按语义相似度重新排序;它可以用内置的 Elastic Rerank,也可以接 Cohere、Vertex AI,或者你自己上传的 text similarity 模型。
  9 +
  10 +{
  11 + "retriever": {
  12 + "rrf": {
  13 + "retrievers": [
  14 + { "standard": { "query": { ... } } },
  15 + { "knn": { ... } }
  16 + ]
  17 + }
  18 + }
  19 +}
  20 +
  21 +
  22 +加reranker:
  23 +text_similarity_reranker 用 NLP 模型对 top-k 结果按语义相似度重新排序;它可以用内置的 Elastic Rerank,也可以接 Cohere、Vertex AI,或者你自己上传的 text similarity 模型。
  24 +
  25 +{
  26 + "retriever": {
  27 + "text_similarity_reranker": {
  28 + "retriever": {
  29 + "rrf": { ... }
  30 + },
  31 + ...
  32 + }
  33 + }
  34 +}
  35 +
  36 +{
  37 + "retriever": {
  38 + "text_similarity_reranker": {
  39 + "retriever": {
  40 + "rrf": {
  41 + "retrievers": [
  42 + {
  43 + "standard": {
  44 + "query": {
  45 + "...": "..."
  46 + }
  47 + }
  48 + },
  49 + {
  50 + "knn": {
  51 + "...": "..."
  52 + }
  53 + }
  54 + ],
  55 + "rank_window_size": 100,
  56 + "rank_constant": 20
  57 + }
  58 + },
  59 + "field": "your_rerank_text_field",
  60 + "inference_text": "白色 oversized T-shirt",
  61 + "inference_id": ".rerank-v1-elasticsearch",
  62 + "rank_window_size": 50
  63 + }
  64 + },
  65 + "size": 20
  66 +}
  67 +
  68 +
  69 +
... ...
docs/TODO.txt
1 1  
2 2  
3   -@reranker/backends/qwen3_vllm.py 单次 generate 前有进程内锁,同一进程里不会并行多路 vLLM 推理,这个锁有必要吗?是否会影响性能?是否能够打开,使得性能更好?比如这个场景,我一次请求 400 条,分成每64个一个batch,基于我现在的gpu配置,可以再提高并发度吗?
4   -测试了,让每个批次都并发地进行,耗时没有变化
  3 +
  4 +本地部署一个7b Q4量化的大模型
  5 +es需要licence的两个功能,如果费用低,开通下licence,或者改es源码定制开发下,支持 rank.rrf,reranker
  6 +
  7 +
  8 +
  9 +把knn跟文本相关性的融合方式修改为 "rank": {"rrf": {} }需要licence,可以帮我修改源码支持吗?
  10 +
  11 + knn_boost: 2.0
  12 +
  13 +
  14 +{
  15 + "query": { ...全文检索... },
  16 + "knn": { ...向量检索... },
  17 + "rank": {
  18 + "rrf": {}
  19 + }
  20 +}
  21 +
  22 +
  23 +"image_embedding": {
  24 + "type": "nested",
  25 + "properties": {
  26 + "vector": {
  27 + "type": "dense_vector",
  28 + "dims": 1024,
  29 + "index": true,
  30 + "similarity": "dot_product",
  31 + "element_type": "bfloat16"
  32 + },
  33 + "url": {
  34 + "type": "text"
  35 + }
  36 + }
  37 +},
  38 +去掉 image_embedding_512
  39 +image_embedding改为,一个spu有多个sku向量,每个向量内部properties:
  40 +除了vector url还应该包括,该图片是对应哪些sku
  41 +"image_embedding": {
  42 + "type": "nested",
  43 + "properties": {
  44 + "vector": {
  45 + "type": "dense_vector",
  46 + "dims": 1024,
  47 + "index": true,
  48 + "similarity": "dot_product",
  49 + "element_type": "bfloat16"
  50 + },
  51 + "url": {
  52 + "type": "text"
  53 + }
  54 + }
  55 +},
  56 +
  57 +
  58 +
  59 +
  60 +tags字段使用的优化:
  61 +现在是keyword,在搜索中,不太好使用(目前主要用于suggest)。
  62 +可以考虑也拆分多语言,配合analyzer使用(和qanchors一样)
  63 +
  64 +
  65 +
  66 +外部需求:
  67 +1. 对推理能力要求很低、对耗时要求很高的大模型API(或者本地部署一个7b Q4量化的大模型),prompt大概30-50个token,首token响应要求500ms以内
  68 +2. ES支持reranker pipline?
  69 +
  70 +
  71 +
  72 +
  73 +
5 74  
6 75 增加款式意图识别模块
7 76  
8   -意图类型: 颜色,尺(目前只需要支持这两种)
  77 +意图类型: 颜色,尺(目前只需要支持这两种)
9 78  
10 79 意图召回层:
11 80 每种意图,有一个召回词集合
12 81 对query(包括原始query、各种翻译query 都做匹配)
13 82  
14   -意图识别层:
15   -如果召回 判断有款式需求,
  83 +以颜色意图为例:
  84 +有一个词表,每一行 都逗号分割,互为同义词,行内第一个为标准化词
  85 +query匹配了其中任何一个词,都认为,具有颜色意图
  86 +匹配规则: 用细粒度、粗粒度分词,看是否有在词表中的。原始query分词、和每种翻译的分词,都要用。
  87 +
  88 +意图判断: 暂时留空,直接返回true。目前没有模型,即只要召回了(词表匹配了),即认为有该维度款式需求。
  89 +
  90 +
  91 +
  92 +意图使用:
  93 +
  94 +我们第一阶段,使用 参与ES提权。
  95 +
  96 +一、参与ES提权
  97 +
  98 +
  99 +二、参与reranker
16 100  
17 101  
18   -是否有:
19   -颜色需求
20   -尺码需求
21 102 如果有: 先做sku筛选,然后把最优的拼接到名称中,参与reranker。
22 103  
23 104  
24 105 现在在reranker、分页之后、做填充的时候,已经有做sku的筛选。
25 106 需要优化:
26 107 现在是,先做包含的判断,找到第一个 option_value被query包含的,则直接认为匹配。改为
27   -1. 第一轮:遍历完,如果有且仅有一个才这样。
28   -2. 第二轮:如果有多个,跳到3。如果没有,对每个词都走泛化词表进行匹配。
  108 +1. 第一轮:遍历完,如果有且仅有一个被query包含,那么认为匹配。
  109 +2. 第二轮:如果有多个符合(被query包含),跳到3。如果没有,对每个词都走泛化词表进行匹配。
29 110 3. 第三轮:如果有多个,那么对这多个,走embedding相关性取最高的。如果一个也没有,则对所有的走embedding相关性取最高的
30   -这个sku筛选也需要提取为一个独立的模块
31   -
32   -
33   -
34   -2026-03-21 10:29:23,698 - elastic_transport.transport - INFO - POST http://localhost:9200/search_products_tenant_163/_search?include_named_queries_score=false [status:200 duration:0.009s]
35   -2026-03-21 10:29:23,700 - request_context - INFO - 分页详情回填 | ids=20 | filled=20 | took=7ms
36   -2026-03-21 10:29:23,700 - request_context - INFO - 重排分页切片 | from=20, size=20, 返回=20条
37   -2026-03-21 10:29:23,720 - embeddings.text_encoder - ERROR - TextEmbeddingEncoder service request failed: 502 Server Error: Bad Gateway for url: http://127.0.0.1:6005/embed/text?normalize=true&priority=1
38   -Traceback (most recent call last):
39   - File "/data/saas-search/embeddings/text_encoder.py", line 63, in _call_service
40   - response.raise_for_status()
41   - File "/data/saas-search/.venv/lib/python3.12/site-packages/requests/models.py", line 1026, in raise_for_status
42   - raise HTTPError(http_error_msg, response=self)
43   -requests.exceptions.HTTPError: 502 Server Error: Bad Gateway for url: http://127.0.0.1:6005/embed/text?normalize=true&priority=1
44   -2026-03-21 10:29:23,720 - search.searcher - WARNING - Failed to encode SKU option1 values for final-page sorting: 502 Server Error: Bad Gateway for url: http://127.0.0.1:6005/embed/text?normalize=true&priority=1
45   -Traceback (most recent call last):
46   - File "/data/saas-search/search/searcher.py", line 448, in _apply_sku_sorting_for_page_hits
47   - encoded_option_vectors = text_encoder.encode(option1_values_to_encode, priority=1)
48   - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
49   - File "/data/saas-search/embeddings/text_encoder.py", line 112, in encode
50   - response_data = self._call_service(
51   - ^^^^^^^^^^^^^^^^^^^
52   - File "/data/saas-search/embeddings/text_encoder.py", line 63, in _call_service
53   - response.raise_for_status()
54   - File "/data/saas-search/.venv/lib/python3.12/site-packages/requests/models.py", line 1026, in raise_for_status
55   - raise HTTPError(http_error_msg, response=self)
56   -requests.exceptions.HTTPError: 502 Server Error: Bad Gateway for url: http://127.0.0.1:6005/embed/text?normalize=true&priority=1
57   -2026-03-21 10:29:23,721 - request_context - WARNING - SKU option embedding failed: 502 Server Error: Bad Gateway for url: http://127.0.0.1:6005/embed/text?normalize=true&priority=1
  111 +这个sku筛选也需要提取为一个独立的模块。
  112 +
  113 +
  114 +另外:现在是reranker、分页之后做sku筛选,要改为:
  115 +1. 有款式意图的时候,才做sku筛选
  116 +2. sku筛选的时机,改为在reranker之前,对所有内容做sku筛选,然后
  117 +3. 从仅 option1 扩展到多个维度,识别的意图,包含意图的维度名(color)和维度名的泛化词list(color、颜色、colour、olors、、、、),遍历option1_name,option2_name,option3_name,看哪个能匹配上意图的维度名list,哪个匹配上了,则在这个维度筛选。
  118 +4. Rerank doc (有款式意图的时候)要带上属性后缀,拼接到title后面。在调用 run_rerank 前,对每条 hit 生成「用于重排的 doc 文本」(标题 + 可选后缀)
  119 +5. TODO : 还有一个问题。 目前,sku只返回一个维度(店铺主维度。默认应该是option1,不是所有维度的sku信息都返回的。所以,如果有款式意图,但是主维度是颜色,那么拿不到全的款式sku)
  120 +
58 121  
59 122  
60 123  
  124 +当前项目功能已经较多,但是有清晰的框架,请务必基于现有框架进行改造,不要进行补丁式的修改,避免代码逻辑分叉。
  125 +
  126 +请一步一步来,先设计意图识别模块,仔细思考需求,意图识别模块需要提供哪些内容,用于返回数据接口的定义,深度思考,定义一个合理的接口后,再给出合理的模块设计。
  127 +
  128 +
  129 +
  130 +
  131 +
  132 +
  133 +
  134 +
  135 +
  136 +
  137 +
  138 +
  139 +
  140 +是否需要:
  141 +当「源语言不在 index_languages」且「某些目标语言的翻译缺失」时,ES 里会额外加一层 用「原始 query 字符串」去撞缺失语种字段
  142 +
  143 +
61 144  
62 145 先阅读文本embedding相关的代码:
63 146 @embeddings/README.md @embeddings/server.py @docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md @embeddings/text_encoder.py
... ... @@ -361,6 +444,31 @@ embeddings/image_encoder.py:requests.post(..., timeout=self.timeout_sec)
361 444  
362 445  
363 446  
  447 +
  448 +
  449 +
  450 +
  451 +
  452 +
  453 +
  454 +多reranker:
  455 +
  456 +改 reranker 服务,一次请求返回多路分
  457 +服务启动时 加载多个 backend(或按请求懒加载),/rerank 响应扩展为例如
  458 +scores: [...](兼容主后端)+ scores_by_backend: { "bge": [...], "qwen3_vllm": [...] }。
  459 +搜索侧解析多路分,再融合或只透传 debug。
  460 +优点:搜索侧仍只调一个 URL。缺点:单进程多大模型 显存压力很大;
  461 +
  462 +融合层要注意的一点
  463 +fuse_scores_and_resort 目前只消费 一条 rerank_scores 序列,并写入 _rerank_score
  464 +多 backend 之后需要rerank_scores 都参与融合
  465 +
  466 +
  467 +
  468 +
  469 +
  470 +
  471 +
364 472 product_enrich : Partial Mode : done
365 473 https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-menu-2400256.d_0_3_0_7.74a630119Ct6zR
366 474 需在messages 数组中将最后一条消息的 role 设置为 assistant,并在其 content 中提供前缀,在此消息中设置参数 "partial": true。messages格式如下:
... ... @@ -383,6 +491,8 @@ https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-men
383 491  
384 492  
385 493 融合打分(已完成,2026-03)
  494 +
  495 +以下已经完成:
386 496 1. `fuse_scores_and_resort` 已改为乘法融合,并通过 `matched_queries` 提取:
387 497 - `base_query`
388 498 - `base_query_trans_*`
... ... @@ -397,7 +507,11 @@ https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-men
397 507 - `docs/搜索API对接指南.md`
398 508 - `docs/Usage-Guide.md`
399 509  
400   -
  510 +未完成的:
  511 +(归一化、次序融合?还乘法公式?)
  512 +RRF:先把多路召回稳妥融合
  513 +linear + minmax:让你能精调 knn 和文本的权重
  514 +reranker:对前面召回出来的 top-k 再做“最后一刀”
401 515  
402 516  
403 517  
... ...
docs/搜索API对接指南-01-搜索接口.md
... ... @@ -66,9 +66,11 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"})
66 66 | `min_score` | float | N | null | 最小相关性分数阈值 |
67 67 | `sku_filter_dimension` | array[string] | N | null | 子SKU筛选维度列表(见[SKU筛选维度](#35-sku筛选维度)) |
68 68 | `debug` | boolean | N | false | 是否返回调试信息 |
69   -| `enable_rerank` | boolean/null | N | null | 是否开启重排(调用外部重排服务对 ES 结果进行二次排序)。不传/传 null 使用服务端 `rerank.enabled`(默认开启)。开启后会先对 ES TopN(`rerank_window`)重排,再按分页截取;若 `from+size>1000`,则不重排,直接按分页从 ES 返回 |
70   -| `rerank_query_template` | string | N | null | 重排 query 模板(可选)。支持 `{query}` 占位符;不传则使用服务端配置 |
71   -| `rerank_doc_template` | string | N | null | 重排 doc 模板(可选)。支持 `{title} {brief} {vendor} {description} {category_path}`;不传则使用服务端配置 |
  69 +| `enable_rerank` | boolean/null | N | null | 是否开启重排(调用外部重排服务对 ES 结果进行二次排序)。不传/传 null 使用服务端 `rerank.enabled`。当有效开启且 `from + size <= rerank_window` 时:ES 先取前 `rerank_window` 条,重排后再按 `from`/`size` 截取当前页;若 `from + size > rerank_window`,则**不进行**窗口内重排,直接按请求的 `from`/`size` 查询 ES(`rerank_window` 见 `config.yaml` 的 `rerank.rerank_window`,仓库示例默认 400) |
  70 +| `rerank_query_template` | string | N | null | 重排 query 模板(可选)。支持 `{query}` 占位符;不传则使用服务端 `rerank.rerank_query_template` |
  71 +| `rerank_doc_template` | string | N | null | 重排 doc 模板(可选)。支持 `{title} {brief} {vendor} {description} {category_path}` 等占位符(由 `search/rerank_client.py` 按语言字段拼装);不传则使用服务端 `rerank.rerank_doc_template` |
  72 +
  73 +**与后端代码的对应关系**(便于联调):HTTP `POST /search/` 请求体由 `api/models.py` 的 `SearchRequest` 校验;路由 `api/routes/search.py` 将字段原样传入 `Searcher.search(...)`(含上述三个重排相关字段)。CLI `python main.py search` 目前未暴露这些参数,走配置默认值。
72 74 | `user_id` | string | N | null | 用户ID(用于个性化,预留) |
73 75 | `session_id` | string | N | null | 会话ID(用于分析,预留) |
74 76  
... ... @@ -551,9 +553,6 @@ response = requests.post(url, headers=headers, json={&quot;query&quot;: &quot;芭比娃娃&quot;})
551 553 | `rewritten_query` | string | 重写后的查询 |
552 554 | `detected_language` | string | 检测到的语言 |
553 555 | `translations` | object | 翻译结果 |
554   -| `query_text_by_lang` | object | 实际参与检索的多语言 query 文本 |
555   -| `search_langs` | array[string] | 实际参与检索的语言列表 |
556   -| `supplemental_search_langs` | array[string] | 因 mixed query 补入的附加语言列表 |
557 556 | `has_vector` | boolean | 是否生成了向量 |
558 557  
559 558 `debug_info.per_result[]` 常见字段:
... ... @@ -563,10 +562,9 @@ response = requests.post(url, headers=headers, json={&quot;query&quot;: &quot;芭比娃娃&quot;})
563 562 | `spu_id` | string | 结果 SPU ID |
564 563 | `es_score` | float | ES 原始 `_score` |
565 564 | `rerank_score` | float | 重排分数 |
566   -| `text_score` | float | 文本相关性大分(由 `base_query` / `base_query_trans_*` / `fallback_original_query_*` 聚合而来) |
  565 +| `text_score` | float | 文本相关性大分(由 `base_query` / `base_query_trans_*` 聚合而来) |
567 566 | `text_source_score` | float | `base_query` 分数 |
568 567 | `text_translation_score` | float | `base_query_trans_*` 里的最大分数 |
569   -| `text_fallback_score` | float | `fallback_original_query_*` 里的最大分数 |
570 568 | `text_primary_score` | float | 文本大分中的主证据部分 |
571 569 | `text_support_score` | float | 文本大分中的辅助证据部分 |
572 570 | `knn_score` | float | `knn_query` 分数 |
... ...
docs/相关性检索优化说明.md
... ... @@ -2,11 +2,11 @@
2 2  
3 3 ## 1. 文档目标
4 4  
5   -本文描述当前线上代码的文本检索策略,重点覆盖:
  5 +本文描述当前代码中的文本检索策略,重点覆盖:
6 6  
7 7 - 多语言检索路由(`detector` / `translator` / `indexed` 的关系)
8 8 - 统一文本召回表达式(无布尔 AST 分支)
9   -- 翻译缺失时的兜底策略
  9 +- 解析层与检索表达式层的职责边界
10 10 - 重排融合打分与调试字段
11 11 - 典型场景下实际生成的 ES 查询结构
12 12  
... ... @@ -17,9 +17,11 @@
17 17 查询链路(文本相关):
18 18  
19 19 1. `QueryParser.parse()`
20   - 输出 `detected_language`、`query_text_by_lang`、`search_langs`、`index_languages`、`source_in_index_languages`;另输出 `contains_chinese` / `contains_english`(仅服务混写辅助召回,见 §4 末)。
  20 + 负责产出解析事实:`query_normalized`、`rewritten_query`、`detected_language`、`translations`、`query_vector`、`query_tokens`、`contains_chinese`、`contains_english`。
  21 +2. `Searcher.search()`
  22 + 负责读取租户 `index_languages`,并将其一方面传给 `QueryParser` 作为 `target_languages`,另一方面传给 `ESQueryBuilder` 作为字段展开约束。
21 23 2. `ESQueryBuilder._build_advanced_text_query()`
22   - 按 `search_langs` 动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段,叠加 shared 字段(`tags`、`option*_values`);若命中混写辅助条件,在同一子句内并入另一语种列(§4 末)。
  24 + 基于 `rewritten_query + detected_language + translations + index_languages` 构建 `base_query` 与 `base_query_trans_*`;并按语言动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段,叠加 shared 字段(`tags`、`option*_values`)。
23 25 3. `build_query()`
24 26 统一走文本策略,不再有布尔 AST 枝路。
25 27  
... ... @@ -37,18 +39,18 @@
37 39 源语言字段做主召回;其他语言走翻译补召回(低权重)。
38 40 2. 若 `detected_language not in index_languages`:
39 41 翻译到 `index_languages` 是主路径;源语言字段仅作弱召回。
40   -3. 若第 2 步翻译部分失败或全部失败:
41   - 对缺失翻译的 `index_languages` 字段,追加“原文低权重兜底”子句,避免完全丢失这些语种索引面的召回机会。
  42 +3. 若翻译部分失败或全部失败:
  43 + 当前实现不会再额外生成“原文打到其他语种字段”的兜底子句;系统保留 `base_query` 并继续执行,可观测性由 `translations` / warning / 命名子句分数提供。
42 44  
43 45 ### 3.2 翻译与向量:并发提交与共享超时
44 46  
45   -`QueryParser.parse()` 内(Stage 4–6)对**离线调用**采用线程池提交 + **一次** `concurrent.futures.wait`:
  47 +`QueryParser.parse()` 内对翻译与向量采用线程池提交 + **一次** `concurrent.futures.wait`:
46 48  
47   -- **翻译**:对 `index_languages` 中除 `detected_language` 外的每个目标语种各提交一个 `translator.translate` 任务(多目标时并发执行)。
48   -- **查询向量**(若开启 `enable_text_embedding` 且域为 default):再提交一个 `text_encoder.encode` 任务。
  49 +- **翻译**:对调用方传入的 `target_languages` 中、除 `detected_language` 外的每个目标语种各提交一个 `translator.translate` 任务(多目标时并发执行)。
  50 +- **查询向量**:若开启 `enable_text_embedding`,再提交一个 `text_encoder.encode` 任务。
49 51 - 上述任务进入**同一** future 集合;例如租户索引为 `[zh, en]` 且检测语种**不在**索引内时,常为 **2 路翻译 + 1 路向量,共 3 个任务并发**,共用超时。
50 52  
51   -**等待预算(毫秒)**由 `detected_language` 是否属于租户 `index_languages` 决定(`query_config`):
  53 +**等待预算(毫秒)**由 `detected_language` 是否属于调用方传入的 `target_languages` 决定(`query_config`):
52 54  
53 55 - **在索引内**:`translation_embedding_wait_budget_ms_source_in_index`(默认较短,如 80ms)— 主召回已能打在源语种字段,翻译/向量稍慢可容忍。
54 56 - **不在索引内**:`translation_embedding_wait_budget_ms_source_not_in_index`(默认较长,如 200ms)— 翻译对可检索文本更关键,给足时间。
... ... @@ -62,7 +64,7 @@
62 64 ```json
63 65 {
64 66 "multi_match": {
65   - "_name": "base_query|base_query_trans_xx|fallback_original_query_xx",
  67 + "_name": "base_query|base_query_trans_xx",
66 68 "query": "<text>",
67 69 "fields": ["title.xx^3.0", "brief.xx^1.5", "...", "tags", "option1_values^0.5", "..."],
68 70 "minimum_should_match": "75%",
... ... @@ -75,7 +77,7 @@
75 77 最终按 `bool.should` 组合,`minimum_should_match: 1`。
76 78  
77 79 > **附 — 混写辅助召回**
78   -> 当中英(或多脚本)混写时,为略抬召回:`QueryParser` 用 `contains_chinese`(文中有汉字)、`contains_english`(分词中有长度 ≥3 的纯英文 token)打标;`ESQueryBuilder` 在某一语言的 `multi_match` 上,按规则把**另一语种**的同类字段并入同一 `fields`(受 `index_languages` 限制),并入列的 boost 为配置值再乘 **`mixed_script_merged_field_boost_scale`(默认 0.8,`ESQueryBuilder` 构造参数)**。`fallback_original_query_*` 同样适用。字段在内部以 `(path, boost)` 列表合并后再格式化为 ES 字符串。
  80 +> 当中英(或多脚本)混写时,为略抬召回:`QueryParser` 用 `contains_chinese`(文中有汉字)、`contains_english`(分词中有长度 ≥3 的纯英文 token)打标;`ESQueryBuilder` 在某一语言的 `multi_match` 上,按规则把**另一语种**的同类字段并入同一 `fields`(受 `index_languages` 限制),并入列的 boost 为配置值再乘 **`mixed_script_merged_field_boost_scale`(默认 0.6,`ESQueryBuilder` 构造参数)**。字段在内部以 `(path, boost)` 列表合并后再格式化为 ES 字符串。
79 81  
80 82 ## 5. 关键配置项(文本策略)
81 83  
... ... @@ -88,20 +90,12 @@
88 90  
89 91 - `base_minimum_should_match`
90 92 - `translation_minimum_should_match`
91   -- `translation_boost`
92   -- `translation_boost_when_source_missing`
93   -- `source_boost_when_missing`
94   -- `original_query_fallback_boost_when_translation_missing`(新增)
  93 +- `translation_boost`(所有 `base_query_trans_*` 共用)
95 94 - `tie_breaker_base_query`
96 95  
97   -新增项说明:
98   -
99   -- `original_query_fallback_boost_when_translation_missing`:
100   - 当源语种不在索引语言且翻译缺失时,原文打到缺失目标语字段的低权重系数,默认 `0.2`。
101   -
102 96 说明:
103 97  
104   -- `phrase_query` / `keywords_query` 已从当前实现中移除,文本相关性只由 `base_query`、`base_query_trans_*`、`fallback_original_query_*` 三类子句组成。
  98 +- `phrase_query` / `keywords_query` 已从当前实现中移除,文本相关性只由 `base_query`、`base_query_trans_*`类子句组成。
105 99  
106 100 ## 6. 典型场景与实际 DSL
107 101  
... ... @@ -111,11 +105,12 @@
111 105  
112 106 - `detected_language=de`
113 107 - `index_languages=[de,en]`
114   -- `query_text_by_lang={de:"herren schuhe", en:"men shoes"}`
  108 +- `rewritten_query="herren schuhe"`
  109 +- `translations={en:"men shoes"}`
115 110  
116 111 策略结果:
117 112  
118   -- `base_query`:德语字段,正常权重
  113 +- `base_query`:德语字段,**不写** `multi_match.boost`
119 114 - `base_query_trans_en`:英语字段,`boost=translation_boost`(默认 0.4)
120 115  
121 116 ### 场景 B:源语种不在索引语言中,部分翻译缺失
... ... @@ -126,38 +121,44 @@
126 121  
127 122 策略结果:
128 123  
129   -- `base_query`(德语字段):`boost=source_boost_when_missing`(默认 0.6)
130   -- `base_query_trans_en`(英文字段):`boost=translation_boost_when_source_missing`(默认 1.0)
131   -- `fallback_original_query_zh`(中文字段):原文低权重兜底(默认 0.2)
  124 +- `base_query`(德语字段):**不写** `multi_match.boost`(默认 1.0)
  125 +- `base_query_trans_en`(英文字段):`boost=translation_boost`(如 0.4)
  126 +- 不会生成额外中文兜底子句
132 127  
133 128 ### 场景 C:源语种不在索引语言中,翻译全部失败
134 129  
135 130 - `detected_language=de`
136 131 - `index_languages=[en,zh]`
137   -- `query_text_by_lang` 仅有 `de`
  132 +- `translations={}`
138 133  
139 134 策略结果:
140 135  
141   -- `base_query`(德语字段,低权重)
142   -- `fallback_original_query_en`(英文字段原文兜底)
143   -- `fallback_original_query_zh`(中文字段原文兜底)
  136 +- `base_query`(德语字段,**无** `boost` 字段)
  137 +- 不会生成 `base_query_trans_*`
144 138  
145   -这能避免“只有源语种字段查询,且该语种字段在商家索引中稀疏/为空”导致的弱召回问题
  139 +这意味着当前实现优先保证职责清晰与可解释性,而不是继续在 Builder 内部隐式制造“跨语种原文兜底”
146 140  
147   -## 7. QueryParser 与 ESBuilder 的职责分工
  141 +## 7. QueryParser 与 Searcher / ESBuilder 的职责分工
148 142  
149   -- `QueryParser` 负责“语言计划”与“可用文本”:
150   - - `search_langs`
151   - - `query_text_by_lang`
152   - - `source_in_index_languages`
153   - - `index_languages`
  143 +- `QueryParser` 负责“解析事实”:
  144 + - `query_normalized`
  145 + - `rewritten_query`
  146 + - `detected_language`
  147 + - `translations`
  148 + - `query_vector`
  149 + - `query_tokens`
154 150 - `contains_chinese` / `contains_english`
  151 +- `Searcher` 负责“租户语境”:
  152 + - `index_languages`
  153 + - 将其传给 parser 作为 `target_languages`
  154 + - 将其传给 builder 作为字段展开约束
155 155 - `ESQueryBuilder` 负责“表达式展开”:
156 156 - 动态字段组装
157 157 - 子句权重分配
158   - - 翻译缺失兜底子句拼接
  158 + - `base_query` / `base_query_trans_*` 子句拼接
  159 + - 跳过“与 base_query 文本和语言完全相同”的重复翻译子句
159 160  
160   -这种分层让策略调优主要落在配置和 Builder,不破坏 Parser 的职责边界
  161 +这种分层让 parser 不再返回 ES 专用的“语言计划字段”,职责边界更清晰
161 162  
162 163 ## 8. 融合打分(Rerank + Text + KNN)
163 164  
... ... @@ -165,24 +166,21 @@
165 166  
166 167 ### 8.1 文本相关性大分
167 168  
168   -文本大分由部分组成:
  169 +文本大分由部分组成:
169 170  
170 171 - `base_query`
171 172 - `base_query_trans_*`
172   -- `fallback_original_query_*`
173 173  
174 174 聚合方式:
175 175  
176 176 1. `source_score = base_query`
177 177 2. `translation_score = max(base_query_trans_*)`
178   -3. `fallback_score = max(fallback_original_query_*)`
179   -4. 加权:
  178 +3. 加权:
180 179 - `weighted_source = source_score`
181 180 - `weighted_translation = 0.8 * translation_score`
182   - - `weighted_fallback = 0.55 * fallback_score`
183   -5. 合成:
184   - - `primary = max(weighted_source, weighted_translation, weighted_fallback)`
185   - - `support = weighted_source + weighted_translation + weighted_fallback - primary`
  181 +4. 合成:
  182 + - `primary = max(weighted_source, weighted_translation)`
  183 + - `support = weighted_source + weighted_translation - primary`
186 184 - `text_score = primary + 0.25 * support`
187 185  
188 186 如果以上子分都缺失,则回退到 ES `_score` 作为 `text_score`,避免纯文本召回被误打成 0。
... ... @@ -212,7 +210,6 @@ fused_score = (
212 210 - `text_score`
213 211 - `text_source_score`
214 212 - `text_translation_score`
215   -- `text_fallback_score`
216 213 - `text_primary_score`
217 214 - `text_support_score`
218 215 - `knn_score`
... ... @@ -221,9 +218,9 @@ fused_score = (
221 218  
222 219 `debug_info.query_analysis` 还会暴露:
223 220  
224   -- `query_text_by_lang`
225   -- `search_langs`
226   -- `supplemental_search_langs`
  221 +- `translations`
  222 +- `detected_language`
  223 +- `rewritten_query`
227 224  
228 225 这些字段用于检索效果评估与 bad case 归因。
229 226  
... ... @@ -231,7 +228,7 @@ fused_score = (
231 228  
232 229 1. 当前文本主链路已移除布尔 AST 分支。
233 230 2. 文档中的旧描述(如 `operator: AND` 固定开启)不再适用,当前实现未强制设置该参数。
234   -3. `HanLP` 为可选依赖;不可用时退化到轻量分词,不影响主链路可用性
  231 +3. `HanLP` 为必需依赖;当前 parser 不再提供轻量 fallback
235 232 4. 若后续扩展到更多语种,请确保:
236 233 - mapping 中存在对应 `.<lang>` 字段
237 234 - `index_languages` 配置在支持列表内
... ... @@ -263,10 +260,9 @@ python ./scripts/eval_search_quality.py
263 260 建议在 `tests/` 增加文本策略用例:
264 261  
265 262 1. 源语种在索引语言,翻译命中缓存
266   -2. 源语种不在索引语言,翻译部分失败(验证 fallback 子句)
267   -3. 源语种不在索引语言,翻译全部失败(验证多目标 fallback)
268   -4. 自定义 `original_query_fallback_boost_when_translation_missing` 生效
269   -5. 非 `zh/en` 语种字段动态拼接(如 `de/fr/es`)
  263 +2. 源语种不在索引语言,翻译部分失败(验证仅保留 `base_query` + 成功翻译子句)
  264 +3. 源语种不在索引语言,翻译全部失败(验证无 `base_query_trans_*` 时仍可正常执行)
  265 +4. 非 `zh/en` 语种字段动态拼接(如 `de/fr/es`)
270 266  
271 267  
272 268  
... ... @@ -281,3 +277,24 @@ title.en: 2026 Korean-style High-waisted Slimming Corduroy Skirt with Slit, Mid-
281 277 Rerank score: 0.9643
282 278 title.en: Black Half-high Collar Base Shirt Women's Autumn and Winter fleece-lined Contrast Color Pure Desire Design Sense Horn Sleeve Ruffled Inner Top
283 279 title.zh: 黑色高领半高领女士秋冬内搭加绒拼色纯欲设计荷叶边袖内衬上衣
  280 +
  281 +
  282 +
  283 +qwen3-0.6b的严重badcase:
  284 +q=牛仔裤
  285 +
  286 +Rerank score: 0.0002
  287 +title.en: Wrangler Womens Cowboy Cut Slim Fit Jean Bleach
  288 +title.zh: Wrangler 女士牛仔裤 牛仔剪裁 紧身版型 漂白色
  289 +
  290 +Rerank score: 0.0168
  291 +title.en: Fleece Lined Tights Sheer Women - Fake Translucent Warm Pantyhose Leggings Sheer Thick Tights for Winter
  292 +title.zh: 加绒透肤女士连裤袜 - 仿透视保暖长筒袜 冬季厚款透肤连裤袜
  293 +
  294 +Rerank score: 0.1366
  295 +title.en: Dockers Men's Classic Fit Workday Khaki Smart 360 FLEX Pants (Standard and Big & Tall)
  296 +title.zh: Dockers 男士经典版型工作日卡其色智能360度弹力裤(标准码与加大码)
  297 +
  298 +Rerank score: 0.0981
  299 +title.en: Lazy One Pajama Shorts for Men, Men's Pajama Bottoms, Sleepwear
  300 +title.zh: 懒人男士睡裤,男式家居裤,睡眠服饰
... ...
indexer/document_transformer.py
... ... @@ -13,7 +13,7 @@ import numpy as np
13 13 import logging
14 14 import re
15 15 from typing import Dict, Any, Optional, List
16   -from indexer.product_enrich import analyze_products
  16 +from indexer.product_enrich import analyze_products, split_multi_value_field
17 17  
18 18 logger = logging.getLogger(__name__)
19 19  
... ... @@ -121,7 +121,7 @@ class SPUDocumentTransformer:
121 121 # Tags
122 122 if pd.notna(spu_row.get('tags')):
123 123 tags_str = str(spu_row['tags'])
124   - doc['tags'] = [tag.strip() for tag in tags_str.split(',') if tag.strip()]
  124 + doc['tags'] = split_multi_value_field(tags_str)
125 125  
126 126 # Category相关字段
127 127 self._fill_category_fields(doc, spu_row)
... ... @@ -282,11 +282,7 @@ class SPUDocumentTransformer:
282 282 raw = row.get(name)
283 283 if not raw:
284 284 continue
285   - parts = re.split(r"[,;|/\n\t]+", str(raw))
286   - for part in parts:
287   - value = part.strip()
288   - if not value:
289   - continue
  285 + for value in split_multi_value_field(str(raw)):
290 286 semantic_list.append({"lang": lang, "name": name, "value": value})
291 287  
292 288 if qanchors_obj:
... ... @@ -703,11 +699,7 @@ class SPUDocumentTransformer:
703 699 raw = row.get(name)
704 700 if not raw:
705 701 continue
706   - parts = re.split(r"[,;|/\n\t]+", str(raw))
707   - for part in parts:
708   - value = part.strip()
709   - if not value:
710   - continue
  702 + for value in split_multi_value_field(str(raw)):
711 703 semantic_list.append(
712 704 {
713 705 "lang": lang,
... ...
indexer/product_enrich.py
... ... @@ -144,6 +144,20 @@ if _missing_prompt_langs:
144 144 )
145 145  
146 146  
  147 +# 多值字段分隔:英文逗号、中文逗号、顿号,及历史约定的 ; | / 与空白
  148 +_MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[,、,;|/\n\t]+")
  149 +
  150 +
  151 +def split_multi_value_field(text: Optional[str]) -> List[str]:
  152 + """将 LLM/业务中的多值字符串拆成短语列表(strip 后去空)。"""
  153 + if text is None:
  154 + return []
  155 + s = str(text).strip()
  156 + if not s:
  157 + return []
  158 + return [p.strip() for p in _MULTI_VALUE_FIELD_SPLIT_RE.split(s) if p.strip()]
  159 +
  160 +
147 161 def _normalize_space(text: str) -> str:
148 162 return re.sub(r"\s+", " ", (text or "").strip())
149 163  
... ...
query/query_parser.py
1 1 """
2 2 Query parser - main module for query processing.
3 3  
4   -Handles query rewriting, translation, and embedding generation.
  4 +Responsibilities are intentionally narrow:
  5 +- normalize and rewrite the incoming query
  6 +- detect language and tokenize with HanLP
  7 +- run translation and embedding requests concurrently
  8 +- return parser facts, not Elasticsearch language-planning data
5 9 """
6 10  
7   -from typing import Dict, List, Optional, Any, Union, Tuple
  11 +from dataclasses import dataclass, field
  12 +from typing import Any, Callable, Dict, List, Optional, Tuple
8 13 import numpy as np
9 14 import logging
10 15 import re
... ... @@ -18,15 +23,12 @@ from .query_rewriter import QueryRewriter, QueryNormalizer
18 23  
19 24 logger = logging.getLogger(__name__)
20 25  
21   -try:
22   - import hanlp # type: ignore
23   -except Exception: # pragma: no cover
24   - hanlp = None
  26 +import hanlp # type: ignore
25 27  
26 28  
27 29 def simple_tokenize_query(text: str) -> List[str]:
28 30 """
29   - Lightweight tokenizer for suggestion length / analysis (aligned with QueryParser fallback).
  31 + Lightweight tokenizer for suggestion-side heuristics only.
30 32  
31 33 - Consecutive CJK characters form one token
32 34 - Latin / digit runs (with internal hyphens) form tokens
... ... @@ -37,63 +39,32 @@ def simple_tokenize_query(text: str) -&gt; List[str]:
37 39 return pattern.findall(text)
38 40  
39 41  
  42 +@dataclass(slots=True)
40 43 class ParsedQuery:
41   - """Container for parsed query results."""
42   -
43   - def __init__(
44   - self,
45   - original_query: str,
46   - query_normalized: str,
47   - rewritten_query: Optional[str] = None,
48   - detected_language: Optional[str] = None,
49   - translations: Dict[str, str] = None,
50   - query_vector: Optional[np.ndarray] = None,
51   - domain: str = "default",
52   - keywords: str = "",
53   - token_count: int = 0,
54   - query_tokens: Optional[List[str]] = None,
55   - query_text_by_lang: Optional[Dict[str, str]] = None,
56   - search_langs: Optional[List[str]] = None,
57   - index_languages: Optional[List[str]] = None,
58   - source_in_index_languages: bool = True,
59   - contains_chinese: bool = False,
60   - contains_english: bool = False,
61   - ):
62   - self.original_query = original_query
63   - self.query_normalized = query_normalized
64   - self.rewritten_query = rewritten_query or query_normalized
65   - self.detected_language = detected_language
66   - self.translations = translations or {}
67   - self.query_vector = query_vector
68   - self.domain = domain
69   - # Query analysis fields
70   - self.keywords = keywords
71   - self.token_count = token_count
72   - self.query_tokens = query_tokens or []
73   - self.query_text_by_lang = query_text_by_lang or {}
74   - self.search_langs = search_langs or []
75   - self.index_languages = index_languages or []
76   - self.source_in_index_languages = bool(source_in_index_languages)
77   - self.contains_chinese = bool(contains_chinese)
78   - self.contains_english = bool(contains_english)
  44 + """Container for query parser facts."""
  45 +
  46 + original_query: str
  47 + query_normalized: str
  48 + rewritten_query: str
  49 + detected_language: Optional[str] = None
  50 + translations: Dict[str, str] = field(default_factory=dict)
  51 + query_vector: Optional[np.ndarray] = None
  52 + query_tokens: List[str] = field(default_factory=list)
  53 + contains_chinese: bool = False
  54 + contains_english: bool = False
79 55  
80 56 def to_dict(self) -> Dict[str, Any]:
81 57 """Convert to dictionary representation."""
82   - result = {
  58 + return {
83 59 "original_query": self.original_query,
84 60 "query_normalized": self.query_normalized,
85 61 "rewritten_query": self.rewritten_query,
86 62 "detected_language": self.detected_language,
87 63 "translations": self.translations,
88   - "domain": self.domain
  64 + "query_tokens": self.query_tokens,
  65 + "contains_chinese": self.contains_chinese,
  66 + "contains_english": self.contains_english,
89 67 }
90   - result["query_text_by_lang"] = self.query_text_by_lang
91   - result["search_langs"] = self.search_langs
92   - result["index_languages"] = self.index_languages
93   - result["source_in_index_languages"] = self.source_in_index_languages
94   - result["contains_chinese"] = self.contains_chinese
95   - result["contains_english"] = self.contains_english
96   - return result
97 68  
98 69  
99 70 class QueryParser:
... ... @@ -102,7 +73,7 @@ class QueryParser:
102 73 1. Normalization
103 74 2. Query rewriting (brand/category mappings, synonyms)
104 75 3. Language detection
105   - 4. Translation to target languages
  76 + 4. Translation to caller-provided target languages
106 77 5. Text embedding generation (for semantic search)
107 78 """
108 79  
... ... @@ -110,7 +81,8 @@ class QueryParser:
110 81 self,
111 82 config: SearchConfig,
112 83 text_encoder: Optional[TextEmbeddingEncoder] = None,
113   - translator: Optional[Any] = None
  84 + translator: Optional[Any] = None,
  85 + tokenizer: Optional[Callable[[str], Any]] = None,
114 86 ):
115 87 """
116 88 Initialize query parser.
... ... @@ -128,23 +100,7 @@ class QueryParser:
128 100 self.normalizer = QueryNormalizer()
129 101 self.language_detector = LanguageDetector()
130 102 self.rewriter = QueryRewriter(config.query_config.rewrite_dictionary)
131   -
132   - # Optional HanLP components (heavy). If unavailable, fall back to a lightweight tokenizer.
133   - self._tok = None
134   - self._pos_tag = None
135   - if hanlp is not None:
136   - try:
137   - logger.info("Initializing HanLP components...")
138   - self._tok = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF)
139   - self._tok.config.output_spans = True
140   - self._pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)
141   - logger.info("HanLP components initialized")
142   - except Exception as e:
143   - logger.warning(f"HanLP init failed, falling back to simple tokenizer: {e}")
144   - self._tok = None
145   - self._pos_tag = None
146   - else:
147   - logger.info("HanLP not installed; using simple tokenizer")
  103 + self._tokenizer = tokenizer or self._build_tokenizer()
148 104  
149 105 # Eager initialization (startup-time failure visibility, no lazy init in request path)
150 106 if self.config.query_config.enable_text_embedding and self._text_encoder is None:
... ... @@ -170,57 +126,81 @@ class QueryParser:
170 126 """Return pre-initialized translator."""
171 127 return self._translator
172 128  
  129 + def _build_tokenizer(self) -> Callable[[str], Any]:
  130 + """Build the tokenizer used by query parsing. No fallback path by design."""
  131 + if hanlp is None:
  132 + raise RuntimeError("HanLP is required for QueryParser tokenization")
  133 + logger.info("Initializing HanLP tokenizer...")
  134 + tokenizer = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF)
  135 + tokenizer.config.output_spans = True
  136 + logger.info("HanLP tokenizer initialized")
  137 + return tokenizer
  138 +
173 139 @staticmethod
174   - def _pick_query_translation_model(source_lang: str, target_lang: str, config: SearchConfig) -> str:
  140 + def _pick_query_translation_model(
  141 + source_lang: str,
  142 + target_lang: str,
  143 + config: SearchConfig,
  144 + source_language_in_index: bool,
  145 + ) -> str:
175 146 """Pick the translation capability for query-time translation (configurable)."""
176 147 src = str(source_lang or "").strip().lower()
177 148 tgt = str(target_lang or "").strip().lower()
  149 + qc = config.query_config
  150 +
  151 + if source_language_in_index:
  152 + if src == "zh" and tgt == "en":
  153 + return qc.zh_to_en_model
  154 + if src == "en" and tgt == "zh":
  155 + return qc.en_to_zh_model
  156 + return qc.default_translation_model
178 157  
179   - # Use dedicated models for zh<->en if configured
180 158 if src == "zh" and tgt == "en":
181   - return config.query_config.zh_to_en_model
  159 + return qc.zh_to_en_model_source_not_in_index or qc.zh_to_en_model
182 160 if src == "en" and tgt == "zh":
183   - return config.query_config.en_to_zh_model
184   -
185   - # For any other language pairs, fall back to the configurable default model.
186   - # By default this is `nllb-200-distilled-600m` (multi-lingual local model).
187   - return config.query_config.default_translation_model
188   -
189   - def _simple_tokenize(self, text: str) -> List[str]:
190   - return simple_tokenize_query(text)
191   -
192   - def _extract_keywords(self, query: str) -> str:
193   - """Extract keywords (nouns with length > 1) from query."""
194   - if self._tok is not None and self._pos_tag is not None:
195   - tok_result = self._tok(query)
196   - if not tok_result:
197   - return ""
198   - words = [x[0] for x in tok_result]
199   - pos_tags = self._pos_tag(words)
200   - keywords = []
201   - for word, pos in zip(words, pos_tags):
202   - if len(word) > 1 and isinstance(pos, str) and pos.startswith("N"):
203   - keywords.append(word)
204   - return " ".join(keywords)
205   -
206   - # Fallback: treat tokens with length > 1 as "keywords"
207   - tokens = self._simple_tokenize(query)
208   - keywords = [t for t in tokens if len(t) > 1]
209   - return " ".join(keywords)
210   -
211   - def _get_token_count(self, query: str) -> int:
212   - """Get token count (HanLP if available, otherwise simple)."""
213   - if self._tok is not None:
214   - tok_result = self._tok(query)
215   - return len(tok_result) if tok_result else 0
216   - return len(self._simple_tokenize(query))
  161 + return qc.en_to_zh_model_source_not_in_index or qc.en_to_zh_model
  162 + return qc.default_translation_model_source_not_in_index or qc.default_translation_model
  163 +
  164 + @staticmethod
  165 + def _normalize_language_codes(languages: Optional[List[str]]) -> List[str]:
  166 + normalized: List[str] = []
  167 + seen = set()
  168 + for language in languages or []:
  169 + token = str(language or "").strip().lower()
  170 + if not token or token in seen:
  171 + continue
  172 + seen.add(token)
  173 + normalized.append(token)
  174 + return normalized
  175 +
  176 + @staticmethod
  177 + def _extract_tokens(tokenizer_result: Any) -> List[str]:
  178 + """Normalize tokenizer output into a flat token string list."""
  179 + if not tokenizer_result:
  180 + return []
  181 + if isinstance(tokenizer_result, str):
  182 + token = tokenizer_result.strip()
  183 + return [token] if token else []
  184 +
  185 + tokens: List[str] = []
  186 + for item in tokenizer_result:
  187 + token: Optional[str] = None
  188 + if isinstance(item, str):
  189 + token = item
  190 + elif isinstance(item, (list, tuple)) and item:
  191 + token = str(item[0])
  192 + elif item is not None:
  193 + token = str(item)
  194 +
  195 + if token is None:
  196 + continue
  197 + token = token.strip()
  198 + if token:
  199 + tokens.append(token)
  200 + return tokens
217 201  
218 202 def _get_query_tokens(self, query: str) -> List[str]:
219   - """Get token list (HanLP if available, otherwise simple)."""
220   - if self._tok is not None:
221   - tok_result = self._tok(query)
222   - return [x[0] for x in tok_result] if tok_result else []
223   - return self._simple_tokenize(query)
  203 + return self._extract_tokens(self._tokenizer(query))
224 204  
225 205 @staticmethod
226 206 def _contains_cjk(text: str) -> bool:
... ... @@ -237,64 +217,24 @@ class QueryParser:
237 217 return False
238 218 return bool(re.fullmatch(r"[A-Za-z]+(?:-[A-Za-z]+)*", token))
239 219  
240   - @staticmethod
241   - def _extract_latin_tokens(text: str) -> List[str]:
242   - """Extract latin word tokens from query text."""
243   - return re.findall(r"[A-Za-z]+(?:-[A-Za-z]+)*", text or "")
244   -
245   - def _infer_supplemental_search_langs(
246   - self,
247   - query_text: str,
248   - detected_lang: str,
249   - index_langs: List[str],
250   - ) -> List[str]:
251   - """
252   - Infer extra languages to search when the query mixes scripts.
253   -
254   - Rules:
255   - - If any Chinese characters appear, include `zh` when available.
256   - - If the query contains meaningful latin tokens, include `en` when available.
257   - "Meaningful" means either:
258   - 1) at least 2 latin tokens with length >= 4, or
259   - 2) at least 1 latin token with length >= 4 and latin chars occupy >= 20% of non-space chars.
260   - """
261   - supplemental: List[str] = []
262   - normalized_index_langs = {str(lang or "").strip().lower() for lang in index_langs}
263   - normalized_detected = str(detected_lang or "").strip().lower()
264   - query_text = str(query_text or "")
265   -
266   - if "zh" in normalized_index_langs and self._contains_cjk(query_text) and normalized_detected != "zh":
267   - supplemental.append("zh")
268   -
269   - latin_tokens = self._extract_latin_tokens(query_text)
270   - significant_latin_tokens = [tok for tok in latin_tokens if len(tok) >= 4]
271   - latin_chars = sum(len(tok) for tok in latin_tokens)
272   - non_space_chars = len(re.sub(r"\s+", "", query_text))
273   - latin_ratio = (latin_chars / non_space_chars) if non_space_chars > 0 else 0.0
274   - has_meaningful_english = (
275   - len(significant_latin_tokens) >= 2 or
276   - (len(significant_latin_tokens) >= 1 and latin_ratio >= 0.2)
277   - )
278   -
279   - if "en" in normalized_index_langs and has_meaningful_english and normalized_detected != "en":
280   - supplemental.append("en")
281   -
282   - return supplemental
283   -
284 220 def parse(
285 221 self,
286 222 query: str,
287 223 tenant_id: Optional[str] = None,
288 224 generate_vector: bool = True,
289   - context: Optional[Any] = None
  225 + context: Optional[Any] = None,
  226 + target_languages: Optional[List[str]] = None,
290 227 ) -> ParsedQuery:
291 228 """
292 229 Parse query through all processing stages.
293 230  
294 231 Args:
295 232 query: Raw query string
  233 + tenant_id: Deprecated and ignored by QueryParser. Kept temporarily
  234 + to avoid a wider refactor in this first step.
296 235 generate_vector: Whether to generate query embedding
297 236 context: Optional request context for tracking and logging
  237 + target_languages: Translation target languages decided by the caller
298 238  
299 239 Returns:
300 240 ParsedQuery object with all processing results
... ... @@ -325,15 +265,9 @@ class QueryParser:
325 265 if context:
326 266 context.store_intermediate_result('query_normalized', normalized)
327 267  
328   - # Extract domain if present (e.g., "brand:Nike" -> domain="brand", query="Nike")
329   - domain, query_text = self.normalizer.extract_domain_query(normalized)
330   - log_debug(f"Domain extraction | Domain: '{domain}', Query: '{query_text}'")
331   - if context:
332   - context.store_intermediate_result('extracted_domain', domain)
333   - context.store_intermediate_result('domain_query', query_text)
334   -
335 268 # Stage 2: Query rewriting
336   - rewritten = None
  269 + query_text = normalized
  270 + rewritten = normalized
337 271 if self.config.query_config.rewrite_dictionary: # Enable rewrite if dictionary exists
338 272 rewritten = self.rewriter.rewrite(query_text)
339 273 if rewritten != query_text:
... ... @@ -351,43 +285,57 @@ class QueryParser:
351 285 log_info(f"Language detection | Detected language: {detected_lang}")
352 286 if context:
353 287 context.store_intermediate_result('detected_language', detected_lang)
  288 + # Stage 4: Query analysis (tokenization + script flags)
  289 + query_tokens = self._get_query_tokens(query_text)
  290 + contains_chinese = self._contains_cjk(query_text)
  291 + contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens)
  292 +
  293 + log_debug(
  294 + f"Query analysis | Query tokens: {query_tokens} | "
  295 + f"contains_chinese={contains_chinese} | contains_english={contains_english}"
  296 + )
  297 + if context:
  298 + context.store_intermediate_result('query_tokens', query_tokens)
  299 + context.store_intermediate_result('contains_chinese', contains_chinese)
  300 + context.store_intermediate_result('contains_english', contains_english)
354 301  
355   - # Stage 4: Translation — always submit to thread pool; results are collected together with
356   - # embedding in one wait() that uses a configurable budget (short vs long by source-in-index).
  302 + # Stage 5: Translation + embedding. Parser only coordinates async enrichment work; the
  303 + # caller decides translation targets and later search-field planning.
357 304 translations: Dict[str, str] = {}
358   - translation_futures: Dict[str, Any] = {}
359   - translation_executor: Optional[ThreadPoolExecutor] = None
360   - index_langs: List[str] = []
  305 + future_to_task: Dict[Any, Tuple[str, Optional[str]]] = {}
  306 + async_executor: Optional[ThreadPoolExecutor] = None
361 307 detected_norm = str(detected_lang or "").strip().lower()
  308 + normalized_targets = self._normalize_language_codes(target_languages)
  309 + translation_targets = [lang for lang in normalized_targets if lang != detected_norm]
  310 + source_language_in_index = bool(normalized_targets) and detected_norm in normalized_targets
  311 +
  312 + # Stage 6: Text embedding - async execution
  313 + query_vector = None
  314 + should_generate_embedding = (
  315 + generate_vector and
  316 + self.config.query_config.enable_text_embedding
  317 + )
  318 +
  319 + task_count = len(translation_targets) + (1 if should_generate_embedding else 0)
  320 + if task_count > 0:
  321 + async_executor = ThreadPoolExecutor(
  322 + max_workers=max(1, min(task_count, 4)),
  323 + thread_name_prefix="query-enrichment",
  324 + )
362 325  
363 326 try:
364   - # 根据租户配置的 index_languages 决定翻译目标语言
365   - from config.tenant_config_loader import get_tenant_config_loader
366   - tenant_loader = get_tenant_config_loader()
367   - tenant_cfg = tenant_loader.get_tenant_config(tenant_id or "default")
368   - raw_index_langs = tenant_cfg.get("index_languages") or []
369   - index_langs = []
370   - seen_langs = set()
371   - for lang in raw_index_langs:
372   - norm_lang = str(lang or "").strip().lower()
373   - if not norm_lang or norm_lang in seen_langs:
374   - continue
375   - seen_langs.add(norm_lang)
376   - index_langs.append(norm_lang)
377   -
378   - target_langs_for_translation = [lang for lang in index_langs if lang != detected_norm]
379   -
380   - if target_langs_for_translation:
381   - translation_executor = ThreadPoolExecutor(
382   - max_workers=max(1, min(len(target_langs_for_translation), 4)),
383   - thread_name_prefix="query-translation",
384   - )
385   - for lang in target_langs_for_translation:
386   - model_name = self._pick_query_translation_model(detected_lang, lang, self.config)
  327 + if async_executor is not None:
  328 + for lang in translation_targets:
  329 + model_name = self._pick_query_translation_model(
  330 + detected_lang,
  331 + lang,
  332 + self.config,
  333 + source_language_in_index,
  334 + )
387 335 log_debug(
388 336 f"Submitting query translation | source={detected_lang} target={lang} model={model_name}"
389 337 )
390   - translation_futures[lang] = translation_executor.submit(
  338 + future = async_executor.submit(
391 339 self.translator.translate,
392 340 query_text,
393 341 lang,
... ... @@ -395,107 +343,61 @@ class QueryParser:
395 343 "ecommerce_search_query",
396 344 model_name,
397 345 )
398   -
399   - if context:
400   - context.store_intermediate_result('translations', translations)
401   - for lang, translation in translations.items():
402   - if translation:
403   - context.store_intermediate_result(f'translation_{lang}', translation)
404   -
  346 + future_to_task[future] = ("translation", lang)
  347 +
  348 + if should_generate_embedding:
  349 + if self.text_encoder is None:
  350 + raise RuntimeError("Text embedding is enabled but text encoder is not initialized")
  351 + log_debug("Submitting query vector generation")
  352 +
  353 + def _encode_query_vector() -> Optional[np.ndarray]:
  354 + arr = self.text_encoder.encode([query_text], priority=1)
  355 + if arr is None or len(arr) == 0:
  356 + return None
  357 + vec = arr[0]
  358 + if vec is None:
  359 + return None
  360 + return np.asarray(vec, dtype=np.float32)
  361 +
  362 + future = async_executor.submit(_encode_query_vector)
  363 + future_to_task[future] = ("embedding", None)
405 364 except Exception as e:
406   - error_msg = f"Translation failed | Error: {str(e)}"
  365 + error_msg = f"Async query enrichment submission failed | Error: {str(e)}"
407 366 log_info(error_msg)
408 367 if context:
409 368 context.add_warning(error_msg)
  369 + if async_executor is not None:
  370 + async_executor.shutdown(wait=False)
  371 + async_executor = None
  372 + future_to_task.clear()
410 373  
411   - # Stage 5: Query analysis (keywords, token count, query_tokens)
412   - keywords = self._extract_keywords(query_text)
413   - query_tokens = self._get_query_tokens(query_text)
414   - token_count = len(query_tokens)
415   - contains_chinese = self._contains_cjk(query_text)
416   - contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens)
417   -
418   - log_debug(f"Query analysis | Keywords: {keywords} | Token count: {token_count} | "
419   - f"Query tokens: {query_tokens} | contains_chinese={contains_chinese} | "
420   - f"contains_english={contains_english}")
421   - if context:
422   - context.store_intermediate_result('keywords', keywords)
423   - context.store_intermediate_result('token_count', token_count)
424   - context.store_intermediate_result('query_tokens', query_tokens)
425   - context.store_intermediate_result('contains_chinese', contains_chinese)
426   - context.store_intermediate_result('contains_english', contains_english)
427   -
428   - # Stage 6: Text embedding (only for non-short queries) - async execution
429   - query_vector = None
430   - embedding_future = None
431   - should_generate_embedding = (
432   - generate_vector and
433   - self.config.query_config.enable_text_embedding and
434   - domain == "default"
435   - )
436   -
437   - encoding_executor = None
438   - if should_generate_embedding:
439   - try:
440   - if self.text_encoder is None:
441   - raise RuntimeError("Text embedding is enabled but text encoder is not initialized")
442   - log_debug("Starting query vector generation (async)")
443   - # Submit encoding task to thread pool for async execution
444   - encoding_executor = ThreadPoolExecutor(max_workers=1)
445   - def _encode_query_vector() -> Optional[np.ndarray]:
446   - arr = self.text_encoder.encode([query_text], priority=1)
447   - if arr is None or len(arr) == 0:
448   - return None
449   - vec = arr[0]
450   - return vec if isinstance(vec, np.ndarray) else None
451   - embedding_future = encoding_executor.submit(
452   - _encode_query_vector
453   - )
454   - except Exception as e:
455   - error_msg = f"Query vector generation task submission failed | Error: {str(e)}"
456   - log_info(error_msg)
457   - if context:
458   - context.add_warning(error_msg)
459   - encoding_executor = None
460   - embedding_future = None
461   -
462   - # Wait for translation + embedding concurrently; shared budget (ms) depends on whether
463   - # the detected language is in tenant index_languages.
  374 + # Wait for translation + embedding concurrently; shared budget depends on whether
  375 + # the detected language belongs to caller-provided target_languages.
464 376 qc = self.config.query_config
465   - source_in_index_for_budget = detected_norm in index_langs
  377 + source_in_target_languages = bool(normalized_targets) and detected_norm in normalized_targets
466 378 budget_ms = (
467 379 qc.translation_embedding_wait_budget_ms_source_in_index
468   - if source_in_index_for_budget
  380 + if source_in_target_languages
469 381 else qc.translation_embedding_wait_budget_ms_source_not_in_index
470 382 )
471 383 budget_sec = max(0.0, float(budget_ms) / 1000.0)
472 384  
473   - if translation_futures:
  385 + if translation_targets:
474 386 log_info(
475 387 f"Translation+embedding shared wait budget | budget_ms={budget_ms} | "
476   - f"source_in_index_languages={source_in_index_for_budget} | "
477   - f"translation_targets={list(translation_futures.keys())}"
  388 + f"source_in_target_languages={source_in_target_languages} | "
  389 + f"translation_targets={translation_targets}"
478 390 )
479 391  
480   - if translation_futures or embedding_future:
  392 + if future_to_task:
481 393 log_debug(
482 394 f"Waiting for async tasks (translation+embedding) | budget_ms={budget_ms} | "
483   - f"source_in_index_languages={source_in_index_for_budget}"
  395 + f"source_in_target_languages={source_in_target_languages}"
484 396 )
485 397  
486   - all_futures: List[Any] = []
487   - future_to_lang: Dict[Any, tuple] = {}
488   - for lang, future in translation_futures.items():
489   - all_futures.append(future)
490   - future_to_lang[future] = ("translation", lang)
491   -
492   - if embedding_future:
493   - all_futures.append(embedding_future)
494   - future_to_lang[embedding_future] = ("embedding", None)
495   -
496   - done, not_done = wait(all_futures, timeout=budget_sec)
  398 + done, not_done = wait(list(future_to_task.keys()), timeout=budget_sec)
497 399 for future in done:
498   - task_type, lang = future_to_lang[future]
  400 + task_type, lang = future_to_task[future]
499 401 try:
500 402 result = future.result()
501 403 if task_type == "translation":
... ... @@ -528,7 +430,7 @@ class QueryParser:
528 430  
529 431 if not_done:
530 432 for future in not_done:
531   - task_type, lang = future_to_lang[future]
  433 + task_type, lang = future_to_task[future]
532 434 if task_type == "translation":
533 435 timeout_msg = (
534 436 f"Translation timeout (>{budget_ms}ms) | Language: {lang} | "
... ... @@ -542,68 +444,21 @@ class QueryParser:
542 444 if context:
543 445 context.add_warning(timeout_msg)
544 446  
545   - if encoding_executor:
546   - encoding_executor.shutdown(wait=False)
547   - if translation_executor:
548   - translation_executor.shutdown(wait=False)
  447 + if async_executor:
  448 + async_executor.shutdown(wait=False)
549 449  
550 450 if translations and context:
551 451 context.store_intermediate_result("translations", translations)
552   -
553   - # Build language-scoped query plan: source language + available translations
554   - query_text_by_lang: Dict[str, str] = {}
555   - if query_text:
556   - query_text_by_lang[detected_lang] = query_text
557   - for lang, translated_text in (translations or {}).items():
558   - if translated_text and str(translated_text).strip():
559   - query_text_by_lang[str(lang).strip().lower()] = str(translated_text)
560   -
561   - supplemental_search_langs = self._infer_supplemental_search_langs(
562   - query_text=query_text,
563   - detected_lang=detected_lang,
564   - index_langs=index_langs,
565   - )
566   - for lang in supplemental_search_langs:
567   - if lang not in query_text_by_lang and query_text:
568   - # Use the original mixed-script query as a robust fallback probe for that language field set.
569   - query_text_by_lang[lang] = query_text
570   -
571   - source_in_index_languages = detected_norm in index_langs
572   - ordered_search_langs: List[str] = []
573   - seen_order = set()
574   - if detected_lang in query_text_by_lang:
575   - ordered_search_langs.append(detected_lang)
576   - seen_order.add(detected_lang)
577   - for lang in index_langs:
578   - if lang in query_text_by_lang and lang not in seen_order:
579   - ordered_search_langs.append(lang)
580   - seen_order.add(lang)
581   - for lang in query_text_by_lang.keys():
582   - if lang not in seen_order:
583   - ordered_search_langs.append(lang)
584   - seen_order.add(lang)
585   -
586   - if context:
587   - context.store_intermediate_result("search_langs", ordered_search_langs)
588   - context.store_intermediate_result("query_text_by_lang", query_text_by_lang)
589   - context.store_intermediate_result("supplemental_search_langs", supplemental_search_langs)
590 452  
591 453 # Build result
592 454 result = ParsedQuery(
593 455 original_query=query,
594 456 query_normalized=normalized,
595   - rewritten_query=rewritten,
  457 + rewritten_query=query_text,
596 458 detected_language=detected_lang,
597 459 translations=translations,
598 460 query_vector=query_vector,
599   - domain=domain,
600   - keywords=keywords,
601   - token_count=token_count,
602 461 query_tokens=query_tokens,
603   - query_text_by_lang=query_text_by_lang,
604   - search_langs=ordered_search_langs,
605   - index_languages=index_langs,
606   - source_in_index_languages=source_in_index_languages,
607 462 contains_chinese=contains_chinese,
608 463 contains_english=contains_english,
609 464 )
... ... @@ -611,14 +466,13 @@ class QueryParser:
611 466 if context and hasattr(context, 'logger'):
612 467 context.logger.info(
613 468 f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | "
614   - f"Language: {detected_lang} | Domain: {domain} | "
615 469 f"Translation count: {len(translations)} | Vector: {'yes' if query_vector is not None else 'no'}",
616 470 extra={'reqid': context.reqid, 'uid': context.uid}
617 471 )
618 472 else:
619 473 logger.info(
620 474 f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | "
621   - f"Language: {detected_lang} | Domain: {domain}"
  475 + f"Language: {detected_lang}"
622 476 )
623 477  
624 478 return result
... ...
requirements_hanlp.txt 0 → 100644
... ... @@ -0,0 +1,13 @@
  1 +# Optional: HanLP query tokenization for the main backend venv (QueryParser).
  2 +#
  3 +# Install:
  4 +# source activate.sh
  5 +# pip install -r requirements_hanlp.txt
  6 +#
  7 +# Why pin transformers<5:
  8 +# transformers 5.x no longer exposes `encode_plus` on `BertTokenizer`, but HanLP 2.1.x
  9 +# still calls it → AttributeError during `hanlp.load(...)`.
  10 +# Use transformers 4.44+ (4.x) which remains API-compatible with HanLP.
  11 +
  12 +hanlp>=2.1.0
  13 +transformers>=4.44,<5
... ...
scripts/eval_search_quality.py
... ... @@ -83,7 +83,6 @@ class RankedItem:
83 83 text_score: float | None
84 84 text_source_score: float | None
85 85 text_translation_score: float | None
86   - text_fallback_score: float | None
87 86 text_primary_score: float | None
88 87 text_support_score: float | None
89 88 knn_score: float | None
... ... @@ -146,7 +145,6 @@ def _evaluate_query(searcher, tenant_id: str, query: str) -&gt; Dict[str, Any]:
146 145 text_score=_to_float(debug_item.get("text_score")),
147 146 text_source_score=_to_float(debug_item.get("text_source_score")),
148 147 text_translation_score=_to_float(debug_item.get("text_translation_score")),
149   - text_fallback_score=_to_float(debug_item.get("text_fallback_score")),
150 148 text_primary_score=_to_float(debug_item.get("text_primary_score")),
151 149 text_support_score=_to_float(debug_item.get("text_support_score")),
152 150 knn_score=_to_float(debug_item.get("knn_score")),
... ... @@ -185,12 +183,11 @@ def _render_markdown(report: Dict[str, Any]) -&gt; str:
185 183 f"- total={entry['total']} max_score={entry['max_score']:.6f} took_ms={entry['took_ms']}"
186 184 )
187 185 lines.append(
188   - f"- detected_language={qa.get('detected_language')} search_langs={qa.get('search_langs')} supplemental_search_langs={qa.get('supplemental_search_langs')}"
  186 + f"- detected_language={qa.get('detected_language')} translations={qa.get('translations')}"
189 187 )
190   - lines.append(f"- query_text_by_lang={qa.get('query_text_by_lang')}")
191 188 lines.append("")
192   - lines.append("| rank | spu_id | title | fused | rerank | text | text_src | text_trans | text_fb | knn | es | matched_queries |")
193   - lines.append("| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |")
  189 + lines.append("| rank | spu_id | title | fused | rerank | text | text_src | text_trans | knn | es | matched_queries |")
  190 + lines.append("| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |")
194 191 for item in entry.get("top20", []):
195 192 title = str(item.get("title", "")).replace("|", "/")
196 193 matched = json.dumps(item.get("matched_queries"), ensure_ascii=False)
... ... @@ -199,7 +196,7 @@ def _render_markdown(report: Dict[str, Any]) -&gt; str:
199 196 f"| {item.get('rank')} | {item.get('spu_id')} | {title} | "
200 197 f"{item.get('fused_score')} | {item.get('rerank_score')} | {item.get('text_score')} | "
201 198 f"{item.get('text_source_score')} | {item.get('text_translation_score')} | "
202   - f"{item.get('text_fallback_score')} | {item.get('knn_score')} | {item.get('es_score')} | {matched} |"
  199 + f"{item.get('knn_score')} | {item.get('es_score')} | {matched} |"
203 200 )
204 201 lines.append("")
205 202 return "\n".join(lines)
... ...
search/es_query_builder.py
... ... @@ -36,11 +36,12 @@ class ESQueryBuilder:
36 36 base_minimum_should_match: str = "70%",
37 37 translation_minimum_should_match: str = "70%",
38 38 translation_boost: float = 0.4,
39   - translation_boost_when_source_missing: float = 1.0,
40   - source_boost_when_missing: float = 0.6,
41   - original_query_fallback_boost_when_translation_missing: float = 0.2,
42 39 tie_breaker_base_query: float = 0.9,
43 40 mixed_script_merged_field_boost_scale: float = 0.6,
  41 + phrase_match_base_fields: Optional[Tuple[str, ...]] = None,
  42 + phrase_match_slop: int = 2,
  43 + phrase_match_tie_breaker: float = 0.4,
  44 + phrase_match_boost: float = 3.0,
44 45 ):
45 46 """
46 47 Initialize query builder.
... ... @@ -74,13 +75,12 @@ class ESQueryBuilder:
74 75 self.base_minimum_should_match = base_minimum_should_match
75 76 self.translation_minimum_should_match = translation_minimum_should_match
76 77 self.translation_boost = float(translation_boost)
77   - self.translation_boost_when_source_missing = float(translation_boost_when_source_missing)
78   - self.source_boost_when_missing = float(source_boost_when_missing)
79   - self.original_query_fallback_boost_when_translation_missing = float(
80   - original_query_fallback_boost_when_translation_missing
81   - )
82 78 self.tie_breaker_base_query = float(tie_breaker_base_query)
83 79 self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale)
  80 + self.phrase_match_base_fields = tuple(phrase_match_base_fields or ("title", "qanchors"))
  81 + self.phrase_match_slop = int(phrase_match_slop)
  82 + self.phrase_match_tie_breaker = float(phrase_match_tie_breaker)
  83 + self.phrase_match_boost = float(phrase_match_boost)
84 84  
85 85 def _apply_source_filter(self, es_query: Dict[str, Any]) -> None:
86 86 """
... ... @@ -159,7 +159,8 @@ class ESQueryBuilder:
159 159 knn_k: int = 50,
160 160 knn_num_candidates: int = 200,
161 161 min_score: Optional[float] = None,
162   - parsed_query: Optional[Any] = None
  162 + parsed_query: Optional[Any] = None,
  163 + index_languages: Optional[List[str]] = None,
163 164 ) -> Dict[str, Any]:
164 165 """
165 166 Build complete ES query with post_filter support for multi-select faceting.
... ... @@ -167,7 +168,7 @@ class ESQueryBuilder:
167 168 结构:filters and (text_recall or embedding_recall) + post_filter
168 169 - conjunctive_filters: 应用在 query.bool.filter(影响结果和聚合)
169 170 - disjunctive_filters: 应用在 post_filter(只影响结果,不影响聚合)
170   - - text_recall: 文本相关性召回(按 search_langs 动态语言字段)
  171 + - text_recall: 文本相关性召回(按实际 clause 语言动态字段)
171 172 - embedding_recall: 向量召回(KNN)
172 173 - function_score: 包装召回部分,支持提权字段
173 174  
... ... @@ -202,7 +203,11 @@ class ESQueryBuilder:
202 203 # Text recall (always include if query_text exists)
203 204 if query_text:
204 205 # Unified text query strategy
205   - text_query = self._build_advanced_text_query(query_text, parsed_query)
  206 + text_query = self._build_advanced_text_query(
  207 + query_text,
  208 + parsed_query,
  209 + index_languages=index_languages,
  210 + )
206 211 recall_clauses.append(text_query)
207 212  
208 213 # Embedding recall (KNN - separate from query, handled below)
... ... @@ -456,6 +461,44 @@ class ESQueryBuilder:
456 461 """Format (field_path, boost) pairs for Elasticsearch multi_match ``fields``."""
457 462 return [self._format_field_with_boost(path, boost) for path, boost in specs]
458 463  
  464 + def _build_phrase_match_fields(self, language: str) -> List[str]:
  465 + """Fields for phrase multi_match: base names × ``.{lang}`` with ``field_boosts``."""
  466 + lang = (language or "").strip().lower()
  467 + if not lang:
  468 + return []
  469 + out: List[str] = []
  470 + for base in self.phrase_match_base_fields:
  471 + path = f"{base}.{lang}"
  472 + boost = self._get_field_boost(base, lang)
  473 + out.append(self._format_field_with_boost(path, boost))
  474 + return out
  475 +
  476 + def _append_phrase_should_clause(
  477 + self,
  478 + should_clauses: List[Dict[str, Any]],
  479 + lang: str,
  480 + lang_query: str,
  481 + clause_name: str
  482 + ) -> None:
  483 + text = (lang_query or "").strip()
  484 + if not text:
  485 + return
  486 + phrase_fields = self._build_phrase_match_fields(lang)
  487 + if not phrase_fields:
  488 + return
  489 + boost = self.phrase_match_boost
  490 + should_clauses.append({
  491 + "multi_match": {
  492 + "_name": f"{clause_name}_phrase",
  493 + "query": lang_query,
  494 + "type": "phrase",
  495 + "fields": phrase_fields,
  496 + "slop": self.phrase_match_slop,
  497 + "tie_breaker": self.phrase_match_tie_breaker,
  498 + "boost": boost,
  499 + }
  500 + })
  501 +
459 502 def _merge_supplemental_lang_field_specs(
460 503 self,
461 504 specs: List[MatchFieldSpec],
... ... @@ -479,6 +522,7 @@ class ESQueryBuilder:
479 522 contains_chinese: bool,
480 523 contains_english: bool,
481 524 index_languages: List[str],
  525 + is_source: bool = False
482 526 ) -> List[MatchFieldSpec]:
483 527 """
484 528 When the query mixes scripts, widen each clause to indexed fields for the other script
... ... @@ -492,10 +536,11 @@ class ESQueryBuilder:
492 536  
493 537 out = list(specs)
494 538 lnorm = (lang or "").strip().lower()
495   - if contains_english and lnorm != "en" and can_use("en"):
496   - out = self._merge_supplemental_lang_field_specs(out, "en")
497   - if contains_chinese and lnorm != "zh" and can_use("zh"):
498   - out = self._merge_supplemental_lang_field_specs(out, "zh")
  539 + if is_source:
  540 + if contains_english and lnorm != "en" and can_use("en"):
  541 + out = self._merge_supplemental_lang_field_specs(out, "en")
  542 + if contains_chinese and lnorm != "zh" and can_use("zh"):
  543 + out = self._merge_supplemental_lang_field_specs(out, "zh")
499 544 return out
500 545  
501 546 def _get_embedding_field(self, language: str) -> str:
... ... @@ -503,13 +548,31 @@ class ESQueryBuilder:
503 548 # Currently using unified embedding field
504 549 return self.text_embedding_field or "title_embedding"
505 550  
506   - def _build_advanced_text_query(self, query_text: str, parsed_query: Optional[Any] = None) -> Dict[str, Any]:
  551 + @staticmethod
  552 + def _normalize_language_list(languages: Optional[List[str]]) -> List[str]:
  553 + normalized: List[str] = []
  554 + seen = set()
  555 + for language in languages or []:
  556 + token = str(language or "").strip().lower()
  557 + if not token or token in seen:
  558 + continue
  559 + seen.add(token)
  560 + normalized.append(token)
  561 + return normalized
  562 +
  563 + def _build_advanced_text_query(
  564 + self,
  565 + query_text: str,
  566 + parsed_query: Optional[Any] = None,
  567 + *,
  568 + index_languages: Optional[List[str]] = None,
  569 + ) -> Dict[str, Any]:
507 570 """
508   - Build advanced text query using should clauses with primary and fallback lexical strategies.
  571 + Build advanced text query using base and translated lexical clauses.
509 572  
510 573 Unified implementation:
511 574 - base_query: source-language clause
512   - - translation queries: target-language clauses from search_langs/query_text_by_lang
  575 + - translation queries: target-language clauses from translations
513 576 - KNN query: added separately in build_query
514 577  
515 578 Args:
... ... @@ -520,66 +583,41 @@ class ESQueryBuilder:
520 583 ES bool query with should clauses
521 584 """
522 585 should_clauses = []
523   -
524   - # Get query analysis from parsed_query
525   - query_text_by_lang: Dict[str, str] = {}
526   - search_langs: List[str] = []
527 586 source_lang = self.default_language
528   - source_in_index_languages = True
529   - index_languages: List[str] = []
530   -
  587 + translations: Dict[str, str] = {}
531 588 contains_chinese = False
532 589 contains_english = False
  590 + normalized_index_languages = self._normalize_language_list(index_languages)
  591 +
533 592 if parsed_query:
534   - query_text_by_lang = getattr(parsed_query, "query_text_by_lang", None) or {}
535   - search_langs = getattr(parsed_query, "search_langs", None) or []
536 593 detected_lang = getattr(parsed_query, "detected_language", None)
537 594 source_lang = detected_lang if detected_lang and detected_lang != "unknown" else self.default_language
538   - source_in_index_languages = bool(
539   - getattr(parsed_query, "source_in_index_languages", True)
540   - )
541   - index_languages = getattr(parsed_query, "index_languages", None) or []
  595 + translations = getattr(parsed_query, "translations", None) or {}
542 596 contains_chinese = bool(getattr(parsed_query, "contains_chinese", False))
543 597 contains_english = bool(getattr(parsed_query, "contains_english", False))
544 598  
545   - if not query_text_by_lang:
546   - query_text_by_lang = {source_lang: query_text}
547   - if source_lang not in query_text_by_lang and query_text:
548   - query_text_by_lang[source_lang] = query_text
549   - if not search_langs:
550   - search_langs = list(query_text_by_lang.keys())
551   -
552   - # Base + translated clauses based on language plan.
553   - for lang in search_langs:
554   - lang_query = query_text_by_lang.get(lang)
555   - if not lang_query:
556   - continue
  599 + source_lang = str(source_lang or self.default_language).strip().lower() or self.default_language
  600 + base_query_text = (
  601 + getattr(parsed_query, "rewritten_query", None) if parsed_query else None
  602 + ) or query_text
  603 +
  604 + def append_clause(lang: str, lang_query: str, clause_name: str, is_source: bool) -> None:
  605 + nonlocal should_clauses
557 606 all_specs, _ = self._build_match_field_specs(lang)
558 607 expanded_specs = self._expand_match_field_specs_for_mixed_script(
559 608 lang,
560 609 all_specs,
561 610 contains_chinese,
562 611 contains_english,
563   - index_languages,
  612 + normalized_index_languages,
  613 + is_source,
564 614 )
565 615 match_fields = self._format_match_field_specs(expanded_specs)
566 616 if not match_fields:
567   - continue
568   -
569   - is_source = (lang == source_lang)
570   - clause_boost = 1.0
571   - clause_name = "base_query" if is_source else f"base_query_trans_{lang}"
  617 + return
572 618 minimum_should_match = (
573 619 self.base_minimum_should_match if is_source else self.translation_minimum_should_match
574 620 )
575   - if is_source and not source_in_index_languages:
576   - clause_boost = self.source_boost_when_missing
577   - elif not is_source:
578   - clause_boost = (
579   - self.translation_boost
580   - if source_in_index_languages
581   - else self.translation_boost_when_source_missing
582   - )
583 621  
584 622 clause = {
585 623 "multi_match": {
... ... @@ -590,55 +628,34 @@ class ESQueryBuilder:
590 628 "tie_breaker": self.tie_breaker_base_query,
591 629 }
592 630 }
593   - if abs(clause_boost - 1.0) > 1e-9:
594   - clause["multi_match"]["boost"] = clause_boost
  631 + # base_query: never set multi_match.boost (ES default 1.0).
  632 + # Translation clauses: single knob from config — translation_boost.
  633 + if not is_source:
  634 + tb = float(self.translation_boost)
  635 + clause["multi_match"]["boost"] = tb
595 636 should_clauses.append({
596 637 "multi_match": clause["multi_match"]
597 638 })
  639 + self._append_phrase_should_clause(
  640 + should_clauses, lang, lang_query, clause_name
  641 + )
598 642  
599   - # Fallback: source language is not indexed and translation for some index languages is missing.
600   - # Use original query text on missing index-language fields with a low boost.
601   - if not source_in_index_languages and query_text and index_languages:
602   - normalized_index_langs: List[str] = []
603   - seen_langs = set()
604   - for lang in index_languages:
605   - norm_lang = str(lang or "").strip().lower()
606   - if not norm_lang or norm_lang in seen_langs:
607   - continue
608   - seen_langs.add(norm_lang)
609   - normalized_index_langs.append(norm_lang)
  643 + if base_query_text:
  644 + append_clause(source_lang, base_query_text, "base_query", True)
610 645  
611   - for lang in normalized_index_langs:
612   - if lang == source_lang:
613   - continue
614   - if lang in query_text_by_lang:
615   - continue
616   - fb_specs, _ = self._build_match_field_specs(lang)
617   - expanded_fb = self._expand_match_field_specs_for_mixed_script(
618   - lang,
619   - fb_specs,
620   - contains_chinese,
621   - contains_english,
622   - index_languages,
623   - )
624   - match_fields = self._format_match_field_specs(expanded_fb)
625   - if not match_fields:
626   - continue
627   - should_clauses.append({
628   - "multi_match": {
629   - "_name": f"fallback_original_query_{lang}",
630   - "query": query_text,
631   - "fields": match_fields,
632   - "minimum_should_match": self.translation_minimum_should_match,
633   - "tie_breaker": self.tie_breaker_base_query,
634   - "boost": self.original_query_fallback_boost_when_translation_missing,
635   - }
636   - })
  646 + for lang, translated_text in translations.items():
  647 + normalized_lang = str(lang or "").strip().lower()
  648 + normalized_text = str(translated_text or "").strip()
  649 + if not normalized_lang or not normalized_text:
  650 + continue
  651 + if normalized_lang == source_lang and normalized_text == base_query_text:
  652 + continue
  653 + append_clause(normalized_lang, normalized_text, f"base_query_trans_{normalized_lang}", False)
637 654  
638 655 # Fallback to a simple query when language fields cannot be resolved.
639 656 if not should_clauses:
640 657 fallback_fields = self.match_fields or ["title.en^1.0"]
641   - return {
  658 + fallback_lexical = {
642 659 "multi_match": {
643 660 "_name": "base_query_fallback",
644 661 "query": query_text,
... ... @@ -647,6 +664,21 @@ class ESQueryBuilder:
647 664 "tie_breaker": self.tie_breaker_base_query,
648 665 }
649 666 }
  667 + fb_should: List[Dict[str, Any]] = [fallback_lexical]
  668 + self._append_phrase_should_clause(
  669 + fb_should,
  670 + self.default_language,
  671 + query_text,
  672 + "base_query_fallback"
  673 + )
  674 + if len(fb_should) == 1:
  675 + return fallback_lexical
  676 + return {
  677 + "bool": {
  678 + "should": fb_should,
  679 + "minimum_should_match": 1,
  680 + }
  681 + }
650 682  
651 683 # Return bool query with should clauses
652 684 if len(should_clauses) == 1:
... ...
search/rerank_client.py
... ... @@ -116,7 +116,6 @@ def _extract_named_query_score(matched_queries: Any, name: str) -&gt; float:
116 116 def _collect_text_score_components(matched_queries: Any, fallback_es_score: float) -> Dict[str, float]:
117 117 source_score = _extract_named_query_score(matched_queries, "base_query")
118 118 translation_score = 0.0
119   - fallback_score = 0.0
120 119  
121 120 if isinstance(matched_queries, dict):
122 121 for query_name, score in matched_queries.items():
... ... @@ -125,21 +124,16 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa
125 124 numeric_score = _to_score(score)
126 125 if query_name.startswith("base_query_trans_"):
127 126 translation_score = max(translation_score, numeric_score)
128   - elif query_name.startswith("fallback_original_query_"):
129   - fallback_score = max(fallback_score, numeric_score)
130 127 elif isinstance(matched_queries, list):
131 128 for query_name in matched_queries:
132 129 if not isinstance(query_name, str):
133 130 continue
134 131 if query_name.startswith("base_query_trans_"):
135 132 translation_score = 1.0
136   - elif query_name.startswith("fallback_original_query_"):
137   - fallback_score = 1.0
138 133  
139 134 weighted_source = source_score
140 135 weighted_translation = 0.8 * translation_score
141   - weighted_fallback = 0.55 * fallback_score
142   - weighted_components = [weighted_source, weighted_translation, weighted_fallback]
  136 + weighted_components = [weighted_source, weighted_translation]
143 137 primary_text_score = max(weighted_components)
144 138 support_text_score = sum(weighted_components) - primary_text_score
145 139 text_score = primary_text_score + 0.25 * support_text_score
... ... @@ -153,10 +147,8 @@ def _collect_text_score_components(matched_queries: Any, fallback_es_score: floa
153 147 return {
154 148 "source_score": source_score,
155 149 "translation_score": translation_score,
156   - "fallback_score": fallback_score,
157 150 "weighted_source_score": weighted_source,
158 151 "weighted_translation_score": weighted_translation,
159   - "weighted_fallback_score": weighted_fallback,
160 152 "primary_text_score": primary_text_score,
161 153 "support_text_score": support_text_score,
162 154 "text_score": text_score,
... ... @@ -219,7 +211,6 @@ def fuse_scores_and_resort(
219 211 hit["_knn_score"] = knn_score
220 212 hit["_text_source_score"] = text_components["source_score"]
221 213 hit["_text_translation_score"] = text_components["translation_score"]
222   - hit["_text_fallback_score"] = text_components["fallback_score"]
223 214 hit["_text_primary_score"] = text_components["primary_text_score"]
224 215 hit["_text_support_score"] = text_components["support_text_score"]
225 216 hit["_fused_score"] = fused
... ... @@ -231,7 +222,6 @@ def fuse_scores_and_resort(
231 222 "text_score": text_score,
232 223 "text_source_score": text_components["source_score"],
233 224 "text_translation_score": text_components["translation_score"],
234   - "text_fallback_score": text_components["fallback_score"],
235 225 "text_primary_score": text_components["primary_text_score"],
236 226 "text_support_score": text_components["support_text_score"],
237 227 "knn_score": knn_score,
... ...
search/searcher.py
... ... @@ -132,11 +132,6 @@ class Searcher:
132 132 base_minimum_should_match=self.config.query_config.base_minimum_should_match,
133 133 translation_minimum_should_match=self.config.query_config.translation_minimum_should_match,
134 134 translation_boost=self.config.query_config.translation_boost,
135   - translation_boost_when_source_missing=self.config.query_config.translation_boost_when_source_missing,
136   - source_boost_when_missing=self.config.query_config.source_boost_when_missing,
137   - original_query_fallback_boost_when_translation_missing=(
138   - self.config.query_config.original_query_fallback_boost_when_translation_missing
139   - ),
140 135 tie_breaker_base_query=self.config.query_config.tie_breaker_base_query,
141 136 )
142 137  
... ... @@ -267,13 +262,6 @@ class Searcher:
267 262 if normalized:
268 263 candidates.append(normalized)
269 264  
270   - query_text_by_lang = getattr(parsed_query, "query_text_by_lang", {}) or {}
271   - if isinstance(query_text_by_lang, dict):
272   - for text in query_text_by_lang.values():
273   - normalized = self._normalize_sku_match_text(text)
274   - if normalized:
275   - candidates.append(normalized)
276   -
277 265 translations = getattr(parsed_query, "translations", {}) or {}
278 266 if isinstance(translations, dict):
279 267 for text in translations.values():
... ... @@ -516,10 +504,19 @@ class Searcher:
516 504 range_filters: Range filters for numeric fields
517 505 facets: Facet configurations for faceted search
518 506 min_score: Minimum score threshold
519   - context: Request context for tracking (created if not provided)
  507 + context: Request context for tracking (required)
520 508 sort_by: Field name for sorting
521 509 sort_order: Sort order: 'asc' or 'desc'
522 510 debug: Enable debug information output
  511 + language: Response / field selection language hint (e.g. zh, en)
  512 + sku_filter_dimension: SKU grouping dimensions for per-SPU variant pick
  513 + enable_rerank: If None, use ``config.rerank.enabled``; if set, overrides
  514 + whether the rerank provider is invoked (subject to rerank window).
  515 + rerank_query_template: Override for rerank query text template; None uses
  516 + ``config.rerank.rerank_query_template`` (e.g. ``"{query}"``).
  517 + rerank_doc_template: Override for per-hit document text passed to rerank;
  518 + None uses ``config.rerank.rerank_doc_template``. Placeholders are
  519 + resolved in ``search/rerank_client.py``.
523 520  
524 521 Returns:
525 522 SearchResult object with formatted results
... ... @@ -592,7 +589,8 @@ class Searcher:
592 589 query,
593 590 tenant_id=tenant_id,
594 591 generate_vector=enable_embedding,
595   - context=context
  592 + context=context,
  593 + target_languages=index_langs if enable_translation else [],
596 594 )
597 595 # Store query analysis results in context
598 596 context.store_query_analysis(
... ... @@ -602,7 +600,7 @@ class Searcher:
602 600 detected_language=parsed_query.detected_language,
603 601 translations=parsed_query.translations,
604 602 query_vector=parsed_query.query_vector.tolist() if parsed_query.query_vector is not None else None,
605   - domain=parsed_query.domain,
  603 + domain="default",
606 604 is_simple_query=True
607 605 )
608 606  
... ... @@ -610,7 +608,6 @@ class Searcher:
610 608 f"查询解析完成 | 原查询: '{parsed_query.original_query}' | "
611 609 f"重写后: '{parsed_query.rewritten_query}' | "
612 610 f"语言: {parsed_query.detected_language} | "
613   - f"域: {parsed_query.domain} | "
614 611 f"向量: {'是' if parsed_query.query_vector is not None else '否'}",
615 612 extra={'reqid': context.reqid, 'uid': context.uid}
616 613 )
... ... @@ -643,7 +640,8 @@ class Searcher:
643 640 from_=es_fetch_from,
644 641 enable_knn=enable_embedding and parsed_query.query_vector is not None,
645 642 min_score=min_score,
646   - parsed_query=parsed_query
  643 + parsed_query=parsed_query,
  644 + index_languages=index_langs,
647 645 )
648 646  
649 647 # Add facets for faceted search
... ... @@ -933,7 +931,6 @@ class Searcher:
933 931 debug_entry["text_score"] = rerank_debug.get("text_score")
934 932 debug_entry["text_source_score"] = rerank_debug.get("text_source_score")
935 933 debug_entry["text_translation_score"] = rerank_debug.get("text_translation_score")
936   - debug_entry["text_fallback_score"] = rerank_debug.get("text_fallback_score")
937 934 debug_entry["text_primary_score"] = rerank_debug.get("text_primary_score")
938 935 debug_entry["text_support_score"] = rerank_debug.get("text_support_score")
939 936 debug_entry["knn_score"] = rerank_debug.get("knn_score")
... ... @@ -985,9 +982,6 @@ class Searcher:
985 982 "rewritten_query": context.query_analysis.rewritten_query,
986 983 "detected_language": context.query_analysis.detected_language,
987 984 "translations": context.query_analysis.translations,
988   - "query_text_by_lang": context.get_intermediate_result("query_text_by_lang", {}),
989   - "search_langs": context.get_intermediate_result("search_langs", []),
990   - "supplemental_search_langs": context.get_intermediate_result("supplemental_search_langs", []),
991 985 "has_vector": context.query_analysis.query_vector is not None,
992 986 "is_simple_query": context.query_analysis.is_simple_query,
993 987 "domain": context.query_analysis.domain
... ...
suggestion/builder.py
... ... @@ -147,7 +147,7 @@ class SuggestionIndexBuilder:
147 147 raw = str(value).strip()
148 148 if not raw:
149 149 return []
150   - parts = re.split(r"[,;|/\n\t]+", raw)
  150 + parts = re.split(r"[,、,;|/\n\t]+", raw)
151 151 out = [p.strip() for p in parts if p and p.strip()]
152 152 if not out:
153 153 return [raw]
... ... @@ -162,7 +162,7 @@ class SuggestionIndexBuilder:
162 162 s = str(raw).strip()
163 163 if not s:
164 164 return []
165   - parts = re.split(r"[,;|/\n\t]+", s)
  165 + parts = re.split(r"[,、,;|/\n\t]+", s)
166 166 out = [p.strip() for p in parts if p and p.strip()]
167 167 return out if out else [s]
168 168  
... ...
tests/test_embedding_pipeline.py
... ... @@ -73,6 +73,10 @@ class _FakeQueryEncoder:
73 73 return np.array([np.array([0.11, 0.22, 0.33], dtype=np.float32) for _ in sentences], dtype=object)
74 74  
75 75  
  76 +def _tokenizer(text):
  77 + return str(text).split()
  78 +
  79 +
76 80 class _FakeEmbeddingCache:
77 81 def __init__(self):
78 82 self.store: Dict[str, np.ndarray] = {}
... ... @@ -210,6 +214,7 @@ def test_query_parser_generates_query_vector_with_encoder():
210 214 config=_build_test_config(),
211 215 text_encoder=encoder,
212 216 translator=_FakeTranslator(),
  217 + tokenizer=_tokenizer,
213 218 )
214 219  
215 220 parsed = parser.parse("red dress", tenant_id="162", generate_vector=True)
... ... @@ -224,6 +229,7 @@ def test_query_parser_skips_query_vector_when_disabled():
224 229 config=_build_test_config(),
225 230 text_encoder=_FakeQueryEncoder(),
226 231 translator=_FakeTranslator(),
  232 + tokenizer=_tokenizer,
227 233 )
228 234  
229 235 parsed = parser.parse("red dress", tenant_id="162", generate_vector=False)
... ...
tests/test_es_query_builder.py
1 1 from types import SimpleNamespace
  2 +from typing import Any, Dict
2 3  
3 4 import numpy as np
4 5  
... ... @@ -13,6 +14,21 @@ def _builder() -&gt; ESQueryBuilder:
13 14 )
14 15  
15 16  
  17 +def _lexical_multi_match_fields(query_root: Dict[str, Any]) -> list:
  18 + """Fields from the non-phrase multi_match (bool.should or single clause)."""
  19 + if "multi_match" in query_root:
  20 + mm = query_root["multi_match"]
  21 + if mm.get("type") == "phrase":
  22 + raise AssertionError("root multi_match is phrase-only")
  23 + return mm["fields"]
  24 + for clause in query_root.get("bool", {}).get("should", []):
  25 + mm = clause.get("multi_match") or {}
  26 + if mm.get("type") == "phrase":
  27 + continue
  28 + return mm["fields"]
  29 + raise AssertionError("no lexical multi_match in query_root")
  30 +
  31 +
16 32 def test_knn_prefilter_includes_range_filters():
17 33 qb = _builder()
18 34 q = qb.build_query(
... ... @@ -65,21 +81,49 @@ def test_knn_prefilter_not_added_without_filters():
65 81 assert q["knn"]["_name"] == "knn_query"
66 82  
67 83  
68   -def test_text_query_contains_only_base_translation_and_fallback_named_queries():
  84 +def test_text_query_contains_only_base_and_translation_named_queries():
69 85 qb = _builder()
70 86 parsed_query = SimpleNamespace(
71   - query_text_by_lang={"en": "dress", "zh": "连衣裙"},
72   - search_langs=["en", "zh"],
  87 + rewritten_query="dress",
73 88 detected_language="en",
74   - source_in_index_languages=False,
75   - index_languages=["en", "zh", "fr"],
  89 + translations={"en": "dress", "zh": "连衣裙"},
76 90 )
77 91  
78   - q = qb.build_query(query_text="dress", parsed_query=parsed_query, enable_knn=False)
  92 + q = qb.build_query(
  93 + query_text="dress",
  94 + parsed_query=parsed_query,
  95 + enable_knn=False,
  96 + index_languages=["en", "zh", "fr"],
  97 + )
79 98 should = q["query"]["bool"]["should"]
80 99 names = [clause["multi_match"]["_name"] for clause in should]
81 100  
82   - assert names == ["base_query", "base_query_trans_zh", "fallback_original_query_fr"]
  101 + assert names == [
  102 + "base_query",
  103 + "base_query_phrase",
  104 + "base_query_trans_zh",
  105 + "base_query_trans_zh_phrase",
  106 + ]
  107 +
  108 +
  109 +def test_text_query_skips_duplicate_translation_same_as_base():
  110 + qb = _builder()
  111 + parsed_query = SimpleNamespace(
  112 + rewritten_query="dress",
  113 + detected_language="en",
  114 + translations={"en": "dress"},
  115 + )
  116 +
  117 + q = qb.build_query(
  118 + query_text="dress",
  119 + parsed_query=parsed_query,
  120 + enable_knn=False,
  121 + index_languages=["en", "zh"],
  122 + )
  123 +
  124 + root = q["query"]
  125 + assert root["bool"]["should"][0]["multi_match"]["_name"] == "base_query"
  126 + assert root["bool"]["should"][1]["multi_match"]["_name"] == "base_query_phrase"
83 127  
84 128  
85 129 def test_mixed_script_merges_en_fields_into_zh_clause():
... ... @@ -91,22 +135,25 @@ def test_mixed_script_merges_en_fields_into_zh_clause():
91 135 default_language="en",
92 136 )
93 137 parsed_query = SimpleNamespace(
94   - query_text_by_lang={"zh": "法式 dress"},
95   - search_langs=["zh"],
  138 + rewritten_query="法式 dress",
96 139 detected_language="zh",
97   - source_in_index_languages=True,
98   - index_languages=["zh", "en"],
  140 + translations={},
99 141 contains_chinese=True,
100 142 contains_english=True,
101 143 )
102   - q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False)
103   - fields = q["query"]["multi_match"]["fields"]
  144 + q = qb.build_query(
  145 + query_text="法式 dress",
  146 + parsed_query=parsed_query,
  147 + enable_knn=False,
  148 + index_languages=["zh", "en"],
  149 + )
  150 + fields = _lexical_multi_match_fields(q["query"])
104 151 bases = {f.split("^", 1)[0] for f in fields}
105 152 assert "title.zh" in bases and "title.en" in bases
106 153 assert "brief.zh" in bases and "brief.en" in bases
107   - # Merged supplemental language fields use boost * 0.8 (implicit 1.0 -> ^0.8)
108   - assert "title.en^0.8" in fields
109   - assert "brief.en^0.8" in fields
  154 + # Merged supplemental language fields use boost * 0.6 by default.
  155 + assert "title.en^0.6" in fields
  156 + assert "brief.en^0.6" in fields
110 157  
111 158  
112 159 def test_mixed_script_merges_zh_fields_into_en_clause():
... ... @@ -118,19 +165,22 @@ def test_mixed_script_merges_zh_fields_into_en_clause():
118 165 default_language="en",
119 166 )
120 167 parsed_query = SimpleNamespace(
121   - query_text_by_lang={"en": "red 连衣裙"},
122   - search_langs=["en"],
  168 + rewritten_query="red 连衣裙",
123 169 detected_language="en",
124   - source_in_index_languages=True,
125   - index_languages=["zh", "en"],
  170 + translations={},
126 171 contains_chinese=True,
127 172 contains_english=True,
128 173 )
129   - q = qb.build_query(query_text="red 连衣裙", parsed_query=parsed_query, enable_knn=False)
130   - fields = q["query"]["multi_match"]["fields"]
  174 + q = qb.build_query(
  175 + query_text="red 连衣裙",
  176 + parsed_query=parsed_query,
  177 + enable_knn=False,
  178 + index_languages=["zh", "en"],
  179 + )
  180 + fields = _lexical_multi_match_fields(q["query"])
131 181 bases = {f.split("^", 1)[0] for f in fields}
132 182 assert "title.en" in bases and "title.zh" in bases
133   - assert "title.zh^0.8" in fields
  183 + assert "title.zh^0.6" in fields
134 184  
135 185  
136 186 def test_mixed_script_merged_fields_scale_configured_boosts():
... ... @@ -143,18 +193,21 @@ def test_mixed_script_merged_fields_scale_configured_boosts():
143 193 default_language="en",
144 194 )
145 195 parsed_query = SimpleNamespace(
146   - query_text_by_lang={"zh": "法式 dress"},
147   - search_langs=["zh"],
  196 + rewritten_query="法式 dress",
148 197 detected_language="zh",
149   - source_in_index_languages=True,
150   - index_languages=["zh", "en"],
  198 + translations={},
151 199 contains_chinese=True,
152 200 contains_english=True,
153 201 )
154   - q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False)
155   - fields = q["query"]["multi_match"]["fields"]
  202 + q = qb.build_query(
  203 + query_text="法式 dress",
  204 + parsed_query=parsed_query,
  205 + enable_knn=False,
  206 + index_languages=["zh", "en"],
  207 + )
  208 + fields = _lexical_multi_match_fields(q["query"])
156 209 assert "title.zh^5.0" in fields
157   - assert "title.en^8.0" in fields # 10.0 * 0.8
  210 + assert "title.en^6.0" in fields # 10.0 * 0.6
158 211  
159 212  
160 213 def test_mixed_script_does_not_merge_en_when_not_in_index_languages():
... ... @@ -166,16 +219,19 @@ def test_mixed_script_does_not_merge_en_when_not_in_index_languages():
166 219 default_language="zh",
167 220 )
168 221 parsed_query = SimpleNamespace(
169   - query_text_by_lang={"zh": "法式 dress"},
170   - search_langs=["zh"],
  222 + rewritten_query="法式 dress",
171 223 detected_language="zh",
172   - source_in_index_languages=True,
173   - index_languages=["zh"],
  224 + translations={},
174 225 contains_chinese=True,
175 226 contains_english=True,
176 227 )
177   - q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False)
178   - fields = q["query"]["multi_match"]["fields"]
  228 + q = qb.build_query(
  229 + query_text="法式 dress",
  230 + parsed_query=parsed_query,
  231 + enable_knn=False,
  232 + index_languages=["zh"],
  233 + )
  234 + fields = _lexical_multi_match_fields(q["query"])
179 235 bases = {f.split("^", 1)[0] for f in fields}
180 236 assert "title.zh" in bases
181 237 assert "title.en" not in bases
... ...
tests/test_es_query_builder_text_recall_languages.py 0 → 100644
... ... @@ -0,0 +1,453 @@
  1 +"""
  2 +ES text recall: base_query (rewritten @ detected_language) + base_query_trans_*.
  3 +
  4 +Covers combinations of query language vs tenant index_languages, translations,
  5 +and mixed Chinese/English queries. Asserts multi_match _name, query text, and
  6 +target language fields (title.{lang}).
  7 +"""
  8 +
  9 +from types import SimpleNamespace
  10 +from typing import Any, Dict, List
  11 +
  12 +import numpy as np
  13 +
  14 +from search.es_query_builder import ESQueryBuilder
  15 +
  16 +
  17 +def _builder_multilingual_title_only(
  18 + *,
  19 + default_language: str = "en",
  20 + mixed_script_scale: float = 0.6,
  21 +) -> ESQueryBuilder:
  22 + """Minimal builder: only title.{lang} for easy field assertions."""
  23 + return ESQueryBuilder(
  24 + match_fields=["title.en^1.0"],
  25 + multilingual_fields=["title"],
  26 + shared_fields=[],
  27 + text_embedding_field="title_embedding",
  28 + default_language=default_language,
  29 + mixed_script_merged_field_boost_scale=mixed_script_scale,
  30 + function_score_config=None,
  31 + )
  32 +
  33 +
  34 +def _unwrap_inner_query(es_body: Dict[str, Any]) -> Dict[str, Any]:
  35 + """Navigate bool.must / function_score wrappers to the text recall root."""
  36 + q = es_body.get("query") or {}
  37 + if "bool" in q and "must" in q["bool"] and q["bool"]["must"]:
  38 + q = q["bool"]["must"][0]
  39 + if "function_score" in q:
  40 + q = q["function_score"]["query"]
  41 + return q
  42 +
  43 +
  44 +def _extract_multi_match_clauses(es_body: Dict[str, Any]) -> List[Dict[str, Any]]:
  45 + inner = _unwrap_inner_query(es_body)
  46 + if "multi_match" in inner:
  47 + return [inner["multi_match"]]
  48 + should = (inner.get("bool") or {}).get("should") or []
  49 + return [c["multi_match"] for c in should if "multi_match" in c]
  50 +
  51 +
  52 +def _clauses_index(es_body: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
  53 + """Map _name -> multi_match dict."""
  54 + out: Dict[str, Dict[str, Any]] = {}
  55 + for mm in _extract_multi_match_clauses(es_body):
  56 + name = mm.get("_name")
  57 + if name:
  58 + out[str(name)] = mm
  59 + return out
  60 +
  61 +
  62 +def _with_phrase(lexical_names: set[str]) -> set[str]:
  63 + """Each lexical recall clause has a companion ``*_phrase`` multi_match."""
  64 + return lexical_names | {f"{n}_phrase" for n in lexical_names}
  65 +
  66 +
  67 +def _title_fields(mm: Dict[str, Any]) -> List[str]:
  68 + fields = mm.get("fields") or []
  69 + return [f for f in fields if str(f).startswith("title.")]
  70 +
  71 +
  72 +def _has_title_lang(mm: Dict[str, Any], lang: str) -> bool:
  73 + """True if any field is title.{lang} with optional ^boost suffix."""
  74 + prefix = f"title.{lang}"
  75 + for f in mm.get("fields") or []:
  76 + s = str(f)
  77 + if s == prefix or s.startswith(prefix + "^"):
  78 + return True
  79 + return False
  80 +
  81 +
  82 +def _build(
  83 + qb: ESQueryBuilder,
  84 + *,
  85 + query_text: str,
  86 + rewritten: str,
  87 + detected_language: str,
  88 + translations: Dict[str, str],
  89 + index_languages: List[str],
  90 + contains_chinese: bool = False,
  91 + contains_english: bool = False,
  92 +) -> Dict[str, Any]:
  93 + parsed = SimpleNamespace(
  94 + rewritten_query=rewritten,
  95 + detected_language=detected_language,
  96 + translations=dict(translations),
  97 + contains_chinese=contains_chinese,
  98 + contains_english=contains_english,
  99 + )
  100 + return qb.build_query(
  101 + query_text=query_text,
  102 + parsed_query=parsed,
  103 + enable_knn=False,
  104 + index_languages=index_languages,
  105 + )
  106 +
  107 +
  108 +# --- 检测语言在 index_languages 内:主召回 + 翻译补召回 ---
  109 +
  110 +
  111 +def test_zh_query_index_zh_en_includes_base_zh_and_trans_en():
  112 + qb = _builder_multilingual_title_only(default_language="en")
  113 + q = _build(
  114 + qb,
  115 + query_text="连衣裙",
  116 + rewritten="连衣裙",
  117 + detected_language="zh",
  118 + translations={"en": "dress"},
  119 + index_languages=["zh", "en"],
  120 + )
  121 + idx = _clauses_index(q)
  122 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"})
  123 + assert idx["base_query"]["query"] == "连衣裙"
  124 + assert "title.zh" in _title_fields(idx["base_query"])
  125 + assert idx["base_query_trans_en"]["query"] == "dress"
  126 + assert "title.en" in _title_fields(idx["base_query_trans_en"])
  127 +
  128 +
  129 +def test_en_query_index_zh_en_includes_base_en_and_trans_zh():
  130 + qb = _builder_multilingual_title_only(default_language="en")
  131 + q = _build(
  132 + qb,
  133 + query_text="dress",
  134 + rewritten="dress",
  135 + detected_language="en",
  136 + translations={"zh": "连衣裙"},
  137 + index_languages=["en", "zh"],
  138 + )
  139 + idx = _clauses_index(q)
  140 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"})
  141 + assert idx["base_query"]["query"] == "dress"
  142 + assert "title.en" in _title_fields(idx["base_query"])
  143 + assert idx["base_query_trans_zh"]["query"] == "连衣裙"
  144 + assert "title.zh" in _title_fields(idx["base_query_trans_zh"])
  145 +
  146 +
  147 +def test_de_query_index_de_en_fr_includes_base_and_two_translations():
  148 + qb = _builder_multilingual_title_only(default_language="en")
  149 + q = _build(
  150 + qb,
  151 + query_text="kleid",
  152 + rewritten="kleid",
  153 + detected_language="de",
  154 + translations={"en": "dress", "fr": "robe"},
  155 + index_languages=["de", "en", "fr"],
  156 + )
  157 + idx = _clauses_index(q)
  158 + assert set(idx) == _with_phrase(
  159 + {"base_query", "base_query_trans_en", "base_query_trans_fr"}
  160 + )
  161 + assert idx["base_query"]["query"] == "kleid"
  162 + assert "title.de" in _title_fields(idx["base_query"])
  163 + assert idx["base_query_trans_en"]["query"] == "dress"
  164 + assert idx["base_query_trans_fr"]["query"] == "robe"
  165 +
  166 +
  167 +# --- 检测语言不在 index_languages:仍有 base(弱)+ 翻译(强) ---
  168 +
  169 +
  170 +def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields():
  171 + qb = _builder_multilingual_title_only(default_language="en")
  172 + q = _build(
  173 + qb,
  174 + query_text="schuh",
  175 + rewritten="schuh",
  176 + detected_language="de",
  177 + translations={"en": "shoe", "zh": "鞋"},
  178 + index_languages=["en", "zh"],
  179 + )
  180 + idx = _clauses_index(q)
  181 + assert set(idx) == _with_phrase(
  182 + {"base_query", "base_query_trans_en", "base_query_trans_zh"}
  183 + )
  184 + assert idx["base_query"]["query"] == "schuh"
  185 + assert "title.de" in _title_fields(idx["base_query"])
  186 + assert "boost" not in idx["base_query"]
  187 + assert idx["base_query_trans_en"]["query"] == "shoe"
  188 + assert idx["base_query_trans_en"]["boost"] == qb.translation_boost
  189 + assert idx["base_query_trans_zh"]["query"] == "鞋"
  190 + assert idx["base_query_trans_zh"]["boost"] == qb.translation_boost
  191 +
  192 +
  193 +# --- 中英混写:原文在 base_query;翻译子句独立;混写时 base 子句扩列 ---
  194 +
  195 +
  196 +def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause():
  197 + qb = _builder_multilingual_title_only(default_language="en")
  198 + q = _build(
  199 + qb,
  200 + query_text="红色 dress",
  201 + rewritten="红色 dress",
  202 + detected_language="zh",
  203 + translations={"en": "red dress"},
  204 + index_languages=["zh", "en"],
  205 + contains_chinese=True,
  206 + contains_english=True,
  207 + )
  208 + idx = _clauses_index(q)
  209 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"})
  210 + assert idx["base_query"]["query"] == "红色 dress"
  211 + assert _has_title_lang(idx["base_query"], "zh") and _has_title_lang(idx["base_query"], "en")
  212 + assert idx["base_query_trans_en"]["query"] == "red dress"
  213 + assert _has_title_lang(idx["base_query_trans_en"], "en")
  214 +
  215 +
  216 +def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause():
  217 + qb = _builder_multilingual_title_only(default_language="en")
  218 + q = _build(
  219 + qb,
  220 + query_text="nike 运动鞋",
  221 + rewritten="nike 运动鞋",
  222 + detected_language="en",
  223 + translations={"zh": "耐克运动鞋"},
  224 + index_languages=["zh", "en"],
  225 + contains_chinese=True,
  226 + contains_english=True,
  227 + )
  228 + idx = _clauses_index(q)
  229 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"})
  230 + assert idx["base_query"]["query"] == "nike 运动鞋"
  231 + assert _has_title_lang(idx["base_query"], "en") and _has_title_lang(idx["base_query"], "zh")
  232 + assert idx["base_query_trans_zh"]["query"] == "耐克运动鞋"
  233 +
  234 +
  235 +def test_mixed_zh_query_index_zh_only_no_en_merge_in_base():
  236 + qb = _builder_multilingual_title_only(default_language="en")
  237 + q = _build(
  238 + qb,
  239 + query_text="法式 dress",
  240 + rewritten="法式 dress",
  241 + detected_language="zh",
  242 + translations={},
  243 + index_languages=["zh"],
  244 + contains_chinese=True,
  245 + contains_english=True,
  246 + )
  247 + idx = _clauses_index(q)
  248 + assert set(idx) == _with_phrase({"base_query"})
  249 + bases = {f.split("^", 1)[0] for f in _title_fields(idx["base_query"])}
  250 + assert bases == {"title.zh"}
  251 +
  252 +
  253 +# --- 去重:与 base 同语言同文本的翻译项跳过 ---
  254 +
  255 +
  256 +def test_skips_translation_when_same_lang_and_same_text_as_base():
  257 + qb = _builder_multilingual_title_only(default_language="en")
  258 + q = _build(
  259 + qb,
  260 + query_text="NIKE",
  261 + rewritten="NIKE",
  262 + detected_language="en",
  263 + translations={"en": "NIKE", "zh": "耐克"},
  264 + index_languages=["en", "zh"],
  265 + )
  266 + idx = _clauses_index(q)
  267 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"})
  268 +
  269 +
  270 +def test_keeps_translation_when_same_text_but_different_lang_than_base():
  271 + qb = _builder_multilingual_title_only(default_language="en")
  272 + q = _build(
  273 + qb,
  274 + query_text="NIKE",
  275 + rewritten="NIKE",
  276 + detected_language="en",
  277 + translations={"zh": "NIKE"},
  278 + index_languages=["en", "zh"],
  279 + )
  280 + idx = _clauses_index(q)
  281 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"})
  282 + assert idx["base_query_trans_zh"]["query"] == "NIKE"
  283 +
  284 +
  285 +# --- 翻译 key 规范化、空翻译跳过 ---
  286 +
  287 +
  288 +def test_translation_language_key_is_normalized_case_insensitive():
  289 + qb = _builder_multilingual_title_only(default_language="en")
  290 + q = _build(
  291 + qb,
  292 + query_text="dress",
  293 + rewritten="dress",
  294 + detected_language="en",
  295 + translations={"ZH": "连衣裙"},
  296 + index_languages=["en", "zh"],
  297 + )
  298 + idx = _clauses_index(q)
  299 + assert "base_query_trans_zh" in idx
  300 + assert idx["base_query_trans_zh"]["query"] == "连衣裙"
  301 +
  302 +
  303 +def test_empty_translation_value_is_skipped():
  304 + qb = _builder_multilingual_title_only(default_language="en")
  305 + q = _build(
  306 + qb,
  307 + query_text="dress",
  308 + rewritten="dress",
  309 + detected_language="en",
  310 + translations={"zh": " ", "fr": "robe"},
  311 + index_languages=["en", "zh", "fr"],
  312 + )
  313 + idx = _clauses_index(q)
  314 + assert "base_query_trans_zh" not in idx
  315 + assert "base_query_trans_fr" in idx
  316 +
  317 +
  318 +# --- index_languages 为空:视为「未约束」source_in_index 为 True ---
  319 +
  320 +
  321 +def test_empty_index_languages_treats_source_as_in_index_boosts():
  322 + qb = _builder_multilingual_title_only(default_language="en")
  323 + q = _build(
  324 + qb,
  325 + query_text="x",
  326 + rewritten="x",
  327 + detected_language="de",
  328 + translations={"en": "y"},
  329 + index_languages=[],
  330 + )
  331 + idx = _clauses_index(q)
  332 + assert "boost" not in idx["base_query"]
  333 + assert idx["base_query_trans_en"]["boost"] == qb.translation_boost
  334 + assert idx["base_query_phrase"]["boost"] == qb.phrase_match_boost
  335 + assert idx["base_query_trans_en_phrase"]["boost"] == qb.phrase_match_boost
  336 +
  337 +
  338 +# --- 无翻译:仅 base_query ---
  339 +
  340 +
  341 +def test_no_translations_only_base_query():
  342 + qb = _builder_multilingual_title_only(default_language="en")
  343 + q = _build(
  344 + qb,
  345 + query_text="hello",
  346 + rewritten="hello",
  347 + detected_language="en",
  348 + translations={},
  349 + index_languages=["en", "zh"],
  350 + )
  351 + idx = _clauses_index(q)
  352 + assert set(idx) == _with_phrase({"base_query"})
  353 +
  354 +
  355 +# --- 与 KNN 同存时仍能解析文本子句(顶层 knn 不影响 query 内结构) ---
  356 +
  357 +
  358 +def test_text_clauses_present_alongside_knn():
  359 + qb = _builder_multilingual_title_only(default_language="en")
  360 + parsed = SimpleNamespace(
  361 + rewritten_query="dress",
  362 + detected_language="en",
  363 + translations={"zh": "连衣裙"},
  364 + contains_chinese=False,
  365 + contains_english=True,
  366 + )
  367 + q = qb.build_query(
  368 + query_text="dress",
  369 + query_vector=np.array([0.1, 0.2, 0.3], dtype=np.float32),
  370 + parsed_query=parsed,
  371 + enable_knn=True,
  372 + index_languages=["en", "zh"],
  373 + )
  374 + assert "knn" in q
  375 + idx = _clauses_index(q)
  376 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"})
  377 +
  378 +
  379 +def test_detected_language_unknown_falls_back_to_default_language():
  380 + """与 LanguageDetector 失败时 QueryConfig.default_language 行为对齐。"""
  381 + qb = _builder_multilingual_title_only(default_language="en")
  382 + parsed = SimpleNamespace(
  383 + rewritten_query="shirt",
  384 + detected_language="unknown",
  385 + translations={"zh": "衬衫"},
  386 + contains_chinese=False,
  387 + contains_english=True,
  388 + )
  389 + q = qb.build_query(
  390 + query_text="shirt",
  391 + parsed_query=parsed,
  392 + enable_knn=False,
  393 + index_languages=["en", "zh"],
  394 + )
  395 + idx = _clauses_index(q)
  396 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"})
  397 + assert idx["base_query"]["query"] == "shirt"
  398 + assert _has_title_lang(idx["base_query"], "en")
  399 +
  400 +
  401 +def test_ru_query_index_ru_en_includes_base_ru_and_trans_en():
  402 + qb = _builder_multilingual_title_only(default_language="en")
  403 + q = _build(
  404 + qb,
  405 + query_text="платье",
  406 + rewritten="платье",
  407 + detected_language="ru",
  408 + translations={"en": "dress"},
  409 + index_languages=["ru", "en"],
  410 + )
  411 + idx = _clauses_index(q)
  412 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"})
  413 + assert idx["base_query"]["query"] == "платье"
  414 + assert _has_title_lang(idx["base_query"], "ru")
  415 + assert idx["base_query_trans_en"]["query"] == "dress"
  416 +
  417 +
  418 +def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause():
  419 + """
  420 + 当前实现:凡是 translations 里非空的条目都会生成子句;
  421 + index_languages 只约束混写扩列,不用于过滤翻译子句。
  422 + """
  423 + qb = _builder_multilingual_title_only(default_language="en")
  424 + q = _build(
  425 + qb,
  426 + query_text="dress",
  427 + rewritten="dress",
  428 + detected_language="en",
  429 + translations={"zh": "连衣裙", "de": "Kleid"},
  430 + index_languages=["en", "zh"],
  431 + )
  432 + idx = _clauses_index(q)
  433 + assert "base_query_trans_de" in idx
  434 + assert idx["base_query_trans_de"]["query"] == "Kleid"
  435 + assert _has_title_lang(idx["base_query_trans_de"], "de")
  436 +
  437 +
  438 +def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_base():
  439 + """base_query 始终用 rewritten_query,而非仅 query_text。"""
  440 + qb = _builder_multilingual_title_only(default_language="en")
  441 + q = _build(
  442 + qb,
  443 + query_text=" 红色 ",
  444 + rewritten="红色连衣裙",
  445 + detected_language="zh",
  446 + translations={"en": "red dress"},
  447 + index_languages=["zh", "en"],
  448 + contains_chinese=True,
  449 + contains_english=False,
  450 + )
  451 + idx = _clauses_index(q)
  452 + assert idx["base_query"]["query"] == "红色连衣裙"
  453 + assert idx["base_query_trans_en"]["query"] == "red dress"
... ...
tests/test_query_parser_mixed_language.py
1   -from types import SimpleNamespace
2   -
3 1 from config import FunctionScoreConfig, IndexConfig, QueryConfig, RerankConfig, SPUConfig, SearchConfig
4 2 from query.query_parser import QueryParser
5 3  
... ... @@ -9,6 +7,10 @@ class _DummyTranslator:
9 7 return f"{text}-{target_lang}"
10 8  
11 9  
  10 +def _tokenizer(text):
  11 + return str(text).split()
  12 +
  13 +
12 14 def test_pure_english_word_token_length_and_script():
13 15 assert QueryParser._is_pure_english_word_token("ab") is False
14 16 assert QueryParser._is_pure_english_word_token("abc") is True
... ... @@ -35,59 +37,57 @@ def _build_config() -&gt; SearchConfig:
35 37  
36 38  
37 39 def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(monkeypatch):
38   - parser = QueryParser(_build_config(), translator=_DummyTranslator())
  40 + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
39 41 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh")
40   - monkeypatch.setattr(
41   - "query.query_parser.get_tenant_config_loader",
42   - lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["zh", "en"]}),
43   - raising=False,
44   - )
45 42  
46   - result = parser.parse("法式 dress 连衣裙", tenant_id="162", generate_vector=False)
  43 + result = parser.parse(
  44 + "法式 dress 连衣裙",
  45 + tenant_id="162",
  46 + generate_vector=False,
  47 + target_languages=["zh", "en"],
  48 + )
47 49  
48 50 assert result.detected_language == "zh"
49 51 assert result.contains_chinese is True
50 52 assert result.contains_english is True
51   - assert "en" in result.search_langs
52   - # 翻译在预算内完成时会写入目标语言字段(优于仅用原文做 supplemental 探测)
53   - assert result.query_text_by_lang["en"] == "法式 dress 连衣裙-en"
54   - assert result.query_text_by_lang["zh"] == "法式 dress 连衣裙"
  53 + assert result.translations == {"en": "法式 dress 连衣裙-en"}
  54 + assert result.query_tokens == ["法式", "dress", "连衣裙"]
  55 + assert not hasattr(result, "query_text_by_lang")
  56 + assert not hasattr(result, "search_langs")
55 57  
56 58  
57 59 def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch):
58   - parser = QueryParser(_build_config(), translator=_DummyTranslator())
  60 + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
59 61 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
60   - monkeypatch.setattr(
61   - "query.query_parser.get_tenant_config_loader",
62   - lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}),
63   - raising=False,
64   - )
65 62  
66   - result = parser.parse("red 连衣裙", tenant_id="0", generate_vector=False)
  63 + result = parser.parse(
  64 + "red 连衣裙",
  65 + tenant_id="0",
  66 + generate_vector=False,
  67 + target_languages=["en", "zh"],
  68 + )
67 69  
68 70 assert result.detected_language == "en"
69 71 assert result.contains_chinese is True
70 72 assert result.contains_english is True
71   - assert "zh" in result.search_langs
72   - assert result.query_text_by_lang["zh"] == "red 连衣裙-zh"
73   - assert result.query_text_by_lang["en"] == "red 连衣裙"
  73 + assert result.translations == {"zh": "red 连衣裙-zh"}
  74 + assert result.query_tokens == ["red", "连衣裙"]
74 75  
75 76  
76 77 def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch):
77 78 """en 在 index_languages 内时仍应等待并采纳 en->zh 翻译结果(与向量共用预算)。"""
78   - parser = QueryParser(_build_config(), translator=_DummyTranslator())
  79 + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
79 80 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
80   - monkeypatch.setattr(
81   - "query.query_parser.get_tenant_config_loader",
82   - lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}),
83   - raising=False,
84   - )
85 81  
86   - result = parser.parse("off shoulder top", tenant_id="0", generate_vector=False)
  82 + result = parser.parse(
  83 + "off shoulder top",
  84 + tenant_id="0",
  85 + generate_vector=False,
  86 + target_languages=["en", "zh"],
  87 + )
87 88  
88 89 assert result.detected_language == "en"
89 90 assert result.contains_chinese is False
90 91 assert result.contains_english is True
91 92 assert result.translations.get("zh") == "off shoulder top-zh"
92   - assert result.query_text_by_lang.get("zh") == "off shoulder top-zh"
93   - assert result.source_in_index_languages is True
  93 + assert not hasattr(result, "source_in_index_languages")
... ...
tests/test_rerank_client.py
... ... @@ -11,7 +11,6 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim
11 11 "matched_queries": {
12 12 "base_query": 2.4,
13 13 "base_query_trans_zh": 1.8,
14   - "fallback_original_query_zh": 1.2,
15 14 "knn_query": 0.8,
16 15 },
17 16 },
... ... @@ -27,7 +26,7 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim
27 26  
28 27 debug = fuse_scores_and_resort(hits, [0.9, 0.7])
29 28  
30   - expected_text_1 = 2.4 + 0.25 * ((0.8 * 1.8) + (0.55 * 1.2))
  29 + expected_text_1 = 2.4 + 0.25 * (0.8 * 1.8)
31 30 expected_fused_1 = (0.9 + 0.00001) * ((expected_text_1 + 0.1) ** 0.35) * ((0.8 + 0.6) ** 0.2)
32 31 expected_fused_2 = (0.7 + 0.00001) * ((9.0 + 0.1) ** 0.35) * ((0.2 + 0.6) ** 0.2)
33 32  
... ... @@ -38,7 +37,6 @@ def test_fuse_scores_and_resort_aggregates_text_components_and_keeps_rerank_prim
38 37 assert isclose(by_id["2"]["_fused_score"], expected_fused_2, rel_tol=1e-9)
39 38 assert debug[0]["text_source_score"] == 2.4
40 39 assert debug[0]["text_translation_score"] == 1.8
41   - assert debug[0]["text_fallback_score"] == 1.2
42 40 assert debug[0]["knn_score"] == 0.8
43 41 assert [hit["_id"] for hit in hits] == ["2", "1"]
44 42  
... ...
tests/test_search_rerank_window.py
... ... @@ -43,7 +43,14 @@ class _FakeParsedQuery:
43 43  
44 44  
45 45 class _FakeQueryParser:
46   - def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any):
  46 + def parse(
  47 + self,
  48 + query: str,
  49 + tenant_id: str,
  50 + generate_vector: bool,
  51 + context: Any,
  52 + target_languages: Any = None,
  53 + ):
47 54 return _FakeParsedQuery(
48 55 original_query=query,
49 56 query_normalized=query,
... ... @@ -191,6 +198,66 @@ def test_config_loader_rerank_enabled_defaults_true(tmp_path: Path):
191 198 "field_boosts": {"title.en": 3.0},
192 199 "indexes": [{"name": "default", "label": "default", "fields": ["title.en"]}],
193 200 "query_config": {"supported_languages": ["en"], "default_language": "en"},
  201 + "services": {
  202 + "translation": {
  203 + "service_url": "http://localhost:6005",
  204 + "timeout_sec": 3.0,
  205 + "default_model": "dummy-model",
  206 + "default_scene": "general",
  207 + "cache": {
  208 + "ttl_seconds": 60,
  209 + "sliding_expiration": True,
  210 + },
  211 + "capabilities": {
  212 + "dummy-model": {
  213 + "enabled": True,
  214 + "backend": "llm",
  215 + "use_cache": True,
  216 + "model": "dummy-model",
  217 + "base_url": "http://localhost:6005/v1",
  218 + "timeout_sec": 3.0,
  219 + }
  220 + },
  221 + },
  222 + "embedding": {
  223 + "provider": "http",
  224 + "providers": {
  225 + "http": {
  226 + "text_base_url": "http://localhost:6005",
  227 + "image_base_url": "http://localhost:6008",
  228 + }
  229 + },
  230 + "backend": "tei",
  231 + "backends": {
  232 + "tei": {
  233 + "base_url": "http://localhost:8080",
  234 + "timeout_sec": 3.0,
  235 + "model_id": "dummy-embedding-model",
  236 + }
  237 + },
  238 + },
  239 + "rerank": {
  240 + "provider": "http",
  241 + "providers": {
  242 + "http": {
  243 + "base_url": "http://localhost:6007",
  244 + "service_url": "http://localhost:6007/rerank",
  245 + }
  246 + },
  247 + "backend": "bge",
  248 + "backends": {
  249 + "bge": {
  250 + "model_name": "dummy-rerank-model",
  251 + "device": "cpu",
  252 + "use_fp16": False,
  253 + "batch_size": 8,
  254 + "max_length": 128,
  255 + "cache_dir": "./model_cache",
  256 + "enable_warmup": False,
  257 + }
  258 + },
  259 + },
  260 + },
194 261 "spu_config": {"enabled": False},
195 262 "function_score": {"score_mode": "sum", "boost_mode": "multiply", "functions": []},
196 263 "rerank": {"rerank_window": 384},
... ... @@ -354,7 +421,14 @@ def test_searcher_promotes_sku_when_option1_matches_translated_query(monkeypatch
354 421 class _TranslatedQueryParser:
355 422 text_encoder = None
356 423  
357   - def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any):
  424 + def parse(
  425 + self,
  426 + query: str,
  427 + tenant_id: str,
  428 + generate_vector: bool,
  429 + context: Any,
  430 + target_languages: Any = None,
  431 + ):
358 432 return _FakeParsedQuery(
359 433 original_query=query,
360 434 query_normalized=query,
... ... @@ -407,15 +481,22 @@ def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_matc
407 481 encoder = _FakeTextEncoder(
408 482 {
409 483 "linen summer dress": [0.8, 0.2],
410   - "color:Red": [1.0, 0.0],
411   - "color:Blue": [0.0, 1.0],
  484 + "color:red": [1.0, 0.0],
  485 + "color:blue": [0.0, 1.0],
412 486 }
413 487 )
414 488  
415 489 class _EmbeddingQueryParser:
416 490 text_encoder = encoder
417 491  
418   - def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any):
  492 + def parse(
  493 + self,
  494 + query: str,
  495 + tenant_id: str,
  496 + generate_vector: bool,
  497 + context: Any,
  498 + target_languages: Any = None,
  499 + ):
419 500 return _FakeParsedQuery(
420 501 original_query=query,
421 502 query_normalized=query,
... ...
tests/test_translator_failure_semantics.py
1 1 import logging
2 2  
  3 +import pytest
  4 +
3 5 from translation.cache import TranslationCache
4 6 from translation.logging_utils import (
5 7 TranslationRequestFilter,
... ... @@ -7,6 +9,7 @@ from translation.logging_utils import (
7 9 reset_translation_request_id,
8 10 )
9 11 from translation.service import TranslationService
  12 +from translation.settings import build_translation_config, translation_cache_probe_models
10 13  
11 14  
12 15 class _FakeCache:
... ... @@ -16,7 +19,8 @@ class _FakeCache:
16 19 self.get_calls = []
17 20 self.set_calls = []
18 21  
19   - def get(self, *, model, target_lang, source_text):
  22 + def get(self, *, model, target_lang, source_text, log_lookup=True):
  23 + del log_lookup
20 24 self.get_calls.append((model, target_lang, source_text))
21 25 return self.storage.get((model, target_lang, source_text))
22 26  
... ... @@ -191,3 +195,262 @@ def test_translation_route_log_focuses_on_routing_decision(monkeypatch, caplog):
191 195 assert route_messages == [
192 196 "Translation route | backend=llm request_type=single use_cache=True cache_available=False"
193 197 ]
  198 +
  199 +
  200 +def test_translation_cache_probe_models_order():
  201 + cfg = {"cache": {"model_quality_tiers": {"low": 10, "high": 50, "mid": 30}}}
  202 + assert translation_cache_probe_models(cfg, "low") == ["high", "mid", "low"]
  203 + assert translation_cache_probe_models(cfg, "mid") == ["high", "mid"]
  204 + assert translation_cache_probe_models(cfg, "high") == ["high"]
  205 + assert translation_cache_probe_models(cfg, "unknown") == ["unknown"]
  206 +
  207 +
  208 +def test_translation_cache_probe_models_respects_enable_switch():
  209 + cfg = {
  210 + "cache": {
  211 + "enable_model_quality_tier_cache": False,
  212 + "model_quality_tiers": {"peer-a": 50, "peer-b": 50, "top": 100},
  213 + }
  214 + }
  215 + assert translation_cache_probe_models(cfg, "peer-a") == ["peer-a"]
  216 +
  217 +
  218 +def test_translation_cache_probe_models_same_tier_included():
  219 + """Same numeric tier: all peers are probed (higher tier first, then name order)."""
  220 + cfg = {"cache": {"model_quality_tiers": {"peer-a": 50, "peer-b": 50, "top": 100}}}
  221 + assert translation_cache_probe_models(cfg, "peer-a") == ["top", "peer-a", "peer-b"]
  222 + assert translation_cache_probe_models(cfg, "peer-b") == ["top", "peer-b", "peer-a"]
  223 +
  224 +
  225 +def test_model_quality_tiers_unknown_capability_raises():
  226 + with pytest.raises(ValueError, match="unknown capability"):
  227 + build_translation_config(
  228 + {
  229 + "service_url": "http://127.0.0.1:6006",
  230 + "timeout_sec": 10.0,
  231 + "default_model": "llm",
  232 + "default_scene": "general",
  233 + "cache": {
  234 + "ttl_seconds": 60,
  235 + "sliding_expiration": True,
  236 + "model_quality_tiers": {"ghost": 1},
  237 + },
  238 + "capabilities": {
  239 + "llm": {
  240 + "enabled": True,
  241 + "backend": "llm",
  242 + "model": "dummy-llm",
  243 + "base_url": "https://example.com",
  244 + "timeout_sec": 10.0,
  245 + "use_cache": True,
  246 + }
  247 + },
  248 + }
  249 + )
  250 +
  251 +
  252 +def test_tiered_cache_reuses_higher_tier_entry(monkeypatch):
  253 + monkeypatch.setattr(TranslationCache, "_init_redis_client", staticmethod(lambda: None))
  254 + translate_calls = []
  255 +
  256 + def _fake_create_backend(self, *, name, backend_type, cfg):
  257 + del self, backend_type, cfg
  258 +
  259 + class _Backend:
  260 + model = name
  261 +
  262 + @property
  263 + def supports_batch(self):
  264 + return True
  265 +
  266 + def translate(self, text, target_lang, source_lang=None, scene=None):
  267 + del target_lang, source_lang, scene
  268 + translate_calls.append((name, text))
  269 + if isinstance(text, list):
  270 + return [f"{name}:{item}" for item in text]
  271 + return f"{name}:{text}"
  272 +
  273 + return _Backend()
  274 +
  275 + monkeypatch.setattr(TranslationService, "_create_backend", _fake_create_backend)
  276 + config = {
  277 + "service_url": "http://127.0.0.1:6006",
  278 + "timeout_sec": 10.0,
  279 + "default_model": "opus-mt-zh-en",
  280 + "default_scene": "general",
  281 + "capabilities": {
  282 + "deepl": {
  283 + "enabled": True,
  284 + "backend": "deepl",
  285 + "api_url": "https://api.deepl.com/v2/translate",
  286 + "timeout_sec": 10.0,
  287 + "use_cache": True,
  288 + },
  289 + "opus-mt-zh-en": {
  290 + "enabled": True,
  291 + "backend": "local_marian",
  292 + "model_id": "dummy",
  293 + "model_dir": "dummy",
  294 + "device": "cpu",
  295 + "torch_dtype": "float32",
  296 + "batch_size": 8,
  297 + "max_input_length": 16,
  298 + "max_new_tokens": 16,
  299 + "num_beams": 1,
  300 + "use_cache": True,
  301 + },
  302 + },
  303 + "cache": {
  304 + "ttl_seconds": 60,
  305 + "sliding_expiration": True,
  306 + "model_quality_tiers": {"deepl": 100, "opus-mt-zh-en": 40},
  307 + },
  308 + }
  309 +
  310 + service = TranslationService(config)
  311 + fake_cache = _FakeCache()
  312 + fake_cache.storage[("deepl", "en", "商品标题")] = "from-deepl"
  313 + service._translation_cache = fake_cache
  314 +
  315 + out = service.translate("商品标题", target_lang="en", source_lang="zh", model="opus-mt-zh-en")
  316 + assert out == "from-deepl"
  317 + assert translate_calls == []
  318 + assert fake_cache.get_calls == [("deepl", "en", "商品标题")]
  319 +
  320 +
  321 +def test_tiered_cache_reuses_same_tier_peer(monkeypatch):
  322 + """Model A may use cache written under model B when both share the same tier."""
  323 + monkeypatch.setattr(TranslationCache, "_init_redis_client", staticmethod(lambda: None))
  324 + translate_calls = []
  325 +
  326 + def _fake_create_backend(self, *, name, backend_type, cfg):
  327 + del self, backend_type, cfg
  328 +
  329 + class _Backend:
  330 + model = name
  331 +
  332 + @property
  333 + def supports_batch(self):
  334 + return True
  335 +
  336 + def translate(self, text, target_lang, source_lang=None, scene=None):
  337 + del target_lang, source_lang, scene
  338 + translate_calls.append((name, text))
  339 + if isinstance(text, list):
  340 + return [f"{name}:{item}" for item in text]
  341 + return f"{name}:{text}"
  342 +
  343 + return _Backend()
  344 +
  345 + monkeypatch.setattr(TranslationService, "_create_backend", _fake_create_backend)
  346 + marian_cap = {
  347 + "enabled": True,
  348 + "backend": "local_marian",
  349 + "model_id": "dummy",
  350 + "model_dir": "dummy",
  351 + "device": "cpu",
  352 + "torch_dtype": "float32",
  353 + "batch_size": 8,
  354 + "max_input_length": 16,
  355 + "max_new_tokens": 16,
  356 + "num_beams": 1,
  357 + "use_cache": True,
  358 + }
  359 + config = {
  360 + "service_url": "http://127.0.0.1:6006",
  361 + "timeout_sec": 10.0,
  362 + "default_model": "opus-mt-en-zh",
  363 + "default_scene": "general",
  364 + "capabilities": {
  365 + "opus-mt-zh-en": dict(marian_cap),
  366 + "opus-mt-en-zh": dict(marian_cap),
  367 + },
  368 + "cache": {
  369 + "ttl_seconds": 60,
  370 + "sliding_expiration": True,
  371 + "model_quality_tiers": {"opus-mt-zh-en": 50, "opus-mt-en-zh": 50},
  372 + },
  373 + }
  374 +
  375 + service = TranslationService(config)
  376 + fake_cache = _FakeCache()
  377 + fake_cache.storage[("opus-mt-zh-en", "en", "hello")] = "from-zh-en"
  378 + service._translation_cache = fake_cache
  379 +
  380 + out = service.translate("hello", target_lang="en", source_lang="zh", model="opus-mt-en-zh")
  381 + assert out == "from-zh-en"
  382 + assert translate_calls == []
  383 + assert fake_cache.get_calls == [
  384 + ("opus-mt-en-zh", "en", "hello"),
  385 + ("opus-mt-zh-en", "en", "hello"),
  386 + ]
  387 +
  388 +
  389 +def test_tiered_cache_switch_off_uses_exact_model_only(monkeypatch):
  390 + monkeypatch.setattr(TranslationCache, "_init_redis_client", staticmethod(lambda: None))
  391 + translate_calls = []
  392 +
  393 + def _fake_create_backend(self, *, name, backend_type, cfg):
  394 + del self, backend_type, cfg
  395 +
  396 + class _Backend:
  397 + model = name
  398 +
  399 + @property
  400 + def supports_batch(self):
  401 + return True
  402 +
  403 + def translate(self, text, target_lang, source_lang=None, scene=None):
  404 + del target_lang, source_lang, scene
  405 + translate_calls.append((name, text))
  406 + if isinstance(text, list):
  407 + return [f"{name}:{item}" for item in text]
  408 + return f"{name}:{text}"
  409 +
  410 + return _Backend()
  411 +
  412 + monkeypatch.setattr(TranslationService, "_create_backend", _fake_create_backend)
  413 + config = {
  414 + "service_url": "http://127.0.0.1:6006",
  415 + "timeout_sec": 10.0,
  416 + "default_model": "opus-mt-zh-en",
  417 + "default_scene": "general",
  418 + "capabilities": {
  419 + "deepl": {
  420 + "enabled": True,
  421 + "backend": "deepl",
  422 + "api_url": "https://api.deepl.com/v2/translate",
  423 + "timeout_sec": 10.0,
  424 + "use_cache": True,
  425 + },
  426 + "opus-mt-zh-en": {
  427 + "enabled": True,
  428 + "backend": "local_marian",
  429 + "model_id": "dummy",
  430 + "model_dir": "dummy",
  431 + "device": "cpu",
  432 + "torch_dtype": "float32",
  433 + "batch_size": 8,
  434 + "max_input_length": 16,
  435 + "max_new_tokens": 16,
  436 + "num_beams": 1,
  437 + "use_cache": True,
  438 + },
  439 + },
  440 + "cache": {
  441 + "ttl_seconds": 60,
  442 + "sliding_expiration": True,
  443 + "enable_model_quality_tier_cache": False,
  444 + "model_quality_tiers": {"deepl": 100, "opus-mt-zh-en": 40},
  445 + },
  446 + }
  447 +
  448 + service = TranslationService(config)
  449 + fake_cache = _FakeCache()
  450 + fake_cache.storage[("deepl", "en", "商品标题")] = "from-deepl"
  451 + service._translation_cache = fake_cache
  452 +
  453 + out = service.translate("商品标题", target_lang="en", source_lang="zh", model="opus-mt-zh-en")
  454 + assert out == "opus-mt-zh-en:商品标题"
  455 + assert translate_calls == [("opus-mt-zh-en", "商品标题")]
  456 + assert fake_cache.get_calls == [("opus-mt-zh-en", "en", "商品标题")]
... ...
translation/cache.py
... ... @@ -36,7 +36,13 @@ class TranslationCache:
36 36 digest = hashlib.sha256(text.encode("utf-8")).hexdigest()
37 37 return f"trans:{normalized_model}:{normalized_target_lang}:{text_prefix}{digest}"
38 38  
39   - def get(self, *, model: str, target_lang: str, source_text: str) -> Optional[str]:
  39 + def get(
  40 + self,
  41 + *,
  42 + model: str,
  43 + target_lang: str,
  44 + source_text: str
  45 + ) -> Optional[str]:
40 46 if self.redis_client is None:
41 47 return None
42 48 key = self.build_key(model=model, target_lang=target_lang, source_text=source_text)
... ...
translation/service.py
... ... @@ -3,7 +3,7 @@
3 3 from __future__ import annotations
4 4  
5 5 import logging
6   -from typing import Dict, List, Optional
  6 +from typing import Dict, List, Optional, Tuple
7 7  
8 8 from config.loader import get_app_config
9 9 from config.schema import AppConfig
... ... @@ -15,6 +15,7 @@ from translation.settings import (
15 15 get_translation_capability,
16 16 normalize_translation_model,
17 17 normalize_translation_scene,
  18 + translation_cache_probe_models,
18 19 )
19 20  
20 21 logger = logging.getLogger(__name__)
... ... @@ -247,7 +248,11 @@ class TranslationService:
247 248 ) -> Optional[str]:
248 249 if not text.strip():
249 250 return text
250   - cached = self._translation_cache.get(model=model, target_lang=target_lang, source_text=text)
  251 + cached, _served = self._tiered_cache_get(
  252 + request_model=model,
  253 + target_lang=target_lang,
  254 + source_text=text,
  255 + )
251 256 if cached is not None:
252 257 logger.info(
253 258 "Translation cache served | request_type=single text_len=%s",
... ... @@ -279,6 +284,30 @@ class TranslationService:
279 284 )
280 285 return translated
281 286  
  287 + def _tiered_cache_get(
  288 + self,
  289 + *,
  290 + request_model: str,
  291 + target_lang: str,
  292 + source_text: str,
  293 + ) -> Tuple[Optional[str], Optional[str]]:
  294 + """Redis lookup: cache from higher-tier or **same-tier** models may satisfy A.
  295 +
  296 + Lower-tier entries are never read. Returns ``(translated, served_model)``.
  297 + """
  298 + probe_models = translation_cache_probe_models(self.config, request_model)
  299 +
  300 + for probe_model in probe_models:
  301 + hit = self._translation_cache.get(
  302 + model=probe_model,
  303 + target_lang=target_lang,
  304 + source_text=source_text,
  305 + )
  306 + if hit is not None:
  307 + return hit, probe_model
  308 +
  309 + return None, None
  310 +
282 311 def _translate_batch_with_cache(
283 312 self,
284 313 *,
... ... @@ -300,8 +329,8 @@ class TranslationService:
300 329 if not normalized_text.strip():
301 330 results[idx] = normalized_text
302 331 continue
303   - cached = self._translation_cache.get(
304   - model=model,
  332 + cached, _served = self._tiered_cache_get(
  333 + request_model=model,
305 334 target_lang=target_lang,
306 335 source_text=normalized_text,
307 336 )
... ...
translation/settings.py
... ... @@ -2,7 +2,7 @@
2 2  
3 3 from __future__ import annotations
4 4  
5   -from typing import Any, Dict, List, Mapping, Optional
  5 +from typing import Any, Dict, List, Mapping, Optional, Tuple
6 6  
7 7 from translation.scenes import normalize_scene_name
8 8  
... ... @@ -38,6 +38,7 @@ def build_translation_config(raw_cfg: Mapping[str, Any]) -&gt; TranslationConfig:
38 38 if not get_enabled_translation_models(config):
39 39 raise ValueError("At least one translation capability must be enabled")
40 40  
  41 + _validate_model_quality_tiers(config)
41 42 return config
42 43  
43 44  
... ... @@ -86,18 +87,107 @@ def get_translation_cache(config: Mapping[str, Any]) -&gt; Dict[str, Any]:
86 87 return dict(cache)
87 88  
88 89  
  90 +def translation_cache_probe_models(config: Mapping[str, Any], request_model: str) -> List[str]:
  91 + """Redis cache key models to try.
  92 +
  93 + Sort order: (1) **tier** descending (higher quality first); (2) within the same tier,
  94 + the **request model** before other peers; (3) remaining ties by model name.
  95 +
  96 + For a request to model A with tier T, probes every configured model whose tier is
  97 + **greater than or equal to** T. Lower tiers are never used.
  98 +
  99 + When ``enable_model_quality_tier_cache`` is false, only the request model is probed.
  100 +
  101 + When ``model_quality_tiers`` is empty or ``request_model`` is not listed, only the
  102 + request model is probed (legacy exact-match behavior).
  103 + """
  104 + rm = str(request_model or "").strip().lower()
  105 + cache = config.get("cache")
  106 + if not isinstance(cache, Mapping):
  107 + return [rm]
  108 + if not bool(cache.get("enable_model_quality_tier_cache", True)):
  109 + return [rm]
  110 + tiers = cache.get("model_quality_tiers")
  111 + if not isinstance(tiers, Mapping) or not tiers:
  112 + return [rm]
  113 + if rm not in tiers:
  114 + return [rm]
  115 + threshold = int(tiers[rm])
  116 + scored: List[Tuple[int, str]] = []
  117 + for name, tier_val in tiers.items():
  118 + n = str(name).strip().lower()
  119 + t = int(tier_val)
  120 + if t >= threshold:
  121 + scored.append((t, n))
  122 + scored.sort(
  123 + key=lambda item: (
  124 + -item[0],
  125 + 0 if item[1] == rm else 1,
  126 + item[1],
  127 + )
  128 + )
  129 + out: List[str] = []
  130 + seen: set[str] = set()
  131 + for _t, n in scored:
  132 + if n not in seen:
  133 + seen.add(n)
  134 + out.append(n)
  135 + return out
  136 +
  137 +
89 138 def _build_cache_config(raw_cache: Any) -> Dict[str, Any]:
90 139 if not isinstance(raw_cache, Mapping):
91 140 raise ValueError("services.translation.cache must be a mapping")
  141 + if "enable_model_quality_tier_cache" in raw_cache:
  142 + enable_tier_cache = _require_bool(
  143 + raw_cache["enable_model_quality_tier_cache"],
  144 + "services.translation.cache.enable_model_quality_tier_cache",
  145 + )
  146 + else:
  147 + enable_tier_cache = True
92 148 return {
93 149 "ttl_seconds": _require_positive_int(raw_cache.get("ttl_seconds"), "services.translation.cache.ttl_seconds"),
94 150 "sliding_expiration": _require_bool(
95 151 raw_cache.get("sliding_expiration"),
96 152 "services.translation.cache.sliding_expiration",
97 153 ),
  154 + "enable_model_quality_tier_cache": enable_tier_cache,
  155 + "model_quality_tiers": _build_model_quality_tiers(raw_cache.get("model_quality_tiers")),
98 156 }
99 157  
100 158  
  159 +def _build_model_quality_tiers(raw: Any) -> Dict[str, int]:
  160 + if raw is None:
  161 + return {}
  162 + if not isinstance(raw, Mapping):
  163 + raise ValueError("services.translation.cache.model_quality_tiers must be a mapping")
  164 + resolved: Dict[str, int] = {}
  165 + for name, tier_val in raw.items():
  166 + cap = _require_string(name, "services.translation.cache.model_quality_tiers key").lower()
  167 + field = f"services.translation.cache.model_quality_tiers.{cap}"
  168 + resolved[cap] = _require_non_negative_int(tier_val, field)
  169 + return resolved
  170 +
  171 +
  172 +def _validate_model_quality_tiers(config: TranslationConfig) -> None:
  173 + tiers = config["cache"].get("model_quality_tiers")
  174 + if not isinstance(tiers, Mapping) or not tiers:
  175 + return
  176 + caps = config["capabilities"]
  177 + for name in tiers:
  178 + if name not in caps:
  179 + raise ValueError(
  180 + f"services.translation.cache.model_quality_tiers references unknown capability '{name}'"
  181 + )
  182 +
  183 +
  184 +def _require_non_negative_int(value: Any, field_name: str) -> int:
  185 + parsed = _require_int(value, field_name)
  186 + if parsed < 0:
  187 + raise ValueError(f"{field_name} must be >= 0")
  188 + return parsed
  189 +
  190 +
101 191 def _build_capabilities(raw_capabilities: Any) -> Dict[str, Dict[str, Any]]:
102 192 if not isinstance(raw_capabilities, Mapping):
103 193 raise ValueError("services.translation.capabilities must be a mapping")
... ...