Commit ef5baa866ae0cf2c061a83a9fc2aead25b1d098e

Authored by tangwang
1 parent fb973d19

混杂语言处理

config/config.yaml
@@ -120,7 +120,7 @@ query_config: @@ -120,7 +120,7 @@ query_config:
120 - skus 120 - skus
121 121
122 # KNN boost配置(向量召回的boost值) 122 # KNN boost配置(向量召回的boost值)
123 - knn_boost: 0.25 # Lower boost for embedding recall 123 + knn_boost: 2.0 # Lower boost for embedding recall
124 124
125 # Function Score配置(ES层打分规则) 125 # Function Score配置(ES层打分规则)
126 function_score: 126 function_score:
@@ -290,7 +290,7 @@ services: @@ -290,7 +290,7 @@ services:
290 engine: "vllm" 290 engine: "vllm"
291 max_model_len: 160 291 max_model_len: 160
292 tensor_parallel_size: 1 292 tensor_parallel_size: 1
293 - gpu_memory_utilization: 0.36 293 + gpu_memory_utilization: 0.20
294 dtype: "float16" 294 dtype: "float16"
295 enable_prefix_caching: true 295 enable_prefix_caching: true
296 enforce_eager: false 296 enforce_eager: false
docs/TODO-ES能力提升.md 0 → 100644
@@ -0,0 +1,69 @@ @@ -0,0 +1,69 @@
  1 +ES 付费版本 or 定制开发(建议先看下付费版本价格)
  2 +ES定制开发:
  3 +RRF / retrievers
  4 +
  5 +Elastic 的订阅矩阵里明确列了这些相关能力:Retrievers: linear, rule, RRF, text similarity re-ranker,以及 Reciprocal Rank Fusion (RRF) for hybrid search。
  6 +
  7 +这类能力最有价值的点是:
  8 +它们把混合检索从“自己拼 DSL 和手搓打分”变成了官方支持的多阶段检索框架。重排:text similarity re-ranker / Elastic Rerank. text_similarity_reranker 用 NLP 模型对 top-k 结果按语义相似度重新排序;它可以用内置的 Elastic Rerank,也可以接 Cohere、Vertex AI,或者你自己上传的 text similarity 模型。
  9 +
  10 +{
  11 + "retriever": {
  12 + "rrf": {
  13 + "retrievers": [
  14 + { "standard": { "query": { ... } } },
  15 + { "knn": { ... } }
  16 + ]
  17 + }
  18 + }
  19 +}
  20 +
  21 +
  22 +加reranker:
  23 +text_similarity_reranker 用 NLP 模型对 top-k 结果按语义相似度重新排序;它可以用内置的 Elastic Rerank,也可以接 Cohere、Vertex AI,或者你自己上传的 text similarity 模型。
  24 +
  25 +{
  26 + "retriever": {
  27 + "text_similarity_reranker": {
  28 + "retriever": {
  29 + "rrf": { ... }
  30 + },
  31 + ...
  32 + }
  33 + }
  34 +}
  35 +
  36 +{
  37 + "retriever": {
  38 + "text_similarity_reranker": {
  39 + "retriever": {
  40 + "rrf": {
  41 + "retrievers": [
  42 + {
  43 + "standard": {
  44 + "query": {
  45 + "...": "..."
  46 + }
  47 + }
  48 + },
  49 + {
  50 + "knn": {
  51 + "...": "..."
  52 + }
  53 + }
  54 + ],
  55 + "rank_window_size": 100,
  56 + "rank_constant": 20
  57 + }
  58 + },
  59 + "field": "your_rerank_text_field",
  60 + "inference_text": "白色 oversized T-shirt",
  61 + "inference_id": ".rerank-v1-elasticsearch",
  62 + "rank_window_size": 50
  63 + }
  64 + },
  65 + "size": 20
  66 +}
  67 +
  68 +
  69 +
1 1
  2 +把knn跟文本相关性的融合方式修改为 "rank": {"rrf": {} }需要licence,可以帮我修改源码支持吗?
  3 +
  4 + knn_boost: 2.0
  5 +
  6 +
  7 +{
  8 + "query": { ...全文检索... },
  9 + "knn": { ...向量检索... },
  10 + "rank": {
  11 + "rrf": {}
  12 + }
  13 +}
  14 +
  15 +
  16 +
  17 +
  18 +"image_embedding": {
  19 + "type": "nested",
  20 + "properties": {
  21 + "vector": {
  22 + "type": "dense_vector",
  23 + "dims": 1024,
  24 + "index": true,
  25 + "similarity": "dot_product",
  26 + "element_type": "bfloat16"
  27 + },
  28 + "url": {
  29 + "type": "text"
  30 + }
  31 + }
  32 +},
  33 +去掉 image_embedding_512
  34 +image_embedding改为,一个spu有多个sku向量,每个向量内部properties:
  35 +除了vector url还应该包括
  36 +"image_embedding": {
  37 + "type": "nested",
  38 + "properties": {
  39 + "vector": {
  40 + "type": "dense_vector",
  41 + "dims": 1024,
  42 + "index": true,
  43 + "similarity": "dot_product",
  44 + "element_type": "bfloat16"
  45 + },
  46 + "url": {
  47 + "type": "text"
  48 + }
  49 + }
  50 +},
  51 +
  52 +
  53 +
  54 +
  55 +
  56 +外部需求:
  57 +1. 对推理能力要求很低、对耗时要求很高的大模型API(或者本地部署一个7b Q4量化的大模型),prompt大概30-50个token,首token响应要求500ms以内
  58 +2. ES支持reranker pipline?
  59 +
2 60
3 @reranker/backends/qwen3_vllm.py 单次 generate 前有进程内锁,同一进程里不会并行多路 vLLM 推理,这个锁有必要吗?是否会影响性能?是否能够打开,使得性能更好?比如这个场景,我一次请求 400 条,分成每64个一个batch,基于我现在的gpu配置,可以再提高并发度吗? 61 @reranker/backends/qwen3_vllm.py 单次 generate 前有进程内锁,同一进程里不会并行多路 vLLM 推理,这个锁有必要吗?是否会影响性能?是否能够打开,使得性能更好?比如这个场景,我一次请求 400 条,分成每64个一个batch,基于我现在的gpu配置,可以再提高并发度吗?
4 测试了,让每个批次都并发地进行,耗时没有变化 62 测试了,让每个批次都并发地进行,耗时没有变化
@@ -383,6 +441,8 @@ https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-men @@ -383,6 +441,8 @@ https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-men
383 441
384 442
385 融合打分(已完成,2026-03) 443 融合打分(已完成,2026-03)
  444 +
  445 +以下已经完成:
386 1. `fuse_scores_and_resort` 已改为乘法融合,并通过 `matched_queries` 提取: 446 1. `fuse_scores_and_resort` 已改为乘法融合,并通过 `matched_queries` 提取:
387 - `base_query` 447 - `base_query`
388 - `base_query_trans_*` 448 - `base_query_trans_*`
@@ -397,7 +457,11 @@ https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-men @@ -397,7 +457,11 @@ https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-men
397 - `docs/搜索API对接指南.md` 457 - `docs/搜索API对接指南.md`
398 - `docs/Usage-Guide.md` 458 - `docs/Usage-Guide.md`
399 459
400 - 460 +未完成的:
  461 +(归一化、次序融合?还乘法公式?)
  462 +RRF:先把多路召回稳妥融合
  463 +linear + minmax:让你能精调 knn 和文本的权重
  464 +reranker:对前面召回出来的 top-k 再做“最后一刀”
401 465
402 466
403 467
docs/搜索API对接指南-01-搜索接口.md
@@ -66,9 +66,11 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) @@ -66,9 +66,11 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"})
66 | `min_score` | float | N | null | 最小相关性分数阈值 | 66 | `min_score` | float | N | null | 最小相关性分数阈值 |
67 | `sku_filter_dimension` | array[string] | N | null | 子SKU筛选维度列表(见[SKU筛选维度](#35-sku筛选维度)) | 67 | `sku_filter_dimension` | array[string] | N | null | 子SKU筛选维度列表(见[SKU筛选维度](#35-sku筛选维度)) |
68 | `debug` | boolean | N | false | 是否返回调试信息 | 68 | `debug` | boolean | N | false | 是否返回调试信息 |
69 -| `enable_rerank` | boolean/null | N | null | 是否开启重排(调用外部重排服务对 ES 结果进行二次排序)。不传/传 null 使用服务端 `rerank.enabled`(默认开启)。开启后会先对 ES TopN(`rerank_window`)重排,再按分页截取;若 `from+size>1000`,则不重排,直接按分页从 ES 返回 |  
70 -| `rerank_query_template` | string | N | null | 重排 query 模板(可选)。支持 `{query}` 占位符;不传则使用服务端配置 |  
71 -| `rerank_doc_template` | string | N | null | 重排 doc 模板(可选)。支持 `{title} {brief} {vendor} {description} {category_path}`;不传则使用服务端配置 | 69 +| `enable_rerank` | boolean/null | N | null | 是否开启重排(调用外部重排服务对 ES 结果进行二次排序)。不传/传 null 使用服务端 `rerank.enabled`。当有效开启且 `from + size <= rerank_window` 时:ES 先取前 `rerank_window` 条,重排后再按 `from`/`size` 截取当前页;若 `from + size > rerank_window`,则**不进行**窗口内重排,直接按请求的 `from`/`size` 查询 ES(`rerank_window` 见 `config.yaml` 的 `rerank.rerank_window`,仓库示例默认 400) |
  70 +| `rerank_query_template` | string | N | null | 重排 query 模板(可选)。支持 `{query}` 占位符;不传则使用服务端 `rerank.rerank_query_template` |
  71 +| `rerank_doc_template` | string | N | null | 重排 doc 模板(可选)。支持 `{title} {brief} {vendor} {description} {category_path}` 等占位符(由 `search/rerank_client.py` 按语言字段拼装);不传则使用服务端 `rerank.rerank_doc_template` |
  72 +
  73 +**与后端代码的对应关系**(便于联调):HTTP `POST /search/` 请求体由 `api/models.py` 的 `SearchRequest` 校验;路由 `api/routes/search.py` 将字段原样传入 `Searcher.search(...)`(含上述三个重排相关字段)。CLI `python main.py search` 目前未暴露这些参数,走配置默认值。
72 | `user_id` | string | N | null | 用户ID(用于个性化,预留) | 74 | `user_id` | string | N | null | 用户ID(用于个性化,预留) |
73 | `session_id` | string | N | null | 会话ID(用于分析,预留) | 75 | `session_id` | string | N | null | 会话ID(用于分析,预留) |
74 76
docs/相关性检索优化说明.md
@@ -281,3 +281,24 @@ title.en: 2026 Korean-style High-waisted Slimming Corduroy Skirt with Slit, Mid- @@ -281,3 +281,24 @@ title.en: 2026 Korean-style High-waisted Slimming Corduroy Skirt with Slit, Mid-
281 Rerank score: 0.9643 281 Rerank score: 0.9643
282 title.en: Black Half-high Collar Base Shirt Women's Autumn and Winter fleece-lined Contrast Color Pure Desire Design Sense Horn Sleeve Ruffled Inner Top 282 title.en: Black Half-high Collar Base Shirt Women's Autumn and Winter fleece-lined Contrast Color Pure Desire Design Sense Horn Sleeve Ruffled Inner Top
283 title.zh: 黑色高领半高领女士秋冬内搭加绒拼色纯欲设计荷叶边袖内衬上衣 283 title.zh: 黑色高领半高领女士秋冬内搭加绒拼色纯欲设计荷叶边袖内衬上衣
  284 +
  285 +
  286 +
  287 +qwen3-0.6b的严重badcase:
  288 +q=牛仔裤
  289 +
  290 +Rerank score: 0.0002
  291 +title.en: Wrangler Womens Cowboy Cut Slim Fit Jean Bleach
  292 +title.zh: Wrangler 女士牛仔裤 牛仔剪裁 紧身版型 漂白色
  293 +
  294 +Rerank score: 0.0168
  295 +title.en: Fleece Lined Tights Sheer Women - Fake Translucent Warm Pantyhose Leggings Sheer Thick Tights for Winter
  296 +title.zh: 加绒透肤女士连裤袜 - 仿透视保暖长筒袜 冬季厚款透肤连裤袜
  297 +
  298 +Rerank score: 0.1366
  299 +title.en: Dockers Men's Classic Fit Workday Khaki Smart 360 FLEX Pants (Standard and Big & Tall)
  300 +title.zh: Dockers 男士经典版型工作日卡其色智能360度弹力裤(标准码与加大码)
  301 +
  302 +Rerank score: 0.0981
  303 +title.en: Lazy One Pajama Shorts for Men, Men's Pajama Bottoms, Sleepwear
  304 +title.zh: 懒人男士睡裤,男式家居裤,睡眠服饰
query/query_parser.py
1 """ 1 """
2 Query parser - main module for query processing. 2 Query parser - main module for query processing.
3 3
4 -Handles query rewriting, translation, and embedding generation. 4 +Responsibilities are intentionally narrow:
  5 +- normalize and rewrite the incoming query
  6 +- detect language and tokenize with HanLP
  7 +- run translation and embedding requests concurrently
  8 +- return parser facts, not Elasticsearch language-planning data
5 """ 9 """
6 10
7 -from typing import Dict, List, Optional, Any, Union, Tuple 11 +from dataclasses import dataclass, field
  12 +from typing import Any, Callable, Dict, List, Optional, Tuple
8 import numpy as np 13 import numpy as np
9 import logging 14 import logging
10 import re 15 import re
@@ -26,7 +31,7 @@ except Exception: # pragma: no cover @@ -26,7 +31,7 @@ except Exception: # pragma: no cover
26 31
27 def simple_tokenize_query(text: str) -> List[str]: 32 def simple_tokenize_query(text: str) -> List[str]:
28 """ 33 """
29 - Lightweight tokenizer for suggestion length / analysis (aligned with QueryParser fallback). 34 + Lightweight tokenizer for suggestion-side heuristics only.
30 35
31 - Consecutive CJK characters form one token 36 - Consecutive CJK characters form one token
32 - Latin / digit runs (with internal hyphens) form tokens 37 - Latin / digit runs (with internal hyphens) form tokens
@@ -37,63 +42,32 @@ def simple_tokenize_query(text: str) -&gt; List[str]: @@ -37,63 +42,32 @@ def simple_tokenize_query(text: str) -&gt; List[str]:
37 return pattern.findall(text) 42 return pattern.findall(text)
38 43
39 44
  45 +@dataclass(slots=True)
40 class ParsedQuery: 46 class ParsedQuery:
41 - """Container for parsed query results."""  
42 -  
43 - def __init__(  
44 - self,  
45 - original_query: str,  
46 - query_normalized: str,  
47 - rewritten_query: Optional[str] = None,  
48 - detected_language: Optional[str] = None,  
49 - translations: Dict[str, str] = None,  
50 - query_vector: Optional[np.ndarray] = None,  
51 - domain: str = "default",  
52 - keywords: str = "",  
53 - token_count: int = 0,  
54 - query_tokens: Optional[List[str]] = None,  
55 - query_text_by_lang: Optional[Dict[str, str]] = None,  
56 - search_langs: Optional[List[str]] = None,  
57 - index_languages: Optional[List[str]] = None,  
58 - source_in_index_languages: bool = True,  
59 - contains_chinese: bool = False,  
60 - contains_english: bool = False,  
61 - ):  
62 - self.original_query = original_query  
63 - self.query_normalized = query_normalized  
64 - self.rewritten_query = rewritten_query or query_normalized  
65 - self.detected_language = detected_language  
66 - self.translations = translations or {}  
67 - self.query_vector = query_vector  
68 - self.domain = domain  
69 - # Query analysis fields  
70 - self.keywords = keywords  
71 - self.token_count = token_count  
72 - self.query_tokens = query_tokens or []  
73 - self.query_text_by_lang = query_text_by_lang or {}  
74 - self.search_langs = search_langs or []  
75 - self.index_languages = index_languages or []  
76 - self.source_in_index_languages = bool(source_in_index_languages)  
77 - self.contains_chinese = bool(contains_chinese)  
78 - self.contains_english = bool(contains_english) 47 + """Container for query parser facts."""
  48 +
  49 + original_query: str
  50 + query_normalized: str
  51 + rewritten_query: str
  52 + detected_language: Optional[str] = None
  53 + translations: Dict[str, str] = field(default_factory=dict)
  54 + query_vector: Optional[np.ndarray] = None
  55 + query_tokens: List[str] = field(default_factory=list)
  56 + contains_chinese: bool = False
  57 + contains_english: bool = False
79 58
80 def to_dict(self) -> Dict[str, Any]: 59 def to_dict(self) -> Dict[str, Any]:
81 """Convert to dictionary representation.""" 60 """Convert to dictionary representation."""
82 - result = { 61 + return {
83 "original_query": self.original_query, 62 "original_query": self.original_query,
84 "query_normalized": self.query_normalized, 63 "query_normalized": self.query_normalized,
85 "rewritten_query": self.rewritten_query, 64 "rewritten_query": self.rewritten_query,
86 "detected_language": self.detected_language, 65 "detected_language": self.detected_language,
87 "translations": self.translations, 66 "translations": self.translations,
88 - "domain": self.domain 67 + "query_tokens": self.query_tokens,
  68 + "contains_chinese": self.contains_chinese,
  69 + "contains_english": self.contains_english,
89 } 70 }
90 - result["query_text_by_lang"] = self.query_text_by_lang  
91 - result["search_langs"] = self.search_langs  
92 - result["index_languages"] = self.index_languages  
93 - result["source_in_index_languages"] = self.source_in_index_languages  
94 - result["contains_chinese"] = self.contains_chinese  
95 - result["contains_english"] = self.contains_english  
96 - return result  
97 71
98 72
99 class QueryParser: 73 class QueryParser:
@@ -102,7 +76,7 @@ class QueryParser: @@ -102,7 +76,7 @@ class QueryParser:
102 1. Normalization 76 1. Normalization
103 2. Query rewriting (brand/category mappings, synonyms) 77 2. Query rewriting (brand/category mappings, synonyms)
104 3. Language detection 78 3. Language detection
105 - 4. Translation to target languages 79 + 4. Translation to caller-provided target languages
106 5. Text embedding generation (for semantic search) 80 5. Text embedding generation (for semantic search)
107 """ 81 """
108 82
@@ -110,7 +84,8 @@ class QueryParser: @@ -110,7 +84,8 @@ class QueryParser:
110 self, 84 self,
111 config: SearchConfig, 85 config: SearchConfig,
112 text_encoder: Optional[TextEmbeddingEncoder] = None, 86 text_encoder: Optional[TextEmbeddingEncoder] = None,
113 - translator: Optional[Any] = None 87 + translator: Optional[Any] = None,
  88 + tokenizer: Optional[Callable[[str], Any]] = None,
114 ): 89 ):
115 """ 90 """
116 Initialize query parser. 91 Initialize query parser.
@@ -128,23 +103,7 @@ class QueryParser: @@ -128,23 +103,7 @@ class QueryParser:
128 self.normalizer = QueryNormalizer() 103 self.normalizer = QueryNormalizer()
129 self.language_detector = LanguageDetector() 104 self.language_detector = LanguageDetector()
130 self.rewriter = QueryRewriter(config.query_config.rewrite_dictionary) 105 self.rewriter = QueryRewriter(config.query_config.rewrite_dictionary)
131 -  
132 - # Optional HanLP components (heavy). If unavailable, fall back to a lightweight tokenizer.  
133 - self._tok = None  
134 - self._pos_tag = None  
135 - if hanlp is not None:  
136 - try:  
137 - logger.info("Initializing HanLP components...")  
138 - self._tok = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF)  
139 - self._tok.config.output_spans = True  
140 - self._pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)  
141 - logger.info("HanLP components initialized")  
142 - except Exception as e:  
143 - logger.warning(f"HanLP init failed, falling back to simple tokenizer: {e}")  
144 - self._tok = None  
145 - self._pos_tag = None  
146 - else:  
147 - logger.info("HanLP not installed; using simple tokenizer") 106 + self._tokenizer = tokenizer or self._build_tokenizer()
148 107
149 # Eager initialization (startup-time failure visibility, no lazy init in request path) 108 # Eager initialization (startup-time failure visibility, no lazy init in request path)
150 if self.config.query_config.enable_text_embedding and self._text_encoder is None: 109 if self.config.query_config.enable_text_embedding and self._text_encoder is None:
@@ -170,6 +129,16 @@ class QueryParser: @@ -170,6 +129,16 @@ class QueryParser:
170 """Return pre-initialized translator.""" 129 """Return pre-initialized translator."""
171 return self._translator 130 return self._translator
172 131
  132 + def _build_tokenizer(self) -> Callable[[str], Any]:
  133 + """Build the tokenizer used by query parsing. No fallback path by design."""
  134 + if hanlp is None:
  135 + raise RuntimeError("HanLP is required for QueryParser tokenization")
  136 + logger.info("Initializing HanLP tokenizer...")
  137 + tokenizer = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF)
  138 + tokenizer.config.output_spans = True
  139 + logger.info("HanLP tokenizer initialized")
  140 + return tokenizer
  141 +
173 @staticmethod 142 @staticmethod
174 def _pick_query_translation_model(source_lang: str, target_lang: str, config: SearchConfig) -> str: 143 def _pick_query_translation_model(source_lang: str, target_lang: str, config: SearchConfig) -> str:
175 """Pick the translation capability for query-time translation (configurable).""" 144 """Pick the translation capability for query-time translation (configurable)."""
@@ -186,41 +155,46 @@ class QueryParser: @@ -186,41 +155,46 @@ class QueryParser:
186 # By default this is `nllb-200-distilled-600m` (multi-lingual local model). 155 # By default this is `nllb-200-distilled-600m` (multi-lingual local model).
187 return config.query_config.default_translation_model 156 return config.query_config.default_translation_model
188 157
189 - def _simple_tokenize(self, text: str) -> List[str]:  
190 - return simple_tokenize_query(text)  
191 -  
192 - def _extract_keywords(self, query: str) -> str:  
193 - """Extract keywords (nouns with length > 1) from query."""  
194 - if self._tok is not None and self._pos_tag is not None:  
195 - tok_result = self._tok(query)  
196 - if not tok_result:  
197 - return ""  
198 - words = [x[0] for x in tok_result]  
199 - pos_tags = self._pos_tag(words)  
200 - keywords = []  
201 - for word, pos in zip(words, pos_tags):  
202 - if len(word) > 1 and isinstance(pos, str) and pos.startswith("N"):  
203 - keywords.append(word)  
204 - return " ".join(keywords)  
205 -  
206 - # Fallback: treat tokens with length > 1 as "keywords"  
207 - tokens = self._simple_tokenize(query)  
208 - keywords = [t for t in tokens if len(t) > 1]  
209 - return " ".join(keywords)  
210 -  
211 - def _get_token_count(self, query: str) -> int:  
212 - """Get token count (HanLP if available, otherwise simple)."""  
213 - if self._tok is not None:  
214 - tok_result = self._tok(query)  
215 - return len(tok_result) if tok_result else 0  
216 - return len(self._simple_tokenize(query)) 158 + @staticmethod
  159 + def _normalize_language_codes(languages: Optional[List[str]]) -> List[str]:
  160 + normalized: List[str] = []
  161 + seen = set()
  162 + for language in languages or []:
  163 + token = str(language or "").strip().lower()
  164 + if not token or token in seen:
  165 + continue
  166 + seen.add(token)
  167 + normalized.append(token)
  168 + return normalized
  169 +
  170 + @staticmethod
  171 + def _extract_tokens(tokenizer_result: Any) -> List[str]:
  172 + """Normalize tokenizer output into a flat token string list."""
  173 + if not tokenizer_result:
  174 + return []
  175 + if isinstance(tokenizer_result, str):
  176 + token = tokenizer_result.strip()
  177 + return [token] if token else []
  178 +
  179 + tokens: List[str] = []
  180 + for item in tokenizer_result:
  181 + token: Optional[str] = None
  182 + if isinstance(item, str):
  183 + token = item
  184 + elif isinstance(item, (list, tuple)) and item:
  185 + token = str(item[0])
  186 + elif item is not None:
  187 + token = str(item)
  188 +
  189 + if token is None:
  190 + continue
  191 + token = token.strip()
  192 + if token:
  193 + tokens.append(token)
  194 + return tokens
217 195
218 def _get_query_tokens(self, query: str) -> List[str]: 196 def _get_query_tokens(self, query: str) -> List[str]:
219 - """Get token list (HanLP if available, otherwise simple)."""  
220 - if self._tok is not None:  
221 - tok_result = self._tok(query)  
222 - return [x[0] for x in tok_result] if tok_result else []  
223 - return self._simple_tokenize(query) 197 + return self._extract_tokens(self._tokenizer(query))
224 198
225 @staticmethod 199 @staticmethod
226 def _contains_cjk(text: str) -> bool: 200 def _contains_cjk(text: str) -> bool:
@@ -237,64 +211,24 @@ class QueryParser: @@ -237,64 +211,24 @@ class QueryParser:
237 return False 211 return False
238 return bool(re.fullmatch(r"[A-Za-z]+(?:-[A-Za-z]+)*", token)) 212 return bool(re.fullmatch(r"[A-Za-z]+(?:-[A-Za-z]+)*", token))
239 213
240 - @staticmethod  
241 - def _extract_latin_tokens(text: str) -> List[str]:  
242 - """Extract latin word tokens from query text."""  
243 - return re.findall(r"[A-Za-z]+(?:-[A-Za-z]+)*", text or "")  
244 -  
245 - def _infer_supplemental_search_langs(  
246 - self,  
247 - query_text: str,  
248 - detected_lang: str,  
249 - index_langs: List[str],  
250 - ) -> List[str]:  
251 - """  
252 - Infer extra languages to search when the query mixes scripts.  
253 -  
254 - Rules:  
255 - - If any Chinese characters appear, include `zh` when available.  
256 - - If the query contains meaningful latin tokens, include `en` when available.  
257 - "Meaningful" means either:  
258 - 1) at least 2 latin tokens with length >= 4, or  
259 - 2) at least 1 latin token with length >= 4 and latin chars occupy >= 20% of non-space chars.  
260 - """  
261 - supplemental: List[str] = []  
262 - normalized_index_langs = {str(lang or "").strip().lower() for lang in index_langs}  
263 - normalized_detected = str(detected_lang or "").strip().lower()  
264 - query_text = str(query_text or "")  
265 -  
266 - if "zh" in normalized_index_langs and self._contains_cjk(query_text) and normalized_detected != "zh":  
267 - supplemental.append("zh")  
268 -  
269 - latin_tokens = self._extract_latin_tokens(query_text)  
270 - significant_latin_tokens = [tok for tok in latin_tokens if len(tok) >= 4]  
271 - latin_chars = sum(len(tok) for tok in latin_tokens)  
272 - non_space_chars = len(re.sub(r"\s+", "", query_text))  
273 - latin_ratio = (latin_chars / non_space_chars) if non_space_chars > 0 else 0.0  
274 - has_meaningful_english = (  
275 - len(significant_latin_tokens) >= 2 or  
276 - (len(significant_latin_tokens) >= 1 and latin_ratio >= 0.2)  
277 - )  
278 -  
279 - if "en" in normalized_index_langs and has_meaningful_english and normalized_detected != "en":  
280 - supplemental.append("en")  
281 -  
282 - return supplemental  
283 -  
284 def parse( 214 def parse(
285 self, 215 self,
286 query: str, 216 query: str,
287 tenant_id: Optional[str] = None, 217 tenant_id: Optional[str] = None,
288 generate_vector: bool = True, 218 generate_vector: bool = True,
289 - context: Optional[Any] = None 219 + context: Optional[Any] = None,
  220 + target_languages: Optional[List[str]] = None,
290 ) -> ParsedQuery: 221 ) -> ParsedQuery:
291 """ 222 """
292 Parse query through all processing stages. 223 Parse query through all processing stages.
293 224
294 Args: 225 Args:
295 query: Raw query string 226 query: Raw query string
  227 + tenant_id: Deprecated and ignored by QueryParser. Kept temporarily
  228 + to avoid a wider refactor in this first step.
296 generate_vector: Whether to generate query embedding 229 generate_vector: Whether to generate query embedding
297 context: Optional request context for tracking and logging 230 context: Optional request context for tracking and logging
  231 + target_languages: Translation target languages decided by the caller
298 232
299 Returns: 233 Returns:
300 ParsedQuery object with all processing results 234 ParsedQuery object with all processing results
@@ -325,15 +259,9 @@ class QueryParser: @@ -325,15 +259,9 @@ class QueryParser:
325 if context: 259 if context:
326 context.store_intermediate_result('query_normalized', normalized) 260 context.store_intermediate_result('query_normalized', normalized)
327 261
328 - # Extract domain if present (e.g., "brand:Nike" -> domain="brand", query="Nike")  
329 - domain, query_text = self.normalizer.extract_domain_query(normalized)  
330 - log_debug(f"Domain extraction | Domain: '{domain}', Query: '{query_text}'")  
331 - if context:  
332 - context.store_intermediate_result('extracted_domain', domain)  
333 - context.store_intermediate_result('domain_query', query_text)  
334 -  
335 # Stage 2: Query rewriting 262 # Stage 2: Query rewriting
336 - rewritten = None 263 + query_text = normalized
  264 + rewritten = normalized
337 if self.config.query_config.rewrite_dictionary: # Enable rewrite if dictionary exists 265 if self.config.query_config.rewrite_dictionary: # Enable rewrite if dictionary exists
338 rewritten = self.rewriter.rewrite(query_text) 266 rewritten = self.rewriter.rewrite(query_text)
339 if rewritten != query_text: 267 if rewritten != query_text:
@@ -351,43 +279,51 @@ class QueryParser: @@ -351,43 +279,51 @@ class QueryParser:
351 log_info(f"Language detection | Detected language: {detected_lang}") 279 log_info(f"Language detection | Detected language: {detected_lang}")
352 if context: 280 if context:
353 context.store_intermediate_result('detected_language', detected_lang) 281 context.store_intermediate_result('detected_language', detected_lang)
  282 + # Stage 4: Query analysis (tokenization + script flags)
  283 + query_tokens = self._get_query_tokens(query_text)
  284 + contains_chinese = self._contains_cjk(query_text)
  285 + contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens)
  286 +
  287 + log_debug(
  288 + f"Query analysis | Query tokens: {query_tokens} | "
  289 + f"contains_chinese={contains_chinese} | contains_english={contains_english}"
  290 + )
  291 + if context:
  292 + context.store_intermediate_result('query_tokens', query_tokens)
  293 + context.store_intermediate_result('contains_chinese', contains_chinese)
  294 + context.store_intermediate_result('contains_english', contains_english)
354 295
355 - # Stage 4: Translation — always submit to thread pool; results are collected together with  
356 - # embedding in one wait() that uses a configurable budget (short vs long by source-in-index). 296 + # Stage 5: Translation + embedding. Parser only coordinates async enrichment work; the
  297 + # caller decides translation targets and later search-field planning.
357 translations: Dict[str, str] = {} 298 translations: Dict[str, str] = {}
358 - translation_futures: Dict[str, Any] = {}  
359 - translation_executor: Optional[ThreadPoolExecutor] = None  
360 - index_langs: List[str] = [] 299 + future_to_task: Dict[Any, Tuple[str, Optional[str]]] = {}
  300 + async_executor: Optional[ThreadPoolExecutor] = None
361 detected_norm = str(detected_lang or "").strip().lower() 301 detected_norm = str(detected_lang or "").strip().lower()
  302 + normalized_targets = self._normalize_language_codes(target_languages)
  303 + translation_targets = [lang for lang in normalized_targets if lang != detected_norm]
  304 +
  305 + # Stage 6: Text embedding - async execution
  306 + query_vector = None
  307 + should_generate_embedding = (
  308 + generate_vector and
  309 + self.config.query_config.enable_text_embedding
  310 + )
  311 +
  312 + task_count = len(translation_targets) + (1 if should_generate_embedding else 0)
  313 + if task_count > 0:
  314 + async_executor = ThreadPoolExecutor(
  315 + max_workers=max(1, min(task_count, 4)),
  316 + thread_name_prefix="query-enrichment",
  317 + )
362 318
363 try: 319 try:
364 - # 根据租户配置的 index_languages 决定翻译目标语言  
365 - from config.tenant_config_loader import get_tenant_config_loader  
366 - tenant_loader = get_tenant_config_loader()  
367 - tenant_cfg = tenant_loader.get_tenant_config(tenant_id or "default")  
368 - raw_index_langs = tenant_cfg.get("index_languages") or []  
369 - index_langs = []  
370 - seen_langs = set()  
371 - for lang in raw_index_langs:  
372 - norm_lang = str(lang or "").strip().lower()  
373 - if not norm_lang or norm_lang in seen_langs:  
374 - continue  
375 - seen_langs.add(norm_lang)  
376 - index_langs.append(norm_lang)  
377 -  
378 - target_langs_for_translation = [lang for lang in index_langs if lang != detected_norm]  
379 -  
380 - if target_langs_for_translation:  
381 - translation_executor = ThreadPoolExecutor(  
382 - max_workers=max(1, min(len(target_langs_for_translation), 4)),  
383 - thread_name_prefix="query-translation",  
384 - )  
385 - for lang in target_langs_for_translation: 320 + if async_executor is not None:
  321 + for lang in translation_targets:
386 model_name = self._pick_query_translation_model(detected_lang, lang, self.config) 322 model_name = self._pick_query_translation_model(detected_lang, lang, self.config)
387 log_debug( 323 log_debug(
388 f"Submitting query translation | source={detected_lang} target={lang} model={model_name}" 324 f"Submitting query translation | source={detected_lang} target={lang} model={model_name}"
389 ) 325 )
390 - translation_futures[lang] = translation_executor.submit( 326 + future = async_executor.submit(
391 self.translator.translate, 327 self.translator.translate,
392 query_text, 328 query_text,
393 lang, 329 lang,
@@ -395,107 +331,61 @@ class QueryParser: @@ -395,107 +331,61 @@ class QueryParser:
395 "ecommerce_search_query", 331 "ecommerce_search_query",
396 model_name, 332 model_name,
397 ) 333 )
398 -  
399 - if context:  
400 - context.store_intermediate_result('translations', translations)  
401 - for lang, translation in translations.items():  
402 - if translation:  
403 - context.store_intermediate_result(f'translation_{lang}', translation)  
404 - 334 + future_to_task[future] = ("translation", lang)
  335 +
  336 + if should_generate_embedding:
  337 + if self.text_encoder is None:
  338 + raise RuntimeError("Text embedding is enabled but text encoder is not initialized")
  339 + log_debug("Submitting query vector generation")
  340 +
  341 + def _encode_query_vector() -> Optional[np.ndarray]:
  342 + arr = self.text_encoder.encode([query_text], priority=1)
  343 + if arr is None or len(arr) == 0:
  344 + return None
  345 + vec = arr[0]
  346 + if vec is None:
  347 + return None
  348 + return np.asarray(vec, dtype=np.float32)
  349 +
  350 + future = async_executor.submit(_encode_query_vector)
  351 + future_to_task[future] = ("embedding", None)
405 except Exception as e: 352 except Exception as e:
406 - error_msg = f"Translation failed | Error: {str(e)}" 353 + error_msg = f"Async query enrichment submission failed | Error: {str(e)}"
407 log_info(error_msg) 354 log_info(error_msg)
408 if context: 355 if context:
409 context.add_warning(error_msg) 356 context.add_warning(error_msg)
  357 + if async_executor is not None:
  358 + async_executor.shutdown(wait=False)
  359 + async_executor = None
  360 + future_to_task.clear()
410 361
411 - # Stage 5: Query analysis (keywords, token count, query_tokens)  
412 - keywords = self._extract_keywords(query_text)  
413 - query_tokens = self._get_query_tokens(query_text)  
414 - token_count = len(query_tokens)  
415 - contains_chinese = self._contains_cjk(query_text)  
416 - contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens)  
417 -  
418 - log_debug(f"Query analysis | Keywords: {keywords} | Token count: {token_count} | "  
419 - f"Query tokens: {query_tokens} | contains_chinese={contains_chinese} | "  
420 - f"contains_english={contains_english}")  
421 - if context:  
422 - context.store_intermediate_result('keywords', keywords)  
423 - context.store_intermediate_result('token_count', token_count)  
424 - context.store_intermediate_result('query_tokens', query_tokens)  
425 - context.store_intermediate_result('contains_chinese', contains_chinese)  
426 - context.store_intermediate_result('contains_english', contains_english)  
427 -  
428 - # Stage 6: Text embedding (only for non-short queries) - async execution  
429 - query_vector = None  
430 - embedding_future = None  
431 - should_generate_embedding = (  
432 - generate_vector and  
433 - self.config.query_config.enable_text_embedding and  
434 - domain == "default"  
435 - )  
436 -  
437 - encoding_executor = None  
438 - if should_generate_embedding:  
439 - try:  
440 - if self.text_encoder is None:  
441 - raise RuntimeError("Text embedding is enabled but text encoder is not initialized")  
442 - log_debug("Starting query vector generation (async)")  
443 - # Submit encoding task to thread pool for async execution  
444 - encoding_executor = ThreadPoolExecutor(max_workers=1)  
445 - def _encode_query_vector() -> Optional[np.ndarray]:  
446 - arr = self.text_encoder.encode([query_text], priority=1)  
447 - if arr is None or len(arr) == 0:  
448 - return None  
449 - vec = arr[0]  
450 - return vec if isinstance(vec, np.ndarray) else None  
451 - embedding_future = encoding_executor.submit(  
452 - _encode_query_vector  
453 - )  
454 - except Exception as e:  
455 - error_msg = f"Query vector generation task submission failed | Error: {str(e)}"  
456 - log_info(error_msg)  
457 - if context:  
458 - context.add_warning(error_msg)  
459 - encoding_executor = None  
460 - embedding_future = None  
461 -  
462 - # Wait for translation + embedding concurrently; shared budget (ms) depends on whether  
463 - # the detected language is in tenant index_languages. 362 + # Wait for translation + embedding concurrently; shared budget depends on whether
  363 + # the detected language belongs to caller-provided target_languages.
464 qc = self.config.query_config 364 qc = self.config.query_config
465 - source_in_index_for_budget = detected_norm in index_langs 365 + source_in_target_languages = bool(normalized_targets) and detected_norm in normalized_targets
466 budget_ms = ( 366 budget_ms = (
467 qc.translation_embedding_wait_budget_ms_source_in_index 367 qc.translation_embedding_wait_budget_ms_source_in_index
468 - if source_in_index_for_budget 368 + if source_in_target_languages
469 else qc.translation_embedding_wait_budget_ms_source_not_in_index 369 else qc.translation_embedding_wait_budget_ms_source_not_in_index
470 ) 370 )
471 budget_sec = max(0.0, float(budget_ms) / 1000.0) 371 budget_sec = max(0.0, float(budget_ms) / 1000.0)
472 372
473 - if translation_futures: 373 + if translation_targets:
474 log_info( 374 log_info(
475 f"Translation+embedding shared wait budget | budget_ms={budget_ms} | " 375 f"Translation+embedding shared wait budget | budget_ms={budget_ms} | "
476 - f"source_in_index_languages={source_in_index_for_budget} | "  
477 - f"translation_targets={list(translation_futures.keys())}" 376 + f"source_in_target_languages={source_in_target_languages} | "
  377 + f"translation_targets={translation_targets}"
478 ) 378 )
479 379
480 - if translation_futures or embedding_future: 380 + if future_to_task:
481 log_debug( 381 log_debug(
482 f"Waiting for async tasks (translation+embedding) | budget_ms={budget_ms} | " 382 f"Waiting for async tasks (translation+embedding) | budget_ms={budget_ms} | "
483 - f"source_in_index_languages={source_in_index_for_budget}" 383 + f"source_in_target_languages={source_in_target_languages}"
484 ) 384 )
485 385
486 - all_futures: List[Any] = []  
487 - future_to_lang: Dict[Any, tuple] = {}  
488 - for lang, future in translation_futures.items():  
489 - all_futures.append(future)  
490 - future_to_lang[future] = ("translation", lang)  
491 -  
492 - if embedding_future:  
493 - all_futures.append(embedding_future)  
494 - future_to_lang[embedding_future] = ("embedding", None)  
495 -  
496 - done, not_done = wait(all_futures, timeout=budget_sec) 386 + done, not_done = wait(list(future_to_task.keys()), timeout=budget_sec)
497 for future in done: 387 for future in done:
498 - task_type, lang = future_to_lang[future] 388 + task_type, lang = future_to_task[future]
499 try: 389 try:
500 result = future.result() 390 result = future.result()
501 if task_type == "translation": 391 if task_type == "translation":
@@ -528,7 +418,7 @@ class QueryParser: @@ -528,7 +418,7 @@ class QueryParser:
528 418
529 if not_done: 419 if not_done:
530 for future in not_done: 420 for future in not_done:
531 - task_type, lang = future_to_lang[future] 421 + task_type, lang = future_to_task[future]
532 if task_type == "translation": 422 if task_type == "translation":
533 timeout_msg = ( 423 timeout_msg = (
534 f"Translation timeout (>{budget_ms}ms) | Language: {lang} | " 424 f"Translation timeout (>{budget_ms}ms) | Language: {lang} | "
@@ -542,68 +432,21 @@ class QueryParser: @@ -542,68 +432,21 @@ class QueryParser:
542 if context: 432 if context:
543 context.add_warning(timeout_msg) 433 context.add_warning(timeout_msg)
544 434
545 - if encoding_executor:  
546 - encoding_executor.shutdown(wait=False)  
547 - if translation_executor:  
548 - translation_executor.shutdown(wait=False) 435 + if async_executor:
  436 + async_executor.shutdown(wait=False)
549 437
550 if translations and context: 438 if translations and context:
551 context.store_intermediate_result("translations", translations) 439 context.store_intermediate_result("translations", translations)
552 -  
553 - # Build language-scoped query plan: source language + available translations  
554 - query_text_by_lang: Dict[str, str] = {}  
555 - if query_text:  
556 - query_text_by_lang[detected_lang] = query_text  
557 - for lang, translated_text in (translations or {}).items():  
558 - if translated_text and str(translated_text).strip():  
559 - query_text_by_lang[str(lang).strip().lower()] = str(translated_text)  
560 -  
561 - supplemental_search_langs = self._infer_supplemental_search_langs(  
562 - query_text=query_text,  
563 - detected_lang=detected_lang,  
564 - index_langs=index_langs,  
565 - )  
566 - for lang in supplemental_search_langs:  
567 - if lang not in query_text_by_lang and query_text:  
568 - # Use the original mixed-script query as a robust fallback probe for that language field set.  
569 - query_text_by_lang[lang] = query_text  
570 -  
571 - source_in_index_languages = detected_norm in index_langs  
572 - ordered_search_langs: List[str] = []  
573 - seen_order = set()  
574 - if detected_lang in query_text_by_lang:  
575 - ordered_search_langs.append(detected_lang)  
576 - seen_order.add(detected_lang)  
577 - for lang in index_langs:  
578 - if lang in query_text_by_lang and lang not in seen_order:  
579 - ordered_search_langs.append(lang)  
580 - seen_order.add(lang)  
581 - for lang in query_text_by_lang.keys():  
582 - if lang not in seen_order:  
583 - ordered_search_langs.append(lang)  
584 - seen_order.add(lang)  
585 -  
586 - if context:  
587 - context.store_intermediate_result("search_langs", ordered_search_langs)  
588 - context.store_intermediate_result("query_text_by_lang", query_text_by_lang)  
589 - context.store_intermediate_result("supplemental_search_langs", supplemental_search_langs)  
590 440
591 # Build result 441 # Build result
592 result = ParsedQuery( 442 result = ParsedQuery(
593 original_query=query, 443 original_query=query,
594 query_normalized=normalized, 444 query_normalized=normalized,
595 - rewritten_query=rewritten, 445 + rewritten_query=query_text,
596 detected_language=detected_lang, 446 detected_language=detected_lang,
597 translations=translations, 447 translations=translations,
598 query_vector=query_vector, 448 query_vector=query_vector,
599 - domain=domain,  
600 - keywords=keywords,  
601 - token_count=token_count,  
602 query_tokens=query_tokens, 449 query_tokens=query_tokens,
603 - query_text_by_lang=query_text_by_lang,  
604 - search_langs=ordered_search_langs,  
605 - index_languages=index_langs,  
606 - source_in_index_languages=source_in_index_languages,  
607 contains_chinese=contains_chinese, 450 contains_chinese=contains_chinese,
608 contains_english=contains_english, 451 contains_english=contains_english,
609 ) 452 )
@@ -611,14 +454,13 @@ class QueryParser: @@ -611,14 +454,13 @@ class QueryParser:
611 if context and hasattr(context, 'logger'): 454 if context and hasattr(context, 'logger'):
612 context.logger.info( 455 context.logger.info(
613 f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | " 456 f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | "
614 - f"Language: {detected_lang} | Domain: {domain} | "  
615 f"Translation count: {len(translations)} | Vector: {'yes' if query_vector is not None else 'no'}", 457 f"Translation count: {len(translations)} | Vector: {'yes' if query_vector is not None else 'no'}",
616 extra={'reqid': context.reqid, 'uid': context.uid} 458 extra={'reqid': context.reqid, 'uid': context.uid}
617 ) 459 )
618 else: 460 else:
619 logger.info( 461 logger.info(
620 f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | " 462 f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | "
621 - f"Language: {detected_lang} | Domain: {domain}" 463 + f"Language: {detected_lang}"
622 ) 464 )
623 465
624 return result 466 return result
search/es_query_builder.py
@@ -159,7 +159,8 @@ class ESQueryBuilder: @@ -159,7 +159,8 @@ class ESQueryBuilder:
159 knn_k: int = 50, 159 knn_k: int = 50,
160 knn_num_candidates: int = 200, 160 knn_num_candidates: int = 200,
161 min_score: Optional[float] = None, 161 min_score: Optional[float] = None,
162 - parsed_query: Optional[Any] = None 162 + parsed_query: Optional[Any] = None,
  163 + index_languages: Optional[List[str]] = None,
163 ) -> Dict[str, Any]: 164 ) -> Dict[str, Any]:
164 """ 165 """
165 Build complete ES query with post_filter support for multi-select faceting. 166 Build complete ES query with post_filter support for multi-select faceting.
@@ -202,7 +203,11 @@ class ESQueryBuilder: @@ -202,7 +203,11 @@ class ESQueryBuilder:
202 # Text recall (always include if query_text exists) 203 # Text recall (always include if query_text exists)
203 if query_text: 204 if query_text:
204 # Unified text query strategy 205 # Unified text query strategy
205 - text_query = self._build_advanced_text_query(query_text, parsed_query) 206 + text_query = self._build_advanced_text_query(
  207 + query_text,
  208 + parsed_query,
  209 + index_languages=index_languages,
  210 + )
206 recall_clauses.append(text_query) 211 recall_clauses.append(text_query)
207 212
208 # Embedding recall (KNN - separate from query, handled below) 213 # Embedding recall (KNN - separate from query, handled below)
@@ -503,13 +508,31 @@ class ESQueryBuilder: @@ -503,13 +508,31 @@ class ESQueryBuilder:
503 # Currently using unified embedding field 508 # Currently using unified embedding field
504 return self.text_embedding_field or "title_embedding" 509 return self.text_embedding_field or "title_embedding"
505 510
506 - def _build_advanced_text_query(self, query_text: str, parsed_query: Optional[Any] = None) -> Dict[str, Any]: 511 + @staticmethod
  512 + def _normalize_language_list(languages: Optional[List[str]]) -> List[str]:
  513 + normalized: List[str] = []
  514 + seen = set()
  515 + for language in languages or []:
  516 + token = str(language or "").strip().lower()
  517 + if not token or token in seen:
  518 + continue
  519 + seen.add(token)
  520 + normalized.append(token)
  521 + return normalized
  522 +
  523 + def _build_advanced_text_query(
  524 + self,
  525 + query_text: str,
  526 + parsed_query: Optional[Any] = None,
  527 + *,
  528 + index_languages: Optional[List[str]] = None,
  529 + ) -> Dict[str, Any]:
507 """ 530 """
508 - Build advanced text query using should clauses with primary and fallback lexical strategies. 531 + Build advanced text query using base and translated lexical clauses.
509 532
510 Unified implementation: 533 Unified implementation:
511 - base_query: source-language clause 534 - base_query: source-language clause
512 - - translation queries: target-language clauses from search_langs/query_text_by_lang 535 + - translation queries: target-language clauses from translations
513 - KNN query: added separately in build_query 536 - KNN query: added separately in build_query
514 537
515 Args: 538 Args:
@@ -520,55 +543,42 @@ class ESQueryBuilder: @@ -520,55 +543,42 @@ class ESQueryBuilder:
520 ES bool query with should clauses 543 ES bool query with should clauses
521 """ 544 """
522 should_clauses = [] 545 should_clauses = []
523 -  
524 - # Get query analysis from parsed_query  
525 - query_text_by_lang: Dict[str, str] = {}  
526 - search_langs: List[str] = []  
527 source_lang = self.default_language 546 source_lang = self.default_language
528 - source_in_index_languages = True  
529 - index_languages: List[str] = []  
530 - 547 + translations: Dict[str, str] = {}
531 contains_chinese = False 548 contains_chinese = False
532 contains_english = False 549 contains_english = False
  550 + normalized_index_languages = self._normalize_language_list(index_languages)
  551 +
533 if parsed_query: 552 if parsed_query:
534 - query_text_by_lang = getattr(parsed_query, "query_text_by_lang", None) or {}  
535 - search_langs = getattr(parsed_query, "search_langs", None) or []  
536 detected_lang = getattr(parsed_query, "detected_language", None) 553 detected_lang = getattr(parsed_query, "detected_language", None)
537 source_lang = detected_lang if detected_lang and detected_lang != "unknown" else self.default_language 554 source_lang = detected_lang if detected_lang and detected_lang != "unknown" else self.default_language
538 - source_in_index_languages = bool(  
539 - getattr(parsed_query, "source_in_index_languages", True)  
540 - )  
541 - index_languages = getattr(parsed_query, "index_languages", None) or [] 555 + translations = getattr(parsed_query, "translations", None) or {}
542 contains_chinese = bool(getattr(parsed_query, "contains_chinese", False)) 556 contains_chinese = bool(getattr(parsed_query, "contains_chinese", False))
543 contains_english = bool(getattr(parsed_query, "contains_english", False)) 557 contains_english = bool(getattr(parsed_query, "contains_english", False))
544 558
545 - if not query_text_by_lang:  
546 - query_text_by_lang = {source_lang: query_text}  
547 - if source_lang not in query_text_by_lang and query_text:  
548 - query_text_by_lang[source_lang] = query_text  
549 - if not search_langs:  
550 - search_langs = list(query_text_by_lang.keys())  
551 -  
552 - # Base + translated clauses based on language plan.  
553 - for lang in search_langs:  
554 - lang_query = query_text_by_lang.get(lang)  
555 - if not lang_query:  
556 - continue 559 + source_lang = str(source_lang or self.default_language).strip().lower() or self.default_language
  560 + source_in_index_languages = (
  561 + True if not normalized_index_languages else source_lang in normalized_index_languages
  562 + )
  563 +
  564 + base_query_text = (
  565 + getattr(parsed_query, "rewritten_query", None) if parsed_query else None
  566 + ) or query_text
  567 +
  568 + def append_clause(lang: str, lang_query: str, clause_name: str, is_source: bool) -> None:
  569 + nonlocal should_clauses
557 all_specs, _ = self._build_match_field_specs(lang) 570 all_specs, _ = self._build_match_field_specs(lang)
558 expanded_specs = self._expand_match_field_specs_for_mixed_script( 571 expanded_specs = self._expand_match_field_specs_for_mixed_script(
559 lang, 572 lang,
560 all_specs, 573 all_specs,
561 contains_chinese, 574 contains_chinese,
562 contains_english, 575 contains_english,
563 - index_languages, 576 + normalized_index_languages,
564 ) 577 )
565 match_fields = self._format_match_field_specs(expanded_specs) 578 match_fields = self._format_match_field_specs(expanded_specs)
566 if not match_fields: 579 if not match_fields:
567 - continue  
568 -  
569 - is_source = (lang == source_lang) 580 + return
570 clause_boost = 1.0 581 clause_boost = 1.0
571 - clause_name = "base_query" if is_source else f"base_query_trans_{lang}"  
572 minimum_should_match = ( 582 minimum_should_match = (
573 self.base_minimum_should_match if is_source else self.translation_minimum_should_match 583 self.base_minimum_should_match if is_source else self.translation_minimum_should_match
574 ) 584 )
@@ -596,44 +606,17 @@ class ESQueryBuilder: @@ -596,44 +606,17 @@ class ESQueryBuilder:
596 "multi_match": clause["multi_match"] 606 "multi_match": clause["multi_match"]
597 }) 607 })
598 608
599 - # Fallback: source language is not indexed and translation for some index languages is missing.  
600 - # Use original query text on missing index-language fields with a low boost.  
601 - if not source_in_index_languages and query_text and index_languages:  
602 - normalized_index_langs: List[str] = []  
603 - seen_langs = set()  
604 - for lang in index_languages:  
605 - norm_lang = str(lang or "").strip().lower()  
606 - if not norm_lang or norm_lang in seen_langs:  
607 - continue  
608 - seen_langs.add(norm_lang)  
609 - normalized_index_langs.append(norm_lang) 609 + if base_query_text:
  610 + append_clause(source_lang, base_query_text, "base_query", True)
610 611
611 - for lang in normalized_index_langs:  
612 - if lang == source_lang:  
613 - continue  
614 - if lang in query_text_by_lang:  
615 - continue  
616 - fb_specs, _ = self._build_match_field_specs(lang)  
617 - expanded_fb = self._expand_match_field_specs_for_mixed_script(  
618 - lang,  
619 - fb_specs,  
620 - contains_chinese,  
621 - contains_english,  
622 - index_languages,  
623 - )  
624 - match_fields = self._format_match_field_specs(expanded_fb)  
625 - if not match_fields:  
626 - continue  
627 - should_clauses.append({  
628 - "multi_match": {  
629 - "_name": f"fallback_original_query_{lang}",  
630 - "query": query_text,  
631 - "fields": match_fields,  
632 - "minimum_should_match": self.translation_minimum_should_match,  
633 - "tie_breaker": self.tie_breaker_base_query,  
634 - "boost": self.original_query_fallback_boost_when_translation_missing,  
635 - }  
636 - }) 612 + for lang, translated_text in translations.items():
  613 + normalized_lang = str(lang or "").strip().lower()
  614 + normalized_text = str(translated_text or "").strip()
  615 + if not normalized_lang or not normalized_text:
  616 + continue
  617 + if normalized_lang == source_lang and normalized_text == base_query_text:
  618 + continue
  619 + append_clause(normalized_lang, normalized_text, f"base_query_trans_{normalized_lang}", False)
637 620
638 # Fallback to a simple query when language fields cannot be resolved. 621 # Fallback to a simple query when language fields cannot be resolved.
639 if not should_clauses: 622 if not should_clauses:
search/searcher.py
@@ -516,10 +516,19 @@ class Searcher: @@ -516,10 +516,19 @@ class Searcher:
516 range_filters: Range filters for numeric fields 516 range_filters: Range filters for numeric fields
517 facets: Facet configurations for faceted search 517 facets: Facet configurations for faceted search
518 min_score: Minimum score threshold 518 min_score: Minimum score threshold
519 - context: Request context for tracking (created if not provided) 519 + context: Request context for tracking (required)
520 sort_by: Field name for sorting 520 sort_by: Field name for sorting
521 sort_order: Sort order: 'asc' or 'desc' 521 sort_order: Sort order: 'asc' or 'desc'
522 debug: Enable debug information output 522 debug: Enable debug information output
  523 + language: Response / field selection language hint (e.g. zh, en)
  524 + sku_filter_dimension: SKU grouping dimensions for per-SPU variant pick
  525 + enable_rerank: If None, use ``config.rerank.enabled``; if set, overrides
  526 + whether the rerank provider is invoked (subject to rerank window).
  527 + rerank_query_template: Override for rerank query text template; None uses
  528 + ``config.rerank.rerank_query_template`` (e.g. ``"{query}"``).
  529 + rerank_doc_template: Override for per-hit document text passed to rerank;
  530 + None uses ``config.rerank.rerank_doc_template``. Placeholders are
  531 + resolved in ``search/rerank_client.py``.
523 532
524 Returns: 533 Returns:
525 SearchResult object with formatted results 534 SearchResult object with formatted results
@@ -592,7 +601,8 @@ class Searcher: @@ -592,7 +601,8 @@ class Searcher:
592 query, 601 query,
593 tenant_id=tenant_id, 602 tenant_id=tenant_id,
594 generate_vector=enable_embedding, 603 generate_vector=enable_embedding,
595 - context=context 604 + context=context,
  605 + target_languages=index_langs if enable_translation else [],
596 ) 606 )
597 # Store query analysis results in context 607 # Store query analysis results in context
598 context.store_query_analysis( 608 context.store_query_analysis(
@@ -602,7 +612,7 @@ class Searcher: @@ -602,7 +612,7 @@ class Searcher:
602 detected_language=parsed_query.detected_language, 612 detected_language=parsed_query.detected_language,
603 translations=parsed_query.translations, 613 translations=parsed_query.translations,
604 query_vector=parsed_query.query_vector.tolist() if parsed_query.query_vector is not None else None, 614 query_vector=parsed_query.query_vector.tolist() if parsed_query.query_vector is not None else None,
605 - domain=parsed_query.domain, 615 + domain="default",
606 is_simple_query=True 616 is_simple_query=True
607 ) 617 )
608 618
@@ -610,7 +620,6 @@ class Searcher: @@ -610,7 +620,6 @@ class Searcher:
610 f"查询解析完成 | 原查询: '{parsed_query.original_query}' | " 620 f"查询解析完成 | 原查询: '{parsed_query.original_query}' | "
611 f"重写后: '{parsed_query.rewritten_query}' | " 621 f"重写后: '{parsed_query.rewritten_query}' | "
612 f"语言: {parsed_query.detected_language} | " 622 f"语言: {parsed_query.detected_language} | "
613 - f"域: {parsed_query.domain} | "  
614 f"向量: {'是' if parsed_query.query_vector is not None else '否'}", 623 f"向量: {'是' if parsed_query.query_vector is not None else '否'}",
615 extra={'reqid': context.reqid, 'uid': context.uid} 624 extra={'reqid': context.reqid, 'uid': context.uid}
616 ) 625 )
@@ -643,7 +652,8 @@ class Searcher: @@ -643,7 +652,8 @@ class Searcher:
643 from_=es_fetch_from, 652 from_=es_fetch_from,
644 enable_knn=enable_embedding and parsed_query.query_vector is not None, 653 enable_knn=enable_embedding and parsed_query.query_vector is not None,
645 min_score=min_score, 654 min_score=min_score,
646 - parsed_query=parsed_query 655 + parsed_query=parsed_query,
  656 + index_languages=index_langs,
647 ) 657 )
648 658
649 # Add facets for faceted search 659 # Add facets for faceted search
@@ -985,9 +995,6 @@ class Searcher: @@ -985,9 +995,6 @@ class Searcher:
985 "rewritten_query": context.query_analysis.rewritten_query, 995 "rewritten_query": context.query_analysis.rewritten_query,
986 "detected_language": context.query_analysis.detected_language, 996 "detected_language": context.query_analysis.detected_language,
987 "translations": context.query_analysis.translations, 997 "translations": context.query_analysis.translations,
988 - "query_text_by_lang": context.get_intermediate_result("query_text_by_lang", {}),  
989 - "search_langs": context.get_intermediate_result("search_langs", []),  
990 - "supplemental_search_langs": context.get_intermediate_result("supplemental_search_langs", []),  
991 "has_vector": context.query_analysis.query_vector is not None, 998 "has_vector": context.query_analysis.query_vector is not None,
992 "is_simple_query": context.query_analysis.is_simple_query, 999 "is_simple_query": context.query_analysis.is_simple_query,
993 "domain": context.query_analysis.domain 1000 "domain": context.query_analysis.domain
tests/test_embedding_pipeline.py
@@ -73,6 +73,10 @@ class _FakeQueryEncoder: @@ -73,6 +73,10 @@ class _FakeQueryEncoder:
73 return np.array([np.array([0.11, 0.22, 0.33], dtype=np.float32) for _ in sentences], dtype=object) 73 return np.array([np.array([0.11, 0.22, 0.33], dtype=np.float32) for _ in sentences], dtype=object)
74 74
75 75
  76 +def _tokenizer(text):
  77 + return str(text).split()
  78 +
  79 +
76 class _FakeEmbeddingCache: 80 class _FakeEmbeddingCache:
77 def __init__(self): 81 def __init__(self):
78 self.store: Dict[str, np.ndarray] = {} 82 self.store: Dict[str, np.ndarray] = {}
@@ -210,6 +214,7 @@ def test_query_parser_generates_query_vector_with_encoder(): @@ -210,6 +214,7 @@ def test_query_parser_generates_query_vector_with_encoder():
210 config=_build_test_config(), 214 config=_build_test_config(),
211 text_encoder=encoder, 215 text_encoder=encoder,
212 translator=_FakeTranslator(), 216 translator=_FakeTranslator(),
  217 + tokenizer=_tokenizer,
213 ) 218 )
214 219
215 parsed = parser.parse("red dress", tenant_id="162", generate_vector=True) 220 parsed = parser.parse("red dress", tenant_id="162", generate_vector=True)
@@ -224,6 +229,7 @@ def test_query_parser_skips_query_vector_when_disabled(): @@ -224,6 +229,7 @@ def test_query_parser_skips_query_vector_when_disabled():
224 config=_build_test_config(), 229 config=_build_test_config(),
225 text_encoder=_FakeQueryEncoder(), 230 text_encoder=_FakeQueryEncoder(),
226 translator=_FakeTranslator(), 231 translator=_FakeTranslator(),
  232 + tokenizer=_tokenizer,
227 ) 233 )
228 234
229 parsed = parser.parse("red dress", tenant_id="162", generate_vector=False) 235 parsed = parser.parse("red dress", tenant_id="162", generate_vector=False)
tests/test_es_query_builder.py
@@ -65,21 +65,42 @@ def test_knn_prefilter_not_added_without_filters(): @@ -65,21 +65,42 @@ def test_knn_prefilter_not_added_without_filters():
65 assert q["knn"]["_name"] == "knn_query" 65 assert q["knn"]["_name"] == "knn_query"
66 66
67 67
68 -def test_text_query_contains_only_base_translation_and_fallback_named_queries(): 68 +def test_text_query_contains_only_base_and_translation_named_queries():
69 qb = _builder() 69 qb = _builder()
70 parsed_query = SimpleNamespace( 70 parsed_query = SimpleNamespace(
71 - query_text_by_lang={"en": "dress", "zh": "连衣裙"},  
72 - search_langs=["en", "zh"], 71 + rewritten_query="dress",
73 detected_language="en", 72 detected_language="en",
74 - source_in_index_languages=False,  
75 - index_languages=["en", "zh", "fr"], 73 + translations={"en": "dress", "zh": "连衣裙"},
76 ) 74 )
77 75
78 - q = qb.build_query(query_text="dress", parsed_query=parsed_query, enable_knn=False) 76 + q = qb.build_query(
  77 + query_text="dress",
  78 + parsed_query=parsed_query,
  79 + enable_knn=False,
  80 + index_languages=["en", "zh", "fr"],
  81 + )
79 should = q["query"]["bool"]["should"] 82 should = q["query"]["bool"]["should"]
80 names = [clause["multi_match"]["_name"] for clause in should] 83 names = [clause["multi_match"]["_name"] for clause in should]
81 84
82 - assert names == ["base_query", "base_query_trans_zh", "fallback_original_query_fr"] 85 + assert names == ["base_query", "base_query_trans_zh"]
  86 +
  87 +
  88 +def test_text_query_skips_duplicate_translation_same_as_base():
  89 + qb = _builder()
  90 + parsed_query = SimpleNamespace(
  91 + rewritten_query="dress",
  92 + detected_language="en",
  93 + translations={"en": "dress"},
  94 + )
  95 +
  96 + q = qb.build_query(
  97 + query_text="dress",
  98 + parsed_query=parsed_query,
  99 + enable_knn=False,
  100 + index_languages=["en", "zh"],
  101 + )
  102 +
  103 + assert q["query"]["multi_match"]["_name"] == "base_query"
83 104
84 105
85 def test_mixed_script_merges_en_fields_into_zh_clause(): 106 def test_mixed_script_merges_en_fields_into_zh_clause():
@@ -91,22 +112,25 @@ def test_mixed_script_merges_en_fields_into_zh_clause(): @@ -91,22 +112,25 @@ def test_mixed_script_merges_en_fields_into_zh_clause():
91 default_language="en", 112 default_language="en",
92 ) 113 )
93 parsed_query = SimpleNamespace( 114 parsed_query = SimpleNamespace(
94 - query_text_by_lang={"zh": "法式 dress"},  
95 - search_langs=["zh"], 115 + rewritten_query="法式 dress",
96 detected_language="zh", 116 detected_language="zh",
97 - source_in_index_languages=True,  
98 - index_languages=["zh", "en"], 117 + translations={},
99 contains_chinese=True, 118 contains_chinese=True,
100 contains_english=True, 119 contains_english=True,
101 ) 120 )
102 - q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False) 121 + q = qb.build_query(
  122 + query_text="法式 dress",
  123 + parsed_query=parsed_query,
  124 + enable_knn=False,
  125 + index_languages=["zh", "en"],
  126 + )
103 fields = q["query"]["multi_match"]["fields"] 127 fields = q["query"]["multi_match"]["fields"]
104 bases = {f.split("^", 1)[0] for f in fields} 128 bases = {f.split("^", 1)[0] for f in fields}
105 assert "title.zh" in bases and "title.en" in bases 129 assert "title.zh" in bases and "title.en" in bases
106 assert "brief.zh" in bases and "brief.en" in bases 130 assert "brief.zh" in bases and "brief.en" in bases
107 - # Merged supplemental language fields use boost * 0.8 (implicit 1.0 -> ^0.8)  
108 - assert "title.en^0.8" in fields  
109 - assert "brief.en^0.8" in fields 131 + # Merged supplemental language fields use boost * 0.6 by default.
  132 + assert "title.en^0.6" in fields
  133 + assert "brief.en^0.6" in fields
110 134
111 135
112 def test_mixed_script_merges_zh_fields_into_en_clause(): 136 def test_mixed_script_merges_zh_fields_into_en_clause():
@@ -118,19 +142,22 @@ def test_mixed_script_merges_zh_fields_into_en_clause(): @@ -118,19 +142,22 @@ def test_mixed_script_merges_zh_fields_into_en_clause():
118 default_language="en", 142 default_language="en",
119 ) 143 )
120 parsed_query = SimpleNamespace( 144 parsed_query = SimpleNamespace(
121 - query_text_by_lang={"en": "red 连衣裙"},  
122 - search_langs=["en"], 145 + rewritten_query="red 连衣裙",
123 detected_language="en", 146 detected_language="en",
124 - source_in_index_languages=True,  
125 - index_languages=["zh", "en"], 147 + translations={},
126 contains_chinese=True, 148 contains_chinese=True,
127 contains_english=True, 149 contains_english=True,
128 ) 150 )
129 - q = qb.build_query(query_text="red 连衣裙", parsed_query=parsed_query, enable_knn=False) 151 + q = qb.build_query(
  152 + query_text="red 连衣裙",
  153 + parsed_query=parsed_query,
  154 + enable_knn=False,
  155 + index_languages=["zh", "en"],
  156 + )
130 fields = q["query"]["multi_match"]["fields"] 157 fields = q["query"]["multi_match"]["fields"]
131 bases = {f.split("^", 1)[0] for f in fields} 158 bases = {f.split("^", 1)[0] for f in fields}
132 assert "title.en" in bases and "title.zh" in bases 159 assert "title.en" in bases and "title.zh" in bases
133 - assert "title.zh^0.8" in fields 160 + assert "title.zh^0.6" in fields
134 161
135 162
136 def test_mixed_script_merged_fields_scale_configured_boosts(): 163 def test_mixed_script_merged_fields_scale_configured_boosts():
@@ -143,18 +170,21 @@ def test_mixed_script_merged_fields_scale_configured_boosts(): @@ -143,18 +170,21 @@ def test_mixed_script_merged_fields_scale_configured_boosts():
143 default_language="en", 170 default_language="en",
144 ) 171 )
145 parsed_query = SimpleNamespace( 172 parsed_query = SimpleNamespace(
146 - query_text_by_lang={"zh": "法式 dress"},  
147 - search_langs=["zh"], 173 + rewritten_query="法式 dress",
148 detected_language="zh", 174 detected_language="zh",
149 - source_in_index_languages=True,  
150 - index_languages=["zh", "en"], 175 + translations={},
151 contains_chinese=True, 176 contains_chinese=True,
152 contains_english=True, 177 contains_english=True,
153 ) 178 )
154 - q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False) 179 + q = qb.build_query(
  180 + query_text="法式 dress",
  181 + parsed_query=parsed_query,
  182 + enable_knn=False,
  183 + index_languages=["zh", "en"],
  184 + )
155 fields = q["query"]["multi_match"]["fields"] 185 fields = q["query"]["multi_match"]["fields"]
156 assert "title.zh^5.0" in fields 186 assert "title.zh^5.0" in fields
157 - assert "title.en^8.0" in fields # 10.0 * 0.8 187 + assert "title.en^6.0" in fields # 10.0 * 0.6
158 188
159 189
160 def test_mixed_script_does_not_merge_en_when_not_in_index_languages(): 190 def test_mixed_script_does_not_merge_en_when_not_in_index_languages():
@@ -166,15 +196,18 @@ def test_mixed_script_does_not_merge_en_when_not_in_index_languages(): @@ -166,15 +196,18 @@ def test_mixed_script_does_not_merge_en_when_not_in_index_languages():
166 default_language="zh", 196 default_language="zh",
167 ) 197 )
168 parsed_query = SimpleNamespace( 198 parsed_query = SimpleNamespace(
169 - query_text_by_lang={"zh": "法式 dress"},  
170 - search_langs=["zh"], 199 + rewritten_query="法式 dress",
171 detected_language="zh", 200 detected_language="zh",
172 - source_in_index_languages=True,  
173 - index_languages=["zh"], 201 + translations={},
174 contains_chinese=True, 202 contains_chinese=True,
175 contains_english=True, 203 contains_english=True,
176 ) 204 )
177 - q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False) 205 + q = qb.build_query(
  206 + query_text="法式 dress",
  207 + parsed_query=parsed_query,
  208 + enable_knn=False,
  209 + index_languages=["zh"],
  210 + )
178 fields = q["query"]["multi_match"]["fields"] 211 fields = q["query"]["multi_match"]["fields"]
179 bases = {f.split("^", 1)[0] for f in fields} 212 bases = {f.split("^", 1)[0] for f in fields}
180 assert "title.zh" in bases 213 assert "title.zh" in bases
tests/test_query_parser_mixed_language.py
1 -from types import SimpleNamespace  
2 -  
3 from config import FunctionScoreConfig, IndexConfig, QueryConfig, RerankConfig, SPUConfig, SearchConfig 1 from config import FunctionScoreConfig, IndexConfig, QueryConfig, RerankConfig, SPUConfig, SearchConfig
4 from query.query_parser import QueryParser 2 from query.query_parser import QueryParser
5 3
@@ -9,6 +7,10 @@ class _DummyTranslator: @@ -9,6 +7,10 @@ class _DummyTranslator:
9 return f"{text}-{target_lang}" 7 return f"{text}-{target_lang}"
10 8
11 9
  10 +def _tokenizer(text):
  11 + return str(text).split()
  12 +
  13 +
12 def test_pure_english_word_token_length_and_script(): 14 def test_pure_english_word_token_length_and_script():
13 assert QueryParser._is_pure_english_word_token("ab") is False 15 assert QueryParser._is_pure_english_word_token("ab") is False
14 assert QueryParser._is_pure_english_word_token("abc") is True 16 assert QueryParser._is_pure_english_word_token("abc") is True
@@ -35,59 +37,57 @@ def _build_config() -&gt; SearchConfig: @@ -35,59 +37,57 @@ def _build_config() -&gt; SearchConfig:
35 37
36 38
37 def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(monkeypatch): 39 def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(monkeypatch):
38 - parser = QueryParser(_build_config(), translator=_DummyTranslator()) 40 + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
39 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh") 41 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh")
40 - monkeypatch.setattr(  
41 - "query.query_parser.get_tenant_config_loader",  
42 - lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["zh", "en"]}),  
43 - raising=False,  
44 - )  
45 42
46 - result = parser.parse("法式 dress 连衣裙", tenant_id="162", generate_vector=False) 43 + result = parser.parse(
  44 + "法式 dress 连衣裙",
  45 + tenant_id="162",
  46 + generate_vector=False,
  47 + target_languages=["zh", "en"],
  48 + )
47 49
48 assert result.detected_language == "zh" 50 assert result.detected_language == "zh"
49 assert result.contains_chinese is True 51 assert result.contains_chinese is True
50 assert result.contains_english is True 52 assert result.contains_english is True
51 - assert "en" in result.search_langs  
52 - # 翻译在预算内完成时会写入目标语言字段(优于仅用原文做 supplemental 探测)  
53 - assert result.query_text_by_lang["en"] == "法式 dress 连衣裙-en"  
54 - assert result.query_text_by_lang["zh"] == "法式 dress 连衣裙" 53 + assert result.translations == {"en": "法式 dress 连衣裙-en"}
  54 + assert result.query_tokens == ["法式", "dress", "连衣裙"]
  55 + assert not hasattr(result, "query_text_by_lang")
  56 + assert not hasattr(result, "search_langs")
55 57
56 58
57 def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch): 59 def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch):
58 - parser = QueryParser(_build_config(), translator=_DummyTranslator()) 60 + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
59 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") 61 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
60 - monkeypatch.setattr(  
61 - "query.query_parser.get_tenant_config_loader",  
62 - lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}),  
63 - raising=False,  
64 - )  
65 62
66 - result = parser.parse("red 连衣裙", tenant_id="0", generate_vector=False) 63 + result = parser.parse(
  64 + "red 连衣裙",
  65 + tenant_id="0",
  66 + generate_vector=False,
  67 + target_languages=["en", "zh"],
  68 + )
67 69
68 assert result.detected_language == "en" 70 assert result.detected_language == "en"
69 assert result.contains_chinese is True 71 assert result.contains_chinese is True
70 assert result.contains_english is True 72 assert result.contains_english is True
71 - assert "zh" in result.search_langs  
72 - assert result.query_text_by_lang["zh"] == "red 连衣裙-zh"  
73 - assert result.query_text_by_lang["en"] == "red 连衣裙" 73 + assert result.translations == {"zh": "red 连衣裙-zh"}
  74 + assert result.query_tokens == ["red", "连衣裙"]
74 75
75 76
76 def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch): 77 def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch):
77 """en 在 index_languages 内时仍应等待并采纳 en->zh 翻译结果(与向量共用预算)。""" 78 """en 在 index_languages 内时仍应等待并采纳 en->zh 翻译结果(与向量共用预算)。"""
78 - parser = QueryParser(_build_config(), translator=_DummyTranslator()) 79 + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
79 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") 80 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
80 - monkeypatch.setattr(  
81 - "query.query_parser.get_tenant_config_loader",  
82 - lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}),  
83 - raising=False,  
84 - )  
85 81
86 - result = parser.parse("off shoulder top", tenant_id="0", generate_vector=False) 82 + result = parser.parse(
  83 + "off shoulder top",
  84 + tenant_id="0",
  85 + generate_vector=False,
  86 + target_languages=["en", "zh"],
  87 + )
87 88
88 assert result.detected_language == "en" 89 assert result.detected_language == "en"
89 assert result.contains_chinese is False 90 assert result.contains_chinese is False
90 assert result.contains_english is True 91 assert result.contains_english is True
91 assert result.translations.get("zh") == "off shoulder top-zh" 92 assert result.translations.get("zh") == "off shoulder top-zh"
92 - assert result.query_text_by_lang.get("zh") == "off shoulder top-zh"  
93 - assert result.source_in_index_languages is True 93 + assert not hasattr(result, "source_in_index_languages")
tests/test_search_rerank_window.py
@@ -43,7 +43,14 @@ class _FakeParsedQuery: @@ -43,7 +43,14 @@ class _FakeParsedQuery:
43 43
44 44
45 class _FakeQueryParser: 45 class _FakeQueryParser:
46 - def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any): 46 + def parse(
  47 + self,
  48 + query: str,
  49 + tenant_id: str,
  50 + generate_vector: bool,
  51 + context: Any,
  52 + target_languages: Any = None,
  53 + ):
47 return _FakeParsedQuery( 54 return _FakeParsedQuery(
48 original_query=query, 55 original_query=query,
49 query_normalized=query, 56 query_normalized=query,
@@ -191,6 +198,66 @@ def test_config_loader_rerank_enabled_defaults_true(tmp_path: Path): @@ -191,6 +198,66 @@ def test_config_loader_rerank_enabled_defaults_true(tmp_path: Path):
191 "field_boosts": {"title.en": 3.0}, 198 "field_boosts": {"title.en": 3.0},
192 "indexes": [{"name": "default", "label": "default", "fields": ["title.en"]}], 199 "indexes": [{"name": "default", "label": "default", "fields": ["title.en"]}],
193 "query_config": {"supported_languages": ["en"], "default_language": "en"}, 200 "query_config": {"supported_languages": ["en"], "default_language": "en"},
  201 + "services": {
  202 + "translation": {
  203 + "service_url": "http://localhost:6005",
  204 + "timeout_sec": 3.0,
  205 + "default_model": "dummy-model",
  206 + "default_scene": "general",
  207 + "cache": {
  208 + "ttl_seconds": 60,
  209 + "sliding_expiration": True,
  210 + },
  211 + "capabilities": {
  212 + "dummy-model": {
  213 + "enabled": True,
  214 + "backend": "llm",
  215 + "use_cache": True,
  216 + "model": "dummy-model",
  217 + "base_url": "http://localhost:6005/v1",
  218 + "timeout_sec": 3.0,
  219 + }
  220 + },
  221 + },
  222 + "embedding": {
  223 + "provider": "http",
  224 + "providers": {
  225 + "http": {
  226 + "text_base_url": "http://localhost:6005",
  227 + "image_base_url": "http://localhost:6008",
  228 + }
  229 + },
  230 + "backend": "tei",
  231 + "backends": {
  232 + "tei": {
  233 + "base_url": "http://localhost:8080",
  234 + "timeout_sec": 3.0,
  235 + "model_id": "dummy-embedding-model",
  236 + }
  237 + },
  238 + },
  239 + "rerank": {
  240 + "provider": "http",
  241 + "providers": {
  242 + "http": {
  243 + "base_url": "http://localhost:6007",
  244 + "service_url": "http://localhost:6007/rerank",
  245 + }
  246 + },
  247 + "backend": "bge",
  248 + "backends": {
  249 + "bge": {
  250 + "model_name": "dummy-rerank-model",
  251 + "device": "cpu",
  252 + "use_fp16": False,
  253 + "batch_size": 8,
  254 + "max_length": 128,
  255 + "cache_dir": "./model_cache",
  256 + "enable_warmup": False,
  257 + }
  258 + },
  259 + },
  260 + },
194 "spu_config": {"enabled": False}, 261 "spu_config": {"enabled": False},
195 "function_score": {"score_mode": "sum", "boost_mode": "multiply", "functions": []}, 262 "function_score": {"score_mode": "sum", "boost_mode": "multiply", "functions": []},
196 "rerank": {"rerank_window": 384}, 263 "rerank": {"rerank_window": 384},
@@ -354,7 +421,14 @@ def test_searcher_promotes_sku_when_option1_matches_translated_query(monkeypatch @@ -354,7 +421,14 @@ def test_searcher_promotes_sku_when_option1_matches_translated_query(monkeypatch
354 class _TranslatedQueryParser: 421 class _TranslatedQueryParser:
355 text_encoder = None 422 text_encoder = None
356 423
357 - def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any): 424 + def parse(
  425 + self,
  426 + query: str,
  427 + tenant_id: str,
  428 + generate_vector: bool,
  429 + context: Any,
  430 + target_languages: Any = None,
  431 + ):
358 return _FakeParsedQuery( 432 return _FakeParsedQuery(
359 original_query=query, 433 original_query=query,
360 query_normalized=query, 434 query_normalized=query,
@@ -407,15 +481,22 @@ def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_matc @@ -407,15 +481,22 @@ def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_matc
407 encoder = _FakeTextEncoder( 481 encoder = _FakeTextEncoder(
408 { 482 {
409 "linen summer dress": [0.8, 0.2], 483 "linen summer dress": [0.8, 0.2],
410 - "color:Red": [1.0, 0.0],  
411 - "color:Blue": [0.0, 1.0], 484 + "color:red": [1.0, 0.0],
  485 + "color:blue": [0.0, 1.0],
412 } 486 }
413 ) 487 )
414 488
415 class _EmbeddingQueryParser: 489 class _EmbeddingQueryParser:
416 text_encoder = encoder 490 text_encoder = encoder
417 491
418 - def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any): 492 + def parse(
  493 + self,
  494 + query: str,
  495 + tenant_id: str,
  496 + generate_vector: bool,
  497 + context: Any,
  498 + target_languages: Any = None,
  499 + ):
419 return _FakeParsedQuery( 500 return _FakeParsedQuery(
420 original_query=query, 501 original_query=query,
421 query_normalized=query, 502 query_normalized=query,