Commit ef5baa866ae0cf2c061a83a9fc2aead25b1d098e

Authored by tangwang
1 parent fb973d19

混杂语言处理

config/config.yaml
... ... @@ -120,7 +120,7 @@ query_config:
120 120 - skus
121 121  
122 122 # KNN boost配置(向量召回的boost值)
123   - knn_boost: 0.25 # Lower boost for embedding recall
  123 + knn_boost: 2.0 # Lower boost for embedding recall
124 124  
125 125 # Function Score配置(ES层打分规则)
126 126 function_score:
... ... @@ -290,7 +290,7 @@ services:
290 290 engine: "vllm"
291 291 max_model_len: 160
292 292 tensor_parallel_size: 1
293   - gpu_memory_utilization: 0.36
  293 + gpu_memory_utilization: 0.20
294 294 dtype: "float16"
295 295 enable_prefix_caching: true
296 296 enforce_eager: false
... ...
docs/TODO-ES能力提升.md 0 → 100644
... ... @@ -0,0 +1,69 @@
  1 +ES 付费版本 or 定制开发(建议先看下付费版本价格)
  2 +ES定制开发:
  3 +RRF / retrievers
  4 +
  5 +Elastic 的订阅矩阵里明确列了这些相关能力:Retrievers: linear, rule, RRF, text similarity re-ranker,以及 Reciprocal Rank Fusion (RRF) for hybrid search。
  6 +
  7 +这类能力最有价值的点是:
  8 +它们把混合检索从“自己拼 DSL 和手搓打分”变成了官方支持的多阶段检索框架。重排:text similarity re-ranker / Elastic Rerank. text_similarity_reranker 用 NLP 模型对 top-k 结果按语义相似度重新排序;它可以用内置的 Elastic Rerank,也可以接 Cohere、Vertex AI,或者你自己上传的 text similarity 模型。
  9 +
  10 +{
  11 + "retriever": {
  12 + "rrf": {
  13 + "retrievers": [
  14 + { "standard": { "query": { ... } } },
  15 + { "knn": { ... } }
  16 + ]
  17 + }
  18 + }
  19 +}
  20 +
  21 +
  22 +加reranker:
  23 +text_similarity_reranker 用 NLP 模型对 top-k 结果按语义相似度重新排序;它可以用内置的 Elastic Rerank,也可以接 Cohere、Vertex AI,或者你自己上传的 text similarity 模型。
  24 +
  25 +{
  26 + "retriever": {
  27 + "text_similarity_reranker": {
  28 + "retriever": {
  29 + "rrf": { ... }
  30 + },
  31 + ...
  32 + }
  33 + }
  34 +}
  35 +
  36 +{
  37 + "retriever": {
  38 + "text_similarity_reranker": {
  39 + "retriever": {
  40 + "rrf": {
  41 + "retrievers": [
  42 + {
  43 + "standard": {
  44 + "query": {
  45 + "...": "..."
  46 + }
  47 + }
  48 + },
  49 + {
  50 + "knn": {
  51 + "...": "..."
  52 + }
  53 + }
  54 + ],
  55 + "rank_window_size": 100,
  56 + "rank_constant": 20
  57 + }
  58 + },
  59 + "field": "your_rerank_text_field",
  60 + "inference_text": "白色 oversized T-shirt",
  61 + "inference_id": ".rerank-v1-elasticsearch",
  62 + "rank_window_size": 50
  63 + }
  64 + },
  65 + "size": 20
  66 +}
  67 +
  68 +
  69 +
... ...
docs/TODO.txt
1 1  
  2 +把knn跟文本相关性的融合方式修改为 "rank": {"rrf": {} }需要licence,可以帮我修改源码支持吗?
  3 +
  4 + knn_boost: 2.0
  5 +
  6 +
  7 +{
  8 + "query": { ...全文检索... },
  9 + "knn": { ...向量检索... },
  10 + "rank": {
  11 + "rrf": {}
  12 + }
  13 +}
  14 +
  15 +
  16 +
  17 +
  18 +"image_embedding": {
  19 + "type": "nested",
  20 + "properties": {
  21 + "vector": {
  22 + "type": "dense_vector",
  23 + "dims": 1024,
  24 + "index": true,
  25 + "similarity": "dot_product",
  26 + "element_type": "bfloat16"
  27 + },
  28 + "url": {
  29 + "type": "text"
  30 + }
  31 + }
  32 +},
  33 +去掉 image_embedding_512
  34 +image_embedding改为,一个spu有多个sku向量,每个向量内部properties:
  35 +除了vector url还应该包括
  36 +"image_embedding": {
  37 + "type": "nested",
  38 + "properties": {
  39 + "vector": {
  40 + "type": "dense_vector",
  41 + "dims": 1024,
  42 + "index": true,
  43 + "similarity": "dot_product",
  44 + "element_type": "bfloat16"
  45 + },
  46 + "url": {
  47 + "type": "text"
  48 + }
  49 + }
  50 +},
  51 +
  52 +
  53 +
  54 +
  55 +
  56 +外部需求:
  57 +1. 对推理能力要求很低、对耗时要求很高的大模型API(或者本地部署一个7b Q4量化的大模型),prompt大概30-50个token,首token响应要求500ms以内
  58 +2. ES支持reranker pipline?
  59 +
2 60  
3 61 @reranker/backends/qwen3_vllm.py 单次 generate 前有进程内锁,同一进程里不会并行多路 vLLM 推理,这个锁有必要吗?是否会影响性能?是否能够打开,使得性能更好?比如这个场景,我一次请求 400 条,分成每64个一个batch,基于我现在的gpu配置,可以再提高并发度吗?
4 62 测试了,让每个批次都并发地进行,耗时没有变化
... ... @@ -383,6 +441,8 @@ https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-men
383 441  
384 442  
385 443 融合打分(已完成,2026-03)
  444 +
  445 +以下已经完成:
386 446 1. `fuse_scores_and_resort` 已改为乘法融合,并通过 `matched_queries` 提取:
387 447 - `base_query`
388 448 - `base_query_trans_*`
... ... @@ -397,7 +457,11 @@ https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-men
397 457 - `docs/搜索API对接指南.md`
398 458 - `docs/Usage-Guide.md`
399 459  
400   -
  460 +未完成的:
  461 +(归一化、次序融合?还乘法公式?)
  462 +RRF:先把多路召回稳妥融合
  463 +linear + minmax:让你能精调 knn 和文本的权重
  464 +reranker:对前面召回出来的 top-k 再做“最后一刀”
401 465  
402 466  
403 467  
... ...
docs/搜索API对接指南-01-搜索接口.md
... ... @@ -66,9 +66,11 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"})
66 66 | `min_score` | float | N | null | 最小相关性分数阈值 |
67 67 | `sku_filter_dimension` | array[string] | N | null | 子SKU筛选维度列表(见[SKU筛选维度](#35-sku筛选维度)) |
68 68 | `debug` | boolean | N | false | 是否返回调试信息 |
69   -| `enable_rerank` | boolean/null | N | null | 是否开启重排(调用外部重排服务对 ES 结果进行二次排序)。不传/传 null 使用服务端 `rerank.enabled`(默认开启)。开启后会先对 ES TopN(`rerank_window`)重排,再按分页截取;若 `from+size>1000`,则不重排,直接按分页从 ES 返回 |
70   -| `rerank_query_template` | string | N | null | 重排 query 模板(可选)。支持 `{query}` 占位符;不传则使用服务端配置 |
71   -| `rerank_doc_template` | string | N | null | 重排 doc 模板(可选)。支持 `{title} {brief} {vendor} {description} {category_path}`;不传则使用服务端配置 |
  69 +| `enable_rerank` | boolean/null | N | null | 是否开启重排(调用外部重排服务对 ES 结果进行二次排序)。不传/传 null 使用服务端 `rerank.enabled`。当有效开启且 `from + size <= rerank_window` 时:ES 先取前 `rerank_window` 条,重排后再按 `from`/`size` 截取当前页;若 `from + size > rerank_window`,则**不进行**窗口内重排,直接按请求的 `from`/`size` 查询 ES(`rerank_window` 见 `config.yaml` 的 `rerank.rerank_window`,仓库示例默认 400) |
  70 +| `rerank_query_template` | string | N | null | 重排 query 模板(可选)。支持 `{query}` 占位符;不传则使用服务端 `rerank.rerank_query_template` |
  71 +| `rerank_doc_template` | string | N | null | 重排 doc 模板(可选)。支持 `{title} {brief} {vendor} {description} {category_path}` 等占位符(由 `search/rerank_client.py` 按语言字段拼装);不传则使用服务端 `rerank.rerank_doc_template` |
  72 +
  73 +**与后端代码的对应关系**(便于联调):HTTP `POST /search/` 请求体由 `api/models.py` 的 `SearchRequest` 校验;路由 `api/routes/search.py` 将字段原样传入 `Searcher.search(...)`(含上述三个重排相关字段)。CLI `python main.py search` 目前未暴露这些参数,走配置默认值。
72 74 | `user_id` | string | N | null | 用户ID(用于个性化,预留) |
73 75 | `session_id` | string | N | null | 会话ID(用于分析,预留) |
74 76  
... ...
docs/相关性检索优化说明.md
... ... @@ -281,3 +281,24 @@ title.en: 2026 Korean-style High-waisted Slimming Corduroy Skirt with Slit, Mid-
281 281 Rerank score: 0.9643
282 282 title.en: Black Half-high Collar Base Shirt Women's Autumn and Winter fleece-lined Contrast Color Pure Desire Design Sense Horn Sleeve Ruffled Inner Top
283 283 title.zh: 黑色高领半高领女士秋冬内搭加绒拼色纯欲设计荷叶边袖内衬上衣
  284 +
  285 +
  286 +
  287 +qwen3-0.6b的严重badcase:
  288 +q=牛仔裤
  289 +
  290 +Rerank score: 0.0002
  291 +title.en: Wrangler Womens Cowboy Cut Slim Fit Jean Bleach
  292 +title.zh: Wrangler 女士牛仔裤 牛仔剪裁 紧身版型 漂白色
  293 +
  294 +Rerank score: 0.0168
  295 +title.en: Fleece Lined Tights Sheer Women - Fake Translucent Warm Pantyhose Leggings Sheer Thick Tights for Winter
  296 +title.zh: 加绒透肤女士连裤袜 - 仿透视保暖长筒袜 冬季厚款透肤连裤袜
  297 +
  298 +Rerank score: 0.1366
  299 +title.en: Dockers Men's Classic Fit Workday Khaki Smart 360 FLEX Pants (Standard and Big & Tall)
  300 +title.zh: Dockers 男士经典版型工作日卡其色智能360度弹力裤(标准码与加大码)
  301 +
  302 +Rerank score: 0.0981
  303 +title.en: Lazy One Pajama Shorts for Men, Men's Pajama Bottoms, Sleepwear
  304 +title.zh: 懒人男士睡裤,男式家居裤,睡眠服饰
... ...
query/query_parser.py
1 1 """
2 2 Query parser - main module for query processing.
3 3  
4   -Handles query rewriting, translation, and embedding generation.
  4 +Responsibilities are intentionally narrow:
  5 +- normalize and rewrite the incoming query
  6 +- detect language and tokenize with HanLP
  7 +- run translation and embedding requests concurrently
  8 +- return parser facts, not Elasticsearch language-planning data
5 9 """
6 10  
7   -from typing import Dict, List, Optional, Any, Union, Tuple
  11 +from dataclasses import dataclass, field
  12 +from typing import Any, Callable, Dict, List, Optional, Tuple
8 13 import numpy as np
9 14 import logging
10 15 import re
... ... @@ -26,7 +31,7 @@ except Exception: # pragma: no cover
26 31  
27 32 def simple_tokenize_query(text: str) -> List[str]:
28 33 """
29   - Lightweight tokenizer for suggestion length / analysis (aligned with QueryParser fallback).
  34 + Lightweight tokenizer for suggestion-side heuristics only.
30 35  
31 36 - Consecutive CJK characters form one token
32 37 - Latin / digit runs (with internal hyphens) form tokens
... ... @@ -37,63 +42,32 @@ def simple_tokenize_query(text: str) -&gt; List[str]:
37 42 return pattern.findall(text)
38 43  
39 44  
  45 +@dataclass(slots=True)
40 46 class ParsedQuery:
41   - """Container for parsed query results."""
42   -
43   - def __init__(
44   - self,
45   - original_query: str,
46   - query_normalized: str,
47   - rewritten_query: Optional[str] = None,
48   - detected_language: Optional[str] = None,
49   - translations: Dict[str, str] = None,
50   - query_vector: Optional[np.ndarray] = None,
51   - domain: str = "default",
52   - keywords: str = "",
53   - token_count: int = 0,
54   - query_tokens: Optional[List[str]] = None,
55   - query_text_by_lang: Optional[Dict[str, str]] = None,
56   - search_langs: Optional[List[str]] = None,
57   - index_languages: Optional[List[str]] = None,
58   - source_in_index_languages: bool = True,
59   - contains_chinese: bool = False,
60   - contains_english: bool = False,
61   - ):
62   - self.original_query = original_query
63   - self.query_normalized = query_normalized
64   - self.rewritten_query = rewritten_query or query_normalized
65   - self.detected_language = detected_language
66   - self.translations = translations or {}
67   - self.query_vector = query_vector
68   - self.domain = domain
69   - # Query analysis fields
70   - self.keywords = keywords
71   - self.token_count = token_count
72   - self.query_tokens = query_tokens or []
73   - self.query_text_by_lang = query_text_by_lang or {}
74   - self.search_langs = search_langs or []
75   - self.index_languages = index_languages or []
76   - self.source_in_index_languages = bool(source_in_index_languages)
77   - self.contains_chinese = bool(contains_chinese)
78   - self.contains_english = bool(contains_english)
  47 + """Container for query parser facts."""
  48 +
  49 + original_query: str
  50 + query_normalized: str
  51 + rewritten_query: str
  52 + detected_language: Optional[str] = None
  53 + translations: Dict[str, str] = field(default_factory=dict)
  54 + query_vector: Optional[np.ndarray] = None
  55 + query_tokens: List[str] = field(default_factory=list)
  56 + contains_chinese: bool = False
  57 + contains_english: bool = False
79 58  
80 59 def to_dict(self) -> Dict[str, Any]:
81 60 """Convert to dictionary representation."""
82   - result = {
  61 + return {
83 62 "original_query": self.original_query,
84 63 "query_normalized": self.query_normalized,
85 64 "rewritten_query": self.rewritten_query,
86 65 "detected_language": self.detected_language,
87 66 "translations": self.translations,
88   - "domain": self.domain
  67 + "query_tokens": self.query_tokens,
  68 + "contains_chinese": self.contains_chinese,
  69 + "contains_english": self.contains_english,
89 70 }
90   - result["query_text_by_lang"] = self.query_text_by_lang
91   - result["search_langs"] = self.search_langs
92   - result["index_languages"] = self.index_languages
93   - result["source_in_index_languages"] = self.source_in_index_languages
94   - result["contains_chinese"] = self.contains_chinese
95   - result["contains_english"] = self.contains_english
96   - return result
97 71  
98 72  
99 73 class QueryParser:
... ... @@ -102,7 +76,7 @@ class QueryParser:
102 76 1. Normalization
103 77 2. Query rewriting (brand/category mappings, synonyms)
104 78 3. Language detection
105   - 4. Translation to target languages
  79 + 4. Translation to caller-provided target languages
106 80 5. Text embedding generation (for semantic search)
107 81 """
108 82  
... ... @@ -110,7 +84,8 @@ class QueryParser:
110 84 self,
111 85 config: SearchConfig,
112 86 text_encoder: Optional[TextEmbeddingEncoder] = None,
113   - translator: Optional[Any] = None
  87 + translator: Optional[Any] = None,
  88 + tokenizer: Optional[Callable[[str], Any]] = None,
114 89 ):
115 90 """
116 91 Initialize query parser.
... ... @@ -128,23 +103,7 @@ class QueryParser:
128 103 self.normalizer = QueryNormalizer()
129 104 self.language_detector = LanguageDetector()
130 105 self.rewriter = QueryRewriter(config.query_config.rewrite_dictionary)
131   -
132   - # Optional HanLP components (heavy). If unavailable, fall back to a lightweight tokenizer.
133   - self._tok = None
134   - self._pos_tag = None
135   - if hanlp is not None:
136   - try:
137   - logger.info("Initializing HanLP components...")
138   - self._tok = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF)
139   - self._tok.config.output_spans = True
140   - self._pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)
141   - logger.info("HanLP components initialized")
142   - except Exception as e:
143   - logger.warning(f"HanLP init failed, falling back to simple tokenizer: {e}")
144   - self._tok = None
145   - self._pos_tag = None
146   - else:
147   - logger.info("HanLP not installed; using simple tokenizer")
  106 + self._tokenizer = tokenizer or self._build_tokenizer()
148 107  
149 108 # Eager initialization (startup-time failure visibility, no lazy init in request path)
150 109 if self.config.query_config.enable_text_embedding and self._text_encoder is None:
... ... @@ -170,6 +129,16 @@ class QueryParser:
170 129 """Return pre-initialized translator."""
171 130 return self._translator
172 131  
  132 + def _build_tokenizer(self) -> Callable[[str], Any]:
  133 + """Build the tokenizer used by query parsing. No fallback path by design."""
  134 + if hanlp is None:
  135 + raise RuntimeError("HanLP is required for QueryParser tokenization")
  136 + logger.info("Initializing HanLP tokenizer...")
  137 + tokenizer = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF)
  138 + tokenizer.config.output_spans = True
  139 + logger.info("HanLP tokenizer initialized")
  140 + return tokenizer
  141 +
173 142 @staticmethod
174 143 def _pick_query_translation_model(source_lang: str, target_lang: str, config: SearchConfig) -> str:
175 144 """Pick the translation capability for query-time translation (configurable)."""
... ... @@ -186,41 +155,46 @@ class QueryParser:
186 155 # By default this is `nllb-200-distilled-600m` (multi-lingual local model).
187 156 return config.query_config.default_translation_model
188 157  
189   - def _simple_tokenize(self, text: str) -> List[str]:
190   - return simple_tokenize_query(text)
191   -
192   - def _extract_keywords(self, query: str) -> str:
193   - """Extract keywords (nouns with length > 1) from query."""
194   - if self._tok is not None and self._pos_tag is not None:
195   - tok_result = self._tok(query)
196   - if not tok_result:
197   - return ""
198   - words = [x[0] for x in tok_result]
199   - pos_tags = self._pos_tag(words)
200   - keywords = []
201   - for word, pos in zip(words, pos_tags):
202   - if len(word) > 1 and isinstance(pos, str) and pos.startswith("N"):
203   - keywords.append(word)
204   - return " ".join(keywords)
205   -
206   - # Fallback: treat tokens with length > 1 as "keywords"
207   - tokens = self._simple_tokenize(query)
208   - keywords = [t for t in tokens if len(t) > 1]
209   - return " ".join(keywords)
210   -
211   - def _get_token_count(self, query: str) -> int:
212   - """Get token count (HanLP if available, otherwise simple)."""
213   - if self._tok is not None:
214   - tok_result = self._tok(query)
215   - return len(tok_result) if tok_result else 0
216   - return len(self._simple_tokenize(query))
  158 + @staticmethod
  159 + def _normalize_language_codes(languages: Optional[List[str]]) -> List[str]:
  160 + normalized: List[str] = []
  161 + seen = set()
  162 + for language in languages or []:
  163 + token = str(language or "").strip().lower()
  164 + if not token or token in seen:
  165 + continue
  166 + seen.add(token)
  167 + normalized.append(token)
  168 + return normalized
  169 +
  170 + @staticmethod
  171 + def _extract_tokens(tokenizer_result: Any) -> List[str]:
  172 + """Normalize tokenizer output into a flat token string list."""
  173 + if not tokenizer_result:
  174 + return []
  175 + if isinstance(tokenizer_result, str):
  176 + token = tokenizer_result.strip()
  177 + return [token] if token else []
  178 +
  179 + tokens: List[str] = []
  180 + for item in tokenizer_result:
  181 + token: Optional[str] = None
  182 + if isinstance(item, str):
  183 + token = item
  184 + elif isinstance(item, (list, tuple)) and item:
  185 + token = str(item[0])
  186 + elif item is not None:
  187 + token = str(item)
  188 +
  189 + if token is None:
  190 + continue
  191 + token = token.strip()
  192 + if token:
  193 + tokens.append(token)
  194 + return tokens
217 195  
218 196 def _get_query_tokens(self, query: str) -> List[str]:
219   - """Get token list (HanLP if available, otherwise simple)."""
220   - if self._tok is not None:
221   - tok_result = self._tok(query)
222   - return [x[0] for x in tok_result] if tok_result else []
223   - return self._simple_tokenize(query)
  197 + return self._extract_tokens(self._tokenizer(query))
224 198  
225 199 @staticmethod
226 200 def _contains_cjk(text: str) -> bool:
... ... @@ -237,64 +211,24 @@ class QueryParser:
237 211 return False
238 212 return bool(re.fullmatch(r"[A-Za-z]+(?:-[A-Za-z]+)*", token))
239 213  
240   - @staticmethod
241   - def _extract_latin_tokens(text: str) -> List[str]:
242   - """Extract latin word tokens from query text."""
243   - return re.findall(r"[A-Za-z]+(?:-[A-Za-z]+)*", text or "")
244   -
245   - def _infer_supplemental_search_langs(
246   - self,
247   - query_text: str,
248   - detected_lang: str,
249   - index_langs: List[str],
250   - ) -> List[str]:
251   - """
252   - Infer extra languages to search when the query mixes scripts.
253   -
254   - Rules:
255   - - If any Chinese characters appear, include `zh` when available.
256   - - If the query contains meaningful latin tokens, include `en` when available.
257   - "Meaningful" means either:
258   - 1) at least 2 latin tokens with length >= 4, or
259   - 2) at least 1 latin token with length >= 4 and latin chars occupy >= 20% of non-space chars.
260   - """
261   - supplemental: List[str] = []
262   - normalized_index_langs = {str(lang or "").strip().lower() for lang in index_langs}
263   - normalized_detected = str(detected_lang or "").strip().lower()
264   - query_text = str(query_text or "")
265   -
266   - if "zh" in normalized_index_langs and self._contains_cjk(query_text) and normalized_detected != "zh":
267   - supplemental.append("zh")
268   -
269   - latin_tokens = self._extract_latin_tokens(query_text)
270   - significant_latin_tokens = [tok for tok in latin_tokens if len(tok) >= 4]
271   - latin_chars = sum(len(tok) for tok in latin_tokens)
272   - non_space_chars = len(re.sub(r"\s+", "", query_text))
273   - latin_ratio = (latin_chars / non_space_chars) if non_space_chars > 0 else 0.0
274   - has_meaningful_english = (
275   - len(significant_latin_tokens) >= 2 or
276   - (len(significant_latin_tokens) >= 1 and latin_ratio >= 0.2)
277   - )
278   -
279   - if "en" in normalized_index_langs and has_meaningful_english and normalized_detected != "en":
280   - supplemental.append("en")
281   -
282   - return supplemental
283   -
284 214 def parse(
285 215 self,
286 216 query: str,
287 217 tenant_id: Optional[str] = None,
288 218 generate_vector: bool = True,
289   - context: Optional[Any] = None
  219 + context: Optional[Any] = None,
  220 + target_languages: Optional[List[str]] = None,
290 221 ) -> ParsedQuery:
291 222 """
292 223 Parse query through all processing stages.
293 224  
294 225 Args:
295 226 query: Raw query string
  227 + tenant_id: Deprecated and ignored by QueryParser. Kept temporarily
  228 + to avoid a wider refactor in this first step.
296 229 generate_vector: Whether to generate query embedding
297 230 context: Optional request context for tracking and logging
  231 + target_languages: Translation target languages decided by the caller
298 232  
299 233 Returns:
300 234 ParsedQuery object with all processing results
... ... @@ -325,15 +259,9 @@ class QueryParser:
325 259 if context:
326 260 context.store_intermediate_result('query_normalized', normalized)
327 261  
328   - # Extract domain if present (e.g., "brand:Nike" -> domain="brand", query="Nike")
329   - domain, query_text = self.normalizer.extract_domain_query(normalized)
330   - log_debug(f"Domain extraction | Domain: '{domain}', Query: '{query_text}'")
331   - if context:
332   - context.store_intermediate_result('extracted_domain', domain)
333   - context.store_intermediate_result('domain_query', query_text)
334   -
335 262 # Stage 2: Query rewriting
336   - rewritten = None
  263 + query_text = normalized
  264 + rewritten = normalized
337 265 if self.config.query_config.rewrite_dictionary: # Enable rewrite if dictionary exists
338 266 rewritten = self.rewriter.rewrite(query_text)
339 267 if rewritten != query_text:
... ... @@ -351,43 +279,51 @@ class QueryParser:
351 279 log_info(f"Language detection | Detected language: {detected_lang}")
352 280 if context:
353 281 context.store_intermediate_result('detected_language', detected_lang)
  282 + # Stage 4: Query analysis (tokenization + script flags)
  283 + query_tokens = self._get_query_tokens(query_text)
  284 + contains_chinese = self._contains_cjk(query_text)
  285 + contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens)
  286 +
  287 + log_debug(
  288 + f"Query analysis | Query tokens: {query_tokens} | "
  289 + f"contains_chinese={contains_chinese} | contains_english={contains_english}"
  290 + )
  291 + if context:
  292 + context.store_intermediate_result('query_tokens', query_tokens)
  293 + context.store_intermediate_result('contains_chinese', contains_chinese)
  294 + context.store_intermediate_result('contains_english', contains_english)
354 295  
355   - # Stage 4: Translation — always submit to thread pool; results are collected together with
356   - # embedding in one wait() that uses a configurable budget (short vs long by source-in-index).
  296 + # Stage 5: Translation + embedding. Parser only coordinates async enrichment work; the
  297 + # caller decides translation targets and later search-field planning.
357 298 translations: Dict[str, str] = {}
358   - translation_futures: Dict[str, Any] = {}
359   - translation_executor: Optional[ThreadPoolExecutor] = None
360   - index_langs: List[str] = []
  299 + future_to_task: Dict[Any, Tuple[str, Optional[str]]] = {}
  300 + async_executor: Optional[ThreadPoolExecutor] = None
361 301 detected_norm = str(detected_lang or "").strip().lower()
  302 + normalized_targets = self._normalize_language_codes(target_languages)
  303 + translation_targets = [lang for lang in normalized_targets if lang != detected_norm]
  304 +
  305 + # Stage 6: Text embedding - async execution
  306 + query_vector = None
  307 + should_generate_embedding = (
  308 + generate_vector and
  309 + self.config.query_config.enable_text_embedding
  310 + )
  311 +
  312 + task_count = len(translation_targets) + (1 if should_generate_embedding else 0)
  313 + if task_count > 0:
  314 + async_executor = ThreadPoolExecutor(
  315 + max_workers=max(1, min(task_count, 4)),
  316 + thread_name_prefix="query-enrichment",
  317 + )
362 318  
363 319 try:
364   - # 根据租户配置的 index_languages 决定翻译目标语言
365   - from config.tenant_config_loader import get_tenant_config_loader
366   - tenant_loader = get_tenant_config_loader()
367   - tenant_cfg = tenant_loader.get_tenant_config(tenant_id or "default")
368   - raw_index_langs = tenant_cfg.get("index_languages") or []
369   - index_langs = []
370   - seen_langs = set()
371   - for lang in raw_index_langs:
372   - norm_lang = str(lang or "").strip().lower()
373   - if not norm_lang or norm_lang in seen_langs:
374   - continue
375   - seen_langs.add(norm_lang)
376   - index_langs.append(norm_lang)
377   -
378   - target_langs_for_translation = [lang for lang in index_langs if lang != detected_norm]
379   -
380   - if target_langs_for_translation:
381   - translation_executor = ThreadPoolExecutor(
382   - max_workers=max(1, min(len(target_langs_for_translation), 4)),
383   - thread_name_prefix="query-translation",
384   - )
385   - for lang in target_langs_for_translation:
  320 + if async_executor is not None:
  321 + for lang in translation_targets:
386 322 model_name = self._pick_query_translation_model(detected_lang, lang, self.config)
387 323 log_debug(
388 324 f"Submitting query translation | source={detected_lang} target={lang} model={model_name}"
389 325 )
390   - translation_futures[lang] = translation_executor.submit(
  326 + future = async_executor.submit(
391 327 self.translator.translate,
392 328 query_text,
393 329 lang,
... ... @@ -395,107 +331,61 @@ class QueryParser:
395 331 "ecommerce_search_query",
396 332 model_name,
397 333 )
398   -
399   - if context:
400   - context.store_intermediate_result('translations', translations)
401   - for lang, translation in translations.items():
402   - if translation:
403   - context.store_intermediate_result(f'translation_{lang}', translation)
404   -
  334 + future_to_task[future] = ("translation", lang)
  335 +
  336 + if should_generate_embedding:
  337 + if self.text_encoder is None:
  338 + raise RuntimeError("Text embedding is enabled but text encoder is not initialized")
  339 + log_debug("Submitting query vector generation")
  340 +
  341 + def _encode_query_vector() -> Optional[np.ndarray]:
  342 + arr = self.text_encoder.encode([query_text], priority=1)
  343 + if arr is None or len(arr) == 0:
  344 + return None
  345 + vec = arr[0]
  346 + if vec is None:
  347 + return None
  348 + return np.asarray(vec, dtype=np.float32)
  349 +
  350 + future = async_executor.submit(_encode_query_vector)
  351 + future_to_task[future] = ("embedding", None)
405 352 except Exception as e:
406   - error_msg = f"Translation failed | Error: {str(e)}"
  353 + error_msg = f"Async query enrichment submission failed | Error: {str(e)}"
407 354 log_info(error_msg)
408 355 if context:
409 356 context.add_warning(error_msg)
  357 + if async_executor is not None:
  358 + async_executor.shutdown(wait=False)
  359 + async_executor = None
  360 + future_to_task.clear()
410 361  
411   - # Stage 5: Query analysis (keywords, token count, query_tokens)
412   - keywords = self._extract_keywords(query_text)
413   - query_tokens = self._get_query_tokens(query_text)
414   - token_count = len(query_tokens)
415   - contains_chinese = self._contains_cjk(query_text)
416   - contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens)
417   -
418   - log_debug(f"Query analysis | Keywords: {keywords} | Token count: {token_count} | "
419   - f"Query tokens: {query_tokens} | contains_chinese={contains_chinese} | "
420   - f"contains_english={contains_english}")
421   - if context:
422   - context.store_intermediate_result('keywords', keywords)
423   - context.store_intermediate_result('token_count', token_count)
424   - context.store_intermediate_result('query_tokens', query_tokens)
425   - context.store_intermediate_result('contains_chinese', contains_chinese)
426   - context.store_intermediate_result('contains_english', contains_english)
427   -
428   - # Stage 6: Text embedding (only for non-short queries) - async execution
429   - query_vector = None
430   - embedding_future = None
431   - should_generate_embedding = (
432   - generate_vector and
433   - self.config.query_config.enable_text_embedding and
434   - domain == "default"
435   - )
436   -
437   - encoding_executor = None
438   - if should_generate_embedding:
439   - try:
440   - if self.text_encoder is None:
441   - raise RuntimeError("Text embedding is enabled but text encoder is not initialized")
442   - log_debug("Starting query vector generation (async)")
443   - # Submit encoding task to thread pool for async execution
444   - encoding_executor = ThreadPoolExecutor(max_workers=1)
445   - def _encode_query_vector() -> Optional[np.ndarray]:
446   - arr = self.text_encoder.encode([query_text], priority=1)
447   - if arr is None or len(arr) == 0:
448   - return None
449   - vec = arr[0]
450   - return vec if isinstance(vec, np.ndarray) else None
451   - embedding_future = encoding_executor.submit(
452   - _encode_query_vector
453   - )
454   - except Exception as e:
455   - error_msg = f"Query vector generation task submission failed | Error: {str(e)}"
456   - log_info(error_msg)
457   - if context:
458   - context.add_warning(error_msg)
459   - encoding_executor = None
460   - embedding_future = None
461   -
462   - # Wait for translation + embedding concurrently; shared budget (ms) depends on whether
463   - # the detected language is in tenant index_languages.
  362 + # Wait for translation + embedding concurrently; shared budget depends on whether
  363 + # the detected language belongs to caller-provided target_languages.
464 364 qc = self.config.query_config
465   - source_in_index_for_budget = detected_norm in index_langs
  365 + source_in_target_languages = bool(normalized_targets) and detected_norm in normalized_targets
466 366 budget_ms = (
467 367 qc.translation_embedding_wait_budget_ms_source_in_index
468   - if source_in_index_for_budget
  368 + if source_in_target_languages
469 369 else qc.translation_embedding_wait_budget_ms_source_not_in_index
470 370 )
471 371 budget_sec = max(0.0, float(budget_ms) / 1000.0)
472 372  
473   - if translation_futures:
  373 + if translation_targets:
474 374 log_info(
475 375 f"Translation+embedding shared wait budget | budget_ms={budget_ms} | "
476   - f"source_in_index_languages={source_in_index_for_budget} | "
477   - f"translation_targets={list(translation_futures.keys())}"
  376 + f"source_in_target_languages={source_in_target_languages} | "
  377 + f"translation_targets={translation_targets}"
478 378 )
479 379  
480   - if translation_futures or embedding_future:
  380 + if future_to_task:
481 381 log_debug(
482 382 f"Waiting for async tasks (translation+embedding) | budget_ms={budget_ms} | "
483   - f"source_in_index_languages={source_in_index_for_budget}"
  383 + f"source_in_target_languages={source_in_target_languages}"
484 384 )
485 385  
486   - all_futures: List[Any] = []
487   - future_to_lang: Dict[Any, tuple] = {}
488   - for lang, future in translation_futures.items():
489   - all_futures.append(future)
490   - future_to_lang[future] = ("translation", lang)
491   -
492   - if embedding_future:
493   - all_futures.append(embedding_future)
494   - future_to_lang[embedding_future] = ("embedding", None)
495   -
496   - done, not_done = wait(all_futures, timeout=budget_sec)
  386 + done, not_done = wait(list(future_to_task.keys()), timeout=budget_sec)
497 387 for future in done:
498   - task_type, lang = future_to_lang[future]
  388 + task_type, lang = future_to_task[future]
499 389 try:
500 390 result = future.result()
501 391 if task_type == "translation":
... ... @@ -528,7 +418,7 @@ class QueryParser:
528 418  
529 419 if not_done:
530 420 for future in not_done:
531   - task_type, lang = future_to_lang[future]
  421 + task_type, lang = future_to_task[future]
532 422 if task_type == "translation":
533 423 timeout_msg = (
534 424 f"Translation timeout (>{budget_ms}ms) | Language: {lang} | "
... ... @@ -542,68 +432,21 @@ class QueryParser:
542 432 if context:
543 433 context.add_warning(timeout_msg)
544 434  
545   - if encoding_executor:
546   - encoding_executor.shutdown(wait=False)
547   - if translation_executor:
548   - translation_executor.shutdown(wait=False)
  435 + if async_executor:
  436 + async_executor.shutdown(wait=False)
549 437  
550 438 if translations and context:
551 439 context.store_intermediate_result("translations", translations)
552   -
553   - # Build language-scoped query plan: source language + available translations
554   - query_text_by_lang: Dict[str, str] = {}
555   - if query_text:
556   - query_text_by_lang[detected_lang] = query_text
557   - for lang, translated_text in (translations or {}).items():
558   - if translated_text and str(translated_text).strip():
559   - query_text_by_lang[str(lang).strip().lower()] = str(translated_text)
560   -
561   - supplemental_search_langs = self._infer_supplemental_search_langs(
562   - query_text=query_text,
563   - detected_lang=detected_lang,
564   - index_langs=index_langs,
565   - )
566   - for lang in supplemental_search_langs:
567   - if lang not in query_text_by_lang and query_text:
568   - # Use the original mixed-script query as a robust fallback probe for that language field set.
569   - query_text_by_lang[lang] = query_text
570   -
571   - source_in_index_languages = detected_norm in index_langs
572   - ordered_search_langs: List[str] = []
573   - seen_order = set()
574   - if detected_lang in query_text_by_lang:
575   - ordered_search_langs.append(detected_lang)
576   - seen_order.add(detected_lang)
577   - for lang in index_langs:
578   - if lang in query_text_by_lang and lang not in seen_order:
579   - ordered_search_langs.append(lang)
580   - seen_order.add(lang)
581   - for lang in query_text_by_lang.keys():
582   - if lang not in seen_order:
583   - ordered_search_langs.append(lang)
584   - seen_order.add(lang)
585   -
586   - if context:
587   - context.store_intermediate_result("search_langs", ordered_search_langs)
588   - context.store_intermediate_result("query_text_by_lang", query_text_by_lang)
589   - context.store_intermediate_result("supplemental_search_langs", supplemental_search_langs)
590 440  
591 441 # Build result
592 442 result = ParsedQuery(
593 443 original_query=query,
594 444 query_normalized=normalized,
595   - rewritten_query=rewritten,
  445 + rewritten_query=query_text,
596 446 detected_language=detected_lang,
597 447 translations=translations,
598 448 query_vector=query_vector,
599   - domain=domain,
600   - keywords=keywords,
601   - token_count=token_count,
602 449 query_tokens=query_tokens,
603   - query_text_by_lang=query_text_by_lang,
604   - search_langs=ordered_search_langs,
605   - index_languages=index_langs,
606   - source_in_index_languages=source_in_index_languages,
607 450 contains_chinese=contains_chinese,
608 451 contains_english=contains_english,
609 452 )
... ... @@ -611,14 +454,13 @@ class QueryParser:
611 454 if context and hasattr(context, 'logger'):
612 455 context.logger.info(
613 456 f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | "
614   - f"Language: {detected_lang} | Domain: {domain} | "
615 457 f"Translation count: {len(translations)} | Vector: {'yes' if query_vector is not None else 'no'}",
616 458 extra={'reqid': context.reqid, 'uid': context.uid}
617 459 )
618 460 else:
619 461 logger.info(
620 462 f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | "
621   - f"Language: {detected_lang} | Domain: {domain}"
  463 + f"Language: {detected_lang}"
622 464 )
623 465  
624 466 return result
... ...
search/es_query_builder.py
... ... @@ -159,7 +159,8 @@ class ESQueryBuilder:
159 159 knn_k: int = 50,
160 160 knn_num_candidates: int = 200,
161 161 min_score: Optional[float] = None,
162   - parsed_query: Optional[Any] = None
  162 + parsed_query: Optional[Any] = None,
  163 + index_languages: Optional[List[str]] = None,
163 164 ) -> Dict[str, Any]:
164 165 """
165 166 Build complete ES query with post_filter support for multi-select faceting.
... ... @@ -202,7 +203,11 @@ class ESQueryBuilder:
202 203 # Text recall (always include if query_text exists)
203 204 if query_text:
204 205 # Unified text query strategy
205   - text_query = self._build_advanced_text_query(query_text, parsed_query)
  206 + text_query = self._build_advanced_text_query(
  207 + query_text,
  208 + parsed_query,
  209 + index_languages=index_languages,
  210 + )
206 211 recall_clauses.append(text_query)
207 212  
208 213 # Embedding recall (KNN - separate from query, handled below)
... ... @@ -503,13 +508,31 @@ class ESQueryBuilder:
503 508 # Currently using unified embedding field
504 509 return self.text_embedding_field or "title_embedding"
505 510  
506   - def _build_advanced_text_query(self, query_text: str, parsed_query: Optional[Any] = None) -> Dict[str, Any]:
  511 + @staticmethod
  512 + def _normalize_language_list(languages: Optional[List[str]]) -> List[str]:
  513 + normalized: List[str] = []
  514 + seen = set()
  515 + for language in languages or []:
  516 + token = str(language or "").strip().lower()
  517 + if not token or token in seen:
  518 + continue
  519 + seen.add(token)
  520 + normalized.append(token)
  521 + return normalized
  522 +
  523 + def _build_advanced_text_query(
  524 + self,
  525 + query_text: str,
  526 + parsed_query: Optional[Any] = None,
  527 + *,
  528 + index_languages: Optional[List[str]] = None,
  529 + ) -> Dict[str, Any]:
507 530 """
508   - Build advanced text query using should clauses with primary and fallback lexical strategies.
  531 + Build advanced text query using base and translated lexical clauses.
509 532  
510 533 Unified implementation:
511 534 - base_query: source-language clause
512   - - translation queries: target-language clauses from search_langs/query_text_by_lang
  535 + - translation queries: target-language clauses from translations
513 536 - KNN query: added separately in build_query
514 537  
515 538 Args:
... ... @@ -520,55 +543,42 @@ class ESQueryBuilder:
520 543 ES bool query with should clauses
521 544 """
522 545 should_clauses = []
523   -
524   - # Get query analysis from parsed_query
525   - query_text_by_lang: Dict[str, str] = {}
526   - search_langs: List[str] = []
527 546 source_lang = self.default_language
528   - source_in_index_languages = True
529   - index_languages: List[str] = []
530   -
  547 + translations: Dict[str, str] = {}
531 548 contains_chinese = False
532 549 contains_english = False
  550 + normalized_index_languages = self._normalize_language_list(index_languages)
  551 +
533 552 if parsed_query:
534   - query_text_by_lang = getattr(parsed_query, "query_text_by_lang", None) or {}
535   - search_langs = getattr(parsed_query, "search_langs", None) or []
536 553 detected_lang = getattr(parsed_query, "detected_language", None)
537 554 source_lang = detected_lang if detected_lang and detected_lang != "unknown" else self.default_language
538   - source_in_index_languages = bool(
539   - getattr(parsed_query, "source_in_index_languages", True)
540   - )
541   - index_languages = getattr(parsed_query, "index_languages", None) or []
  555 + translations = getattr(parsed_query, "translations", None) or {}
542 556 contains_chinese = bool(getattr(parsed_query, "contains_chinese", False))
543 557 contains_english = bool(getattr(parsed_query, "contains_english", False))
544 558  
545   - if not query_text_by_lang:
546   - query_text_by_lang = {source_lang: query_text}
547   - if source_lang not in query_text_by_lang and query_text:
548   - query_text_by_lang[source_lang] = query_text
549   - if not search_langs:
550   - search_langs = list(query_text_by_lang.keys())
551   -
552   - # Base + translated clauses based on language plan.
553   - for lang in search_langs:
554   - lang_query = query_text_by_lang.get(lang)
555   - if not lang_query:
556   - continue
  559 + source_lang = str(source_lang or self.default_language).strip().lower() or self.default_language
  560 + source_in_index_languages = (
  561 + True if not normalized_index_languages else source_lang in normalized_index_languages
  562 + )
  563 +
  564 + base_query_text = (
  565 + getattr(parsed_query, "rewritten_query", None) if parsed_query else None
  566 + ) or query_text
  567 +
  568 + def append_clause(lang: str, lang_query: str, clause_name: str, is_source: bool) -> None:
  569 + nonlocal should_clauses
557 570 all_specs, _ = self._build_match_field_specs(lang)
558 571 expanded_specs = self._expand_match_field_specs_for_mixed_script(
559 572 lang,
560 573 all_specs,
561 574 contains_chinese,
562 575 contains_english,
563   - index_languages,
  576 + normalized_index_languages,
564 577 )
565 578 match_fields = self._format_match_field_specs(expanded_specs)
566 579 if not match_fields:
567   - continue
568   -
569   - is_source = (lang == source_lang)
  580 + return
570 581 clause_boost = 1.0
571   - clause_name = "base_query" if is_source else f"base_query_trans_{lang}"
572 582 minimum_should_match = (
573 583 self.base_minimum_should_match if is_source else self.translation_minimum_should_match
574 584 )
... ... @@ -596,44 +606,17 @@ class ESQueryBuilder:
596 606 "multi_match": clause["multi_match"]
597 607 })
598 608  
599   - # Fallback: source language is not indexed and translation for some index languages is missing.
600   - # Use original query text on missing index-language fields with a low boost.
601   - if not source_in_index_languages and query_text and index_languages:
602   - normalized_index_langs: List[str] = []
603   - seen_langs = set()
604   - for lang in index_languages:
605   - norm_lang = str(lang or "").strip().lower()
606   - if not norm_lang or norm_lang in seen_langs:
607   - continue
608   - seen_langs.add(norm_lang)
609   - normalized_index_langs.append(norm_lang)
  609 + if base_query_text:
  610 + append_clause(source_lang, base_query_text, "base_query", True)
610 611  
611   - for lang in normalized_index_langs:
612   - if lang == source_lang:
613   - continue
614   - if lang in query_text_by_lang:
615   - continue
616   - fb_specs, _ = self._build_match_field_specs(lang)
617   - expanded_fb = self._expand_match_field_specs_for_mixed_script(
618   - lang,
619   - fb_specs,
620   - contains_chinese,
621   - contains_english,
622   - index_languages,
623   - )
624   - match_fields = self._format_match_field_specs(expanded_fb)
625   - if not match_fields:
626   - continue
627   - should_clauses.append({
628   - "multi_match": {
629   - "_name": f"fallback_original_query_{lang}",
630   - "query": query_text,
631   - "fields": match_fields,
632   - "minimum_should_match": self.translation_minimum_should_match,
633   - "tie_breaker": self.tie_breaker_base_query,
634   - "boost": self.original_query_fallback_boost_when_translation_missing,
635   - }
636   - })
  612 + for lang, translated_text in translations.items():
  613 + normalized_lang = str(lang or "").strip().lower()
  614 + normalized_text = str(translated_text or "").strip()
  615 + if not normalized_lang or not normalized_text:
  616 + continue
  617 + if normalized_lang == source_lang and normalized_text == base_query_text:
  618 + continue
  619 + append_clause(normalized_lang, normalized_text, f"base_query_trans_{normalized_lang}", False)
637 620  
638 621 # Fallback to a simple query when language fields cannot be resolved.
639 622 if not should_clauses:
... ...
search/searcher.py
... ... @@ -516,10 +516,19 @@ class Searcher:
516 516 range_filters: Range filters for numeric fields
517 517 facets: Facet configurations for faceted search
518 518 min_score: Minimum score threshold
519   - context: Request context for tracking (created if not provided)
  519 + context: Request context for tracking (required)
520 520 sort_by: Field name for sorting
521 521 sort_order: Sort order: 'asc' or 'desc'
522 522 debug: Enable debug information output
  523 + language: Response / field selection language hint (e.g. zh, en)
  524 + sku_filter_dimension: SKU grouping dimensions for per-SPU variant pick
  525 + enable_rerank: If None, use ``config.rerank.enabled``; if set, overrides
  526 + whether the rerank provider is invoked (subject to rerank window).
  527 + rerank_query_template: Override for rerank query text template; None uses
  528 + ``config.rerank.rerank_query_template`` (e.g. ``"{query}"``).
  529 + rerank_doc_template: Override for per-hit document text passed to rerank;
  530 + None uses ``config.rerank.rerank_doc_template``. Placeholders are
  531 + resolved in ``search/rerank_client.py``.
523 532  
524 533 Returns:
525 534 SearchResult object with formatted results
... ... @@ -592,7 +601,8 @@ class Searcher:
592 601 query,
593 602 tenant_id=tenant_id,
594 603 generate_vector=enable_embedding,
595   - context=context
  604 + context=context,
  605 + target_languages=index_langs if enable_translation else [],
596 606 )
597 607 # Store query analysis results in context
598 608 context.store_query_analysis(
... ... @@ -602,7 +612,7 @@ class Searcher:
602 612 detected_language=parsed_query.detected_language,
603 613 translations=parsed_query.translations,
604 614 query_vector=parsed_query.query_vector.tolist() if parsed_query.query_vector is not None else None,
605   - domain=parsed_query.domain,
  615 + domain="default",
606 616 is_simple_query=True
607 617 )
608 618  
... ... @@ -610,7 +620,6 @@ class Searcher:
610 620 f"查询解析完成 | 原查询: '{parsed_query.original_query}' | "
611 621 f"重写后: '{parsed_query.rewritten_query}' | "
612 622 f"语言: {parsed_query.detected_language} | "
613   - f"域: {parsed_query.domain} | "
614 623 f"向量: {'是' if parsed_query.query_vector is not None else '否'}",
615 624 extra={'reqid': context.reqid, 'uid': context.uid}
616 625 )
... ... @@ -643,7 +652,8 @@ class Searcher:
643 652 from_=es_fetch_from,
644 653 enable_knn=enable_embedding and parsed_query.query_vector is not None,
645 654 min_score=min_score,
646   - parsed_query=parsed_query
  655 + parsed_query=parsed_query,
  656 + index_languages=index_langs,
647 657 )
648 658  
649 659 # Add facets for faceted search
... ... @@ -985,9 +995,6 @@ class Searcher:
985 995 "rewritten_query": context.query_analysis.rewritten_query,
986 996 "detected_language": context.query_analysis.detected_language,
987 997 "translations": context.query_analysis.translations,
988   - "query_text_by_lang": context.get_intermediate_result("query_text_by_lang", {}),
989   - "search_langs": context.get_intermediate_result("search_langs", []),
990   - "supplemental_search_langs": context.get_intermediate_result("supplemental_search_langs", []),
991 998 "has_vector": context.query_analysis.query_vector is not None,
992 999 "is_simple_query": context.query_analysis.is_simple_query,
993 1000 "domain": context.query_analysis.domain
... ...
tests/test_embedding_pipeline.py
... ... @@ -73,6 +73,10 @@ class _FakeQueryEncoder:
73 73 return np.array([np.array([0.11, 0.22, 0.33], dtype=np.float32) for _ in sentences], dtype=object)
74 74  
75 75  
  76 +def _tokenizer(text):
  77 + return str(text).split()
  78 +
  79 +
76 80 class _FakeEmbeddingCache:
77 81 def __init__(self):
78 82 self.store: Dict[str, np.ndarray] = {}
... ... @@ -210,6 +214,7 @@ def test_query_parser_generates_query_vector_with_encoder():
210 214 config=_build_test_config(),
211 215 text_encoder=encoder,
212 216 translator=_FakeTranslator(),
  217 + tokenizer=_tokenizer,
213 218 )
214 219  
215 220 parsed = parser.parse("red dress", tenant_id="162", generate_vector=True)
... ... @@ -224,6 +229,7 @@ def test_query_parser_skips_query_vector_when_disabled():
224 229 config=_build_test_config(),
225 230 text_encoder=_FakeQueryEncoder(),
226 231 translator=_FakeTranslator(),
  232 + tokenizer=_tokenizer,
227 233 )
228 234  
229 235 parsed = parser.parse("red dress", tenant_id="162", generate_vector=False)
... ...
tests/test_es_query_builder.py
... ... @@ -65,21 +65,42 @@ def test_knn_prefilter_not_added_without_filters():
65 65 assert q["knn"]["_name"] == "knn_query"
66 66  
67 67  
68   -def test_text_query_contains_only_base_translation_and_fallback_named_queries():
  68 +def test_text_query_contains_only_base_and_translation_named_queries():
69 69 qb = _builder()
70 70 parsed_query = SimpleNamespace(
71   - query_text_by_lang={"en": "dress", "zh": "连衣裙"},
72   - search_langs=["en", "zh"],
  71 + rewritten_query="dress",
73 72 detected_language="en",
74   - source_in_index_languages=False,
75   - index_languages=["en", "zh", "fr"],
  73 + translations={"en": "dress", "zh": "连衣裙"},
76 74 )
77 75  
78   - q = qb.build_query(query_text="dress", parsed_query=parsed_query, enable_knn=False)
  76 + q = qb.build_query(
  77 + query_text="dress",
  78 + parsed_query=parsed_query,
  79 + enable_knn=False,
  80 + index_languages=["en", "zh", "fr"],
  81 + )
79 82 should = q["query"]["bool"]["should"]
80 83 names = [clause["multi_match"]["_name"] for clause in should]
81 84  
82   - assert names == ["base_query", "base_query_trans_zh", "fallback_original_query_fr"]
  85 + assert names == ["base_query", "base_query_trans_zh"]
  86 +
  87 +
  88 +def test_text_query_skips_duplicate_translation_same_as_base():
  89 + qb = _builder()
  90 + parsed_query = SimpleNamespace(
  91 + rewritten_query="dress",
  92 + detected_language="en",
  93 + translations={"en": "dress"},
  94 + )
  95 +
  96 + q = qb.build_query(
  97 + query_text="dress",
  98 + parsed_query=parsed_query,
  99 + enable_knn=False,
  100 + index_languages=["en", "zh"],
  101 + )
  102 +
  103 + assert q["query"]["multi_match"]["_name"] == "base_query"
83 104  
84 105  
85 106 def test_mixed_script_merges_en_fields_into_zh_clause():
... ... @@ -91,22 +112,25 @@ def test_mixed_script_merges_en_fields_into_zh_clause():
91 112 default_language="en",
92 113 )
93 114 parsed_query = SimpleNamespace(
94   - query_text_by_lang={"zh": "法式 dress"},
95   - search_langs=["zh"],
  115 + rewritten_query="法式 dress",
96 116 detected_language="zh",
97   - source_in_index_languages=True,
98   - index_languages=["zh", "en"],
  117 + translations={},
99 118 contains_chinese=True,
100 119 contains_english=True,
101 120 )
102   - q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False)
  121 + q = qb.build_query(
  122 + query_text="法式 dress",
  123 + parsed_query=parsed_query,
  124 + enable_knn=False,
  125 + index_languages=["zh", "en"],
  126 + )
103 127 fields = q["query"]["multi_match"]["fields"]
104 128 bases = {f.split("^", 1)[0] for f in fields}
105 129 assert "title.zh" in bases and "title.en" in bases
106 130 assert "brief.zh" in bases and "brief.en" in bases
107   - # Merged supplemental language fields use boost * 0.8 (implicit 1.0 -> ^0.8)
108   - assert "title.en^0.8" in fields
109   - assert "brief.en^0.8" in fields
  131 + # Merged supplemental language fields use boost * 0.6 by default.
  132 + assert "title.en^0.6" in fields
  133 + assert "brief.en^0.6" in fields
110 134  
111 135  
112 136 def test_mixed_script_merges_zh_fields_into_en_clause():
... ... @@ -118,19 +142,22 @@ def test_mixed_script_merges_zh_fields_into_en_clause():
118 142 default_language="en",
119 143 )
120 144 parsed_query = SimpleNamespace(
121   - query_text_by_lang={"en": "red 连衣裙"},
122   - search_langs=["en"],
  145 + rewritten_query="red 连衣裙",
123 146 detected_language="en",
124   - source_in_index_languages=True,
125   - index_languages=["zh", "en"],
  147 + translations={},
126 148 contains_chinese=True,
127 149 contains_english=True,
128 150 )
129   - q = qb.build_query(query_text="red 连衣裙", parsed_query=parsed_query, enable_knn=False)
  151 + q = qb.build_query(
  152 + query_text="red 连衣裙",
  153 + parsed_query=parsed_query,
  154 + enable_knn=False,
  155 + index_languages=["zh", "en"],
  156 + )
130 157 fields = q["query"]["multi_match"]["fields"]
131 158 bases = {f.split("^", 1)[0] for f in fields}
132 159 assert "title.en" in bases and "title.zh" in bases
133   - assert "title.zh^0.8" in fields
  160 + assert "title.zh^0.6" in fields
134 161  
135 162  
136 163 def test_mixed_script_merged_fields_scale_configured_boosts():
... ... @@ -143,18 +170,21 @@ def test_mixed_script_merged_fields_scale_configured_boosts():
143 170 default_language="en",
144 171 )
145 172 parsed_query = SimpleNamespace(
146   - query_text_by_lang={"zh": "法式 dress"},
147   - search_langs=["zh"],
  173 + rewritten_query="法式 dress",
148 174 detected_language="zh",
149   - source_in_index_languages=True,
150   - index_languages=["zh", "en"],
  175 + translations={},
151 176 contains_chinese=True,
152 177 contains_english=True,
153 178 )
154   - q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False)
  179 + q = qb.build_query(
  180 + query_text="法式 dress",
  181 + parsed_query=parsed_query,
  182 + enable_knn=False,
  183 + index_languages=["zh", "en"],
  184 + )
155 185 fields = q["query"]["multi_match"]["fields"]
156 186 assert "title.zh^5.0" in fields
157   - assert "title.en^8.0" in fields # 10.0 * 0.8
  187 + assert "title.en^6.0" in fields # 10.0 * 0.6
158 188  
159 189  
160 190 def test_mixed_script_does_not_merge_en_when_not_in_index_languages():
... ... @@ -166,15 +196,18 @@ def test_mixed_script_does_not_merge_en_when_not_in_index_languages():
166 196 default_language="zh",
167 197 )
168 198 parsed_query = SimpleNamespace(
169   - query_text_by_lang={"zh": "法式 dress"},
170   - search_langs=["zh"],
  199 + rewritten_query="法式 dress",
171 200 detected_language="zh",
172   - source_in_index_languages=True,
173   - index_languages=["zh"],
  201 + translations={},
174 202 contains_chinese=True,
175 203 contains_english=True,
176 204 )
177   - q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False)
  205 + q = qb.build_query(
  206 + query_text="法式 dress",
  207 + parsed_query=parsed_query,
  208 + enable_knn=False,
  209 + index_languages=["zh"],
  210 + )
178 211 fields = q["query"]["multi_match"]["fields"]
179 212 bases = {f.split("^", 1)[0] for f in fields}
180 213 assert "title.zh" in bases
... ...
tests/test_query_parser_mixed_language.py
1   -from types import SimpleNamespace
2   -
3 1 from config import FunctionScoreConfig, IndexConfig, QueryConfig, RerankConfig, SPUConfig, SearchConfig
4 2 from query.query_parser import QueryParser
5 3  
... ... @@ -9,6 +7,10 @@ class _DummyTranslator:
9 7 return f"{text}-{target_lang}"
10 8  
11 9  
  10 +def _tokenizer(text):
  11 + return str(text).split()
  12 +
  13 +
12 14 def test_pure_english_word_token_length_and_script():
13 15 assert QueryParser._is_pure_english_word_token("ab") is False
14 16 assert QueryParser._is_pure_english_word_token("abc") is True
... ... @@ -35,59 +37,57 @@ def _build_config() -&gt; SearchConfig:
35 37  
36 38  
37 39 def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(monkeypatch):
38   - parser = QueryParser(_build_config(), translator=_DummyTranslator())
  40 + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
39 41 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh")
40   - monkeypatch.setattr(
41   - "query.query_parser.get_tenant_config_loader",
42   - lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["zh", "en"]}),
43   - raising=False,
44   - )
45 42  
46   - result = parser.parse("法式 dress 连衣裙", tenant_id="162", generate_vector=False)
  43 + result = parser.parse(
  44 + "法式 dress 连衣裙",
  45 + tenant_id="162",
  46 + generate_vector=False,
  47 + target_languages=["zh", "en"],
  48 + )
47 49  
48 50 assert result.detected_language == "zh"
49 51 assert result.contains_chinese is True
50 52 assert result.contains_english is True
51   - assert "en" in result.search_langs
52   - # 翻译在预算内完成时会写入目标语言字段(优于仅用原文做 supplemental 探测)
53   - assert result.query_text_by_lang["en"] == "法式 dress 连衣裙-en"
54   - assert result.query_text_by_lang["zh"] == "法式 dress 连衣裙"
  53 + assert result.translations == {"en": "法式 dress 连衣裙-en"}
  54 + assert result.query_tokens == ["法式", "dress", "连衣裙"]
  55 + assert not hasattr(result, "query_text_by_lang")
  56 + assert not hasattr(result, "search_langs")
55 57  
56 58  
57 59 def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch):
58   - parser = QueryParser(_build_config(), translator=_DummyTranslator())
  60 + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
59 61 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
60   - monkeypatch.setattr(
61   - "query.query_parser.get_tenant_config_loader",
62   - lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}),
63   - raising=False,
64   - )
65 62  
66   - result = parser.parse("red 连衣裙", tenant_id="0", generate_vector=False)
  63 + result = parser.parse(
  64 + "red 连衣裙",
  65 + tenant_id="0",
  66 + generate_vector=False,
  67 + target_languages=["en", "zh"],
  68 + )
67 69  
68 70 assert result.detected_language == "en"
69 71 assert result.contains_chinese is True
70 72 assert result.contains_english is True
71   - assert "zh" in result.search_langs
72   - assert result.query_text_by_lang["zh"] == "red 连衣裙-zh"
73   - assert result.query_text_by_lang["en"] == "red 连衣裙"
  73 + assert result.translations == {"zh": "red 连衣裙-zh"}
  74 + assert result.query_tokens == ["red", "连衣裙"]
74 75  
75 76  
76 77 def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch):
77 78 """en 在 index_languages 内时仍应等待并采纳 en->zh 翻译结果(与向量共用预算)。"""
78   - parser = QueryParser(_build_config(), translator=_DummyTranslator())
  79 + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
79 80 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
80   - monkeypatch.setattr(
81   - "query.query_parser.get_tenant_config_loader",
82   - lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}),
83   - raising=False,
84   - )
85 81  
86   - result = parser.parse("off shoulder top", tenant_id="0", generate_vector=False)
  82 + result = parser.parse(
  83 + "off shoulder top",
  84 + tenant_id="0",
  85 + generate_vector=False,
  86 + target_languages=["en", "zh"],
  87 + )
87 88  
88 89 assert result.detected_language == "en"
89 90 assert result.contains_chinese is False
90 91 assert result.contains_english is True
91 92 assert result.translations.get("zh") == "off shoulder top-zh"
92   - assert result.query_text_by_lang.get("zh") == "off shoulder top-zh"
93   - assert result.source_in_index_languages is True
  93 + assert not hasattr(result, "source_in_index_languages")
... ...
tests/test_search_rerank_window.py
... ... @@ -43,7 +43,14 @@ class _FakeParsedQuery:
43 43  
44 44  
45 45 class _FakeQueryParser:
46   - def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any):
  46 + def parse(
  47 + self,
  48 + query: str,
  49 + tenant_id: str,
  50 + generate_vector: bool,
  51 + context: Any,
  52 + target_languages: Any = None,
  53 + ):
47 54 return _FakeParsedQuery(
48 55 original_query=query,
49 56 query_normalized=query,
... ... @@ -191,6 +198,66 @@ def test_config_loader_rerank_enabled_defaults_true(tmp_path: Path):
191 198 "field_boosts": {"title.en": 3.0},
192 199 "indexes": [{"name": "default", "label": "default", "fields": ["title.en"]}],
193 200 "query_config": {"supported_languages": ["en"], "default_language": "en"},
  201 + "services": {
  202 + "translation": {
  203 + "service_url": "http://localhost:6005",
  204 + "timeout_sec": 3.0,
  205 + "default_model": "dummy-model",
  206 + "default_scene": "general",
  207 + "cache": {
  208 + "ttl_seconds": 60,
  209 + "sliding_expiration": True,
  210 + },
  211 + "capabilities": {
  212 + "dummy-model": {
  213 + "enabled": True,
  214 + "backend": "llm",
  215 + "use_cache": True,
  216 + "model": "dummy-model",
  217 + "base_url": "http://localhost:6005/v1",
  218 + "timeout_sec": 3.0,
  219 + }
  220 + },
  221 + },
  222 + "embedding": {
  223 + "provider": "http",
  224 + "providers": {
  225 + "http": {
  226 + "text_base_url": "http://localhost:6005",
  227 + "image_base_url": "http://localhost:6008",
  228 + }
  229 + },
  230 + "backend": "tei",
  231 + "backends": {
  232 + "tei": {
  233 + "base_url": "http://localhost:8080",
  234 + "timeout_sec": 3.0,
  235 + "model_id": "dummy-embedding-model",
  236 + }
  237 + },
  238 + },
  239 + "rerank": {
  240 + "provider": "http",
  241 + "providers": {
  242 + "http": {
  243 + "base_url": "http://localhost:6007",
  244 + "service_url": "http://localhost:6007/rerank",
  245 + }
  246 + },
  247 + "backend": "bge",
  248 + "backends": {
  249 + "bge": {
  250 + "model_name": "dummy-rerank-model",
  251 + "device": "cpu",
  252 + "use_fp16": False,
  253 + "batch_size": 8,
  254 + "max_length": 128,
  255 + "cache_dir": "./model_cache",
  256 + "enable_warmup": False,
  257 + }
  258 + },
  259 + },
  260 + },
194 261 "spu_config": {"enabled": False},
195 262 "function_score": {"score_mode": "sum", "boost_mode": "multiply", "functions": []},
196 263 "rerank": {"rerank_window": 384},
... ... @@ -354,7 +421,14 @@ def test_searcher_promotes_sku_when_option1_matches_translated_query(monkeypatch
354 421 class _TranslatedQueryParser:
355 422 text_encoder = None
356 423  
357   - def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any):
  424 + def parse(
  425 + self,
  426 + query: str,
  427 + tenant_id: str,
  428 + generate_vector: bool,
  429 + context: Any,
  430 + target_languages: Any = None,
  431 + ):
358 432 return _FakeParsedQuery(
359 433 original_query=query,
360 434 query_normalized=query,
... ... @@ -407,15 +481,22 @@ def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_matc
407 481 encoder = _FakeTextEncoder(
408 482 {
409 483 "linen summer dress": [0.8, 0.2],
410   - "color:Red": [1.0, 0.0],
411   - "color:Blue": [0.0, 1.0],
  484 + "color:red": [1.0, 0.0],
  485 + "color:blue": [0.0, 1.0],
412 486 }
413 487 )
414 488  
415 489 class _EmbeddingQueryParser:
416 490 text_encoder = encoder
417 491  
418   - def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any):
  492 + def parse(
  493 + self,
  494 + query: str,
  495 + tenant_id: str,
  496 + generate_vector: bool,
  497 + context: Any,
  498 + target_languages: Any = None,
  499 + ):
419 500 return _FakeParsedQuery(
420 501 original_query=query,
421 502 query_normalized=query,
... ...