Commit ef5baa866ae0cf2c061a83a9fc2aead25b1d098e
1 parent
fb973d19
混杂语言处理
Showing
12 changed files
with
579 additions
and
471 deletions
Show diff stats
config/config.yaml
| ... | ... | @@ -120,7 +120,7 @@ query_config: |
| 120 | 120 | - skus |
| 121 | 121 | |
| 122 | 122 | # KNN boost配置(向量召回的boost值) |
| 123 | - knn_boost: 0.25 # Lower boost for embedding recall | |
| 123 | + knn_boost: 2.0 # Lower boost for embedding recall | |
| 124 | 124 | |
| 125 | 125 | # Function Score配置(ES层打分规则) |
| 126 | 126 | function_score: |
| ... | ... | @@ -290,7 +290,7 @@ services: |
| 290 | 290 | engine: "vllm" |
| 291 | 291 | max_model_len: 160 |
| 292 | 292 | tensor_parallel_size: 1 |
| 293 | - gpu_memory_utilization: 0.36 | |
| 293 | + gpu_memory_utilization: 0.20 | |
| 294 | 294 | dtype: "float16" |
| 295 | 295 | enable_prefix_caching: true |
| 296 | 296 | enforce_eager: false | ... | ... |
| ... | ... | @@ -0,0 +1,69 @@ |
| 1 | +ES 付费版本 or 定制开发(建议先看下付费版本价格) | |
| 2 | +ES定制开发: | |
| 3 | +RRF / retrievers | |
| 4 | + | |
| 5 | +Elastic 的订阅矩阵里明确列了这些相关能力:Retrievers: linear, rule, RRF, text similarity re-ranker,以及 Reciprocal Rank Fusion (RRF) for hybrid search。 | |
| 6 | + | |
| 7 | +这类能力最有价值的点是: | |
| 8 | +它们把混合检索从“自己拼 DSL 和手搓打分”变成了官方支持的多阶段检索框架。重排:text similarity re-ranker / Elastic Rerank. text_similarity_reranker 用 NLP 模型对 top-k 结果按语义相似度重新排序;它可以用内置的 Elastic Rerank,也可以接 Cohere、Vertex AI,或者你自己上传的 text similarity 模型。 | |
| 9 | + | |
| 10 | +{ | |
| 11 | + "retriever": { | |
| 12 | + "rrf": { | |
| 13 | + "retrievers": [ | |
| 14 | + { "standard": { "query": { ... } } }, | |
| 15 | + { "knn": { ... } } | |
| 16 | + ] | |
| 17 | + } | |
| 18 | + } | |
| 19 | +} | |
| 20 | + | |
| 21 | + | |
| 22 | +加reranker: | |
| 23 | +text_similarity_reranker 用 NLP 模型对 top-k 结果按语义相似度重新排序;它可以用内置的 Elastic Rerank,也可以接 Cohere、Vertex AI,或者你自己上传的 text similarity 模型。 | |
| 24 | + | |
| 25 | +{ | |
| 26 | + "retriever": { | |
| 27 | + "text_similarity_reranker": { | |
| 28 | + "retriever": { | |
| 29 | + "rrf": { ... } | |
| 30 | + }, | |
| 31 | + ... | |
| 32 | + } | |
| 33 | + } | |
| 34 | +} | |
| 35 | + | |
| 36 | +{ | |
| 37 | + "retriever": { | |
| 38 | + "text_similarity_reranker": { | |
| 39 | + "retriever": { | |
| 40 | + "rrf": { | |
| 41 | + "retrievers": [ | |
| 42 | + { | |
| 43 | + "standard": { | |
| 44 | + "query": { | |
| 45 | + "...": "..." | |
| 46 | + } | |
| 47 | + } | |
| 48 | + }, | |
| 49 | + { | |
| 50 | + "knn": { | |
| 51 | + "...": "..." | |
| 52 | + } | |
| 53 | + } | |
| 54 | + ], | |
| 55 | + "rank_window_size": 100, | |
| 56 | + "rank_constant": 20 | |
| 57 | + } | |
| 58 | + }, | |
| 59 | + "field": "your_rerank_text_field", | |
| 60 | + "inference_text": "白色 oversized T-shirt", | |
| 61 | + "inference_id": ".rerank-v1-elasticsearch", | |
| 62 | + "rank_window_size": 50 | |
| 63 | + } | |
| 64 | + }, | |
| 65 | + "size": 20 | |
| 66 | +} | |
| 67 | + | |
| 68 | + | |
| 69 | + | ... | ... |
docs/TODO.txt
| 1 | 1 | |
| 2 | +把knn跟文本相关性的融合方式修改为 "rank": {"rrf": {} }需要licence,可以帮我修改源码支持吗? | |
| 3 | + | |
| 4 | + knn_boost: 2.0 | |
| 5 | + | |
| 6 | + | |
| 7 | +{ | |
| 8 | + "query": { ...全文检索... }, | |
| 9 | + "knn": { ...向量检索... }, | |
| 10 | + "rank": { | |
| 11 | + "rrf": {} | |
| 12 | + } | |
| 13 | +} | |
| 14 | + | |
| 15 | + | |
| 16 | + | |
| 17 | + | |
| 18 | +"image_embedding": { | |
| 19 | + "type": "nested", | |
| 20 | + "properties": { | |
| 21 | + "vector": { | |
| 22 | + "type": "dense_vector", | |
| 23 | + "dims": 1024, | |
| 24 | + "index": true, | |
| 25 | + "similarity": "dot_product", | |
| 26 | + "element_type": "bfloat16" | |
| 27 | + }, | |
| 28 | + "url": { | |
| 29 | + "type": "text" | |
| 30 | + } | |
| 31 | + } | |
| 32 | +}, | |
| 33 | +去掉 image_embedding_512 | |
| 34 | +image_embedding改为,一个spu有多个sku向量,每个向量内部properties: | |
| 35 | +除了vector url还应该包括 | |
| 36 | +"image_embedding": { | |
| 37 | + "type": "nested", | |
| 38 | + "properties": { | |
| 39 | + "vector": { | |
| 40 | + "type": "dense_vector", | |
| 41 | + "dims": 1024, | |
| 42 | + "index": true, | |
| 43 | + "similarity": "dot_product", | |
| 44 | + "element_type": "bfloat16" | |
| 45 | + }, | |
| 46 | + "url": { | |
| 47 | + "type": "text" | |
| 48 | + } | |
| 49 | + } | |
| 50 | +}, | |
| 51 | + | |
| 52 | + | |
| 53 | + | |
| 54 | + | |
| 55 | + | |
| 56 | +外部需求: | |
| 57 | +1. 对推理能力要求很低、对耗时要求很高的大模型API(或者本地部署一个7b Q4量化的大模型),prompt大概30-50个token,首token响应要求500ms以内 | |
| 58 | +2. ES支持reranker pipline? | |
| 59 | + | |
| 2 | 60 | |
| 3 | 61 | @reranker/backends/qwen3_vllm.py 单次 generate 前有进程内锁,同一进程里不会并行多路 vLLM 推理,这个锁有必要吗?是否会影响性能?是否能够打开,使得性能更好?比如这个场景,我一次请求 400 条,分成每64个一个batch,基于我现在的gpu配置,可以再提高并发度吗? |
| 4 | 62 | 测试了,让每个批次都并发地进行,耗时没有变化 |
| ... | ... | @@ -383,6 +441,8 @@ https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-men |
| 383 | 441 | |
| 384 | 442 | |
| 385 | 443 | 融合打分(已完成,2026-03) |
| 444 | + | |
| 445 | +以下已经完成: | |
| 386 | 446 | 1. `fuse_scores_and_resort` 已改为乘法融合,并通过 `matched_queries` 提取: |
| 387 | 447 | - `base_query` |
| 388 | 448 | - `base_query_trans_*` |
| ... | ... | @@ -397,7 +457,11 @@ https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-men |
| 397 | 457 | - `docs/搜索API对接指南.md` |
| 398 | 458 | - `docs/Usage-Guide.md` |
| 399 | 459 | |
| 400 | - | |
| 460 | +未完成的: | |
| 461 | +(归一化、次序融合?还乘法公式?) | |
| 462 | +RRF:先把多路召回稳妥融合 | |
| 463 | +linear + minmax:让你能精调 knn 和文本的权重 | |
| 464 | +reranker:对前面召回出来的 top-k 再做“最后一刀” | |
| 401 | 465 | |
| 402 | 466 | |
| 403 | 467 | ... | ... |
docs/搜索API对接指南-01-搜索接口.md
| ... | ... | @@ -66,9 +66,11 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) |
| 66 | 66 | | `min_score` | float | N | null | 最小相关性分数阈值 | |
| 67 | 67 | | `sku_filter_dimension` | array[string] | N | null | 子SKU筛选维度列表(见[SKU筛选维度](#35-sku筛选维度)) | |
| 68 | 68 | | `debug` | boolean | N | false | 是否返回调试信息 | |
| 69 | -| `enable_rerank` | boolean/null | N | null | 是否开启重排(调用外部重排服务对 ES 结果进行二次排序)。不传/传 null 使用服务端 `rerank.enabled`(默认开启)。开启后会先对 ES TopN(`rerank_window`)重排,再按分页截取;若 `from+size>1000`,则不重排,直接按分页从 ES 返回 | | |
| 70 | -| `rerank_query_template` | string | N | null | 重排 query 模板(可选)。支持 `{query}` 占位符;不传则使用服务端配置 | | |
| 71 | -| `rerank_doc_template` | string | N | null | 重排 doc 模板(可选)。支持 `{title} {brief} {vendor} {description} {category_path}`;不传则使用服务端配置 | | |
| 69 | +| `enable_rerank` | boolean/null | N | null | 是否开启重排(调用外部重排服务对 ES 结果进行二次排序)。不传/传 null 使用服务端 `rerank.enabled`。当有效开启且 `from + size <= rerank_window` 时:ES 先取前 `rerank_window` 条,重排后再按 `from`/`size` 截取当前页;若 `from + size > rerank_window`,则**不进行**窗口内重排,直接按请求的 `from`/`size` 查询 ES(`rerank_window` 见 `config.yaml` 的 `rerank.rerank_window`,仓库示例默认 400) | | |
| 70 | +| `rerank_query_template` | string | N | null | 重排 query 模板(可选)。支持 `{query}` 占位符;不传则使用服务端 `rerank.rerank_query_template` | | |
| 71 | +| `rerank_doc_template` | string | N | null | 重排 doc 模板(可选)。支持 `{title} {brief} {vendor} {description} {category_path}` 等占位符(由 `search/rerank_client.py` 按语言字段拼装);不传则使用服务端 `rerank.rerank_doc_template` | | |
| 72 | + | |
| 73 | +**与后端代码的对应关系**(便于联调):HTTP `POST /search/` 请求体由 `api/models.py` 的 `SearchRequest` 校验;路由 `api/routes/search.py` 将字段原样传入 `Searcher.search(...)`(含上述三个重排相关字段)。CLI `python main.py search` 目前未暴露这些参数,走配置默认值。 | |
| 72 | 74 | | `user_id` | string | N | null | 用户ID(用于个性化,预留) | |
| 73 | 75 | | `session_id` | string | N | null | 会话ID(用于分析,预留) | |
| 74 | 76 | ... | ... |
docs/相关性检索优化说明.md
| ... | ... | @@ -281,3 +281,24 @@ title.en: 2026 Korean-style High-waisted Slimming Corduroy Skirt with Slit, Mid- |
| 281 | 281 | Rerank score: 0.9643 |
| 282 | 282 | title.en: Black Half-high Collar Base Shirt Women's Autumn and Winter fleece-lined Contrast Color Pure Desire Design Sense Horn Sleeve Ruffled Inner Top |
| 283 | 283 | title.zh: 黑色高领半高领女士秋冬内搭加绒拼色纯欲设计荷叶边袖内衬上衣 |
| 284 | + | |
| 285 | + | |
| 286 | + | |
| 287 | +qwen3-0.6b的严重badcase: | |
| 288 | +q=牛仔裤 | |
| 289 | + | |
| 290 | +Rerank score: 0.0002 | |
| 291 | +title.en: Wrangler Womens Cowboy Cut Slim Fit Jean Bleach | |
| 292 | +title.zh: Wrangler 女士牛仔裤 牛仔剪裁 紧身版型 漂白色 | |
| 293 | + | |
| 294 | +Rerank score: 0.0168 | |
| 295 | +title.en: Fleece Lined Tights Sheer Women - Fake Translucent Warm Pantyhose Leggings Sheer Thick Tights for Winter | |
| 296 | +title.zh: 加绒透肤女士连裤袜 - 仿透视保暖长筒袜 冬季厚款透肤连裤袜 | |
| 297 | + | |
| 298 | +Rerank score: 0.1366 | |
| 299 | +title.en: Dockers Men's Classic Fit Workday Khaki Smart 360 FLEX Pants (Standard and Big & Tall) | |
| 300 | +title.zh: Dockers 男士经典版型工作日卡其色智能360度弹力裤(标准码与加大码) | |
| 301 | + | |
| 302 | +Rerank score: 0.0981 | |
| 303 | +title.en: Lazy One Pajama Shorts for Men, Men's Pajama Bottoms, Sleepwear | |
| 304 | +title.zh: 懒人男士睡裤,男式家居裤,睡眠服饰 | ... | ... |
query/query_parser.py
| 1 | 1 | """ |
| 2 | 2 | Query parser - main module for query processing. |
| 3 | 3 | |
| 4 | -Handles query rewriting, translation, and embedding generation. | |
| 4 | +Responsibilities are intentionally narrow: | |
| 5 | +- normalize and rewrite the incoming query | |
| 6 | +- detect language and tokenize with HanLP | |
| 7 | +- run translation and embedding requests concurrently | |
| 8 | +- return parser facts, not Elasticsearch language-planning data | |
| 5 | 9 | """ |
| 6 | 10 | |
| 7 | -from typing import Dict, List, Optional, Any, Union, Tuple | |
| 11 | +from dataclasses import dataclass, field | |
| 12 | +from typing import Any, Callable, Dict, List, Optional, Tuple | |
| 8 | 13 | import numpy as np |
| 9 | 14 | import logging |
| 10 | 15 | import re |
| ... | ... | @@ -26,7 +31,7 @@ except Exception: # pragma: no cover |
| 26 | 31 | |
| 27 | 32 | def simple_tokenize_query(text: str) -> List[str]: |
| 28 | 33 | """ |
| 29 | - Lightweight tokenizer for suggestion length / analysis (aligned with QueryParser fallback). | |
| 34 | + Lightweight tokenizer for suggestion-side heuristics only. | |
| 30 | 35 | |
| 31 | 36 | - Consecutive CJK characters form one token |
| 32 | 37 | - Latin / digit runs (with internal hyphens) form tokens |
| ... | ... | @@ -37,63 +42,32 @@ def simple_tokenize_query(text: str) -> List[str]: |
| 37 | 42 | return pattern.findall(text) |
| 38 | 43 | |
| 39 | 44 | |
| 45 | +@dataclass(slots=True) | |
| 40 | 46 | class ParsedQuery: |
| 41 | - """Container for parsed query results.""" | |
| 42 | - | |
| 43 | - def __init__( | |
| 44 | - self, | |
| 45 | - original_query: str, | |
| 46 | - query_normalized: str, | |
| 47 | - rewritten_query: Optional[str] = None, | |
| 48 | - detected_language: Optional[str] = None, | |
| 49 | - translations: Dict[str, str] = None, | |
| 50 | - query_vector: Optional[np.ndarray] = None, | |
| 51 | - domain: str = "default", | |
| 52 | - keywords: str = "", | |
| 53 | - token_count: int = 0, | |
| 54 | - query_tokens: Optional[List[str]] = None, | |
| 55 | - query_text_by_lang: Optional[Dict[str, str]] = None, | |
| 56 | - search_langs: Optional[List[str]] = None, | |
| 57 | - index_languages: Optional[List[str]] = None, | |
| 58 | - source_in_index_languages: bool = True, | |
| 59 | - contains_chinese: bool = False, | |
| 60 | - contains_english: bool = False, | |
| 61 | - ): | |
| 62 | - self.original_query = original_query | |
| 63 | - self.query_normalized = query_normalized | |
| 64 | - self.rewritten_query = rewritten_query or query_normalized | |
| 65 | - self.detected_language = detected_language | |
| 66 | - self.translations = translations or {} | |
| 67 | - self.query_vector = query_vector | |
| 68 | - self.domain = domain | |
| 69 | - # Query analysis fields | |
| 70 | - self.keywords = keywords | |
| 71 | - self.token_count = token_count | |
| 72 | - self.query_tokens = query_tokens or [] | |
| 73 | - self.query_text_by_lang = query_text_by_lang or {} | |
| 74 | - self.search_langs = search_langs or [] | |
| 75 | - self.index_languages = index_languages or [] | |
| 76 | - self.source_in_index_languages = bool(source_in_index_languages) | |
| 77 | - self.contains_chinese = bool(contains_chinese) | |
| 78 | - self.contains_english = bool(contains_english) | |
| 47 | + """Container for query parser facts.""" | |
| 48 | + | |
| 49 | + original_query: str | |
| 50 | + query_normalized: str | |
| 51 | + rewritten_query: str | |
| 52 | + detected_language: Optional[str] = None | |
| 53 | + translations: Dict[str, str] = field(default_factory=dict) | |
| 54 | + query_vector: Optional[np.ndarray] = None | |
| 55 | + query_tokens: List[str] = field(default_factory=list) | |
| 56 | + contains_chinese: bool = False | |
| 57 | + contains_english: bool = False | |
| 79 | 58 | |
| 80 | 59 | def to_dict(self) -> Dict[str, Any]: |
| 81 | 60 | """Convert to dictionary representation.""" |
| 82 | - result = { | |
| 61 | + return { | |
| 83 | 62 | "original_query": self.original_query, |
| 84 | 63 | "query_normalized": self.query_normalized, |
| 85 | 64 | "rewritten_query": self.rewritten_query, |
| 86 | 65 | "detected_language": self.detected_language, |
| 87 | 66 | "translations": self.translations, |
| 88 | - "domain": self.domain | |
| 67 | + "query_tokens": self.query_tokens, | |
| 68 | + "contains_chinese": self.contains_chinese, | |
| 69 | + "contains_english": self.contains_english, | |
| 89 | 70 | } |
| 90 | - result["query_text_by_lang"] = self.query_text_by_lang | |
| 91 | - result["search_langs"] = self.search_langs | |
| 92 | - result["index_languages"] = self.index_languages | |
| 93 | - result["source_in_index_languages"] = self.source_in_index_languages | |
| 94 | - result["contains_chinese"] = self.contains_chinese | |
| 95 | - result["contains_english"] = self.contains_english | |
| 96 | - return result | |
| 97 | 71 | |
| 98 | 72 | |
| 99 | 73 | class QueryParser: |
| ... | ... | @@ -102,7 +76,7 @@ class QueryParser: |
| 102 | 76 | 1. Normalization |
| 103 | 77 | 2. Query rewriting (brand/category mappings, synonyms) |
| 104 | 78 | 3. Language detection |
| 105 | - 4. Translation to target languages | |
| 79 | + 4. Translation to caller-provided target languages | |
| 106 | 80 | 5. Text embedding generation (for semantic search) |
| 107 | 81 | """ |
| 108 | 82 | |
| ... | ... | @@ -110,7 +84,8 @@ class QueryParser: |
| 110 | 84 | self, |
| 111 | 85 | config: SearchConfig, |
| 112 | 86 | text_encoder: Optional[TextEmbeddingEncoder] = None, |
| 113 | - translator: Optional[Any] = None | |
| 87 | + translator: Optional[Any] = None, | |
| 88 | + tokenizer: Optional[Callable[[str], Any]] = None, | |
| 114 | 89 | ): |
| 115 | 90 | """ |
| 116 | 91 | Initialize query parser. |
| ... | ... | @@ -128,23 +103,7 @@ class QueryParser: |
| 128 | 103 | self.normalizer = QueryNormalizer() |
| 129 | 104 | self.language_detector = LanguageDetector() |
| 130 | 105 | self.rewriter = QueryRewriter(config.query_config.rewrite_dictionary) |
| 131 | - | |
| 132 | - # Optional HanLP components (heavy). If unavailable, fall back to a lightweight tokenizer. | |
| 133 | - self._tok = None | |
| 134 | - self._pos_tag = None | |
| 135 | - if hanlp is not None: | |
| 136 | - try: | |
| 137 | - logger.info("Initializing HanLP components...") | |
| 138 | - self._tok = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF) | |
| 139 | - self._tok.config.output_spans = True | |
| 140 | - self._pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL) | |
| 141 | - logger.info("HanLP components initialized") | |
| 142 | - except Exception as e: | |
| 143 | - logger.warning(f"HanLP init failed, falling back to simple tokenizer: {e}") | |
| 144 | - self._tok = None | |
| 145 | - self._pos_tag = None | |
| 146 | - else: | |
| 147 | - logger.info("HanLP not installed; using simple tokenizer") | |
| 106 | + self._tokenizer = tokenizer or self._build_tokenizer() | |
| 148 | 107 | |
| 149 | 108 | # Eager initialization (startup-time failure visibility, no lazy init in request path) |
| 150 | 109 | if self.config.query_config.enable_text_embedding and self._text_encoder is None: |
| ... | ... | @@ -170,6 +129,16 @@ class QueryParser: |
| 170 | 129 | """Return pre-initialized translator.""" |
| 171 | 130 | return self._translator |
| 172 | 131 | |
| 132 | + def _build_tokenizer(self) -> Callable[[str], Any]: | |
| 133 | + """Build the tokenizer used by query parsing. No fallback path by design.""" | |
| 134 | + if hanlp is None: | |
| 135 | + raise RuntimeError("HanLP is required for QueryParser tokenization") | |
| 136 | + logger.info("Initializing HanLP tokenizer...") | |
| 137 | + tokenizer = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF) | |
| 138 | + tokenizer.config.output_spans = True | |
| 139 | + logger.info("HanLP tokenizer initialized") | |
| 140 | + return tokenizer | |
| 141 | + | |
| 173 | 142 | @staticmethod |
| 174 | 143 | def _pick_query_translation_model(source_lang: str, target_lang: str, config: SearchConfig) -> str: |
| 175 | 144 | """Pick the translation capability for query-time translation (configurable).""" |
| ... | ... | @@ -186,41 +155,46 @@ class QueryParser: |
| 186 | 155 | # By default this is `nllb-200-distilled-600m` (multi-lingual local model). |
| 187 | 156 | return config.query_config.default_translation_model |
| 188 | 157 | |
| 189 | - def _simple_tokenize(self, text: str) -> List[str]: | |
| 190 | - return simple_tokenize_query(text) | |
| 191 | - | |
| 192 | - def _extract_keywords(self, query: str) -> str: | |
| 193 | - """Extract keywords (nouns with length > 1) from query.""" | |
| 194 | - if self._tok is not None and self._pos_tag is not None: | |
| 195 | - tok_result = self._tok(query) | |
| 196 | - if not tok_result: | |
| 197 | - return "" | |
| 198 | - words = [x[0] for x in tok_result] | |
| 199 | - pos_tags = self._pos_tag(words) | |
| 200 | - keywords = [] | |
| 201 | - for word, pos in zip(words, pos_tags): | |
| 202 | - if len(word) > 1 and isinstance(pos, str) and pos.startswith("N"): | |
| 203 | - keywords.append(word) | |
| 204 | - return " ".join(keywords) | |
| 205 | - | |
| 206 | - # Fallback: treat tokens with length > 1 as "keywords" | |
| 207 | - tokens = self._simple_tokenize(query) | |
| 208 | - keywords = [t for t in tokens if len(t) > 1] | |
| 209 | - return " ".join(keywords) | |
| 210 | - | |
| 211 | - def _get_token_count(self, query: str) -> int: | |
| 212 | - """Get token count (HanLP if available, otherwise simple).""" | |
| 213 | - if self._tok is not None: | |
| 214 | - tok_result = self._tok(query) | |
| 215 | - return len(tok_result) if tok_result else 0 | |
| 216 | - return len(self._simple_tokenize(query)) | |
| 158 | + @staticmethod | |
| 159 | + def _normalize_language_codes(languages: Optional[List[str]]) -> List[str]: | |
| 160 | + normalized: List[str] = [] | |
| 161 | + seen = set() | |
| 162 | + for language in languages or []: | |
| 163 | + token = str(language or "").strip().lower() | |
| 164 | + if not token or token in seen: | |
| 165 | + continue | |
| 166 | + seen.add(token) | |
| 167 | + normalized.append(token) | |
| 168 | + return normalized | |
| 169 | + | |
| 170 | + @staticmethod | |
| 171 | + def _extract_tokens(tokenizer_result: Any) -> List[str]: | |
| 172 | + """Normalize tokenizer output into a flat token string list.""" | |
| 173 | + if not tokenizer_result: | |
| 174 | + return [] | |
| 175 | + if isinstance(tokenizer_result, str): | |
| 176 | + token = tokenizer_result.strip() | |
| 177 | + return [token] if token else [] | |
| 178 | + | |
| 179 | + tokens: List[str] = [] | |
| 180 | + for item in tokenizer_result: | |
| 181 | + token: Optional[str] = None | |
| 182 | + if isinstance(item, str): | |
| 183 | + token = item | |
| 184 | + elif isinstance(item, (list, tuple)) and item: | |
| 185 | + token = str(item[0]) | |
| 186 | + elif item is not None: | |
| 187 | + token = str(item) | |
| 188 | + | |
| 189 | + if token is None: | |
| 190 | + continue | |
| 191 | + token = token.strip() | |
| 192 | + if token: | |
| 193 | + tokens.append(token) | |
| 194 | + return tokens | |
| 217 | 195 | |
| 218 | 196 | def _get_query_tokens(self, query: str) -> List[str]: |
| 219 | - """Get token list (HanLP if available, otherwise simple).""" | |
| 220 | - if self._tok is not None: | |
| 221 | - tok_result = self._tok(query) | |
| 222 | - return [x[0] for x in tok_result] if tok_result else [] | |
| 223 | - return self._simple_tokenize(query) | |
| 197 | + return self._extract_tokens(self._tokenizer(query)) | |
| 224 | 198 | |
| 225 | 199 | @staticmethod |
| 226 | 200 | def _contains_cjk(text: str) -> bool: |
| ... | ... | @@ -237,64 +211,24 @@ class QueryParser: |
| 237 | 211 | return False |
| 238 | 212 | return bool(re.fullmatch(r"[A-Za-z]+(?:-[A-Za-z]+)*", token)) |
| 239 | 213 | |
| 240 | - @staticmethod | |
| 241 | - def _extract_latin_tokens(text: str) -> List[str]: | |
| 242 | - """Extract latin word tokens from query text.""" | |
| 243 | - return re.findall(r"[A-Za-z]+(?:-[A-Za-z]+)*", text or "") | |
| 244 | - | |
| 245 | - def _infer_supplemental_search_langs( | |
| 246 | - self, | |
| 247 | - query_text: str, | |
| 248 | - detected_lang: str, | |
| 249 | - index_langs: List[str], | |
| 250 | - ) -> List[str]: | |
| 251 | - """ | |
| 252 | - Infer extra languages to search when the query mixes scripts. | |
| 253 | - | |
| 254 | - Rules: | |
| 255 | - - If any Chinese characters appear, include `zh` when available. | |
| 256 | - - If the query contains meaningful latin tokens, include `en` when available. | |
| 257 | - "Meaningful" means either: | |
| 258 | - 1) at least 2 latin tokens with length >= 4, or | |
| 259 | - 2) at least 1 latin token with length >= 4 and latin chars occupy >= 20% of non-space chars. | |
| 260 | - """ | |
| 261 | - supplemental: List[str] = [] | |
| 262 | - normalized_index_langs = {str(lang or "").strip().lower() for lang in index_langs} | |
| 263 | - normalized_detected = str(detected_lang or "").strip().lower() | |
| 264 | - query_text = str(query_text or "") | |
| 265 | - | |
| 266 | - if "zh" in normalized_index_langs and self._contains_cjk(query_text) and normalized_detected != "zh": | |
| 267 | - supplemental.append("zh") | |
| 268 | - | |
| 269 | - latin_tokens = self._extract_latin_tokens(query_text) | |
| 270 | - significant_latin_tokens = [tok for tok in latin_tokens if len(tok) >= 4] | |
| 271 | - latin_chars = sum(len(tok) for tok in latin_tokens) | |
| 272 | - non_space_chars = len(re.sub(r"\s+", "", query_text)) | |
| 273 | - latin_ratio = (latin_chars / non_space_chars) if non_space_chars > 0 else 0.0 | |
| 274 | - has_meaningful_english = ( | |
| 275 | - len(significant_latin_tokens) >= 2 or | |
| 276 | - (len(significant_latin_tokens) >= 1 and latin_ratio >= 0.2) | |
| 277 | - ) | |
| 278 | - | |
| 279 | - if "en" in normalized_index_langs and has_meaningful_english and normalized_detected != "en": | |
| 280 | - supplemental.append("en") | |
| 281 | - | |
| 282 | - return supplemental | |
| 283 | - | |
| 284 | 214 | def parse( |
| 285 | 215 | self, |
| 286 | 216 | query: str, |
| 287 | 217 | tenant_id: Optional[str] = None, |
| 288 | 218 | generate_vector: bool = True, |
| 289 | - context: Optional[Any] = None | |
| 219 | + context: Optional[Any] = None, | |
| 220 | + target_languages: Optional[List[str]] = None, | |
| 290 | 221 | ) -> ParsedQuery: |
| 291 | 222 | """ |
| 292 | 223 | Parse query through all processing stages. |
| 293 | 224 | |
| 294 | 225 | Args: |
| 295 | 226 | query: Raw query string |
| 227 | + tenant_id: Deprecated and ignored by QueryParser. Kept temporarily | |
| 228 | + to avoid a wider refactor in this first step. | |
| 296 | 229 | generate_vector: Whether to generate query embedding |
| 297 | 230 | context: Optional request context for tracking and logging |
| 231 | + target_languages: Translation target languages decided by the caller | |
| 298 | 232 | |
| 299 | 233 | Returns: |
| 300 | 234 | ParsedQuery object with all processing results |
| ... | ... | @@ -325,15 +259,9 @@ class QueryParser: |
| 325 | 259 | if context: |
| 326 | 260 | context.store_intermediate_result('query_normalized', normalized) |
| 327 | 261 | |
| 328 | - # Extract domain if present (e.g., "brand:Nike" -> domain="brand", query="Nike") | |
| 329 | - domain, query_text = self.normalizer.extract_domain_query(normalized) | |
| 330 | - log_debug(f"Domain extraction | Domain: '{domain}', Query: '{query_text}'") | |
| 331 | - if context: | |
| 332 | - context.store_intermediate_result('extracted_domain', domain) | |
| 333 | - context.store_intermediate_result('domain_query', query_text) | |
| 334 | - | |
| 335 | 262 | # Stage 2: Query rewriting |
| 336 | - rewritten = None | |
| 263 | + query_text = normalized | |
| 264 | + rewritten = normalized | |
| 337 | 265 | if self.config.query_config.rewrite_dictionary: # Enable rewrite if dictionary exists |
| 338 | 266 | rewritten = self.rewriter.rewrite(query_text) |
| 339 | 267 | if rewritten != query_text: |
| ... | ... | @@ -351,43 +279,51 @@ class QueryParser: |
| 351 | 279 | log_info(f"Language detection | Detected language: {detected_lang}") |
| 352 | 280 | if context: |
| 353 | 281 | context.store_intermediate_result('detected_language', detected_lang) |
| 282 | + # Stage 4: Query analysis (tokenization + script flags) | |
| 283 | + query_tokens = self._get_query_tokens(query_text) | |
| 284 | + contains_chinese = self._contains_cjk(query_text) | |
| 285 | + contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens) | |
| 286 | + | |
| 287 | + log_debug( | |
| 288 | + f"Query analysis | Query tokens: {query_tokens} | " | |
| 289 | + f"contains_chinese={contains_chinese} | contains_english={contains_english}" | |
| 290 | + ) | |
| 291 | + if context: | |
| 292 | + context.store_intermediate_result('query_tokens', query_tokens) | |
| 293 | + context.store_intermediate_result('contains_chinese', contains_chinese) | |
| 294 | + context.store_intermediate_result('contains_english', contains_english) | |
| 354 | 295 | |
| 355 | - # Stage 4: Translation — always submit to thread pool; results are collected together with | |
| 356 | - # embedding in one wait() that uses a configurable budget (short vs long by source-in-index). | |
| 296 | + # Stage 5: Translation + embedding. Parser only coordinates async enrichment work; the | |
| 297 | + # caller decides translation targets and later search-field planning. | |
| 357 | 298 | translations: Dict[str, str] = {} |
| 358 | - translation_futures: Dict[str, Any] = {} | |
| 359 | - translation_executor: Optional[ThreadPoolExecutor] = None | |
| 360 | - index_langs: List[str] = [] | |
| 299 | + future_to_task: Dict[Any, Tuple[str, Optional[str]]] = {} | |
| 300 | + async_executor: Optional[ThreadPoolExecutor] = None | |
| 361 | 301 | detected_norm = str(detected_lang or "").strip().lower() |
| 302 | + normalized_targets = self._normalize_language_codes(target_languages) | |
| 303 | + translation_targets = [lang for lang in normalized_targets if lang != detected_norm] | |
| 304 | + | |
| 305 | + # Stage 6: Text embedding - async execution | |
| 306 | + query_vector = None | |
| 307 | + should_generate_embedding = ( | |
| 308 | + generate_vector and | |
| 309 | + self.config.query_config.enable_text_embedding | |
| 310 | + ) | |
| 311 | + | |
| 312 | + task_count = len(translation_targets) + (1 if should_generate_embedding else 0) | |
| 313 | + if task_count > 0: | |
| 314 | + async_executor = ThreadPoolExecutor( | |
| 315 | + max_workers=max(1, min(task_count, 4)), | |
| 316 | + thread_name_prefix="query-enrichment", | |
| 317 | + ) | |
| 362 | 318 | |
| 363 | 319 | try: |
| 364 | - # 根据租户配置的 index_languages 决定翻译目标语言 | |
| 365 | - from config.tenant_config_loader import get_tenant_config_loader | |
| 366 | - tenant_loader = get_tenant_config_loader() | |
| 367 | - tenant_cfg = tenant_loader.get_tenant_config(tenant_id or "default") | |
| 368 | - raw_index_langs = tenant_cfg.get("index_languages") or [] | |
| 369 | - index_langs = [] | |
| 370 | - seen_langs = set() | |
| 371 | - for lang in raw_index_langs: | |
| 372 | - norm_lang = str(lang or "").strip().lower() | |
| 373 | - if not norm_lang or norm_lang in seen_langs: | |
| 374 | - continue | |
| 375 | - seen_langs.add(norm_lang) | |
| 376 | - index_langs.append(norm_lang) | |
| 377 | - | |
| 378 | - target_langs_for_translation = [lang for lang in index_langs if lang != detected_norm] | |
| 379 | - | |
| 380 | - if target_langs_for_translation: | |
| 381 | - translation_executor = ThreadPoolExecutor( | |
| 382 | - max_workers=max(1, min(len(target_langs_for_translation), 4)), | |
| 383 | - thread_name_prefix="query-translation", | |
| 384 | - ) | |
| 385 | - for lang in target_langs_for_translation: | |
| 320 | + if async_executor is not None: | |
| 321 | + for lang in translation_targets: | |
| 386 | 322 | model_name = self._pick_query_translation_model(detected_lang, lang, self.config) |
| 387 | 323 | log_debug( |
| 388 | 324 | f"Submitting query translation | source={detected_lang} target={lang} model={model_name}" |
| 389 | 325 | ) |
| 390 | - translation_futures[lang] = translation_executor.submit( | |
| 326 | + future = async_executor.submit( | |
| 391 | 327 | self.translator.translate, |
| 392 | 328 | query_text, |
| 393 | 329 | lang, |
| ... | ... | @@ -395,107 +331,61 @@ class QueryParser: |
| 395 | 331 | "ecommerce_search_query", |
| 396 | 332 | model_name, |
| 397 | 333 | ) |
| 398 | - | |
| 399 | - if context: | |
| 400 | - context.store_intermediate_result('translations', translations) | |
| 401 | - for lang, translation in translations.items(): | |
| 402 | - if translation: | |
| 403 | - context.store_intermediate_result(f'translation_{lang}', translation) | |
| 404 | - | |
| 334 | + future_to_task[future] = ("translation", lang) | |
| 335 | + | |
| 336 | + if should_generate_embedding: | |
| 337 | + if self.text_encoder is None: | |
| 338 | + raise RuntimeError("Text embedding is enabled but text encoder is not initialized") | |
| 339 | + log_debug("Submitting query vector generation") | |
| 340 | + | |
| 341 | + def _encode_query_vector() -> Optional[np.ndarray]: | |
| 342 | + arr = self.text_encoder.encode([query_text], priority=1) | |
| 343 | + if arr is None or len(arr) == 0: | |
| 344 | + return None | |
| 345 | + vec = arr[0] | |
| 346 | + if vec is None: | |
| 347 | + return None | |
| 348 | + return np.asarray(vec, dtype=np.float32) | |
| 349 | + | |
| 350 | + future = async_executor.submit(_encode_query_vector) | |
| 351 | + future_to_task[future] = ("embedding", None) | |
| 405 | 352 | except Exception as e: |
| 406 | - error_msg = f"Translation failed | Error: {str(e)}" | |
| 353 | + error_msg = f"Async query enrichment submission failed | Error: {str(e)}" | |
| 407 | 354 | log_info(error_msg) |
| 408 | 355 | if context: |
| 409 | 356 | context.add_warning(error_msg) |
| 357 | + if async_executor is not None: | |
| 358 | + async_executor.shutdown(wait=False) | |
| 359 | + async_executor = None | |
| 360 | + future_to_task.clear() | |
| 410 | 361 | |
| 411 | - # Stage 5: Query analysis (keywords, token count, query_tokens) | |
| 412 | - keywords = self._extract_keywords(query_text) | |
| 413 | - query_tokens = self._get_query_tokens(query_text) | |
| 414 | - token_count = len(query_tokens) | |
| 415 | - contains_chinese = self._contains_cjk(query_text) | |
| 416 | - contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens) | |
| 417 | - | |
| 418 | - log_debug(f"Query analysis | Keywords: {keywords} | Token count: {token_count} | " | |
| 419 | - f"Query tokens: {query_tokens} | contains_chinese={contains_chinese} | " | |
| 420 | - f"contains_english={contains_english}") | |
| 421 | - if context: | |
| 422 | - context.store_intermediate_result('keywords', keywords) | |
| 423 | - context.store_intermediate_result('token_count', token_count) | |
| 424 | - context.store_intermediate_result('query_tokens', query_tokens) | |
| 425 | - context.store_intermediate_result('contains_chinese', contains_chinese) | |
| 426 | - context.store_intermediate_result('contains_english', contains_english) | |
| 427 | - | |
| 428 | - # Stage 6: Text embedding (only for non-short queries) - async execution | |
| 429 | - query_vector = None | |
| 430 | - embedding_future = None | |
| 431 | - should_generate_embedding = ( | |
| 432 | - generate_vector and | |
| 433 | - self.config.query_config.enable_text_embedding and | |
| 434 | - domain == "default" | |
| 435 | - ) | |
| 436 | - | |
| 437 | - encoding_executor = None | |
| 438 | - if should_generate_embedding: | |
| 439 | - try: | |
| 440 | - if self.text_encoder is None: | |
| 441 | - raise RuntimeError("Text embedding is enabled but text encoder is not initialized") | |
| 442 | - log_debug("Starting query vector generation (async)") | |
| 443 | - # Submit encoding task to thread pool for async execution | |
| 444 | - encoding_executor = ThreadPoolExecutor(max_workers=1) | |
| 445 | - def _encode_query_vector() -> Optional[np.ndarray]: | |
| 446 | - arr = self.text_encoder.encode([query_text], priority=1) | |
| 447 | - if arr is None or len(arr) == 0: | |
| 448 | - return None | |
| 449 | - vec = arr[0] | |
| 450 | - return vec if isinstance(vec, np.ndarray) else None | |
| 451 | - embedding_future = encoding_executor.submit( | |
| 452 | - _encode_query_vector | |
| 453 | - ) | |
| 454 | - except Exception as e: | |
| 455 | - error_msg = f"Query vector generation task submission failed | Error: {str(e)}" | |
| 456 | - log_info(error_msg) | |
| 457 | - if context: | |
| 458 | - context.add_warning(error_msg) | |
| 459 | - encoding_executor = None | |
| 460 | - embedding_future = None | |
| 461 | - | |
| 462 | - # Wait for translation + embedding concurrently; shared budget (ms) depends on whether | |
| 463 | - # the detected language is in tenant index_languages. | |
| 362 | + # Wait for translation + embedding concurrently; shared budget depends on whether | |
| 363 | + # the detected language belongs to caller-provided target_languages. | |
| 464 | 364 | qc = self.config.query_config |
| 465 | - source_in_index_for_budget = detected_norm in index_langs | |
| 365 | + source_in_target_languages = bool(normalized_targets) and detected_norm in normalized_targets | |
| 466 | 366 | budget_ms = ( |
| 467 | 367 | qc.translation_embedding_wait_budget_ms_source_in_index |
| 468 | - if source_in_index_for_budget | |
| 368 | + if source_in_target_languages | |
| 469 | 369 | else qc.translation_embedding_wait_budget_ms_source_not_in_index |
| 470 | 370 | ) |
| 471 | 371 | budget_sec = max(0.0, float(budget_ms) / 1000.0) |
| 472 | 372 | |
| 473 | - if translation_futures: | |
| 373 | + if translation_targets: | |
| 474 | 374 | log_info( |
| 475 | 375 | f"Translation+embedding shared wait budget | budget_ms={budget_ms} | " |
| 476 | - f"source_in_index_languages={source_in_index_for_budget} | " | |
| 477 | - f"translation_targets={list(translation_futures.keys())}" | |
| 376 | + f"source_in_target_languages={source_in_target_languages} | " | |
| 377 | + f"translation_targets={translation_targets}" | |
| 478 | 378 | ) |
| 479 | 379 | |
| 480 | - if translation_futures or embedding_future: | |
| 380 | + if future_to_task: | |
| 481 | 381 | log_debug( |
| 482 | 382 | f"Waiting for async tasks (translation+embedding) | budget_ms={budget_ms} | " |
| 483 | - f"source_in_index_languages={source_in_index_for_budget}" | |
| 383 | + f"source_in_target_languages={source_in_target_languages}" | |
| 484 | 384 | ) |
| 485 | 385 | |
| 486 | - all_futures: List[Any] = [] | |
| 487 | - future_to_lang: Dict[Any, tuple] = {} | |
| 488 | - for lang, future in translation_futures.items(): | |
| 489 | - all_futures.append(future) | |
| 490 | - future_to_lang[future] = ("translation", lang) | |
| 491 | - | |
| 492 | - if embedding_future: | |
| 493 | - all_futures.append(embedding_future) | |
| 494 | - future_to_lang[embedding_future] = ("embedding", None) | |
| 495 | - | |
| 496 | - done, not_done = wait(all_futures, timeout=budget_sec) | |
| 386 | + done, not_done = wait(list(future_to_task.keys()), timeout=budget_sec) | |
| 497 | 387 | for future in done: |
| 498 | - task_type, lang = future_to_lang[future] | |
| 388 | + task_type, lang = future_to_task[future] | |
| 499 | 389 | try: |
| 500 | 390 | result = future.result() |
| 501 | 391 | if task_type == "translation": |
| ... | ... | @@ -528,7 +418,7 @@ class QueryParser: |
| 528 | 418 | |
| 529 | 419 | if not_done: |
| 530 | 420 | for future in not_done: |
| 531 | - task_type, lang = future_to_lang[future] | |
| 421 | + task_type, lang = future_to_task[future] | |
| 532 | 422 | if task_type == "translation": |
| 533 | 423 | timeout_msg = ( |
| 534 | 424 | f"Translation timeout (>{budget_ms}ms) | Language: {lang} | " |
| ... | ... | @@ -542,68 +432,21 @@ class QueryParser: |
| 542 | 432 | if context: |
| 543 | 433 | context.add_warning(timeout_msg) |
| 544 | 434 | |
| 545 | - if encoding_executor: | |
| 546 | - encoding_executor.shutdown(wait=False) | |
| 547 | - if translation_executor: | |
| 548 | - translation_executor.shutdown(wait=False) | |
| 435 | + if async_executor: | |
| 436 | + async_executor.shutdown(wait=False) | |
| 549 | 437 | |
| 550 | 438 | if translations and context: |
| 551 | 439 | context.store_intermediate_result("translations", translations) |
| 552 | - | |
| 553 | - # Build language-scoped query plan: source language + available translations | |
| 554 | - query_text_by_lang: Dict[str, str] = {} | |
| 555 | - if query_text: | |
| 556 | - query_text_by_lang[detected_lang] = query_text | |
| 557 | - for lang, translated_text in (translations or {}).items(): | |
| 558 | - if translated_text and str(translated_text).strip(): | |
| 559 | - query_text_by_lang[str(lang).strip().lower()] = str(translated_text) | |
| 560 | - | |
| 561 | - supplemental_search_langs = self._infer_supplemental_search_langs( | |
| 562 | - query_text=query_text, | |
| 563 | - detected_lang=detected_lang, | |
| 564 | - index_langs=index_langs, | |
| 565 | - ) | |
| 566 | - for lang in supplemental_search_langs: | |
| 567 | - if lang not in query_text_by_lang and query_text: | |
| 568 | - # Use the original mixed-script query as a robust fallback probe for that language field set. | |
| 569 | - query_text_by_lang[lang] = query_text | |
| 570 | - | |
| 571 | - source_in_index_languages = detected_norm in index_langs | |
| 572 | - ordered_search_langs: List[str] = [] | |
| 573 | - seen_order = set() | |
| 574 | - if detected_lang in query_text_by_lang: | |
| 575 | - ordered_search_langs.append(detected_lang) | |
| 576 | - seen_order.add(detected_lang) | |
| 577 | - for lang in index_langs: | |
| 578 | - if lang in query_text_by_lang and lang not in seen_order: | |
| 579 | - ordered_search_langs.append(lang) | |
| 580 | - seen_order.add(lang) | |
| 581 | - for lang in query_text_by_lang.keys(): | |
| 582 | - if lang not in seen_order: | |
| 583 | - ordered_search_langs.append(lang) | |
| 584 | - seen_order.add(lang) | |
| 585 | - | |
| 586 | - if context: | |
| 587 | - context.store_intermediate_result("search_langs", ordered_search_langs) | |
| 588 | - context.store_intermediate_result("query_text_by_lang", query_text_by_lang) | |
| 589 | - context.store_intermediate_result("supplemental_search_langs", supplemental_search_langs) | |
| 590 | 440 | |
| 591 | 441 | # Build result |
| 592 | 442 | result = ParsedQuery( |
| 593 | 443 | original_query=query, |
| 594 | 444 | query_normalized=normalized, |
| 595 | - rewritten_query=rewritten, | |
| 445 | + rewritten_query=query_text, | |
| 596 | 446 | detected_language=detected_lang, |
| 597 | 447 | translations=translations, |
| 598 | 448 | query_vector=query_vector, |
| 599 | - domain=domain, | |
| 600 | - keywords=keywords, | |
| 601 | - token_count=token_count, | |
| 602 | 449 | query_tokens=query_tokens, |
| 603 | - query_text_by_lang=query_text_by_lang, | |
| 604 | - search_langs=ordered_search_langs, | |
| 605 | - index_languages=index_langs, | |
| 606 | - source_in_index_languages=source_in_index_languages, | |
| 607 | 450 | contains_chinese=contains_chinese, |
| 608 | 451 | contains_english=contains_english, |
| 609 | 452 | ) |
| ... | ... | @@ -611,14 +454,13 @@ class QueryParser: |
| 611 | 454 | if context and hasattr(context, 'logger'): |
| 612 | 455 | context.logger.info( |
| 613 | 456 | f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | " |
| 614 | - f"Language: {detected_lang} | Domain: {domain} | " | |
| 615 | 457 | f"Translation count: {len(translations)} | Vector: {'yes' if query_vector is not None else 'no'}", |
| 616 | 458 | extra={'reqid': context.reqid, 'uid': context.uid} |
| 617 | 459 | ) |
| 618 | 460 | else: |
| 619 | 461 | logger.info( |
| 620 | 462 | f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | " |
| 621 | - f"Language: {detected_lang} | Domain: {domain}" | |
| 463 | + f"Language: {detected_lang}" | |
| 622 | 464 | ) |
| 623 | 465 | |
| 624 | 466 | return result | ... | ... |
search/es_query_builder.py
| ... | ... | @@ -159,7 +159,8 @@ class ESQueryBuilder: |
| 159 | 159 | knn_k: int = 50, |
| 160 | 160 | knn_num_candidates: int = 200, |
| 161 | 161 | min_score: Optional[float] = None, |
| 162 | - parsed_query: Optional[Any] = None | |
| 162 | + parsed_query: Optional[Any] = None, | |
| 163 | + index_languages: Optional[List[str]] = None, | |
| 163 | 164 | ) -> Dict[str, Any]: |
| 164 | 165 | """ |
| 165 | 166 | Build complete ES query with post_filter support for multi-select faceting. |
| ... | ... | @@ -202,7 +203,11 @@ class ESQueryBuilder: |
| 202 | 203 | # Text recall (always include if query_text exists) |
| 203 | 204 | if query_text: |
| 204 | 205 | # Unified text query strategy |
| 205 | - text_query = self._build_advanced_text_query(query_text, parsed_query) | |
| 206 | + text_query = self._build_advanced_text_query( | |
| 207 | + query_text, | |
| 208 | + parsed_query, | |
| 209 | + index_languages=index_languages, | |
| 210 | + ) | |
| 206 | 211 | recall_clauses.append(text_query) |
| 207 | 212 | |
| 208 | 213 | # Embedding recall (KNN - separate from query, handled below) |
| ... | ... | @@ -503,13 +508,31 @@ class ESQueryBuilder: |
| 503 | 508 | # Currently using unified embedding field |
| 504 | 509 | return self.text_embedding_field or "title_embedding" |
| 505 | 510 | |
| 506 | - def _build_advanced_text_query(self, query_text: str, parsed_query: Optional[Any] = None) -> Dict[str, Any]: | |
| 511 | + @staticmethod | |
| 512 | + def _normalize_language_list(languages: Optional[List[str]]) -> List[str]: | |
| 513 | + normalized: List[str] = [] | |
| 514 | + seen = set() | |
| 515 | + for language in languages or []: | |
| 516 | + token = str(language or "").strip().lower() | |
| 517 | + if not token or token in seen: | |
| 518 | + continue | |
| 519 | + seen.add(token) | |
| 520 | + normalized.append(token) | |
| 521 | + return normalized | |
| 522 | + | |
| 523 | + def _build_advanced_text_query( | |
| 524 | + self, | |
| 525 | + query_text: str, | |
| 526 | + parsed_query: Optional[Any] = None, | |
| 527 | + *, | |
| 528 | + index_languages: Optional[List[str]] = None, | |
| 529 | + ) -> Dict[str, Any]: | |
| 507 | 530 | """ |
| 508 | - Build advanced text query using should clauses with primary and fallback lexical strategies. | |
| 531 | + Build advanced text query using base and translated lexical clauses. | |
| 509 | 532 | |
| 510 | 533 | Unified implementation: |
| 511 | 534 | - base_query: source-language clause |
| 512 | - - translation queries: target-language clauses from search_langs/query_text_by_lang | |
| 535 | + - translation queries: target-language clauses from translations | |
| 513 | 536 | - KNN query: added separately in build_query |
| 514 | 537 | |
| 515 | 538 | Args: |
| ... | ... | @@ -520,55 +543,42 @@ class ESQueryBuilder: |
| 520 | 543 | ES bool query with should clauses |
| 521 | 544 | """ |
| 522 | 545 | should_clauses = [] |
| 523 | - | |
| 524 | - # Get query analysis from parsed_query | |
| 525 | - query_text_by_lang: Dict[str, str] = {} | |
| 526 | - search_langs: List[str] = [] | |
| 527 | 546 | source_lang = self.default_language |
| 528 | - source_in_index_languages = True | |
| 529 | - index_languages: List[str] = [] | |
| 530 | - | |
| 547 | + translations: Dict[str, str] = {} | |
| 531 | 548 | contains_chinese = False |
| 532 | 549 | contains_english = False |
| 550 | + normalized_index_languages = self._normalize_language_list(index_languages) | |
| 551 | + | |
| 533 | 552 | if parsed_query: |
| 534 | - query_text_by_lang = getattr(parsed_query, "query_text_by_lang", None) or {} | |
| 535 | - search_langs = getattr(parsed_query, "search_langs", None) or [] | |
| 536 | 553 | detected_lang = getattr(parsed_query, "detected_language", None) |
| 537 | 554 | source_lang = detected_lang if detected_lang and detected_lang != "unknown" else self.default_language |
| 538 | - source_in_index_languages = bool( | |
| 539 | - getattr(parsed_query, "source_in_index_languages", True) | |
| 540 | - ) | |
| 541 | - index_languages = getattr(parsed_query, "index_languages", None) or [] | |
| 555 | + translations = getattr(parsed_query, "translations", None) or {} | |
| 542 | 556 | contains_chinese = bool(getattr(parsed_query, "contains_chinese", False)) |
| 543 | 557 | contains_english = bool(getattr(parsed_query, "contains_english", False)) |
| 544 | 558 | |
| 545 | - if not query_text_by_lang: | |
| 546 | - query_text_by_lang = {source_lang: query_text} | |
| 547 | - if source_lang not in query_text_by_lang and query_text: | |
| 548 | - query_text_by_lang[source_lang] = query_text | |
| 549 | - if not search_langs: | |
| 550 | - search_langs = list(query_text_by_lang.keys()) | |
| 551 | - | |
| 552 | - # Base + translated clauses based on language plan. | |
| 553 | - for lang in search_langs: | |
| 554 | - lang_query = query_text_by_lang.get(lang) | |
| 555 | - if not lang_query: | |
| 556 | - continue | |
| 559 | + source_lang = str(source_lang or self.default_language).strip().lower() or self.default_language | |
| 560 | + source_in_index_languages = ( | |
| 561 | + True if not normalized_index_languages else source_lang in normalized_index_languages | |
| 562 | + ) | |
| 563 | + | |
| 564 | + base_query_text = ( | |
| 565 | + getattr(parsed_query, "rewritten_query", None) if parsed_query else None | |
| 566 | + ) or query_text | |
| 567 | + | |
| 568 | + def append_clause(lang: str, lang_query: str, clause_name: str, is_source: bool) -> None: | |
| 569 | + nonlocal should_clauses | |
| 557 | 570 | all_specs, _ = self._build_match_field_specs(lang) |
| 558 | 571 | expanded_specs = self._expand_match_field_specs_for_mixed_script( |
| 559 | 572 | lang, |
| 560 | 573 | all_specs, |
| 561 | 574 | contains_chinese, |
| 562 | 575 | contains_english, |
| 563 | - index_languages, | |
| 576 | + normalized_index_languages, | |
| 564 | 577 | ) |
| 565 | 578 | match_fields = self._format_match_field_specs(expanded_specs) |
| 566 | 579 | if not match_fields: |
| 567 | - continue | |
| 568 | - | |
| 569 | - is_source = (lang == source_lang) | |
| 580 | + return | |
| 570 | 581 | clause_boost = 1.0 |
| 571 | - clause_name = "base_query" if is_source else f"base_query_trans_{lang}" | |
| 572 | 582 | minimum_should_match = ( |
| 573 | 583 | self.base_minimum_should_match if is_source else self.translation_minimum_should_match |
| 574 | 584 | ) |
| ... | ... | @@ -596,44 +606,17 @@ class ESQueryBuilder: |
| 596 | 606 | "multi_match": clause["multi_match"] |
| 597 | 607 | }) |
| 598 | 608 | |
| 599 | - # Fallback: source language is not indexed and translation for some index languages is missing. | |
| 600 | - # Use original query text on missing index-language fields with a low boost. | |
| 601 | - if not source_in_index_languages and query_text and index_languages: | |
| 602 | - normalized_index_langs: List[str] = [] | |
| 603 | - seen_langs = set() | |
| 604 | - for lang in index_languages: | |
| 605 | - norm_lang = str(lang or "").strip().lower() | |
| 606 | - if not norm_lang or norm_lang in seen_langs: | |
| 607 | - continue | |
| 608 | - seen_langs.add(norm_lang) | |
| 609 | - normalized_index_langs.append(norm_lang) | |
| 609 | + if base_query_text: | |
| 610 | + append_clause(source_lang, base_query_text, "base_query", True) | |
| 610 | 611 | |
| 611 | - for lang in normalized_index_langs: | |
| 612 | - if lang == source_lang: | |
| 613 | - continue | |
| 614 | - if lang in query_text_by_lang: | |
| 615 | - continue | |
| 616 | - fb_specs, _ = self._build_match_field_specs(lang) | |
| 617 | - expanded_fb = self._expand_match_field_specs_for_mixed_script( | |
| 618 | - lang, | |
| 619 | - fb_specs, | |
| 620 | - contains_chinese, | |
| 621 | - contains_english, | |
| 622 | - index_languages, | |
| 623 | - ) | |
| 624 | - match_fields = self._format_match_field_specs(expanded_fb) | |
| 625 | - if not match_fields: | |
| 626 | - continue | |
| 627 | - should_clauses.append({ | |
| 628 | - "multi_match": { | |
| 629 | - "_name": f"fallback_original_query_{lang}", | |
| 630 | - "query": query_text, | |
| 631 | - "fields": match_fields, | |
| 632 | - "minimum_should_match": self.translation_minimum_should_match, | |
| 633 | - "tie_breaker": self.tie_breaker_base_query, | |
| 634 | - "boost": self.original_query_fallback_boost_when_translation_missing, | |
| 635 | - } | |
| 636 | - }) | |
| 612 | + for lang, translated_text in translations.items(): | |
| 613 | + normalized_lang = str(lang or "").strip().lower() | |
| 614 | + normalized_text = str(translated_text or "").strip() | |
| 615 | + if not normalized_lang or not normalized_text: | |
| 616 | + continue | |
| 617 | + if normalized_lang == source_lang and normalized_text == base_query_text: | |
| 618 | + continue | |
| 619 | + append_clause(normalized_lang, normalized_text, f"base_query_trans_{normalized_lang}", False) | |
| 637 | 620 | |
| 638 | 621 | # Fallback to a simple query when language fields cannot be resolved. |
| 639 | 622 | if not should_clauses: | ... | ... |
search/searcher.py
| ... | ... | @@ -516,10 +516,19 @@ class Searcher: |
| 516 | 516 | range_filters: Range filters for numeric fields |
| 517 | 517 | facets: Facet configurations for faceted search |
| 518 | 518 | min_score: Minimum score threshold |
| 519 | - context: Request context for tracking (created if not provided) | |
| 519 | + context: Request context for tracking (required) | |
| 520 | 520 | sort_by: Field name for sorting |
| 521 | 521 | sort_order: Sort order: 'asc' or 'desc' |
| 522 | 522 | debug: Enable debug information output |
| 523 | + language: Response / field selection language hint (e.g. zh, en) | |
| 524 | + sku_filter_dimension: SKU grouping dimensions for per-SPU variant pick | |
| 525 | + enable_rerank: If None, use ``config.rerank.enabled``; if set, overrides | |
| 526 | + whether the rerank provider is invoked (subject to rerank window). | |
| 527 | + rerank_query_template: Override for rerank query text template; None uses | |
| 528 | + ``config.rerank.rerank_query_template`` (e.g. ``"{query}"``). | |
| 529 | + rerank_doc_template: Override for per-hit document text passed to rerank; | |
| 530 | + None uses ``config.rerank.rerank_doc_template``. Placeholders are | |
| 531 | + resolved in ``search/rerank_client.py``. | |
| 523 | 532 | |
| 524 | 533 | Returns: |
| 525 | 534 | SearchResult object with formatted results |
| ... | ... | @@ -592,7 +601,8 @@ class Searcher: |
| 592 | 601 | query, |
| 593 | 602 | tenant_id=tenant_id, |
| 594 | 603 | generate_vector=enable_embedding, |
| 595 | - context=context | |
| 604 | + context=context, | |
| 605 | + target_languages=index_langs if enable_translation else [], | |
| 596 | 606 | ) |
| 597 | 607 | # Store query analysis results in context |
| 598 | 608 | context.store_query_analysis( |
| ... | ... | @@ -602,7 +612,7 @@ class Searcher: |
| 602 | 612 | detected_language=parsed_query.detected_language, |
| 603 | 613 | translations=parsed_query.translations, |
| 604 | 614 | query_vector=parsed_query.query_vector.tolist() if parsed_query.query_vector is not None else None, |
| 605 | - domain=parsed_query.domain, | |
| 615 | + domain="default", | |
| 606 | 616 | is_simple_query=True |
| 607 | 617 | ) |
| 608 | 618 | |
| ... | ... | @@ -610,7 +620,6 @@ class Searcher: |
| 610 | 620 | f"查询解析完成 | 原查询: '{parsed_query.original_query}' | " |
| 611 | 621 | f"重写后: '{parsed_query.rewritten_query}' | " |
| 612 | 622 | f"语言: {parsed_query.detected_language} | " |
| 613 | - f"域: {parsed_query.domain} | " | |
| 614 | 623 | f"向量: {'是' if parsed_query.query_vector is not None else '否'}", |
| 615 | 624 | extra={'reqid': context.reqid, 'uid': context.uid} |
| 616 | 625 | ) |
| ... | ... | @@ -643,7 +652,8 @@ class Searcher: |
| 643 | 652 | from_=es_fetch_from, |
| 644 | 653 | enable_knn=enable_embedding and parsed_query.query_vector is not None, |
| 645 | 654 | min_score=min_score, |
| 646 | - parsed_query=parsed_query | |
| 655 | + parsed_query=parsed_query, | |
| 656 | + index_languages=index_langs, | |
| 647 | 657 | ) |
| 648 | 658 | |
| 649 | 659 | # Add facets for faceted search |
| ... | ... | @@ -985,9 +995,6 @@ class Searcher: |
| 985 | 995 | "rewritten_query": context.query_analysis.rewritten_query, |
| 986 | 996 | "detected_language": context.query_analysis.detected_language, |
| 987 | 997 | "translations": context.query_analysis.translations, |
| 988 | - "query_text_by_lang": context.get_intermediate_result("query_text_by_lang", {}), | |
| 989 | - "search_langs": context.get_intermediate_result("search_langs", []), | |
| 990 | - "supplemental_search_langs": context.get_intermediate_result("supplemental_search_langs", []), | |
| 991 | 998 | "has_vector": context.query_analysis.query_vector is not None, |
| 992 | 999 | "is_simple_query": context.query_analysis.is_simple_query, |
| 993 | 1000 | "domain": context.query_analysis.domain | ... | ... |
tests/test_embedding_pipeline.py
| ... | ... | @@ -73,6 +73,10 @@ class _FakeQueryEncoder: |
| 73 | 73 | return np.array([np.array([0.11, 0.22, 0.33], dtype=np.float32) for _ in sentences], dtype=object) |
| 74 | 74 | |
| 75 | 75 | |
| 76 | +def _tokenizer(text): | |
| 77 | + return str(text).split() | |
| 78 | + | |
| 79 | + | |
| 76 | 80 | class _FakeEmbeddingCache: |
| 77 | 81 | def __init__(self): |
| 78 | 82 | self.store: Dict[str, np.ndarray] = {} |
| ... | ... | @@ -210,6 +214,7 @@ def test_query_parser_generates_query_vector_with_encoder(): |
| 210 | 214 | config=_build_test_config(), |
| 211 | 215 | text_encoder=encoder, |
| 212 | 216 | translator=_FakeTranslator(), |
| 217 | + tokenizer=_tokenizer, | |
| 213 | 218 | ) |
| 214 | 219 | |
| 215 | 220 | parsed = parser.parse("red dress", tenant_id="162", generate_vector=True) |
| ... | ... | @@ -224,6 +229,7 @@ def test_query_parser_skips_query_vector_when_disabled(): |
| 224 | 229 | config=_build_test_config(), |
| 225 | 230 | text_encoder=_FakeQueryEncoder(), |
| 226 | 231 | translator=_FakeTranslator(), |
| 232 | + tokenizer=_tokenizer, | |
| 227 | 233 | ) |
| 228 | 234 | |
| 229 | 235 | parsed = parser.parse("red dress", tenant_id="162", generate_vector=False) | ... | ... |
tests/test_es_query_builder.py
| ... | ... | @@ -65,21 +65,42 @@ def test_knn_prefilter_not_added_without_filters(): |
| 65 | 65 | assert q["knn"]["_name"] == "knn_query" |
| 66 | 66 | |
| 67 | 67 | |
| 68 | -def test_text_query_contains_only_base_translation_and_fallback_named_queries(): | |
| 68 | +def test_text_query_contains_only_base_and_translation_named_queries(): | |
| 69 | 69 | qb = _builder() |
| 70 | 70 | parsed_query = SimpleNamespace( |
| 71 | - query_text_by_lang={"en": "dress", "zh": "连衣裙"}, | |
| 72 | - search_langs=["en", "zh"], | |
| 71 | + rewritten_query="dress", | |
| 73 | 72 | detected_language="en", |
| 74 | - source_in_index_languages=False, | |
| 75 | - index_languages=["en", "zh", "fr"], | |
| 73 | + translations={"en": "dress", "zh": "连衣裙"}, | |
| 76 | 74 | ) |
| 77 | 75 | |
| 78 | - q = qb.build_query(query_text="dress", parsed_query=parsed_query, enable_knn=False) | |
| 76 | + q = qb.build_query( | |
| 77 | + query_text="dress", | |
| 78 | + parsed_query=parsed_query, | |
| 79 | + enable_knn=False, | |
| 80 | + index_languages=["en", "zh", "fr"], | |
| 81 | + ) | |
| 79 | 82 | should = q["query"]["bool"]["should"] |
| 80 | 83 | names = [clause["multi_match"]["_name"] for clause in should] |
| 81 | 84 | |
| 82 | - assert names == ["base_query", "base_query_trans_zh", "fallback_original_query_fr"] | |
| 85 | + assert names == ["base_query", "base_query_trans_zh"] | |
| 86 | + | |
| 87 | + | |
| 88 | +def test_text_query_skips_duplicate_translation_same_as_base(): | |
| 89 | + qb = _builder() | |
| 90 | + parsed_query = SimpleNamespace( | |
| 91 | + rewritten_query="dress", | |
| 92 | + detected_language="en", | |
| 93 | + translations={"en": "dress"}, | |
| 94 | + ) | |
| 95 | + | |
| 96 | + q = qb.build_query( | |
| 97 | + query_text="dress", | |
| 98 | + parsed_query=parsed_query, | |
| 99 | + enable_knn=False, | |
| 100 | + index_languages=["en", "zh"], | |
| 101 | + ) | |
| 102 | + | |
| 103 | + assert q["query"]["multi_match"]["_name"] == "base_query" | |
| 83 | 104 | |
| 84 | 105 | |
| 85 | 106 | def test_mixed_script_merges_en_fields_into_zh_clause(): |
| ... | ... | @@ -91,22 +112,25 @@ def test_mixed_script_merges_en_fields_into_zh_clause(): |
| 91 | 112 | default_language="en", |
| 92 | 113 | ) |
| 93 | 114 | parsed_query = SimpleNamespace( |
| 94 | - query_text_by_lang={"zh": "法式 dress"}, | |
| 95 | - search_langs=["zh"], | |
| 115 | + rewritten_query="法式 dress", | |
| 96 | 116 | detected_language="zh", |
| 97 | - source_in_index_languages=True, | |
| 98 | - index_languages=["zh", "en"], | |
| 117 | + translations={}, | |
| 99 | 118 | contains_chinese=True, |
| 100 | 119 | contains_english=True, |
| 101 | 120 | ) |
| 102 | - q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False) | |
| 121 | + q = qb.build_query( | |
| 122 | + query_text="法式 dress", | |
| 123 | + parsed_query=parsed_query, | |
| 124 | + enable_knn=False, | |
| 125 | + index_languages=["zh", "en"], | |
| 126 | + ) | |
| 103 | 127 | fields = q["query"]["multi_match"]["fields"] |
| 104 | 128 | bases = {f.split("^", 1)[0] for f in fields} |
| 105 | 129 | assert "title.zh" in bases and "title.en" in bases |
| 106 | 130 | assert "brief.zh" in bases and "brief.en" in bases |
| 107 | - # Merged supplemental language fields use boost * 0.8 (implicit 1.0 -> ^0.8) | |
| 108 | - assert "title.en^0.8" in fields | |
| 109 | - assert "brief.en^0.8" in fields | |
| 131 | + # Merged supplemental language fields use boost * 0.6 by default. | |
| 132 | + assert "title.en^0.6" in fields | |
| 133 | + assert "brief.en^0.6" in fields | |
| 110 | 134 | |
| 111 | 135 | |
| 112 | 136 | def test_mixed_script_merges_zh_fields_into_en_clause(): |
| ... | ... | @@ -118,19 +142,22 @@ def test_mixed_script_merges_zh_fields_into_en_clause(): |
| 118 | 142 | default_language="en", |
| 119 | 143 | ) |
| 120 | 144 | parsed_query = SimpleNamespace( |
| 121 | - query_text_by_lang={"en": "red 连衣裙"}, | |
| 122 | - search_langs=["en"], | |
| 145 | + rewritten_query="red 连衣裙", | |
| 123 | 146 | detected_language="en", |
| 124 | - source_in_index_languages=True, | |
| 125 | - index_languages=["zh", "en"], | |
| 147 | + translations={}, | |
| 126 | 148 | contains_chinese=True, |
| 127 | 149 | contains_english=True, |
| 128 | 150 | ) |
| 129 | - q = qb.build_query(query_text="red 连衣裙", parsed_query=parsed_query, enable_knn=False) | |
| 151 | + q = qb.build_query( | |
| 152 | + query_text="red 连衣裙", | |
| 153 | + parsed_query=parsed_query, | |
| 154 | + enable_knn=False, | |
| 155 | + index_languages=["zh", "en"], | |
| 156 | + ) | |
| 130 | 157 | fields = q["query"]["multi_match"]["fields"] |
| 131 | 158 | bases = {f.split("^", 1)[0] for f in fields} |
| 132 | 159 | assert "title.en" in bases and "title.zh" in bases |
| 133 | - assert "title.zh^0.8" in fields | |
| 160 | + assert "title.zh^0.6" in fields | |
| 134 | 161 | |
| 135 | 162 | |
| 136 | 163 | def test_mixed_script_merged_fields_scale_configured_boosts(): |
| ... | ... | @@ -143,18 +170,21 @@ def test_mixed_script_merged_fields_scale_configured_boosts(): |
| 143 | 170 | default_language="en", |
| 144 | 171 | ) |
| 145 | 172 | parsed_query = SimpleNamespace( |
| 146 | - query_text_by_lang={"zh": "法式 dress"}, | |
| 147 | - search_langs=["zh"], | |
| 173 | + rewritten_query="法式 dress", | |
| 148 | 174 | detected_language="zh", |
| 149 | - source_in_index_languages=True, | |
| 150 | - index_languages=["zh", "en"], | |
| 175 | + translations={}, | |
| 151 | 176 | contains_chinese=True, |
| 152 | 177 | contains_english=True, |
| 153 | 178 | ) |
| 154 | - q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False) | |
| 179 | + q = qb.build_query( | |
| 180 | + query_text="法式 dress", | |
| 181 | + parsed_query=parsed_query, | |
| 182 | + enable_knn=False, | |
| 183 | + index_languages=["zh", "en"], | |
| 184 | + ) | |
| 155 | 185 | fields = q["query"]["multi_match"]["fields"] |
| 156 | 186 | assert "title.zh^5.0" in fields |
| 157 | - assert "title.en^8.0" in fields # 10.0 * 0.8 | |
| 187 | + assert "title.en^6.0" in fields # 10.0 * 0.6 | |
| 158 | 188 | |
| 159 | 189 | |
| 160 | 190 | def test_mixed_script_does_not_merge_en_when_not_in_index_languages(): |
| ... | ... | @@ -166,15 +196,18 @@ def test_mixed_script_does_not_merge_en_when_not_in_index_languages(): |
| 166 | 196 | default_language="zh", |
| 167 | 197 | ) |
| 168 | 198 | parsed_query = SimpleNamespace( |
| 169 | - query_text_by_lang={"zh": "法式 dress"}, | |
| 170 | - search_langs=["zh"], | |
| 199 | + rewritten_query="法式 dress", | |
| 171 | 200 | detected_language="zh", |
| 172 | - source_in_index_languages=True, | |
| 173 | - index_languages=["zh"], | |
| 201 | + translations={}, | |
| 174 | 202 | contains_chinese=True, |
| 175 | 203 | contains_english=True, |
| 176 | 204 | ) |
| 177 | - q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False) | |
| 205 | + q = qb.build_query( | |
| 206 | + query_text="法式 dress", | |
| 207 | + parsed_query=parsed_query, | |
| 208 | + enable_knn=False, | |
| 209 | + index_languages=["zh"], | |
| 210 | + ) | |
| 178 | 211 | fields = q["query"]["multi_match"]["fields"] |
| 179 | 212 | bases = {f.split("^", 1)[0] for f in fields} |
| 180 | 213 | assert "title.zh" in bases | ... | ... |
tests/test_query_parser_mixed_language.py
| 1 | -from types import SimpleNamespace | |
| 2 | - | |
| 3 | 1 | from config import FunctionScoreConfig, IndexConfig, QueryConfig, RerankConfig, SPUConfig, SearchConfig |
| 4 | 2 | from query.query_parser import QueryParser |
| 5 | 3 | |
| ... | ... | @@ -9,6 +7,10 @@ class _DummyTranslator: |
| 9 | 7 | return f"{text}-{target_lang}" |
| 10 | 8 | |
| 11 | 9 | |
| 10 | +def _tokenizer(text): | |
| 11 | + return str(text).split() | |
| 12 | + | |
| 13 | + | |
| 12 | 14 | def test_pure_english_word_token_length_and_script(): |
| 13 | 15 | assert QueryParser._is_pure_english_word_token("ab") is False |
| 14 | 16 | assert QueryParser._is_pure_english_word_token("abc") is True |
| ... | ... | @@ -35,59 +37,57 @@ def _build_config() -> SearchConfig: |
| 35 | 37 | |
| 36 | 38 | |
| 37 | 39 | def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(monkeypatch): |
| 38 | - parser = QueryParser(_build_config(), translator=_DummyTranslator()) | |
| 40 | + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) | |
| 39 | 41 | monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh") |
| 40 | - monkeypatch.setattr( | |
| 41 | - "query.query_parser.get_tenant_config_loader", | |
| 42 | - lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["zh", "en"]}), | |
| 43 | - raising=False, | |
| 44 | - ) | |
| 45 | 42 | |
| 46 | - result = parser.parse("法式 dress 连衣裙", tenant_id="162", generate_vector=False) | |
| 43 | + result = parser.parse( | |
| 44 | + "法式 dress 连衣裙", | |
| 45 | + tenant_id="162", | |
| 46 | + generate_vector=False, | |
| 47 | + target_languages=["zh", "en"], | |
| 48 | + ) | |
| 47 | 49 | |
| 48 | 50 | assert result.detected_language == "zh" |
| 49 | 51 | assert result.contains_chinese is True |
| 50 | 52 | assert result.contains_english is True |
| 51 | - assert "en" in result.search_langs | |
| 52 | - # 翻译在预算内完成时会写入目标语言字段(优于仅用原文做 supplemental 探测) | |
| 53 | - assert result.query_text_by_lang["en"] == "法式 dress 连衣裙-en" | |
| 54 | - assert result.query_text_by_lang["zh"] == "法式 dress 连衣裙" | |
| 53 | + assert result.translations == {"en": "法式 dress 连衣裙-en"} | |
| 54 | + assert result.query_tokens == ["法式", "dress", "连衣裙"] | |
| 55 | + assert not hasattr(result, "query_text_by_lang") | |
| 56 | + assert not hasattr(result, "search_langs") | |
| 55 | 57 | |
| 56 | 58 | |
| 57 | 59 | def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch): |
| 58 | - parser = QueryParser(_build_config(), translator=_DummyTranslator()) | |
| 60 | + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) | |
| 59 | 61 | monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") |
| 60 | - monkeypatch.setattr( | |
| 61 | - "query.query_parser.get_tenant_config_loader", | |
| 62 | - lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}), | |
| 63 | - raising=False, | |
| 64 | - ) | |
| 65 | 62 | |
| 66 | - result = parser.parse("red 连衣裙", tenant_id="0", generate_vector=False) | |
| 63 | + result = parser.parse( | |
| 64 | + "red 连衣裙", | |
| 65 | + tenant_id="0", | |
| 66 | + generate_vector=False, | |
| 67 | + target_languages=["en", "zh"], | |
| 68 | + ) | |
| 67 | 69 | |
| 68 | 70 | assert result.detected_language == "en" |
| 69 | 71 | assert result.contains_chinese is True |
| 70 | 72 | assert result.contains_english is True |
| 71 | - assert "zh" in result.search_langs | |
| 72 | - assert result.query_text_by_lang["zh"] == "red 连衣裙-zh" | |
| 73 | - assert result.query_text_by_lang["en"] == "red 连衣裙" | |
| 73 | + assert result.translations == {"zh": "red 连衣裙-zh"} | |
| 74 | + assert result.query_tokens == ["red", "连衣裙"] | |
| 74 | 75 | |
| 75 | 76 | |
| 76 | 77 | def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch): |
| 77 | 78 | """en 在 index_languages 内时仍应等待并采纳 en->zh 翻译结果(与向量共用预算)。""" |
| 78 | - parser = QueryParser(_build_config(), translator=_DummyTranslator()) | |
| 79 | + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) | |
| 79 | 80 | monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") |
| 80 | - monkeypatch.setattr( | |
| 81 | - "query.query_parser.get_tenant_config_loader", | |
| 82 | - lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}), | |
| 83 | - raising=False, | |
| 84 | - ) | |
| 85 | 81 | |
| 86 | - result = parser.parse("off shoulder top", tenant_id="0", generate_vector=False) | |
| 82 | + result = parser.parse( | |
| 83 | + "off shoulder top", | |
| 84 | + tenant_id="0", | |
| 85 | + generate_vector=False, | |
| 86 | + target_languages=["en", "zh"], | |
| 87 | + ) | |
| 87 | 88 | |
| 88 | 89 | assert result.detected_language == "en" |
| 89 | 90 | assert result.contains_chinese is False |
| 90 | 91 | assert result.contains_english is True |
| 91 | 92 | assert result.translations.get("zh") == "off shoulder top-zh" |
| 92 | - assert result.query_text_by_lang.get("zh") == "off shoulder top-zh" | |
| 93 | - assert result.source_in_index_languages is True | |
| 93 | + assert not hasattr(result, "source_in_index_languages") | ... | ... |
tests/test_search_rerank_window.py
| ... | ... | @@ -43,7 +43,14 @@ class _FakeParsedQuery: |
| 43 | 43 | |
| 44 | 44 | |
| 45 | 45 | class _FakeQueryParser: |
| 46 | - def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any): | |
| 46 | + def parse( | |
| 47 | + self, | |
| 48 | + query: str, | |
| 49 | + tenant_id: str, | |
| 50 | + generate_vector: bool, | |
| 51 | + context: Any, | |
| 52 | + target_languages: Any = None, | |
| 53 | + ): | |
| 47 | 54 | return _FakeParsedQuery( |
| 48 | 55 | original_query=query, |
| 49 | 56 | query_normalized=query, |
| ... | ... | @@ -191,6 +198,66 @@ def test_config_loader_rerank_enabled_defaults_true(tmp_path: Path): |
| 191 | 198 | "field_boosts": {"title.en": 3.0}, |
| 192 | 199 | "indexes": [{"name": "default", "label": "default", "fields": ["title.en"]}], |
| 193 | 200 | "query_config": {"supported_languages": ["en"], "default_language": "en"}, |
| 201 | + "services": { | |
| 202 | + "translation": { | |
| 203 | + "service_url": "http://localhost:6005", | |
| 204 | + "timeout_sec": 3.0, | |
| 205 | + "default_model": "dummy-model", | |
| 206 | + "default_scene": "general", | |
| 207 | + "cache": { | |
| 208 | + "ttl_seconds": 60, | |
| 209 | + "sliding_expiration": True, | |
| 210 | + }, | |
| 211 | + "capabilities": { | |
| 212 | + "dummy-model": { | |
| 213 | + "enabled": True, | |
| 214 | + "backend": "llm", | |
| 215 | + "use_cache": True, | |
| 216 | + "model": "dummy-model", | |
| 217 | + "base_url": "http://localhost:6005/v1", | |
| 218 | + "timeout_sec": 3.0, | |
| 219 | + } | |
| 220 | + }, | |
| 221 | + }, | |
| 222 | + "embedding": { | |
| 223 | + "provider": "http", | |
| 224 | + "providers": { | |
| 225 | + "http": { | |
| 226 | + "text_base_url": "http://localhost:6005", | |
| 227 | + "image_base_url": "http://localhost:6008", | |
| 228 | + } | |
| 229 | + }, | |
| 230 | + "backend": "tei", | |
| 231 | + "backends": { | |
| 232 | + "tei": { | |
| 233 | + "base_url": "http://localhost:8080", | |
| 234 | + "timeout_sec": 3.0, | |
| 235 | + "model_id": "dummy-embedding-model", | |
| 236 | + } | |
| 237 | + }, | |
| 238 | + }, | |
| 239 | + "rerank": { | |
| 240 | + "provider": "http", | |
| 241 | + "providers": { | |
| 242 | + "http": { | |
| 243 | + "base_url": "http://localhost:6007", | |
| 244 | + "service_url": "http://localhost:6007/rerank", | |
| 245 | + } | |
| 246 | + }, | |
| 247 | + "backend": "bge", | |
| 248 | + "backends": { | |
| 249 | + "bge": { | |
| 250 | + "model_name": "dummy-rerank-model", | |
| 251 | + "device": "cpu", | |
| 252 | + "use_fp16": False, | |
| 253 | + "batch_size": 8, | |
| 254 | + "max_length": 128, | |
| 255 | + "cache_dir": "./model_cache", | |
| 256 | + "enable_warmup": False, | |
| 257 | + } | |
| 258 | + }, | |
| 259 | + }, | |
| 260 | + }, | |
| 194 | 261 | "spu_config": {"enabled": False}, |
| 195 | 262 | "function_score": {"score_mode": "sum", "boost_mode": "multiply", "functions": []}, |
| 196 | 263 | "rerank": {"rerank_window": 384}, |
| ... | ... | @@ -354,7 +421,14 @@ def test_searcher_promotes_sku_when_option1_matches_translated_query(monkeypatch |
| 354 | 421 | class _TranslatedQueryParser: |
| 355 | 422 | text_encoder = None |
| 356 | 423 | |
| 357 | - def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any): | |
| 424 | + def parse( | |
| 425 | + self, | |
| 426 | + query: str, | |
| 427 | + tenant_id: str, | |
| 428 | + generate_vector: bool, | |
| 429 | + context: Any, | |
| 430 | + target_languages: Any = None, | |
| 431 | + ): | |
| 358 | 432 | return _FakeParsedQuery( |
| 359 | 433 | original_query=query, |
| 360 | 434 | query_normalized=query, |
| ... | ... | @@ -407,15 +481,22 @@ def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_matc |
| 407 | 481 | encoder = _FakeTextEncoder( |
| 408 | 482 | { |
| 409 | 483 | "linen summer dress": [0.8, 0.2], |
| 410 | - "color:Red": [1.0, 0.0], | |
| 411 | - "color:Blue": [0.0, 1.0], | |
| 484 | + "color:red": [1.0, 0.0], | |
| 485 | + "color:blue": [0.0, 1.0], | |
| 412 | 486 | } |
| 413 | 487 | ) |
| 414 | 488 | |
| 415 | 489 | class _EmbeddingQueryParser: |
| 416 | 490 | text_encoder = encoder |
| 417 | 491 | |
| 418 | - def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any): | |
| 492 | + def parse( | |
| 493 | + self, | |
| 494 | + query: str, | |
| 495 | + tenant_id: str, | |
| 496 | + generate_vector: bool, | |
| 497 | + context: Any, | |
| 498 | + target_languages: Any = None, | |
| 499 | + ): | |
| 419 | 500 | return _FakeParsedQuery( |
| 420 | 501 | original_query=query, |
| 421 | 502 | query_normalized=query, | ... | ... |