From ef5baa866ae0cf2c061a83a9fc2aead25b1d098e Mon Sep 17 00:00:00 2001 From: tangwang Date: Sun, 22 Mar 2026 14:16:39 +0800 Subject: [PATCH] 混杂语言处理 --- config/config.yaml | 4 ++-- docs/TODO-ES能力提升.md | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ docs/TODO.txt | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- docs/搜索API对接指南-01-搜索接口.md | 8 +++++--- docs/相关性检索优化说明.md | 21 +++++++++++++++++++++ query/query_parser.py | 474 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- search/es_query_builder.py | 127 +++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------------------------------------------ search/searcher.py | 23 +++++++++++++++-------- tests/test_embedding_pipeline.py | 6 ++++++ tests/test_es_query_builder.py | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------- tests/test_query_parser_mixed_language.py | 64 ++++++++++++++++++++++++++++++++-------------------------------- tests/test_search_rerank_window.py | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----- 12 files changed, 579 insertions(+), 471 deletions(-) create mode 100644 docs/TODO-ES能力提升.md diff --git a/config/config.yaml b/config/config.yaml index 865bf6f..debc773 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -120,7 +120,7 @@ query_config: - skus # KNN boost配置(向量召回的boost值) - knn_boost: 0.25 # Lower boost for embedding recall + knn_boost: 2.0 # Lower boost for embedding recall # Function Score配置(ES层打分规则) function_score: @@ -290,7 +290,7 @@ services: engine: "vllm" max_model_len: 160 tensor_parallel_size: 1 - gpu_memory_utilization: 0.36 + gpu_memory_utilization: 0.20 dtype: "float16" enable_prefix_caching: true enforce_eager: false diff --git a/docs/TODO-ES能力提升.md b/docs/TODO-ES能力提升.md new file mode 100644 index 0000000..7e20c4c --- /dev/null +++ b/docs/TODO-ES能力提升.md @@ -0,0 +1,69 @@ +ES 付费版本 or 定制开发(建议先看下付费版本价格) +ES定制开发: +RRF / retrievers + +Elastic 的订阅矩阵里明确列了这些相关能力:Retrievers: linear, rule, RRF, text similarity re-ranker,以及 Reciprocal Rank Fusion (RRF) for hybrid search。 + +这类能力最有价值的点是: +它们把混合检索从“自己拼 DSL 和手搓打分”变成了官方支持的多阶段检索框架。重排:text similarity re-ranker / Elastic Rerank. text_similarity_reranker 用 NLP 模型对 top-k 结果按语义相似度重新排序;它可以用内置的 Elastic Rerank,也可以接 Cohere、Vertex AI,或者你自己上传的 text similarity 模型。 + +{ + "retriever": { + "rrf": { + "retrievers": [ + { "standard": { "query": { ... } } }, + { "knn": { ... } } + ] + } + } +} + + +加reranker: +text_similarity_reranker 用 NLP 模型对 top-k 结果按语义相似度重新排序;它可以用内置的 Elastic Rerank,也可以接 Cohere、Vertex AI,或者你自己上传的 text similarity 模型。 + +{ + "retriever": { + "text_similarity_reranker": { + "retriever": { + "rrf": { ... } + }, + ... + } + } +} + +{ + "retriever": { + "text_similarity_reranker": { + "retriever": { + "rrf": { + "retrievers": [ + { + "standard": { + "query": { + "...": "..." + } + } + }, + { + "knn": { + "...": "..." + } + } + ], + "rank_window_size": 100, + "rank_constant": 20 + } + }, + "field": "your_rerank_text_field", + "inference_text": "白色 oversized T-shirt", + "inference_id": ".rerank-v1-elasticsearch", + "rank_window_size": 50 + } + }, + "size": 20 +} + + + diff --git a/docs/TODO.txt b/docs/TODO.txt index a022950..ae809a1 100644 --- a/docs/TODO.txt +++ b/docs/TODO.txt @@ -1,4 +1,62 @@ +把knn跟文本相关性的融合方式修改为 "rank": {"rrf": {} }需要licence,可以帮我修改源码支持吗? + + knn_boost: 2.0 + + +{ + "query": { ...全文检索... }, + "knn": { ...向量检索... }, + "rank": { + "rrf": {} + } +} + + + + +"image_embedding": { + "type": "nested", + "properties": { + "vector": { + "type": "dense_vector", + "dims": 1024, + "index": true, + "similarity": "dot_product", + "element_type": "bfloat16" + }, + "url": { + "type": "text" + } + } +}, +去掉 image_embedding_512 +image_embedding改为,一个spu有多个sku向量,每个向量内部properties: +除了vector url还应该包括 +"image_embedding": { + "type": "nested", + "properties": { + "vector": { + "type": "dense_vector", + "dims": 1024, + "index": true, + "similarity": "dot_product", + "element_type": "bfloat16" + }, + "url": { + "type": "text" + } + } +}, + + + + + +外部需求: +1. 对推理能力要求很低、对耗时要求很高的大模型API(或者本地部署一个7b Q4量化的大模型),prompt大概30-50个token,首token响应要求500ms以内 +2. ES支持reranker pipline? + @reranker/backends/qwen3_vllm.py 单次 generate 前有进程内锁,同一进程里不会并行多路 vLLM 推理,这个锁有必要吗?是否会影响性能?是否能够打开,使得性能更好?比如这个场景,我一次请求 400 条,分成每64个一个batch,基于我现在的gpu配置,可以再提高并发度吗? 测试了,让每个批次都并发地进行,耗时没有变化 @@ -383,6 +441,8 @@ https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-men 融合打分(已完成,2026-03) + +以下已经完成: 1. `fuse_scores_and_resort` 已改为乘法融合,并通过 `matched_queries` 提取: - `base_query` - `base_query_trans_*` @@ -397,7 +457,11 @@ https://help.aliyun.com/zh/model-studio/partial-mode?spm=a2c4g.11186623.help-men - `docs/搜索API对接指南.md` - `docs/Usage-Guide.md` - +未完成的: +(归一化、次序融合?还乘法公式?) +RRF:先把多路召回稳妥融合 +linear + minmax:让你能精调 knn 和文本的权重 +reranker:对前面召回出来的 top-k 再做“最后一刀” diff --git a/docs/搜索API对接指南-01-搜索接口.md b/docs/搜索API对接指南-01-搜索接口.md index b3cded4..34fc597 100644 --- a/docs/搜索API对接指南-01-搜索接口.md +++ b/docs/搜索API对接指南-01-搜索接口.md @@ -66,9 +66,11 @@ response = requests.post(url, headers=headers, json={"query": "芭比娃娃"}) | `min_score` | float | N | null | 最小相关性分数阈值 | | `sku_filter_dimension` | array[string] | N | null | 子SKU筛选维度列表(见[SKU筛选维度](#35-sku筛选维度)) | | `debug` | boolean | N | false | 是否返回调试信息 | -| `enable_rerank` | boolean/null | N | null | 是否开启重排(调用外部重排服务对 ES 结果进行二次排序)。不传/传 null 使用服务端 `rerank.enabled`(默认开启)。开启后会先对 ES TopN(`rerank_window`)重排,再按分页截取;若 `from+size>1000`,则不重排,直接按分页从 ES 返回 | -| `rerank_query_template` | string | N | null | 重排 query 模板(可选)。支持 `{query}` 占位符;不传则使用服务端配置 | -| `rerank_doc_template` | string | N | null | 重排 doc 模板(可选)。支持 `{title} {brief} {vendor} {description} {category_path}`;不传则使用服务端配置 | +| `enable_rerank` | boolean/null | N | null | 是否开启重排(调用外部重排服务对 ES 结果进行二次排序)。不传/传 null 使用服务端 `rerank.enabled`。当有效开启且 `from + size <= rerank_window` 时:ES 先取前 `rerank_window` 条,重排后再按 `from`/`size` 截取当前页;若 `from + size > rerank_window`,则**不进行**窗口内重排,直接按请求的 `from`/`size` 查询 ES(`rerank_window` 见 `config.yaml` 的 `rerank.rerank_window`,仓库示例默认 400) | +| `rerank_query_template` | string | N | null | 重排 query 模板(可选)。支持 `{query}` 占位符;不传则使用服务端 `rerank.rerank_query_template` | +| `rerank_doc_template` | string | N | null | 重排 doc 模板(可选)。支持 `{title} {brief} {vendor} {description} {category_path}` 等占位符(由 `search/rerank_client.py` 按语言字段拼装);不传则使用服务端 `rerank.rerank_doc_template` | + +**与后端代码的对应关系**(便于联调):HTTP `POST /search/` 请求体由 `api/models.py` 的 `SearchRequest` 校验;路由 `api/routes/search.py` 将字段原样传入 `Searcher.search(...)`(含上述三个重排相关字段)。CLI `python main.py search` 目前未暴露这些参数,走配置默认值。 | `user_id` | string | N | null | 用户ID(用于个性化,预留) | | `session_id` | string | N | null | 会话ID(用于分析,预留) | diff --git a/docs/相关性检索优化说明.md b/docs/相关性检索优化说明.md index ab4d70a..6cf890f 100644 --- a/docs/相关性检索优化说明.md +++ b/docs/相关性检索优化说明.md @@ -281,3 +281,24 @@ title.en: 2026 Korean-style High-waisted Slimming Corduroy Skirt with Slit, Mid- Rerank score: 0.9643 title.en: Black Half-high Collar Base Shirt Women's Autumn and Winter fleece-lined Contrast Color Pure Desire Design Sense Horn Sleeve Ruffled Inner Top title.zh: 黑色高领半高领女士秋冬内搭加绒拼色纯欲设计荷叶边袖内衬上衣 + + + +qwen3-0.6b的严重badcase: +q=牛仔裤 + +Rerank score: 0.0002 +title.en: Wrangler Womens Cowboy Cut Slim Fit Jean Bleach +title.zh: Wrangler 女士牛仔裤 牛仔剪裁 紧身版型 漂白色 + +Rerank score: 0.0168 +title.en: Fleece Lined Tights Sheer Women - Fake Translucent Warm Pantyhose Leggings Sheer Thick Tights for Winter +title.zh: 加绒透肤女士连裤袜 - 仿透视保暖长筒袜 冬季厚款透肤连裤袜 + +Rerank score: 0.1366 +title.en: Dockers Men's Classic Fit Workday Khaki Smart 360 FLEX Pants (Standard and Big & Tall) +title.zh: Dockers 男士经典版型工作日卡其色智能360度弹力裤(标准码与加大码) + +Rerank score: 0.0981 +title.en: Lazy One Pajama Shorts for Men, Men's Pajama Bottoms, Sleepwear +title.zh: 懒人男士睡裤,男式家居裤,睡眠服饰 diff --git a/query/query_parser.py b/query/query_parser.py index dfcdeda..5294427 100644 --- a/query/query_parser.py +++ b/query/query_parser.py @@ -1,10 +1,15 @@ """ Query parser - main module for query processing. -Handles query rewriting, translation, and embedding generation. +Responsibilities are intentionally narrow: +- normalize and rewrite the incoming query +- detect language and tokenize with HanLP +- run translation and embedding requests concurrently +- return parser facts, not Elasticsearch language-planning data """ -from typing import Dict, List, Optional, Any, Union, Tuple +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional, Tuple import numpy as np import logging import re @@ -26,7 +31,7 @@ except Exception: # pragma: no cover def simple_tokenize_query(text: str) -> List[str]: """ - Lightweight tokenizer for suggestion length / analysis (aligned with QueryParser fallback). + Lightweight tokenizer for suggestion-side heuristics only. - Consecutive CJK characters form one token - Latin / digit runs (with internal hyphens) form tokens @@ -37,63 +42,32 @@ def simple_tokenize_query(text: str) -> List[str]: return pattern.findall(text) +@dataclass(slots=True) class ParsedQuery: - """Container for parsed query results.""" - - def __init__( - self, - original_query: str, - query_normalized: str, - rewritten_query: Optional[str] = None, - detected_language: Optional[str] = None, - translations: Dict[str, str] = None, - query_vector: Optional[np.ndarray] = None, - domain: str = "default", - keywords: str = "", - token_count: int = 0, - query_tokens: Optional[List[str]] = None, - query_text_by_lang: Optional[Dict[str, str]] = None, - search_langs: Optional[List[str]] = None, - index_languages: Optional[List[str]] = None, - source_in_index_languages: bool = True, - contains_chinese: bool = False, - contains_english: bool = False, - ): - self.original_query = original_query - self.query_normalized = query_normalized - self.rewritten_query = rewritten_query or query_normalized - self.detected_language = detected_language - self.translations = translations or {} - self.query_vector = query_vector - self.domain = domain - # Query analysis fields - self.keywords = keywords - self.token_count = token_count - self.query_tokens = query_tokens or [] - self.query_text_by_lang = query_text_by_lang or {} - self.search_langs = search_langs or [] - self.index_languages = index_languages or [] - self.source_in_index_languages = bool(source_in_index_languages) - self.contains_chinese = bool(contains_chinese) - self.contains_english = bool(contains_english) + """Container for query parser facts.""" + + original_query: str + query_normalized: str + rewritten_query: str + detected_language: Optional[str] = None + translations: Dict[str, str] = field(default_factory=dict) + query_vector: Optional[np.ndarray] = None + query_tokens: List[str] = field(default_factory=list) + contains_chinese: bool = False + contains_english: bool = False def to_dict(self) -> Dict[str, Any]: """Convert to dictionary representation.""" - result = { + return { "original_query": self.original_query, "query_normalized": self.query_normalized, "rewritten_query": self.rewritten_query, "detected_language": self.detected_language, "translations": self.translations, - "domain": self.domain + "query_tokens": self.query_tokens, + "contains_chinese": self.contains_chinese, + "contains_english": self.contains_english, } - result["query_text_by_lang"] = self.query_text_by_lang - result["search_langs"] = self.search_langs - result["index_languages"] = self.index_languages - result["source_in_index_languages"] = self.source_in_index_languages - result["contains_chinese"] = self.contains_chinese - result["contains_english"] = self.contains_english - return result class QueryParser: @@ -102,7 +76,7 @@ class QueryParser: 1. Normalization 2. Query rewriting (brand/category mappings, synonyms) 3. Language detection - 4. Translation to target languages + 4. Translation to caller-provided target languages 5. Text embedding generation (for semantic search) """ @@ -110,7 +84,8 @@ class QueryParser: self, config: SearchConfig, text_encoder: Optional[TextEmbeddingEncoder] = None, - translator: Optional[Any] = None + translator: Optional[Any] = None, + tokenizer: Optional[Callable[[str], Any]] = None, ): """ Initialize query parser. @@ -128,23 +103,7 @@ class QueryParser: self.normalizer = QueryNormalizer() self.language_detector = LanguageDetector() self.rewriter = QueryRewriter(config.query_config.rewrite_dictionary) - - # Optional HanLP components (heavy). If unavailable, fall back to a lightweight tokenizer. - self._tok = None - self._pos_tag = None - if hanlp is not None: - try: - logger.info("Initializing HanLP components...") - self._tok = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF) - self._tok.config.output_spans = True - self._pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL) - logger.info("HanLP components initialized") - except Exception as e: - logger.warning(f"HanLP init failed, falling back to simple tokenizer: {e}") - self._tok = None - self._pos_tag = None - else: - logger.info("HanLP not installed; using simple tokenizer") + self._tokenizer = tokenizer or self._build_tokenizer() # Eager initialization (startup-time failure visibility, no lazy init in request path) if self.config.query_config.enable_text_embedding and self._text_encoder is None: @@ -170,6 +129,16 @@ class QueryParser: """Return pre-initialized translator.""" return self._translator + def _build_tokenizer(self) -> Callable[[str], Any]: + """Build the tokenizer used by query parsing. No fallback path by design.""" + if hanlp is None: + raise RuntimeError("HanLP is required for QueryParser tokenization") + logger.info("Initializing HanLP tokenizer...") + tokenizer = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF) + tokenizer.config.output_spans = True + logger.info("HanLP tokenizer initialized") + return tokenizer + @staticmethod def _pick_query_translation_model(source_lang: str, target_lang: str, config: SearchConfig) -> str: """Pick the translation capability for query-time translation (configurable).""" @@ -186,41 +155,46 @@ class QueryParser: # By default this is `nllb-200-distilled-600m` (multi-lingual local model). return config.query_config.default_translation_model - def _simple_tokenize(self, text: str) -> List[str]: - return simple_tokenize_query(text) - - def _extract_keywords(self, query: str) -> str: - """Extract keywords (nouns with length > 1) from query.""" - if self._tok is not None and self._pos_tag is not None: - tok_result = self._tok(query) - if not tok_result: - return "" - words = [x[0] for x in tok_result] - pos_tags = self._pos_tag(words) - keywords = [] - for word, pos in zip(words, pos_tags): - if len(word) > 1 and isinstance(pos, str) and pos.startswith("N"): - keywords.append(word) - return " ".join(keywords) - - # Fallback: treat tokens with length > 1 as "keywords" - tokens = self._simple_tokenize(query) - keywords = [t for t in tokens if len(t) > 1] - return " ".join(keywords) - - def _get_token_count(self, query: str) -> int: - """Get token count (HanLP if available, otherwise simple).""" - if self._tok is not None: - tok_result = self._tok(query) - return len(tok_result) if tok_result else 0 - return len(self._simple_tokenize(query)) + @staticmethod + def _normalize_language_codes(languages: Optional[List[str]]) -> List[str]: + normalized: List[str] = [] + seen = set() + for language in languages or []: + token = str(language or "").strip().lower() + if not token or token in seen: + continue + seen.add(token) + normalized.append(token) + return normalized + + @staticmethod + def _extract_tokens(tokenizer_result: Any) -> List[str]: + """Normalize tokenizer output into a flat token string list.""" + if not tokenizer_result: + return [] + if isinstance(tokenizer_result, str): + token = tokenizer_result.strip() + return [token] if token else [] + + tokens: List[str] = [] + for item in tokenizer_result: + token: Optional[str] = None + if isinstance(item, str): + token = item + elif isinstance(item, (list, tuple)) and item: + token = str(item[0]) + elif item is not None: + token = str(item) + + if token is None: + continue + token = token.strip() + if token: + tokens.append(token) + return tokens def _get_query_tokens(self, query: str) -> List[str]: - """Get token list (HanLP if available, otherwise simple).""" - if self._tok is not None: - tok_result = self._tok(query) - return [x[0] for x in tok_result] if tok_result else [] - return self._simple_tokenize(query) + return self._extract_tokens(self._tokenizer(query)) @staticmethod def _contains_cjk(text: str) -> bool: @@ -237,64 +211,24 @@ class QueryParser: return False return bool(re.fullmatch(r"[A-Za-z]+(?:-[A-Za-z]+)*", token)) - @staticmethod - def _extract_latin_tokens(text: str) -> List[str]: - """Extract latin word tokens from query text.""" - return re.findall(r"[A-Za-z]+(?:-[A-Za-z]+)*", text or "") - - def _infer_supplemental_search_langs( - self, - query_text: str, - detected_lang: str, - index_langs: List[str], - ) -> List[str]: - """ - Infer extra languages to search when the query mixes scripts. - - Rules: - - If any Chinese characters appear, include `zh` when available. - - If the query contains meaningful latin tokens, include `en` when available. - "Meaningful" means either: - 1) at least 2 latin tokens with length >= 4, or - 2) at least 1 latin token with length >= 4 and latin chars occupy >= 20% of non-space chars. - """ - supplemental: List[str] = [] - normalized_index_langs = {str(lang or "").strip().lower() for lang in index_langs} - normalized_detected = str(detected_lang or "").strip().lower() - query_text = str(query_text or "") - - if "zh" in normalized_index_langs and self._contains_cjk(query_text) and normalized_detected != "zh": - supplemental.append("zh") - - latin_tokens = self._extract_latin_tokens(query_text) - significant_latin_tokens = [tok for tok in latin_tokens if len(tok) >= 4] - latin_chars = sum(len(tok) for tok in latin_tokens) - non_space_chars = len(re.sub(r"\s+", "", query_text)) - latin_ratio = (latin_chars / non_space_chars) if non_space_chars > 0 else 0.0 - has_meaningful_english = ( - len(significant_latin_tokens) >= 2 or - (len(significant_latin_tokens) >= 1 and latin_ratio >= 0.2) - ) - - if "en" in normalized_index_langs and has_meaningful_english and normalized_detected != "en": - supplemental.append("en") - - return supplemental - def parse( self, query: str, tenant_id: Optional[str] = None, generate_vector: bool = True, - context: Optional[Any] = None + context: Optional[Any] = None, + target_languages: Optional[List[str]] = None, ) -> ParsedQuery: """ Parse query through all processing stages. Args: query: Raw query string + tenant_id: Deprecated and ignored by QueryParser. Kept temporarily + to avoid a wider refactor in this first step. generate_vector: Whether to generate query embedding context: Optional request context for tracking and logging + target_languages: Translation target languages decided by the caller Returns: ParsedQuery object with all processing results @@ -325,15 +259,9 @@ class QueryParser: if context: context.store_intermediate_result('query_normalized', normalized) - # Extract domain if present (e.g., "brand:Nike" -> domain="brand", query="Nike") - domain, query_text = self.normalizer.extract_domain_query(normalized) - log_debug(f"Domain extraction | Domain: '{domain}', Query: '{query_text}'") - if context: - context.store_intermediate_result('extracted_domain', domain) - context.store_intermediate_result('domain_query', query_text) - # Stage 2: Query rewriting - rewritten = None + query_text = normalized + rewritten = normalized if self.config.query_config.rewrite_dictionary: # Enable rewrite if dictionary exists rewritten = self.rewriter.rewrite(query_text) if rewritten != query_text: @@ -351,43 +279,51 @@ class QueryParser: log_info(f"Language detection | Detected language: {detected_lang}") if context: context.store_intermediate_result('detected_language', detected_lang) + # Stage 4: Query analysis (tokenization + script flags) + query_tokens = self._get_query_tokens(query_text) + contains_chinese = self._contains_cjk(query_text) + contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens) + + log_debug( + f"Query analysis | Query tokens: {query_tokens} | " + f"contains_chinese={contains_chinese} | contains_english={contains_english}" + ) + if context: + context.store_intermediate_result('query_tokens', query_tokens) + context.store_intermediate_result('contains_chinese', contains_chinese) + context.store_intermediate_result('contains_english', contains_english) - # Stage 4: Translation — always submit to thread pool; results are collected together with - # embedding in one wait() that uses a configurable budget (short vs long by source-in-index). + # Stage 5: Translation + embedding. Parser only coordinates async enrichment work; the + # caller decides translation targets and later search-field planning. translations: Dict[str, str] = {} - translation_futures: Dict[str, Any] = {} - translation_executor: Optional[ThreadPoolExecutor] = None - index_langs: List[str] = [] + future_to_task: Dict[Any, Tuple[str, Optional[str]]] = {} + async_executor: Optional[ThreadPoolExecutor] = None detected_norm = str(detected_lang or "").strip().lower() + normalized_targets = self._normalize_language_codes(target_languages) + translation_targets = [lang for lang in normalized_targets if lang != detected_norm] + + # Stage 6: Text embedding - async execution + query_vector = None + should_generate_embedding = ( + generate_vector and + self.config.query_config.enable_text_embedding + ) + + task_count = len(translation_targets) + (1 if should_generate_embedding else 0) + if task_count > 0: + async_executor = ThreadPoolExecutor( + max_workers=max(1, min(task_count, 4)), + thread_name_prefix="query-enrichment", + ) try: - # 根据租户配置的 index_languages 决定翻译目标语言 - from config.tenant_config_loader import get_tenant_config_loader - tenant_loader = get_tenant_config_loader() - tenant_cfg = tenant_loader.get_tenant_config(tenant_id or "default") - raw_index_langs = tenant_cfg.get("index_languages") or [] - index_langs = [] - seen_langs = set() - for lang in raw_index_langs: - norm_lang = str(lang or "").strip().lower() - if not norm_lang or norm_lang in seen_langs: - continue - seen_langs.add(norm_lang) - index_langs.append(norm_lang) - - target_langs_for_translation = [lang for lang in index_langs if lang != detected_norm] - - if target_langs_for_translation: - translation_executor = ThreadPoolExecutor( - max_workers=max(1, min(len(target_langs_for_translation), 4)), - thread_name_prefix="query-translation", - ) - for lang in target_langs_for_translation: + if async_executor is not None: + for lang in translation_targets: model_name = self._pick_query_translation_model(detected_lang, lang, self.config) log_debug( f"Submitting query translation | source={detected_lang} target={lang} model={model_name}" ) - translation_futures[lang] = translation_executor.submit( + future = async_executor.submit( self.translator.translate, query_text, lang, @@ -395,107 +331,61 @@ class QueryParser: "ecommerce_search_query", model_name, ) - - if context: - context.store_intermediate_result('translations', translations) - for lang, translation in translations.items(): - if translation: - context.store_intermediate_result(f'translation_{lang}', translation) - + future_to_task[future] = ("translation", lang) + + if should_generate_embedding: + if self.text_encoder is None: + raise RuntimeError("Text embedding is enabled but text encoder is not initialized") + log_debug("Submitting query vector generation") + + def _encode_query_vector() -> Optional[np.ndarray]: + arr = self.text_encoder.encode([query_text], priority=1) + if arr is None or len(arr) == 0: + return None + vec = arr[0] + if vec is None: + return None + return np.asarray(vec, dtype=np.float32) + + future = async_executor.submit(_encode_query_vector) + future_to_task[future] = ("embedding", None) except Exception as e: - error_msg = f"Translation failed | Error: {str(e)}" + error_msg = f"Async query enrichment submission failed | Error: {str(e)}" log_info(error_msg) if context: context.add_warning(error_msg) + if async_executor is not None: + async_executor.shutdown(wait=False) + async_executor = None + future_to_task.clear() - # Stage 5: Query analysis (keywords, token count, query_tokens) - keywords = self._extract_keywords(query_text) - query_tokens = self._get_query_tokens(query_text) - token_count = len(query_tokens) - contains_chinese = self._contains_cjk(query_text) - contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens) - - log_debug(f"Query analysis | Keywords: {keywords} | Token count: {token_count} | " - f"Query tokens: {query_tokens} | contains_chinese={contains_chinese} | " - f"contains_english={contains_english}") - if context: - context.store_intermediate_result('keywords', keywords) - context.store_intermediate_result('token_count', token_count) - context.store_intermediate_result('query_tokens', query_tokens) - context.store_intermediate_result('contains_chinese', contains_chinese) - context.store_intermediate_result('contains_english', contains_english) - - # Stage 6: Text embedding (only for non-short queries) - async execution - query_vector = None - embedding_future = None - should_generate_embedding = ( - generate_vector and - self.config.query_config.enable_text_embedding and - domain == "default" - ) - - encoding_executor = None - if should_generate_embedding: - try: - if self.text_encoder is None: - raise RuntimeError("Text embedding is enabled but text encoder is not initialized") - log_debug("Starting query vector generation (async)") - # Submit encoding task to thread pool for async execution - encoding_executor = ThreadPoolExecutor(max_workers=1) - def _encode_query_vector() -> Optional[np.ndarray]: - arr = self.text_encoder.encode([query_text], priority=1) - if arr is None or len(arr) == 0: - return None - vec = arr[0] - return vec if isinstance(vec, np.ndarray) else None - embedding_future = encoding_executor.submit( - _encode_query_vector - ) - except Exception as e: - error_msg = f"Query vector generation task submission failed | Error: {str(e)}" - log_info(error_msg) - if context: - context.add_warning(error_msg) - encoding_executor = None - embedding_future = None - - # Wait for translation + embedding concurrently; shared budget (ms) depends on whether - # the detected language is in tenant index_languages. + # Wait for translation + embedding concurrently; shared budget depends on whether + # the detected language belongs to caller-provided target_languages. qc = self.config.query_config - source_in_index_for_budget = detected_norm in index_langs + source_in_target_languages = bool(normalized_targets) and detected_norm in normalized_targets budget_ms = ( qc.translation_embedding_wait_budget_ms_source_in_index - if source_in_index_for_budget + if source_in_target_languages else qc.translation_embedding_wait_budget_ms_source_not_in_index ) budget_sec = max(0.0, float(budget_ms) / 1000.0) - if translation_futures: + if translation_targets: log_info( f"Translation+embedding shared wait budget | budget_ms={budget_ms} | " - f"source_in_index_languages={source_in_index_for_budget} | " - f"translation_targets={list(translation_futures.keys())}" + f"source_in_target_languages={source_in_target_languages} | " + f"translation_targets={translation_targets}" ) - if translation_futures or embedding_future: + if future_to_task: log_debug( f"Waiting for async tasks (translation+embedding) | budget_ms={budget_ms} | " - f"source_in_index_languages={source_in_index_for_budget}" + f"source_in_target_languages={source_in_target_languages}" ) - all_futures: List[Any] = [] - future_to_lang: Dict[Any, tuple] = {} - for lang, future in translation_futures.items(): - all_futures.append(future) - future_to_lang[future] = ("translation", lang) - - if embedding_future: - all_futures.append(embedding_future) - future_to_lang[embedding_future] = ("embedding", None) - - done, not_done = wait(all_futures, timeout=budget_sec) + done, not_done = wait(list(future_to_task.keys()), timeout=budget_sec) for future in done: - task_type, lang = future_to_lang[future] + task_type, lang = future_to_task[future] try: result = future.result() if task_type == "translation": @@ -528,7 +418,7 @@ class QueryParser: if not_done: for future in not_done: - task_type, lang = future_to_lang[future] + task_type, lang = future_to_task[future] if task_type == "translation": timeout_msg = ( f"Translation timeout (>{budget_ms}ms) | Language: {lang} | " @@ -542,68 +432,21 @@ class QueryParser: if context: context.add_warning(timeout_msg) - if encoding_executor: - encoding_executor.shutdown(wait=False) - if translation_executor: - translation_executor.shutdown(wait=False) + if async_executor: + async_executor.shutdown(wait=False) if translations and context: context.store_intermediate_result("translations", translations) - - # Build language-scoped query plan: source language + available translations - query_text_by_lang: Dict[str, str] = {} - if query_text: - query_text_by_lang[detected_lang] = query_text - for lang, translated_text in (translations or {}).items(): - if translated_text and str(translated_text).strip(): - query_text_by_lang[str(lang).strip().lower()] = str(translated_text) - - supplemental_search_langs = self._infer_supplemental_search_langs( - query_text=query_text, - detected_lang=detected_lang, - index_langs=index_langs, - ) - for lang in supplemental_search_langs: - if lang not in query_text_by_lang and query_text: - # Use the original mixed-script query as a robust fallback probe for that language field set. - query_text_by_lang[lang] = query_text - - source_in_index_languages = detected_norm in index_langs - ordered_search_langs: List[str] = [] - seen_order = set() - if detected_lang in query_text_by_lang: - ordered_search_langs.append(detected_lang) - seen_order.add(detected_lang) - for lang in index_langs: - if lang in query_text_by_lang and lang not in seen_order: - ordered_search_langs.append(lang) - seen_order.add(lang) - for lang in query_text_by_lang.keys(): - if lang not in seen_order: - ordered_search_langs.append(lang) - seen_order.add(lang) - - if context: - context.store_intermediate_result("search_langs", ordered_search_langs) - context.store_intermediate_result("query_text_by_lang", query_text_by_lang) - context.store_intermediate_result("supplemental_search_langs", supplemental_search_langs) # Build result result = ParsedQuery( original_query=query, query_normalized=normalized, - rewritten_query=rewritten, + rewritten_query=query_text, detected_language=detected_lang, translations=translations, query_vector=query_vector, - domain=domain, - keywords=keywords, - token_count=token_count, query_tokens=query_tokens, - query_text_by_lang=query_text_by_lang, - search_langs=ordered_search_langs, - index_languages=index_langs, - source_in_index_languages=source_in_index_languages, contains_chinese=contains_chinese, contains_english=contains_english, ) @@ -611,14 +454,13 @@ class QueryParser: if context and hasattr(context, 'logger'): context.logger.info( f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | " - f"Language: {detected_lang} | Domain: {domain} | " f"Translation count: {len(translations)} | Vector: {'yes' if query_vector is not None else 'no'}", extra={'reqid': context.reqid, 'uid': context.uid} ) else: logger.info( f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | " - f"Language: {detected_lang} | Domain: {domain}" + f"Language: {detected_lang}" ) return result diff --git a/search/es_query_builder.py b/search/es_query_builder.py index 9dc25ad..32ad3b9 100644 --- a/search/es_query_builder.py +++ b/search/es_query_builder.py @@ -159,7 +159,8 @@ class ESQueryBuilder: knn_k: int = 50, knn_num_candidates: int = 200, min_score: Optional[float] = None, - parsed_query: Optional[Any] = None + parsed_query: Optional[Any] = None, + index_languages: Optional[List[str]] = None, ) -> Dict[str, Any]: """ Build complete ES query with post_filter support for multi-select faceting. @@ -202,7 +203,11 @@ class ESQueryBuilder: # Text recall (always include if query_text exists) if query_text: # Unified text query strategy - text_query = self._build_advanced_text_query(query_text, parsed_query) + text_query = self._build_advanced_text_query( + query_text, + parsed_query, + index_languages=index_languages, + ) recall_clauses.append(text_query) # Embedding recall (KNN - separate from query, handled below) @@ -503,13 +508,31 @@ class ESQueryBuilder: # Currently using unified embedding field return self.text_embedding_field or "title_embedding" - def _build_advanced_text_query(self, query_text: str, parsed_query: Optional[Any] = None) -> Dict[str, Any]: + @staticmethod + def _normalize_language_list(languages: Optional[List[str]]) -> List[str]: + normalized: List[str] = [] + seen = set() + for language in languages or []: + token = str(language or "").strip().lower() + if not token or token in seen: + continue + seen.add(token) + normalized.append(token) + return normalized + + def _build_advanced_text_query( + self, + query_text: str, + parsed_query: Optional[Any] = None, + *, + index_languages: Optional[List[str]] = None, + ) -> Dict[str, Any]: """ - Build advanced text query using should clauses with primary and fallback lexical strategies. + Build advanced text query using base and translated lexical clauses. Unified implementation: - base_query: source-language clause - - translation queries: target-language clauses from search_langs/query_text_by_lang + - translation queries: target-language clauses from translations - KNN query: added separately in build_query Args: @@ -520,55 +543,42 @@ class ESQueryBuilder: ES bool query with should clauses """ should_clauses = [] - - # Get query analysis from parsed_query - query_text_by_lang: Dict[str, str] = {} - search_langs: List[str] = [] source_lang = self.default_language - source_in_index_languages = True - index_languages: List[str] = [] - + translations: Dict[str, str] = {} contains_chinese = False contains_english = False + normalized_index_languages = self._normalize_language_list(index_languages) + if parsed_query: - query_text_by_lang = getattr(parsed_query, "query_text_by_lang", None) or {} - search_langs = getattr(parsed_query, "search_langs", None) or [] detected_lang = getattr(parsed_query, "detected_language", None) source_lang = detected_lang if detected_lang and detected_lang != "unknown" else self.default_language - source_in_index_languages = bool( - getattr(parsed_query, "source_in_index_languages", True) - ) - index_languages = getattr(parsed_query, "index_languages", None) or [] + translations = getattr(parsed_query, "translations", None) or {} contains_chinese = bool(getattr(parsed_query, "contains_chinese", False)) contains_english = bool(getattr(parsed_query, "contains_english", False)) - if not query_text_by_lang: - query_text_by_lang = {source_lang: query_text} - if source_lang not in query_text_by_lang and query_text: - query_text_by_lang[source_lang] = query_text - if not search_langs: - search_langs = list(query_text_by_lang.keys()) - - # Base + translated clauses based on language plan. - for lang in search_langs: - lang_query = query_text_by_lang.get(lang) - if not lang_query: - continue + source_lang = str(source_lang or self.default_language).strip().lower() or self.default_language + source_in_index_languages = ( + True if not normalized_index_languages else source_lang in normalized_index_languages + ) + + base_query_text = ( + getattr(parsed_query, "rewritten_query", None) if parsed_query else None + ) or query_text + + def append_clause(lang: str, lang_query: str, clause_name: str, is_source: bool) -> None: + nonlocal should_clauses all_specs, _ = self._build_match_field_specs(lang) expanded_specs = self._expand_match_field_specs_for_mixed_script( lang, all_specs, contains_chinese, contains_english, - index_languages, + normalized_index_languages, ) match_fields = self._format_match_field_specs(expanded_specs) if not match_fields: - continue - - is_source = (lang == source_lang) + return clause_boost = 1.0 - clause_name = "base_query" if is_source else f"base_query_trans_{lang}" minimum_should_match = ( self.base_minimum_should_match if is_source else self.translation_minimum_should_match ) @@ -596,44 +606,17 @@ class ESQueryBuilder: "multi_match": clause["multi_match"] }) - # Fallback: source language is not indexed and translation for some index languages is missing. - # Use original query text on missing index-language fields with a low boost. - if not source_in_index_languages and query_text and index_languages: - normalized_index_langs: List[str] = [] - seen_langs = set() - for lang in index_languages: - norm_lang = str(lang or "").strip().lower() - if not norm_lang or norm_lang in seen_langs: - continue - seen_langs.add(norm_lang) - normalized_index_langs.append(norm_lang) + if base_query_text: + append_clause(source_lang, base_query_text, "base_query", True) - for lang in normalized_index_langs: - if lang == source_lang: - continue - if lang in query_text_by_lang: - continue - fb_specs, _ = self._build_match_field_specs(lang) - expanded_fb = self._expand_match_field_specs_for_mixed_script( - lang, - fb_specs, - contains_chinese, - contains_english, - index_languages, - ) - match_fields = self._format_match_field_specs(expanded_fb) - if not match_fields: - continue - should_clauses.append({ - "multi_match": { - "_name": f"fallback_original_query_{lang}", - "query": query_text, - "fields": match_fields, - "minimum_should_match": self.translation_minimum_should_match, - "tie_breaker": self.tie_breaker_base_query, - "boost": self.original_query_fallback_boost_when_translation_missing, - } - }) + for lang, translated_text in translations.items(): + normalized_lang = str(lang or "").strip().lower() + normalized_text = str(translated_text or "").strip() + if not normalized_lang or not normalized_text: + continue + if normalized_lang == source_lang and normalized_text == base_query_text: + continue + append_clause(normalized_lang, normalized_text, f"base_query_trans_{normalized_lang}", False) # Fallback to a simple query when language fields cannot be resolved. if not should_clauses: diff --git a/search/searcher.py b/search/searcher.py index e6649f0..29e22bb 100644 --- a/search/searcher.py +++ b/search/searcher.py @@ -516,10 +516,19 @@ class Searcher: range_filters: Range filters for numeric fields facets: Facet configurations for faceted search min_score: Minimum score threshold - context: Request context for tracking (created if not provided) + context: Request context for tracking (required) sort_by: Field name for sorting sort_order: Sort order: 'asc' or 'desc' debug: Enable debug information output + language: Response / field selection language hint (e.g. zh, en) + sku_filter_dimension: SKU grouping dimensions for per-SPU variant pick + enable_rerank: If None, use ``config.rerank.enabled``; if set, overrides + whether the rerank provider is invoked (subject to rerank window). + rerank_query_template: Override for rerank query text template; None uses + ``config.rerank.rerank_query_template`` (e.g. ``"{query}"``). + rerank_doc_template: Override for per-hit document text passed to rerank; + None uses ``config.rerank.rerank_doc_template``. Placeholders are + resolved in ``search/rerank_client.py``. Returns: SearchResult object with formatted results @@ -592,7 +601,8 @@ class Searcher: query, tenant_id=tenant_id, generate_vector=enable_embedding, - context=context + context=context, + target_languages=index_langs if enable_translation else [], ) # Store query analysis results in context context.store_query_analysis( @@ -602,7 +612,7 @@ class Searcher: detected_language=parsed_query.detected_language, translations=parsed_query.translations, query_vector=parsed_query.query_vector.tolist() if parsed_query.query_vector is not None else None, - domain=parsed_query.domain, + domain="default", is_simple_query=True ) @@ -610,7 +620,6 @@ class Searcher: f"查询解析完成 | 原查询: '{parsed_query.original_query}' | " f"重写后: '{parsed_query.rewritten_query}' | " f"语言: {parsed_query.detected_language} | " - f"域: {parsed_query.domain} | " f"向量: {'是' if parsed_query.query_vector is not None else '否'}", extra={'reqid': context.reqid, 'uid': context.uid} ) @@ -643,7 +652,8 @@ class Searcher: from_=es_fetch_from, enable_knn=enable_embedding and parsed_query.query_vector is not None, min_score=min_score, - parsed_query=parsed_query + parsed_query=parsed_query, + index_languages=index_langs, ) # Add facets for faceted search @@ -985,9 +995,6 @@ class Searcher: "rewritten_query": context.query_analysis.rewritten_query, "detected_language": context.query_analysis.detected_language, "translations": context.query_analysis.translations, - "query_text_by_lang": context.get_intermediate_result("query_text_by_lang", {}), - "search_langs": context.get_intermediate_result("search_langs", []), - "supplemental_search_langs": context.get_intermediate_result("supplemental_search_langs", []), "has_vector": context.query_analysis.query_vector is not None, "is_simple_query": context.query_analysis.is_simple_query, "domain": context.query_analysis.domain diff --git a/tests/test_embedding_pipeline.py b/tests/test_embedding_pipeline.py index 704910a..8670718 100644 --- a/tests/test_embedding_pipeline.py +++ b/tests/test_embedding_pipeline.py @@ -73,6 +73,10 @@ class _FakeQueryEncoder: return np.array([np.array([0.11, 0.22, 0.33], dtype=np.float32) for _ in sentences], dtype=object) +def _tokenizer(text): + return str(text).split() + + class _FakeEmbeddingCache: def __init__(self): self.store: Dict[str, np.ndarray] = {} @@ -210,6 +214,7 @@ def test_query_parser_generates_query_vector_with_encoder(): config=_build_test_config(), text_encoder=encoder, translator=_FakeTranslator(), + tokenizer=_tokenizer, ) parsed = parser.parse("red dress", tenant_id="162", generate_vector=True) @@ -224,6 +229,7 @@ def test_query_parser_skips_query_vector_when_disabled(): config=_build_test_config(), text_encoder=_FakeQueryEncoder(), translator=_FakeTranslator(), + tokenizer=_tokenizer, ) parsed = parser.parse("red dress", tenant_id="162", generate_vector=False) diff --git a/tests/test_es_query_builder.py b/tests/test_es_query_builder.py index 71a1f07..ad9b1d9 100644 --- a/tests/test_es_query_builder.py +++ b/tests/test_es_query_builder.py @@ -65,21 +65,42 @@ def test_knn_prefilter_not_added_without_filters(): assert q["knn"]["_name"] == "knn_query" -def test_text_query_contains_only_base_translation_and_fallback_named_queries(): +def test_text_query_contains_only_base_and_translation_named_queries(): qb = _builder() parsed_query = SimpleNamespace( - query_text_by_lang={"en": "dress", "zh": "连衣裙"}, - search_langs=["en", "zh"], + rewritten_query="dress", detected_language="en", - source_in_index_languages=False, - index_languages=["en", "zh", "fr"], + translations={"en": "dress", "zh": "连衣裙"}, ) - q = qb.build_query(query_text="dress", parsed_query=parsed_query, enable_knn=False) + q = qb.build_query( + query_text="dress", + parsed_query=parsed_query, + enable_knn=False, + index_languages=["en", "zh", "fr"], + ) should = q["query"]["bool"]["should"] names = [clause["multi_match"]["_name"] for clause in should] - assert names == ["base_query", "base_query_trans_zh", "fallback_original_query_fr"] + assert names == ["base_query", "base_query_trans_zh"] + + +def test_text_query_skips_duplicate_translation_same_as_base(): + qb = _builder() + parsed_query = SimpleNamespace( + rewritten_query="dress", + detected_language="en", + translations={"en": "dress"}, + ) + + q = qb.build_query( + query_text="dress", + parsed_query=parsed_query, + enable_knn=False, + index_languages=["en", "zh"], + ) + + assert q["query"]["multi_match"]["_name"] == "base_query" def test_mixed_script_merges_en_fields_into_zh_clause(): @@ -91,22 +112,25 @@ def test_mixed_script_merges_en_fields_into_zh_clause(): default_language="en", ) parsed_query = SimpleNamespace( - query_text_by_lang={"zh": "法式 dress"}, - search_langs=["zh"], + rewritten_query="法式 dress", detected_language="zh", - source_in_index_languages=True, - index_languages=["zh", "en"], + translations={}, contains_chinese=True, contains_english=True, ) - q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False) + q = qb.build_query( + query_text="法式 dress", + parsed_query=parsed_query, + enable_knn=False, + index_languages=["zh", "en"], + ) fields = q["query"]["multi_match"]["fields"] bases = {f.split("^", 1)[0] for f in fields} assert "title.zh" in bases and "title.en" in bases assert "brief.zh" in bases and "brief.en" in bases - # Merged supplemental language fields use boost * 0.8 (implicit 1.0 -> ^0.8) - assert "title.en^0.8" in fields - assert "brief.en^0.8" in fields + # Merged supplemental language fields use boost * 0.6 by default. + assert "title.en^0.6" in fields + assert "brief.en^0.6" in fields def test_mixed_script_merges_zh_fields_into_en_clause(): @@ -118,19 +142,22 @@ def test_mixed_script_merges_zh_fields_into_en_clause(): default_language="en", ) parsed_query = SimpleNamespace( - query_text_by_lang={"en": "red 连衣裙"}, - search_langs=["en"], + rewritten_query="red 连衣裙", detected_language="en", - source_in_index_languages=True, - index_languages=["zh", "en"], + translations={}, contains_chinese=True, contains_english=True, ) - q = qb.build_query(query_text="red 连衣裙", parsed_query=parsed_query, enable_knn=False) + q = qb.build_query( + query_text="red 连衣裙", + parsed_query=parsed_query, + enable_knn=False, + index_languages=["zh", "en"], + ) fields = q["query"]["multi_match"]["fields"] bases = {f.split("^", 1)[0] for f in fields} assert "title.en" in bases and "title.zh" in bases - assert "title.zh^0.8" in fields + assert "title.zh^0.6" in fields def test_mixed_script_merged_fields_scale_configured_boosts(): @@ -143,18 +170,21 @@ def test_mixed_script_merged_fields_scale_configured_boosts(): default_language="en", ) parsed_query = SimpleNamespace( - query_text_by_lang={"zh": "法式 dress"}, - search_langs=["zh"], + rewritten_query="法式 dress", detected_language="zh", - source_in_index_languages=True, - index_languages=["zh", "en"], + translations={}, contains_chinese=True, contains_english=True, ) - q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False) + q = qb.build_query( + query_text="法式 dress", + parsed_query=parsed_query, + enable_knn=False, + index_languages=["zh", "en"], + ) fields = q["query"]["multi_match"]["fields"] assert "title.zh^5.0" in fields - assert "title.en^8.0" in fields # 10.0 * 0.8 + assert "title.en^6.0" in fields # 10.0 * 0.6 def test_mixed_script_does_not_merge_en_when_not_in_index_languages(): @@ -166,15 +196,18 @@ def test_mixed_script_does_not_merge_en_when_not_in_index_languages(): default_language="zh", ) parsed_query = SimpleNamespace( - query_text_by_lang={"zh": "法式 dress"}, - search_langs=["zh"], + rewritten_query="法式 dress", detected_language="zh", - source_in_index_languages=True, - index_languages=["zh"], + translations={}, contains_chinese=True, contains_english=True, ) - q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False) + q = qb.build_query( + query_text="法式 dress", + parsed_query=parsed_query, + enable_knn=False, + index_languages=["zh"], + ) fields = q["query"]["multi_match"]["fields"] bases = {f.split("^", 1)[0] for f in fields} assert "title.zh" in bases diff --git a/tests/test_query_parser_mixed_language.py b/tests/test_query_parser_mixed_language.py index 218de59..70d2502 100644 --- a/tests/test_query_parser_mixed_language.py +++ b/tests/test_query_parser_mixed_language.py @@ -1,5 +1,3 @@ -from types import SimpleNamespace - from config import FunctionScoreConfig, IndexConfig, QueryConfig, RerankConfig, SPUConfig, SearchConfig from query.query_parser import QueryParser @@ -9,6 +7,10 @@ class _DummyTranslator: return f"{text}-{target_lang}" +def _tokenizer(text): + return str(text).split() + + def test_pure_english_word_token_length_and_script(): assert QueryParser._is_pure_english_word_token("ab") is False assert QueryParser._is_pure_english_word_token("abc") is True @@ -35,59 +37,57 @@ def _build_config() -> SearchConfig: def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(monkeypatch): - parser = QueryParser(_build_config(), translator=_DummyTranslator()) + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh") - monkeypatch.setattr( - "query.query_parser.get_tenant_config_loader", - lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["zh", "en"]}), - raising=False, - ) - result = parser.parse("法式 dress 连衣裙", tenant_id="162", generate_vector=False) + result = parser.parse( + "法式 dress 连衣裙", + tenant_id="162", + generate_vector=False, + target_languages=["zh", "en"], + ) assert result.detected_language == "zh" assert result.contains_chinese is True assert result.contains_english is True - assert "en" in result.search_langs - # 翻译在预算内完成时会写入目标语言字段(优于仅用原文做 supplemental 探测) - assert result.query_text_by_lang["en"] == "法式 dress 连衣裙-en" - assert result.query_text_by_lang["zh"] == "法式 dress 连衣裙" + assert result.translations == {"en": "法式 dress 连衣裙-en"} + assert result.query_tokens == ["法式", "dress", "连衣裙"] + assert not hasattr(result, "query_text_by_lang") + assert not hasattr(result, "search_langs") def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch): - parser = QueryParser(_build_config(), translator=_DummyTranslator()) + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") - monkeypatch.setattr( - "query.query_parser.get_tenant_config_loader", - lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}), - raising=False, - ) - result = parser.parse("red 连衣裙", tenant_id="0", generate_vector=False) + result = parser.parse( + "red 连衣裙", + tenant_id="0", + generate_vector=False, + target_languages=["en", "zh"], + ) assert result.detected_language == "en" assert result.contains_chinese is True assert result.contains_english is True - assert "zh" in result.search_langs - assert result.query_text_by_lang["zh"] == "red 连衣裙-zh" - assert result.query_text_by_lang["en"] == "red 连衣裙" + assert result.translations == {"zh": "red 连衣裙-zh"} + assert result.query_tokens == ["red", "连衣裙"] def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch): """en 在 index_languages 内时仍应等待并采纳 en->zh 翻译结果(与向量共用预算)。""" - parser = QueryParser(_build_config(), translator=_DummyTranslator()) + parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") - monkeypatch.setattr( - "query.query_parser.get_tenant_config_loader", - lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}), - raising=False, - ) - result = parser.parse("off shoulder top", tenant_id="0", generate_vector=False) + result = parser.parse( + "off shoulder top", + tenant_id="0", + generate_vector=False, + target_languages=["en", "zh"], + ) assert result.detected_language == "en" assert result.contains_chinese is False assert result.contains_english is True assert result.translations.get("zh") == "off shoulder top-zh" - assert result.query_text_by_lang.get("zh") == "off shoulder top-zh" - assert result.source_in_index_languages is True + assert not hasattr(result, "source_in_index_languages") diff --git a/tests/test_search_rerank_window.py b/tests/test_search_rerank_window.py index af5ab44..d90c8f0 100644 --- a/tests/test_search_rerank_window.py +++ b/tests/test_search_rerank_window.py @@ -43,7 +43,14 @@ class _FakeParsedQuery: class _FakeQueryParser: - def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any): + def parse( + self, + query: str, + tenant_id: str, + generate_vector: bool, + context: Any, + target_languages: Any = None, + ): return _FakeParsedQuery( original_query=query, query_normalized=query, @@ -191,6 +198,66 @@ def test_config_loader_rerank_enabled_defaults_true(tmp_path: Path): "field_boosts": {"title.en": 3.0}, "indexes": [{"name": "default", "label": "default", "fields": ["title.en"]}], "query_config": {"supported_languages": ["en"], "default_language": "en"}, + "services": { + "translation": { + "service_url": "http://localhost:6005", + "timeout_sec": 3.0, + "default_model": "dummy-model", + "default_scene": "general", + "cache": { + "ttl_seconds": 60, + "sliding_expiration": True, + }, + "capabilities": { + "dummy-model": { + "enabled": True, + "backend": "llm", + "use_cache": True, + "model": "dummy-model", + "base_url": "http://localhost:6005/v1", + "timeout_sec": 3.0, + } + }, + }, + "embedding": { + "provider": "http", + "providers": { + "http": { + "text_base_url": "http://localhost:6005", + "image_base_url": "http://localhost:6008", + } + }, + "backend": "tei", + "backends": { + "tei": { + "base_url": "http://localhost:8080", + "timeout_sec": 3.0, + "model_id": "dummy-embedding-model", + } + }, + }, + "rerank": { + "provider": "http", + "providers": { + "http": { + "base_url": "http://localhost:6007", + "service_url": "http://localhost:6007/rerank", + } + }, + "backend": "bge", + "backends": { + "bge": { + "model_name": "dummy-rerank-model", + "device": "cpu", + "use_fp16": False, + "batch_size": 8, + "max_length": 128, + "cache_dir": "./model_cache", + "enable_warmup": False, + } + }, + }, + }, "spu_config": {"enabled": False}, "function_score": {"score_mode": "sum", "boost_mode": "multiply", "functions": []}, "rerank": {"rerank_window": 384}, @@ -354,7 +421,14 @@ def test_searcher_promotes_sku_when_option1_matches_translated_query(monkeypatch class _TranslatedQueryParser: text_encoder = None - def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any): + def parse( + self, + query: str, + tenant_id: str, + generate_vector: bool, + context: Any, + target_languages: Any = None, + ): return _FakeParsedQuery( original_query=query, query_normalized=query, @@ -407,15 +481,22 @@ def test_searcher_promotes_sku_by_embedding_when_query_has_no_direct_option_matc encoder = _FakeTextEncoder( { "linen summer dress": [0.8, 0.2], - "color:Red": [1.0, 0.0], - "color:Blue": [0.0, 1.0], + "color:red": [1.0, 0.0], + "color:blue": [0.0, 1.0], } ) class _EmbeddingQueryParser: text_encoder = encoder - def parse(self, query: str, tenant_id: str, generate_vector: bool, context: Any): + def parse( + self, + query: str, + tenant_id: str, + generate_vector: bool, + context: Any, + target_languages: Any = None, + ): return _FakeParsedQuery( original_query=query, query_normalized=query, -- libgit2 0.21.2