Commit 35da381333a86fda7b179f767b6f5a5dcd157a65

Authored by tangwang
1 parent 445496cd

中英混写query的优化逻辑,不适合新的combined_fields+best_fields+phrase查询方式,带来的复杂度较多,清理该部分逻辑

docs/相关性检索优化说明.md
@@ -17,9 +17,9 @@ @@ -17,9 +17,9 @@
17 查询链路(文本相关): 17 查询链路(文本相关):
18 18
19 1. `QueryParser.parse()` 19 1. `QueryParser.parse()`
20 - 负责产出解析事实:`query_normalized`、`rewritten_query`、`detected_language`、`translations`、`query_vector`、`query_tokens`、`contains_chinese`、`contains_english` 20 + 负责产出解析事实:`query_normalized`、`rewritten_query`、`detected_language`、`translations`、`query_vector`、`query_tokens`
21 2. `Searcher.search()` 21 2. `Searcher.search()`
22 - 负责读取租户 `index_languages`,并将其一方面传给 `QueryParser` 作为 `target_languages`,另一方面传给 `ESQueryBuilder` 作为字段展开约束 22 + 负责读取租户 `index_languages`,并将其传给 `QueryParser` 作为 `target_languages`(控制翻译目标语种);`ESQueryBuilder` 仅根据 `detected_language` 与各条译文构建子句字段,不再接收 `index_languages`
23 2. `ESQueryBuilder._build_advanced_text_query()` 23 2. `ESQueryBuilder._build_advanced_text_query()`
24 基于 `rewritten_query + detected_language + translations + index_languages` 构建 `base_query` 与 `base_query_trans_*`;并按语言动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段,叠加 shared 字段(`tags`、`option*_values`)。 24 基于 `rewritten_query + detected_language + translations + index_languages` 构建 `base_query` 与 `base_query_trans_*`;并按语言动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段,叠加 shared 字段(`tags`、`option*_values`)。
25 3. `build_query()` 25 3. `build_query()`
@@ -76,9 +76,6 @@ @@ -76,9 +76,6 @@
76 76
77 最终按 `bool.should` 组合,`minimum_should_match: 1`。 77 最终按 `bool.should` 组合,`minimum_should_match: 1`。
78 78
79 -> **附 — 混写辅助召回**  
80 -> 当中英(或多脚本)混写时,为略抬召回:`QueryParser` 用 `contains_chinese`(文中有汉字)、`contains_english`(分词中有长度 ≥3 的纯英文 token)打标;`ESQueryBuilder` 在某一语言的 `multi_match` 上,按规则把**另一语种**的同类字段并入同一 `fields`(受 `index_languages` 限制),并入列的 boost 为配置值再乘 **`mixed_script_merged_field_boost_scale`(默认 0.6,`ESQueryBuilder` 构造参数)**。字段在内部以 `(path, boost)` 列表合并后再格式化为 ES 字符串。  
81 -  
82 ## 5. 关键配置项(文本策略) 79 ## 5. 关键配置项(文本策略)
83 80
84 `query_config` 下与解析等待相关的项: 81 `query_config` 下与解析等待相关的项:
@@ -147,11 +144,9 @@ @@ -147,11 +144,9 @@
147 - `translations` 144 - `translations`
148 - `query_vector` 145 - `query_vector`
149 - `query_tokens` 146 - `query_tokens`
150 - - `contains_chinese` / `contains_english`  
151 - `Searcher` 负责“租户语境”: 147 - `Searcher` 负责“租户语境”:
152 - `index_languages` 148 - `index_languages`
153 - 将其传给 parser 作为 `target_languages` 149 - 将其传给 parser 作为 `target_languages`
154 - - 将其传给 builder 作为字段展开约束  
155 - `ESQueryBuilder` 负责“表达式展开”: 150 - `ESQueryBuilder` 负责“表达式展开”:
156 - 动态字段组装 151 - 动态字段组装
157 - 子句权重分配 152 - 子句权重分配
query/query_parser.py
@@ -50,8 +50,6 @@ class ParsedQuery: @@ -50,8 +50,6 @@ class ParsedQuery:
50 translations: Dict[str, str] = field(default_factory=dict) 50 translations: Dict[str, str] = field(default_factory=dict)
51 query_vector: Optional[np.ndarray] = None 51 query_vector: Optional[np.ndarray] = None
52 query_tokens: List[str] = field(default_factory=list) 52 query_tokens: List[str] = field(default_factory=list)
53 - contains_chinese: bool = False  
54 - contains_english: bool = False  
55 53
56 def to_dict(self) -> Dict[str, Any]: 54 def to_dict(self) -> Dict[str, Any]:
57 """Convert to dictionary representation.""" 55 """Convert to dictionary representation."""
@@ -62,8 +60,6 @@ class ParsedQuery: @@ -62,8 +60,6 @@ class ParsedQuery:
62 "detected_language": self.detected_language, 60 "detected_language": self.detected_language,
63 "translations": self.translations, 61 "translations": self.translations,
64 "query_tokens": self.query_tokens, 62 "query_tokens": self.query_tokens,
65 - "contains_chinese": self.contains_chinese,  
66 - "contains_english": self.contains_english,  
67 } 63 }
68 64
69 65
@@ -202,21 +198,6 @@ class QueryParser: @@ -202,21 +198,6 @@ class QueryParser:
202 def _get_query_tokens(self, query: str) -> List[str]: 198 def _get_query_tokens(self, query: str) -> List[str]:
203 return self._extract_tokens(self._tokenizer(query)) 199 return self._extract_tokens(self._tokenizer(query))
204 200
205 - @staticmethod  
206 - def _contains_cjk(text: str) -> bool:  
207 - """Whether query contains any CJK ideograph."""  
208 - return bool(re.search(r"[\u4e00-\u9fff]", text or ""))  
209 -  
210 - @staticmethod  
211 - def _is_pure_english_word_token(token: str) -> bool:  
212 - """  
213 - A tokenizer token counts as English iff it is letters only (optional internal hyphens)  
214 - and length >= 3.  
215 - """  
216 - if not token or len(token) < 3:  
217 - return False  
218 - return bool(re.fullmatch(r"[A-Za-z]+(?:-[A-Za-z]+)*", token))  
219 -  
220 def parse( 201 def parse(
221 self, 202 self,
222 query: str, 203 query: str,
@@ -285,19 +266,12 @@ class QueryParser: @@ -285,19 +266,12 @@ class QueryParser:
285 log_info(f"Language detection | Detected language: {detected_lang}") 266 log_info(f"Language detection | Detected language: {detected_lang}")
286 if context: 267 if context:
287 context.store_intermediate_result('detected_language', detected_lang) 268 context.store_intermediate_result('detected_language', detected_lang)
288 - # Stage 4: Query analysis (tokenization + script flags) 269 + # Stage 4: Query analysis (tokenization)
289 query_tokens = self._get_query_tokens(query_text) 270 query_tokens = self._get_query_tokens(query_text)
290 - contains_chinese = self._contains_cjk(query_text)  
291 - contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens)  
292 271
293 - log_debug(  
294 - f"Query analysis | Query tokens: {query_tokens} | "  
295 - f"contains_chinese={contains_chinese} | contains_english={contains_english}"  
296 - ) 272 + log_debug(f"Query analysis | Query tokens: {query_tokens}")
297 if context: 273 if context:
298 context.store_intermediate_result('query_tokens', query_tokens) 274 context.store_intermediate_result('query_tokens', query_tokens)
299 - context.store_intermediate_result('contains_chinese', contains_chinese)  
300 - context.store_intermediate_result('contains_english', contains_english)  
301 275
302 # Stage 5: Translation + embedding. Parser only coordinates async enrichment work; the 276 # Stage 5: Translation + embedding. Parser only coordinates async enrichment work; the
303 # caller decides translation targets and later search-field planning. 277 # caller decides translation targets and later search-field planning.
@@ -459,8 +433,6 @@ class QueryParser: @@ -459,8 +433,6 @@ class QueryParser:
459 translations=translations, 433 translations=translations,
460 query_vector=query_vector, 434 query_vector=query_vector,
461 query_tokens=query_tokens, 435 query_tokens=query_tokens,
462 - contains_chinese=contains_chinese,  
463 - contains_english=contains_english,  
464 ) 436 )
465 437
466 if context and hasattr(context, 'logger'): 438 if context and hasattr(context, 'logger'):
search/es_query_builder.py
@@ -8,14 +8,11 @@ Simplified architecture: @@ -8,14 +8,11 @@ Simplified architecture:
8 - function_score wrapper for boosting fields 8 - function_score wrapper for boosting fields
9 """ 9 """
10 10
11 -from typing import Dict, Any, List, Optional, Union, Tuple 11 +from typing import Dict, Any, List, Optional, Tuple
12 12
13 import numpy as np 13 import numpy as np
14 from config import FunctionScoreConfig 14 from config import FunctionScoreConfig
15 15
16 -# (Elasticsearch field path, boost before formatting as "path^boost")  
17 -MatchFieldSpec = Tuple[str, float]  
18 -  
19 16
20 class ESQueryBuilder: 17 class ESQueryBuilder:
21 """Builds Elasticsearch DSL queries.""" 18 """Builds Elasticsearch DSL queries."""
@@ -39,7 +36,6 @@ class ESQueryBuilder: @@ -39,7 +36,6 @@ class ESQueryBuilder:
39 tie_breaker_base_query: float = 0.9, 36 tie_breaker_base_query: float = 0.9,
40 best_fields_boosts: Optional[Dict[str, float]] = None, 37 best_fields_boosts: Optional[Dict[str, float]] = None,
41 best_fields_clause_boost: float = 2.0, 38 best_fields_clause_boost: float = 2.0,
42 - mixed_script_merged_field_boost_scale: float = 0.6,  
43 phrase_field_boosts: Optional[Dict[str, float]] = None, 39 phrase_field_boosts: Optional[Dict[str, float]] = None,
44 phrase_match_base_fields: Optional[Tuple[str, ...]] = None, 40 phrase_match_base_fields: Optional[Tuple[str, ...]] = None,
45 phrase_match_slop: int = 0, 41 phrase_match_slop: int = 0,
@@ -60,7 +56,6 @@ class ESQueryBuilder: @@ -60,7 +56,6 @@ class ESQueryBuilder:
60 function_score_config: Function score configuration 56 function_score_config: Function score configuration
61 default_language: Default language to use when detection fails or returns "unknown" 57 default_language: Default language to use when detection fails or returns "unknown"
62 knn_boost: Boost value for KNN (embedding recall) 58 knn_boost: Boost value for KNN (embedding recall)
63 - mixed_script_merged_field_boost_scale: Multiply per-field ^boost for cross-script merged fields  
64 """ 59 """
65 self.match_fields = match_fields 60 self.match_fields = match_fields
66 self.field_boosts = field_boosts or {} 61 self.field_boosts = field_boosts or {}
@@ -77,7 +72,6 @@ class ESQueryBuilder: @@ -77,7 +72,6 @@ class ESQueryBuilder:
77 self.translation_minimum_should_match = translation_minimum_should_match 72 self.translation_minimum_should_match = translation_minimum_should_match
78 self.translation_boost = float(translation_boost) 73 self.translation_boost = float(translation_boost)
79 self.tie_breaker_base_query = float(tie_breaker_base_query) 74 self.tie_breaker_base_query = float(tie_breaker_base_query)
80 - self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale)  
81 default_best_fields = { 75 default_best_fields = {
82 base: self._get_field_boost(base) 76 base: self._get_field_boost(base)
83 for base in self.core_multilingual_fields 77 for base in self.core_multilingual_fields
@@ -180,7 +174,6 @@ class ESQueryBuilder: @@ -180,7 +174,6 @@ class ESQueryBuilder:
180 knn_num_candidates: int = 200, 174 knn_num_candidates: int = 200,
181 min_score: Optional[float] = None, 175 min_score: Optional[float] = None,
182 parsed_query: Optional[Any] = None, 176 parsed_query: Optional[Any] = None,
183 - index_languages: Optional[List[str]] = None,  
184 ) -> Dict[str, Any]: 177 ) -> Dict[str, Any]:
185 """ 178 """
186 Build complete ES query with post_filter support for multi-select faceting. 179 Build complete ES query with post_filter support for multi-select faceting.
@@ -223,11 +216,7 @@ class ESQueryBuilder: @@ -223,11 +216,7 @@ class ESQueryBuilder:
223 # Text recall (always include if query_text exists) 216 # Text recall (always include if query_text exists)
224 if query_text: 217 if query_text:
225 # Unified text query strategy 218 # Unified text query strategy
226 - text_query = self._build_advanced_text_query(  
227 - query_text,  
228 - parsed_query,  
229 - index_languages=index_languages,  
230 - ) 219 + text_query = self._build_advanced_text_query(query_text, parsed_query)
231 recall_clauses.append(text_query) 220 recall_clauses.append(text_query)
232 221
233 # Embedding recall (KNN - separate from query, handled below) 222 # Embedding recall (KNN - separate from query, handled below)
@@ -434,90 +423,36 @@ class ESQueryBuilder: @@ -434,90 +423,36 @@ class ESQueryBuilder:
434 return float(self.field_boosts[base_field]) 423 return float(self.field_boosts[base_field])
435 return 1.0 424 return 1.0
436 425
437 - def _build_match_field_specs( 426 + def _match_field_strings(
438 self, 427 self,
439 language: str, 428 language: str,
440 *, 429 *,
441 multilingual_fields: Optional[List[str]] = None, 430 multilingual_fields: Optional[List[str]] = None,
442 shared_fields: Optional[List[str]] = None, 431 shared_fields: Optional[List[str]] = None,
443 boost_overrides: Optional[Dict[str, float]] = None, 432 boost_overrides: Optional[Dict[str, float]] = None,
444 - ) -> List[MatchFieldSpec]:  
445 - """  
446 - Per-language match targets as (field_path, boost). Single source of truth before  
447 - formatting as Elasticsearch ``fields`` strings.  
448 - """ 433 + ) -> List[str]:
  434 + """Build ``multi_match`` / ``combined_fields`` field entries for one language code."""
449 lang = (language or "").strip().lower() 435 lang = (language or "").strip().lower()
450 - specs: List[MatchFieldSpec] = []  
451 - text_fields = multilingual_fields if multilingual_fields is not None else self.multilingual_fields 436 + text_bases = multilingual_fields if multilingual_fields is not None else self.multilingual_fields
452 term_fields = shared_fields if shared_fields is not None else self.shared_fields 437 term_fields = shared_fields if shared_fields is not None else self.shared_fields
453 overrides = boost_overrides or {} 438 overrides = boost_overrides or {}
454 -  
455 - for base in text_fields:  
456 - field = f"{base}.{lang}" 439 + out: List[str] = []
  440 + for base in text_bases:
  441 + path = f"{base}.{lang}"
457 boost = float(overrides.get(base, self._get_field_boost(base, lang))) 442 boost = float(overrides.get(base, self._get_field_boost(base, lang)))
458 - specs.append((field, boost))  
459 - 443 + out.append(self._format_field_with_boost(path, boost))
460 for shared in term_fields: 444 for shared in term_fields:
461 boost = float(overrides.get(shared, self._get_field_boost(shared, None))) 445 boost = float(overrides.get(shared, self._get_field_boost(shared, None)))
462 - specs.append((shared, boost))  
463 - return specs  
464 -  
465 - def _format_match_field_specs(self, specs: List[MatchFieldSpec]) -> List[str]:  
466 - """Format (field_path, boost) pairs for Elasticsearch multi_match ``fields``."""  
467 - return [self._format_field_with_boost(path, boost) for path, boost in specs]  
468 -  
469 - def _merge_supplemental_lang_field_specs(  
470 - self,  
471 - specs: List[MatchFieldSpec],  
472 - supplemental_lang: str,  
473 - ) -> List[MatchFieldSpec]:  
474 - """Append supplemental-language columns; boosts multiplied by mixed_script scale."""  
475 - scale = float(self.mixed_script_merged_field_boost_scale)  
476 - extra_all = self._build_match_field_specs(supplemental_lang)  
477 - seen = {path for path, _ in specs}  
478 - out = list(specs)  
479 - for path, boost in extra_all:  
480 - if path not in seen:  
481 - out.append((path, boost * scale))  
482 - seen.add(path)  
483 - return out  
484 -  
485 - def _expand_match_field_specs_for_mixed_script(  
486 - self,  
487 - lang: str,  
488 - specs: List[MatchFieldSpec],  
489 - contains_chinese: bool,  
490 - contains_english: bool,  
491 - index_languages: List[str],  
492 - is_source: bool = False  
493 - ) -> List[MatchFieldSpec]:  
494 - """  
495 - When the query mixes scripts, widen each clause to indexed fields for the other script  
496 - (e.g. zh clause also searches title.en when the query contains an English word token).  
497 - """  
498 - norm = {str(x or "").strip().lower() for x in (index_languages or []) if str(x or "").strip()}  
499 - allow = norm or {"zh", "en"}  
500 -  
501 - def can_use(lcode: str) -> bool:  
502 - return lcode in allow if norm else True  
503 -  
504 - out = list(specs)  
505 - lnorm = (lang or "").strip().lower()  
506 - if is_source:  
507 - if contains_english and lnorm != "en" and can_use("en"):  
508 - out = self._merge_supplemental_lang_field_specs(out, "en")  
509 - if contains_chinese and lnorm != "zh" and can_use("zh"):  
510 - out = self._merge_supplemental_lang_field_specs(out, "zh") 446 + out.append(self._format_field_with_boost(shared, boost))
511 return out 447 return out
512 448
513 def _build_best_fields_clause(self, language: str, query_text: str) -> Optional[Dict[str, Any]]: 449 def _build_best_fields_clause(self, language: str, query_text: str) -> Optional[Dict[str, Any]]:
514 - specs = self._build_match_field_specs( 450 + fields = self._match_field_strings(
515 language, 451 language,
516 multilingual_fields=list(self.best_fields_boosts), 452 multilingual_fields=list(self.best_fields_boosts),
517 shared_fields=[], 453 shared_fields=[],
518 boost_overrides=self.best_fields_boosts, 454 boost_overrides=self.best_fields_boosts,
519 ) 455 )
520 - fields = self._format_match_field_specs(specs)  
521 if not fields: 456 if not fields:
522 return None 457 return None
523 return { 458 return {
@@ -530,13 +465,12 @@ class ESQueryBuilder: @@ -530,13 +465,12 @@ class ESQueryBuilder:
530 } 465 }
531 466
532 def _build_phrase_clause(self, language: str, query_text: str) -> Optional[Dict[str, Any]]: 467 def _build_phrase_clause(self, language: str, query_text: str) -> Optional[Dict[str, Any]]:
533 - specs = self._build_match_field_specs( 468 + fields = self._match_field_strings(
534 language, 469 language,
535 multilingual_fields=list(self.phrase_field_boosts), 470 multilingual_fields=list(self.phrase_field_boosts),
536 shared_fields=[], 471 shared_fields=[],
537 boost_overrides=self.phrase_field_boosts, 472 boost_overrides=self.phrase_field_boosts,
538 ) 473 )
539 - fields = self._format_match_field_specs(specs)  
540 if not fields: 474 if not fields:
541 return None 475 return None
542 clause: Dict[str, Any] = { 476 clause: Dict[str, Any] = {
@@ -560,20 +494,8 @@ class ESQueryBuilder: @@ -560,20 +494,8 @@ class ESQueryBuilder:
560 clause_name: str, 494 clause_name: str,
561 *, 495 *,
562 is_source: bool, 496 is_source: bool,
563 - contains_chinese: bool,  
564 - contains_english: bool,  
565 - index_languages: List[str],  
566 ) -> Optional[Dict[str, Any]]: 497 ) -> Optional[Dict[str, Any]]:
567 - all_specs = self._build_match_field_specs(lang)  
568 - expanded_specs = self._expand_match_field_specs_for_mixed_script(  
569 - lang,  
570 - all_specs,  
571 - contains_chinese,  
572 - contains_english,  
573 - index_languages,  
574 - is_source,  
575 - )  
576 - combined_fields = self._format_match_field_specs(expanded_specs) 498 + combined_fields = self._match_field_strings(lang)
577 if not combined_fields: 499 if not combined_fields:
578 return None 500 return None
579 minimum_should_match = ( 501 minimum_should_match = (
@@ -607,29 +529,10 @@ class ESQueryBuilder: @@ -607,29 +529,10 @@ class ESQueryBuilder:
607 clause["bool"]["boost"] = float(self.translation_boost) 529 clause["bool"]["boost"] = float(self.translation_boost)
608 return clause 530 return clause
609 531
610 - def _get_embedding_field(self, language: str) -> str:  
611 - """Get embedding field name for a language."""  
612 - # Currently using unified embedding field  
613 - return self.text_embedding_field or "title_embedding"  
614 -  
615 - @staticmethod  
616 - def _normalize_language_list(languages: Optional[List[str]]) -> List[str]:  
617 - normalized: List[str] = []  
618 - seen = set()  
619 - for language in languages or []:  
620 - token = str(language or "").strip().lower()  
621 - if not token or token in seen:  
622 - continue  
623 - seen.add(token)  
624 - normalized.append(token)  
625 - return normalized  
626 -  
627 def _build_advanced_text_query( 532 def _build_advanced_text_query(
628 self, 533 self,
629 query_text: str, 534 query_text: str,
630 parsed_query: Optional[Any] = None, 535 parsed_query: Optional[Any] = None,
631 - *,  
632 - index_languages: Optional[List[str]] = None,  
633 ) -> Dict[str, Any]: 536 ) -> Dict[str, Any]:
634 """ 537 """
635 Build advanced text query using base and translated lexical clauses. 538 Build advanced text query using base and translated lexical clauses.
@@ -649,39 +552,26 @@ class ESQueryBuilder: @@ -649,39 +552,26 @@ class ESQueryBuilder:
649 should_clauses = [] 552 should_clauses = []
650 source_lang = self.default_language 553 source_lang = self.default_language
651 translations: Dict[str, str] = {} 554 translations: Dict[str, str] = {}
652 - contains_chinese = False  
653 - contains_english = False  
654 - normalized_index_languages = self._normalize_language_list(index_languages)  
655 555
656 if parsed_query: 556 if parsed_query:
657 detected_lang = getattr(parsed_query, "detected_language", None) 557 detected_lang = getattr(parsed_query, "detected_language", None)
658 source_lang = detected_lang if detected_lang and detected_lang != "unknown" else self.default_language 558 source_lang = detected_lang if detected_lang and detected_lang != "unknown" else self.default_language
659 translations = getattr(parsed_query, "translations", None) or {} 559 translations = getattr(parsed_query, "translations", None) or {}
660 - contains_chinese = bool(getattr(parsed_query, "contains_chinese", False))  
661 - contains_english = bool(getattr(parsed_query, "contains_english", False))  
662 560
663 source_lang = str(source_lang or self.default_language).strip().lower() or self.default_language 561 source_lang = str(source_lang or self.default_language).strip().lower() or self.default_language
664 base_query_text = ( 562 base_query_text = (
665 getattr(parsed_query, "rewritten_query", None) if parsed_query else None 563 getattr(parsed_query, "rewritten_query", None) if parsed_query else None
666 ) or query_text 564 ) or query_text
667 565
668 - def append_clause(lang: str, lang_query: str, clause_name: str, is_source: bool) -> None:  
669 - nonlocal should_clauses  
670 - clause = self._build_lexical_language_clause(  
671 - lang,  
672 - lang_query,  
673 - clause_name,  
674 - is_source=is_source,  
675 - contains_chinese=contains_chinese,  
676 - contains_english=contains_english,  
677 - index_languages=normalized_index_languages,  
678 - )  
679 - if not clause:  
680 - return  
681 - should_clauses.append(clause)  
682 -  
683 if base_query_text: 566 if base_query_text:
684 - append_clause(source_lang, base_query_text, "base_query", True) 567 + base_clause = self._build_lexical_language_clause(
  568 + source_lang,
  569 + base_query_text,
  570 + "base_query",
  571 + is_source=True,
  572 + )
  573 + if base_clause:
  574 + should_clauses.append(base_clause)
685 575
686 for lang, translated_text in translations.items(): 576 for lang, translated_text in translations.items():
687 normalized_lang = str(lang or "").strip().lower() 577 normalized_lang = str(lang or "").strip().lower()
@@ -690,7 +580,14 @@ class ESQueryBuilder: @@ -690,7 +580,14 @@ class ESQueryBuilder:
690 continue 580 continue
691 if normalized_lang == source_lang and normalized_text == base_query_text: 581 if normalized_lang == source_lang and normalized_text == base_query_text:
692 continue 582 continue
693 - append_clause(normalized_lang, normalized_text, f"base_query_trans_{normalized_lang}", False) 583 + trans_clause = self._build_lexical_language_clause(
  584 + normalized_lang,
  585 + normalized_text,
  586 + f"base_query_trans_{normalized_lang}",
  587 + is_source=False,
  588 + )
  589 + if trans_clause:
  590 + should_clauses.append(trans_clause)
694 591
695 # Fallback to a simple query when language fields cannot be resolved. 592 # Fallback to a simple query when language fields cannot be resolved.
696 if not should_clauses: 593 if not should_clauses:
search/searcher.py
@@ -645,7 +645,6 @@ class Searcher: @@ -645,7 +645,6 @@ class Searcher:
645 enable_knn=enable_embedding and parsed_query.query_vector is not None, 645 enable_knn=enable_embedding and parsed_query.query_vector is not None,
646 min_score=min_score, 646 min_score=min_score,
647 parsed_query=parsed_query, 647 parsed_query=parsed_query,
648 - index_languages=index_langs,  
649 ) 648 )
650 649
651 # Add facets for faceted search 650 # Add facets for faceted search
tests/test_es_query_builder.py
@@ -9,6 +9,9 @@ from search.es_query_builder import ESQueryBuilder @@ -9,6 +9,9 @@ from search.es_query_builder import ESQueryBuilder
9 def _builder() -> ESQueryBuilder: 9 def _builder() -> ESQueryBuilder:
10 return ESQueryBuilder( 10 return ESQueryBuilder(
11 match_fields=["title.en^3.0", "brief.en^1.0"], 11 match_fields=["title.en^3.0", "brief.en^1.0"],
  12 + multilingual_fields=["title", "brief"],
  13 + core_multilingual_fields=["title", "brief"],
  14 + shared_fields=[],
12 text_embedding_field="title_embedding", 15 text_embedding_field="title_embedding",
13 default_language="en", 16 default_language="en",
14 ) 17 )
@@ -25,10 +28,6 @@ def _lexical_clause(query_root: Dict[str, Any]) -&gt; Dict[str, Any]: @@ -25,10 +28,6 @@ def _lexical_clause(query_root: Dict[str, Any]) -&gt; Dict[str, Any]:
25 raise AssertionError("no lexical bool clause in query_root") 28 raise AssertionError("no lexical bool clause in query_root")
26 29
27 30
28 -def _lexical_combined_fields(query_root: Dict[str, Any]) -> list:  
29 - return _lexical_clause(query_root)["must"][0]["combined_fields"]["fields"]  
30 -  
31 -  
32 def test_knn_prefilter_includes_range_filters(): 31 def test_knn_prefilter_includes_range_filters():
33 qb = _builder() 32 qb = _builder()
34 q = qb.build_query( 33 q = qb.build_query(
@@ -93,7 +92,6 @@ def test_text_query_contains_only_base_and_translation_named_queries(): @@ -93,7 +92,6 @@ def test_text_query_contains_only_base_and_translation_named_queries():
93 query_text="dress", 92 query_text="dress",
94 parsed_query=parsed_query, 93 parsed_query=parsed_query,
95 enable_knn=False, 94 enable_knn=False,
96 - index_languages=["en", "zh", "fr"],  
97 ) 95 )
98 should = q["query"]["bool"]["should"] 96 should = q["query"]["bool"]["should"]
99 names = [clause["bool"]["_name"] for clause in should] 97 names = [clause["bool"]["_name"] for clause in should]
@@ -115,120 +113,8 @@ def test_text_query_skips_duplicate_translation_same_as_base(): @@ -115,120 +113,8 @@ def test_text_query_skips_duplicate_translation_same_as_base():
115 query_text="dress", 113 query_text="dress",
116 parsed_query=parsed_query, 114 parsed_query=parsed_query,
117 enable_knn=False, 115 enable_knn=False,
118 - index_languages=["en", "zh"],  
119 ) 116 )
120 117
121 root = q["query"] 118 root = q["query"]
122 assert root["bool"]["_name"] == "base_query" 119 assert root["bool"]["_name"] == "base_query"
123 assert [clause["multi_match"]["type"] for clause in root["bool"]["should"]] == ["best_fields", "phrase"] 120 assert [clause["multi_match"]["type"] for clause in root["bool"]["should"]] == ["best_fields", "phrase"]
124 -  
125 -  
126 -def test_mixed_script_merges_en_fields_into_zh_clause():  
127 - qb = ESQueryBuilder(  
128 - match_fields=["title.en^3.0"],  
129 - multilingual_fields=["title", "brief"],  
130 - shared_fields=[],  
131 - text_embedding_field="title_embedding",  
132 - default_language="en",  
133 - )  
134 - parsed_query = SimpleNamespace(  
135 - rewritten_query="法式 dress",  
136 - detected_language="zh",  
137 - translations={},  
138 - contains_chinese=True,  
139 - contains_english=True,  
140 - )  
141 - q = qb.build_query(  
142 - query_text="法式 dress",  
143 - parsed_query=parsed_query,  
144 - enable_knn=False,  
145 - index_languages=["zh", "en"],  
146 - )  
147 - fields = _lexical_combined_fields(q["query"])  
148 - bases = {f.split("^", 1)[0] for f in fields}  
149 - assert "title.zh" in bases and "title.en" in bases  
150 - assert "brief.zh" in bases and "brief.en" in bases  
151 - # Merged supplemental language fields use boost * 0.6 by default.  
152 - assert "title.en^0.6" in fields  
153 - assert "brief.en^0.6" in fields  
154 -  
155 -  
156 -def test_mixed_script_merges_zh_fields_into_en_clause():  
157 - qb = ESQueryBuilder(  
158 - match_fields=["title.en^3.0"],  
159 - multilingual_fields=["title"],  
160 - shared_fields=[],  
161 - text_embedding_field="title_embedding",  
162 - default_language="en",  
163 - )  
164 - parsed_query = SimpleNamespace(  
165 - rewritten_query="red 连衣裙",  
166 - detected_language="en",  
167 - translations={},  
168 - contains_chinese=True,  
169 - contains_english=True,  
170 - )  
171 - q = qb.build_query(  
172 - query_text="red 连衣裙",  
173 - parsed_query=parsed_query,  
174 - enable_knn=False,  
175 - index_languages=["zh", "en"],  
176 - )  
177 - fields = _lexical_combined_fields(q["query"])  
178 - bases = {f.split("^", 1)[0] for f in fields}  
179 - assert "title.en" in bases and "title.zh" in bases  
180 - assert "title.zh^0.6" in fields  
181 -  
182 -  
183 -def test_mixed_script_merged_fields_scale_configured_boosts():  
184 - qb = ESQueryBuilder(  
185 - match_fields=["title.en^3.0"],  
186 - multilingual_fields=["title"],  
187 - shared_fields=[],  
188 - field_boosts={"title.zh": 5.0, "title.en": 10.0},  
189 - text_embedding_field="title_embedding",  
190 - default_language="en",  
191 - )  
192 - parsed_query = SimpleNamespace(  
193 - rewritten_query="法式 dress",  
194 - detected_language="zh",  
195 - translations={},  
196 - contains_chinese=True,  
197 - contains_english=True,  
198 - )  
199 - q = qb.build_query(  
200 - query_text="法式 dress",  
201 - parsed_query=parsed_query,  
202 - enable_knn=False,  
203 - index_languages=["zh", "en"],  
204 - )  
205 - fields = _lexical_combined_fields(q["query"])  
206 - assert "title.zh^5.0" in fields  
207 - assert "title.en^6.0" in fields # 10.0 * 0.6  
208 -  
209 -  
210 -def test_mixed_script_does_not_merge_en_when_not_in_index_languages():  
211 - qb = ESQueryBuilder(  
212 - match_fields=["title.zh^3.0"],  
213 - multilingual_fields=["title"],  
214 - shared_fields=[],  
215 - text_embedding_field="title_embedding",  
216 - default_language="zh",  
217 - )  
218 - parsed_query = SimpleNamespace(  
219 - rewritten_query="法式 dress",  
220 - detected_language="zh",  
221 - translations={},  
222 - contains_chinese=True,  
223 - contains_english=True,  
224 - )  
225 - q = qb.build_query(  
226 - query_text="法式 dress",  
227 - parsed_query=parsed_query,  
228 - enable_knn=False,  
229 - index_languages=["zh"],  
230 - )  
231 - fields = _lexical_combined_fields(q["query"])  
232 - bases = {f.split("^", 1)[0] for f in fields}  
233 - assert "title.zh" in bases  
234 - assert "title.en" not in bases  
tests/test_es_query_builder_text_recall_languages.py
1 """ 1 """
2 ES text recall: base_query (rewritten @ detected_language) + base_query_trans_*. 2 ES text recall: base_query (rewritten @ detected_language) + base_query_trans_*.
3 3
4 -Covers combinations of query language vs tenant index_languages, translations,  
5 -and mixed Chinese/English queries. Asserts named lexical clause boundaries,  
6 -combined_fields payloads, and per-language target fields (title.{lang}). 4 +Covers translation routing, mixed-script queries (per-clause language fields only),
  5 +and clause naming. Asserts named lexical clause boundaries, combined_fields payloads,
  6 +and per-language target fields (title.{lang}).
7 """ 7 """
8 8
9 from types import SimpleNamespace 9 from types import SimpleNamespace
@@ -14,11 +14,7 @@ import numpy as np @@ -14,11 +14,7 @@ import numpy as np
14 from search.es_query_builder import ESQueryBuilder 14 from search.es_query_builder import ESQueryBuilder
15 15
16 16
17 -def _builder_multilingual_title_only(  
18 - *,  
19 - default_language: str = "en",  
20 - mixed_script_scale: float = 0.6,  
21 -) -> ESQueryBuilder: 17 +def _builder_multilingual_title_only(*, default_language: str = "en") -> ESQueryBuilder:
22 """Minimal builder: only title.{lang} for easy field assertions.""" 18 """Minimal builder: only title.{lang} for easy field assertions."""
23 return ESQueryBuilder( 19 return ESQueryBuilder(
24 match_fields=["title.en^1.0"], 20 match_fields=["title.en^1.0"],
@@ -26,7 +22,6 @@ def _builder_multilingual_title_only( @@ -26,7 +22,6 @@ def _builder_multilingual_title_only(
26 shared_fields=[], 22 shared_fields=[],
27 text_embedding_field="title_embedding", 23 text_embedding_field="title_embedding",
28 default_language=default_language, 24 default_language=default_language,
29 - mixed_script_merged_field_boost_scale=mixed_script_scale,  
30 function_score_config=None, 25 function_score_config=None,
31 ) 26 )
32 27
@@ -101,22 +96,16 @@ def _build( @@ -101,22 +96,16 @@ def _build(
101 rewritten: str, 96 rewritten: str,
102 detected_language: str, 97 detected_language: str,
103 translations: Dict[str, str], 98 translations: Dict[str, str],
104 - index_languages: List[str],  
105 - contains_chinese: bool = False,  
106 - contains_english: bool = False,  
107 ) -> Dict[str, Any]: 99 ) -> Dict[str, Any]:
108 parsed = SimpleNamespace( 100 parsed = SimpleNamespace(
109 rewritten_query=rewritten, 101 rewritten_query=rewritten,
110 detected_language=detected_language, 102 detected_language=detected_language,
111 translations=dict(translations), 103 translations=dict(translations),
112 - contains_chinese=contains_chinese,  
113 - contains_english=contains_english,  
114 ) 104 )
115 return qb.build_query( 105 return qb.build_query(
116 query_text=query_text, 106 query_text=query_text,
117 parsed_query=parsed, 107 parsed_query=parsed,
118 enable_knn=False, 108 enable_knn=False,
119 - index_languages=index_languages,  
120 ) 109 )
121 110
122 111
@@ -131,7 +120,6 @@ def test_zh_query_index_zh_en_includes_base_zh_and_trans_en(): @@ -131,7 +120,6 @@ def test_zh_query_index_zh_en_includes_base_zh_and_trans_en():
131 rewritten="连衣裙", 120 rewritten="连衣裙",
132 detected_language="zh", 121 detected_language="zh",
133 translations={"en": "dress"}, 122 translations={"en": "dress"},
134 - index_languages=["zh", "en"],  
135 ) 123 )
136 idx = _clauses_index(q) 124 idx = _clauses_index(q)
137 assert set(idx) == {"base_query", "base_query_trans_en"} 125 assert set(idx) == {"base_query", "base_query_trans_en"}
@@ -149,7 +137,6 @@ def test_en_query_index_zh_en_includes_base_en_and_trans_zh(): @@ -149,7 +137,6 @@ def test_en_query_index_zh_en_includes_base_en_and_trans_zh():
149 rewritten="dress", 137 rewritten="dress",
150 detected_language="en", 138 detected_language="en",
151 translations={"zh": "连衣裙"}, 139 translations={"zh": "连衣裙"},
152 - index_languages=["en", "zh"],  
153 ) 140 )
154 idx = _clauses_index(q) 141 idx = _clauses_index(q)
155 assert set(idx) == {"base_query", "base_query_trans_zh"} 142 assert set(idx) == {"base_query", "base_query_trans_zh"}
@@ -167,7 +154,6 @@ def test_de_query_index_de_en_fr_includes_base_and_two_translations(): @@ -167,7 +154,6 @@ def test_de_query_index_de_en_fr_includes_base_and_two_translations():
167 rewritten="kleid", 154 rewritten="kleid",
168 detected_language="de", 155 detected_language="de",
169 translations={"en": "dress", "fr": "robe"}, 156 translations={"en": "dress", "fr": "robe"},
170 - index_languages=["de", "en", "fr"],  
171 ) 157 )
172 idx = _clauses_index(q) 158 idx = _clauses_index(q)
173 assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_fr"} 159 assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_fr"}
@@ -188,7 +174,6 @@ def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields(): @@ -188,7 +174,6 @@ def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields():
188 rewritten="schuh", 174 rewritten="schuh",
189 detected_language="de", 175 detected_language="de",
190 translations={"en": "shoe", "zh": "鞋"}, 176 translations={"en": "shoe", "zh": "鞋"},
191 - index_languages=["en", "zh"],  
192 ) 177 )
193 idx = _clauses_index(q) 178 idx = _clauses_index(q)
194 assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_zh"} 179 assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_zh"}
@@ -201,10 +186,10 @@ def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields(): @@ -201,10 +186,10 @@ def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields():
201 assert idx["base_query_trans_zh"]["boost"] == qb.translation_boost 186 assert idx["base_query_trans_zh"]["boost"] == qb.translation_boost
202 187
203 188
204 -# --- 中英混写:原文在 base_query;翻译子句独立;混写时 base 子句扩列 --- 189 +# --- 中英混写:base 打在检测语种字段;翻译子句打在译文语种字段 ---
205 190
206 191
207 -def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause(): 192 +def test_mixed_zh_detected_base_clause_zh_fields_only_with_en_translation():
208 qb = _builder_multilingual_title_only(default_language="en") 193 qb = _builder_multilingual_title_only(default_language="en")
209 q = _build( 194 q = _build(
210 qb, 195 qb,
@@ -212,19 +197,16 @@ def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause(): @@ -212,19 +197,16 @@ def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause():
212 rewritten="红色 dress", 197 rewritten="红色 dress",
213 detected_language="zh", 198 detected_language="zh",
214 translations={"en": "red dress"}, 199 translations={"en": "red dress"},
215 - index_languages=["zh", "en"],  
216 - contains_chinese=True,  
217 - contains_english=True,  
218 ) 200 )
219 idx = _clauses_index(q) 201 idx = _clauses_index(q)
220 assert set(idx) == {"base_query", "base_query_trans_en"} 202 assert set(idx) == {"base_query", "base_query_trans_en"}
221 assert _combined_fields_clause(idx["base_query"])["query"] == "红色 dress" 203 assert _combined_fields_clause(idx["base_query"])["query"] == "红色 dress"
222 - assert _has_title_lang(idx["base_query"], "zh") and _has_title_lang(idx["base_query"], "en") 204 + assert _has_title_lang(idx["base_query"], "zh") and not _has_title_lang(idx["base_query"], "en")
223 assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "red dress" 205 assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "red dress"
224 assert _has_title_lang(idx["base_query_trans_en"], "en") 206 assert _has_title_lang(idx["base_query_trans_en"], "en")
225 207
226 208
227 -def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause(): 209 +def test_mixed_en_detected_base_clause_en_fields_only_with_zh_translation():
228 qb = _builder_multilingual_title_only(default_language="en") 210 qb = _builder_multilingual_title_only(default_language="en")
229 q = _build( 211 q = _build(
230 qb, 212 qb,
@@ -232,18 +214,15 @@ def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause(): @@ -232,18 +214,15 @@ def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause():
232 rewritten="nike 运动鞋", 214 rewritten="nike 运动鞋",
233 detected_language="en", 215 detected_language="en",
234 translations={"zh": "耐克运动鞋"}, 216 translations={"zh": "耐克运动鞋"},
235 - index_languages=["zh", "en"],  
236 - contains_chinese=True,  
237 - contains_english=True,  
238 ) 217 )
239 idx = _clauses_index(q) 218 idx = _clauses_index(q)
240 assert set(idx) == {"base_query", "base_query_trans_zh"} 219 assert set(idx) == {"base_query", "base_query_trans_zh"}
241 assert _combined_fields_clause(idx["base_query"])["query"] == "nike 运动鞋" 220 assert _combined_fields_clause(idx["base_query"])["query"] == "nike 运动鞋"
242 - assert _has_title_lang(idx["base_query"], "en") and _has_title_lang(idx["base_query"], "zh") 221 + assert _has_title_lang(idx["base_query"], "en") and not _has_title_lang(idx["base_query"], "zh")
243 assert _combined_fields_clause(idx["base_query_trans_zh"])["query"] == "耐克运动鞋" 222 assert _combined_fields_clause(idx["base_query_trans_zh"])["query"] == "耐克运动鞋"
244 223
245 224
246 -def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): 225 +def test_zh_query_no_translations_only_zh_fields():
247 qb = _builder_multilingual_title_only(default_language="en") 226 qb = _builder_multilingual_title_only(default_language="en")
248 q = _build( 227 q = _build(
249 qb, 228 qb,
@@ -251,9 +230,6 @@ def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): @@ -251,9 +230,6 @@ def test_mixed_zh_query_index_zh_only_no_en_merge_in_base():
251 rewritten="法式 dress", 230 rewritten="法式 dress",
252 detected_language="zh", 231 detected_language="zh",
253 translations={}, 232 translations={},
254 - index_languages=["zh"],  
255 - contains_chinese=True,  
256 - contains_english=True,  
257 ) 233 )
258 idx = _clauses_index(q) 234 idx = _clauses_index(q)
259 assert set(idx) == {"base_query"} 235 assert set(idx) == {"base_query"}
@@ -272,7 +248,6 @@ def test_skips_translation_when_same_lang_and_same_text_as_base(): @@ -272,7 +248,6 @@ def test_skips_translation_when_same_lang_and_same_text_as_base():
272 rewritten="NIKE", 248 rewritten="NIKE",
273 detected_language="en", 249 detected_language="en",
274 translations={"en": "NIKE", "zh": "耐克"}, 250 translations={"en": "NIKE", "zh": "耐克"},
275 - index_languages=["en", "zh"],  
276 ) 251 )
277 idx = _clauses_index(q) 252 idx = _clauses_index(q)
278 assert set(idx) == {"base_query", "base_query_trans_zh"} 253 assert set(idx) == {"base_query", "base_query_trans_zh"}
@@ -286,7 +261,6 @@ def test_keeps_translation_when_same_text_but_different_lang_than_base(): @@ -286,7 +261,6 @@ def test_keeps_translation_when_same_text_but_different_lang_than_base():
286 rewritten="NIKE", 261 rewritten="NIKE",
287 detected_language="en", 262 detected_language="en",
288 translations={"zh": "NIKE"}, 263 translations={"zh": "NIKE"},
289 - index_languages=["en", "zh"],  
290 ) 264 )
291 idx = _clauses_index(q) 265 idx = _clauses_index(q)
292 assert set(idx) == {"base_query", "base_query_trans_zh"} 266 assert set(idx) == {"base_query", "base_query_trans_zh"}
@@ -304,7 +278,6 @@ def test_translation_language_key_is_normalized_case_insensitive(): @@ -304,7 +278,6 @@ def test_translation_language_key_is_normalized_case_insensitive():
304 rewritten="dress", 278 rewritten="dress",
305 detected_language="en", 279 detected_language="en",
306 translations={"ZH": "连衣裙"}, 280 translations={"ZH": "连衣裙"},
307 - index_languages=["en", "zh"],  
308 ) 281 )
309 idx = _clauses_index(q) 282 idx = _clauses_index(q)
310 assert "base_query_trans_zh" in idx 283 assert "base_query_trans_zh" in idx
@@ -319,17 +292,16 @@ def test_empty_translation_value_is_skipped(): @@ -319,17 +292,16 @@ def test_empty_translation_value_is_skipped():
319 rewritten="dress", 292 rewritten="dress",
320 detected_language="en", 293 detected_language="en",
321 translations={"zh": " ", "fr": "robe"}, 294 translations={"zh": " ", "fr": "robe"},
322 - index_languages=["en", "zh", "fr"],  
323 ) 295 )
324 idx = _clauses_index(q) 296 idx = _clauses_index(q)
325 assert "base_query_trans_zh" not in idx 297 assert "base_query_trans_zh" not in idx
326 assert "base_query_trans_fr" in idx 298 assert "base_query_trans_fr" in idx
327 299
328 300
329 -# --- index_languages 为空:视为「未约束」source_in_index 为 True --- 301 +# --- base 子句无 bool.boost;翻译子句带 translation_boost;phrase should 继承 phrase_match_boost ---
330 302
331 303
332 -def test_empty_index_languages_treats_source_as_in_index_boosts(): 304 +def test_de_base_and_en_translation_phrase_boosts():
333 qb = _builder_multilingual_title_only(default_language="en") 305 qb = _builder_multilingual_title_only(default_language="en")
334 q = _build( 306 q = _build(
335 qb, 307 qb,
@@ -337,7 +309,6 @@ def test_empty_index_languages_treats_source_as_in_index_boosts(): @@ -337,7 +309,6 @@ def test_empty_index_languages_treats_source_as_in_index_boosts():
337 rewritten="x", 309 rewritten="x",
338 detected_language="de", 310 detected_language="de",
339 translations={"en": "y"}, 311 translations={"en": "y"},
340 - index_languages=[],  
341 ) 312 )
342 idx = _clauses_index(q) 313 idx = _clauses_index(q)
343 assert "boost" not in idx["base_query"] 314 assert "boost" not in idx["base_query"]
@@ -359,7 +330,6 @@ def test_no_translations_only_base_query(): @@ -359,7 +330,6 @@ def test_no_translations_only_base_query():
359 rewritten="hello", 330 rewritten="hello",
360 detected_language="en", 331 detected_language="en",
361 translations={}, 332 translations={},
362 - index_languages=["en", "zh"],  
363 ) 333 )
364 idx = _clauses_index(q) 334 idx = _clauses_index(q)
365 assert set(idx) == {"base_query"} 335 assert set(idx) == {"base_query"}
@@ -374,15 +344,12 @@ def test_text_clauses_present_alongside_knn(): @@ -374,15 +344,12 @@ def test_text_clauses_present_alongside_knn():
374 rewritten_query="dress", 344 rewritten_query="dress",
375 detected_language="en", 345 detected_language="en",
376 translations={"zh": "连衣裙"}, 346 translations={"zh": "连衣裙"},
377 - contains_chinese=False,  
378 - contains_english=True,  
379 ) 347 )
380 q = qb.build_query( 348 q = qb.build_query(
381 query_text="dress", 349 query_text="dress",
382 query_vector=np.array([0.1, 0.2, 0.3], dtype=np.float32), 350 query_vector=np.array([0.1, 0.2, 0.3], dtype=np.float32),
383 parsed_query=parsed, 351 parsed_query=parsed,
384 enable_knn=True, 352 enable_knn=True,
385 - index_languages=["en", "zh"],  
386 ) 353 )
387 assert "knn" in q 354 assert "knn" in q
388 idx = _clauses_index(q) 355 idx = _clauses_index(q)
@@ -396,14 +363,11 @@ def test_detected_language_unknown_falls_back_to_default_language(): @@ -396,14 +363,11 @@ def test_detected_language_unknown_falls_back_to_default_language():
396 rewritten_query="shirt", 363 rewritten_query="shirt",
397 detected_language="unknown", 364 detected_language="unknown",
398 translations={"zh": "衬衫"}, 365 translations={"zh": "衬衫"},
399 - contains_chinese=False,  
400 - contains_english=True,  
401 ) 366 )
402 q = qb.build_query( 367 q = qb.build_query(
403 query_text="shirt", 368 query_text="shirt",
404 parsed_query=parsed, 369 parsed_query=parsed,
405 enable_knn=False, 370 enable_knn=False,
406 - index_languages=["en", "zh"],  
407 ) 371 )
408 idx = _clauses_index(q) 372 idx = _clauses_index(q)
409 assert set(idx) == {"base_query", "base_query_trans_zh"} 373 assert set(idx) == {"base_query", "base_query_trans_zh"}
@@ -419,7 +383,6 @@ def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): @@ -419,7 +383,6 @@ def test_ru_query_index_ru_en_includes_base_ru_and_trans_en():
419 rewritten="платье", 383 rewritten="платье",
420 detected_language="ru", 384 detected_language="ru",
421 translations={"en": "dress"}, 385 translations={"en": "dress"},
422 - index_languages=["ru", "en"],  
423 ) 386 )
424 idx = _clauses_index(q) 387 idx = _clauses_index(q)
425 assert set(idx) == {"base_query", "base_query_trans_en"} 388 assert set(idx) == {"base_query", "base_query_trans_en"}
@@ -428,11 +391,8 @@ def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): @@ -428,11 +391,8 @@ def test_ru_query_index_ru_en_includes_base_ru_and_trans_en():
428 assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "dress" 391 assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "dress"
429 392
430 393
431 -def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause():  
432 - """  
433 - 当前实现:凡是 translations 里非空的条目都会生成子句;  
434 - index_languages 只约束混写扩列,不用于过滤翻译子句。  
435 - """ 394 +def test_translation_generates_clause_for_any_target_lang_key():
  395 + """translations 里非空的每个语种键都会生成对应 base_query_trans_* 子句。"""
436 qb = _builder_multilingual_title_only(default_language="en") 396 qb = _builder_multilingual_title_only(default_language="en")
437 q = _build( 397 q = _build(
438 qb, 398 qb,
@@ -440,7 +400,6 @@ def test_translation_for_lang_not_listed_in_index_languages_still_generates_clau @@ -440,7 +400,6 @@ def test_translation_for_lang_not_listed_in_index_languages_still_generates_clau
440 rewritten="dress", 400 rewritten="dress",
441 detected_language="en", 401 detected_language="en",
442 translations={"zh": "连衣裙", "de": "Kleid"}, 402 translations={"zh": "连衣裙", "de": "Kleid"},
443 - index_languages=["en", "zh"],  
444 ) 403 )
445 idx = _clauses_index(q) 404 idx = _clauses_index(q)
446 assert "base_query_trans_de" in idx 405 assert "base_query_trans_de" in idx
@@ -457,9 +416,6 @@ def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_bas @@ -457,9 +416,6 @@ def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_bas
457 rewritten="红色连衣裙", 416 rewritten="红色连衣裙",
458 detected_language="zh", 417 detected_language="zh",
459 translations={"en": "red dress"}, 418 translations={"en": "red dress"},
460 - index_languages=["zh", "en"],  
461 - contains_chinese=True,  
462 - contains_english=False,  
463 ) 419 )
464 idx = _clauses_index(q) 420 idx = _clauses_index(q)
465 assert _combined_fields_clause(idx["base_query"])["query"] == "红色连衣裙" 421 assert _combined_fields_clause(idx["base_query"])["query"] == "红色连衣裙"
tests/test_query_parser_mixed_language.py
@@ -11,14 +11,6 @@ def _tokenizer(text): @@ -11,14 +11,6 @@ def _tokenizer(text):
11 return str(text).split() 11 return str(text).split()
12 12
13 13
14 -def test_pure_english_word_token_length_and_script():  
15 - assert QueryParser._is_pure_english_word_token("ab") is False  
16 - assert QueryParser._is_pure_english_word_token("abc") is True  
17 - assert QueryParser._is_pure_english_word_token("wi-fi") is True  
18 - assert QueryParser._is_pure_english_word_token("连衣裙") is False  
19 - assert QueryParser._is_pure_english_word_token("ab12") is False  
20 -  
21 -  
22 def _build_config() -> SearchConfig: 14 def _build_config() -> SearchConfig:
23 return SearchConfig( 15 return SearchConfig(
24 es_index_name="test_products", 16 es_index_name="test_products",
@@ -36,7 +28,7 @@ def _build_config() -&gt; SearchConfig: @@ -36,7 +28,7 @@ def _build_config() -&gt; SearchConfig:
36 ) 28 )
37 29
38 30
39 -def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(monkeypatch): 31 +def test_parse_mixed_zh_query_translates_to_en(monkeypatch):
40 parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) 32 parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
41 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh") 33 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh")
42 34
@@ -48,15 +40,13 @@ def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(mo @@ -48,15 +40,13 @@ def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(mo
48 ) 40 )
49 41
50 assert result.detected_language == "zh" 42 assert result.detected_language == "zh"
51 - assert result.contains_chinese is True  
52 - assert result.contains_english is True  
53 assert result.translations == {"en": "法式 dress 连衣裙-en"} 43 assert result.translations == {"en": "法式 dress 连衣裙-en"}
54 assert result.query_tokens == ["法式", "dress", "连衣裙"] 44 assert result.query_tokens == ["法式", "dress", "连衣裙"]
55 assert not hasattr(result, "query_text_by_lang") 45 assert not hasattr(result, "query_text_by_lang")
56 assert not hasattr(result, "search_langs") 46 assert not hasattr(result, "search_langs")
57 47
58 48
59 -def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch): 49 +def test_parse_mixed_en_query_translates_to_zh(monkeypatch):
60 parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) 50 parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
61 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") 51 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
62 52
@@ -68,8 +58,6 @@ def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch): @@ -68,8 +58,6 @@ def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch):
68 ) 58 )
69 59
70 assert result.detected_language == "en" 60 assert result.detected_language == "en"
71 - assert result.contains_chinese is True  
72 - assert result.contains_english is True  
73 assert result.translations == {"zh": "red 连衣裙-zh"} 61 assert result.translations == {"zh": "red 连衣裙-zh"}
74 assert result.query_tokens == ["red", "连衣裙"] 62 assert result.query_tokens == ["red", "连衣裙"]
75 63
@@ -87,7 +75,5 @@ def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch) @@ -87,7 +75,5 @@ def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch)
87 ) 75 )
88 76
89 assert result.detected_language == "en" 77 assert result.detected_language == "en"
90 - assert result.contains_chinese is False  
91 - assert result.contains_english is True  
92 assert result.translations.get("zh") == "off shoulder top-zh" 78 assert result.translations.get("zh") == "off shoulder top-zh"
93 assert not hasattr(result, "source_in_index_languages") 79 assert not hasattr(result, "source_in_index_languages")