Commit 35da381333a86fda7b179f767b6f5a5dcd157a65

Authored by tangwang
1 parent 445496cd

中英混写query的优化逻辑,不适合新的combined_fields+best_fields+phrase查询方式,带来的复杂度较多,清理该部分逻辑

docs/相关性检索优化说明.md
... ... @@ -17,9 +17,9 @@
17 17 查询链路(文本相关):
18 18  
19 19 1. `QueryParser.parse()`
20   - 负责产出解析事实:`query_normalized`、`rewritten_query`、`detected_language`、`translations`、`query_vector`、`query_tokens`、`contains_chinese`、`contains_english`
  20 + 负责产出解析事实:`query_normalized`、`rewritten_query`、`detected_language`、`translations`、`query_vector`、`query_tokens`
21 21 2. `Searcher.search()`
22   - 负责读取租户 `index_languages`,并将其一方面传给 `QueryParser` 作为 `target_languages`,另一方面传给 `ESQueryBuilder` 作为字段展开约束
  22 + 负责读取租户 `index_languages`,并将其传给 `QueryParser` 作为 `target_languages`(控制翻译目标语种);`ESQueryBuilder` 仅根据 `detected_language` 与各条译文构建子句字段,不再接收 `index_languages`
23 23 2. `ESQueryBuilder._build_advanced_text_query()`
24 24 基于 `rewritten_query + detected_language + translations + index_languages` 构建 `base_query` 与 `base_query_trans_*`;并按语言动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段,叠加 shared 字段(`tags`、`option*_values`)。
25 25 3. `build_query()`
... ... @@ -76,9 +76,6 @@
76 76  
77 77 最终按 `bool.should` 组合,`minimum_should_match: 1`。
78 78  
79   -> **附 — 混写辅助召回**
80   -> 当中英(或多脚本)混写时,为略抬召回:`QueryParser` 用 `contains_chinese`(文中有汉字)、`contains_english`(分词中有长度 ≥3 的纯英文 token)打标;`ESQueryBuilder` 在某一语言的 `multi_match` 上,按规则把**另一语种**的同类字段并入同一 `fields`(受 `index_languages` 限制),并入列的 boost 为配置值再乘 **`mixed_script_merged_field_boost_scale`(默认 0.6,`ESQueryBuilder` 构造参数)**。字段在内部以 `(path, boost)` 列表合并后再格式化为 ES 字符串。
81   -
82 79 ## 5. 关键配置项(文本策略)
83 80  
84 81 `query_config` 下与解析等待相关的项:
... ... @@ -147,11 +144,9 @@
147 144 - `translations`
148 145 - `query_vector`
149 146 - `query_tokens`
150   - - `contains_chinese` / `contains_english`
151 147 - `Searcher` 负责“租户语境”:
152 148 - `index_languages`
153 149 - 将其传给 parser 作为 `target_languages`
154   - - 将其传给 builder 作为字段展开约束
155 150 - `ESQueryBuilder` 负责“表达式展开”:
156 151 - 动态字段组装
157 152 - 子句权重分配
... ...
query/query_parser.py
... ... @@ -50,8 +50,6 @@ class ParsedQuery:
50 50 translations: Dict[str, str] = field(default_factory=dict)
51 51 query_vector: Optional[np.ndarray] = None
52 52 query_tokens: List[str] = field(default_factory=list)
53   - contains_chinese: bool = False
54   - contains_english: bool = False
55 53  
56 54 def to_dict(self) -> Dict[str, Any]:
57 55 """Convert to dictionary representation."""
... ... @@ -62,8 +60,6 @@ class ParsedQuery:
62 60 "detected_language": self.detected_language,
63 61 "translations": self.translations,
64 62 "query_tokens": self.query_tokens,
65   - "contains_chinese": self.contains_chinese,
66   - "contains_english": self.contains_english,
67 63 }
68 64  
69 65  
... ... @@ -202,21 +198,6 @@ class QueryParser:
202 198 def _get_query_tokens(self, query: str) -> List[str]:
203 199 return self._extract_tokens(self._tokenizer(query))
204 200  
205   - @staticmethod
206   - def _contains_cjk(text: str) -> bool:
207   - """Whether query contains any CJK ideograph."""
208   - return bool(re.search(r"[\u4e00-\u9fff]", text or ""))
209   -
210   - @staticmethod
211   - def _is_pure_english_word_token(token: str) -> bool:
212   - """
213   - A tokenizer token counts as English iff it is letters only (optional internal hyphens)
214   - and length >= 3.
215   - """
216   - if not token or len(token) < 3:
217   - return False
218   - return bool(re.fullmatch(r"[A-Za-z]+(?:-[A-Za-z]+)*", token))
219   -
220 201 def parse(
221 202 self,
222 203 query: str,
... ... @@ -285,19 +266,12 @@ class QueryParser:
285 266 log_info(f"Language detection | Detected language: {detected_lang}")
286 267 if context:
287 268 context.store_intermediate_result('detected_language', detected_lang)
288   - # Stage 4: Query analysis (tokenization + script flags)
  269 + # Stage 4: Query analysis (tokenization)
289 270 query_tokens = self._get_query_tokens(query_text)
290   - contains_chinese = self._contains_cjk(query_text)
291   - contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens)
292 271  
293   - log_debug(
294   - f"Query analysis | Query tokens: {query_tokens} | "
295   - f"contains_chinese={contains_chinese} | contains_english={contains_english}"
296   - )
  272 + log_debug(f"Query analysis | Query tokens: {query_tokens}")
297 273 if context:
298 274 context.store_intermediate_result('query_tokens', query_tokens)
299   - context.store_intermediate_result('contains_chinese', contains_chinese)
300   - context.store_intermediate_result('contains_english', contains_english)
301 275  
302 276 # Stage 5: Translation + embedding. Parser only coordinates async enrichment work; the
303 277 # caller decides translation targets and later search-field planning.
... ... @@ -459,8 +433,6 @@ class QueryParser:
459 433 translations=translations,
460 434 query_vector=query_vector,
461 435 query_tokens=query_tokens,
462   - contains_chinese=contains_chinese,
463   - contains_english=contains_english,
464 436 )
465 437  
466 438 if context and hasattr(context, 'logger'):
... ...
search/es_query_builder.py
... ... @@ -8,14 +8,11 @@ Simplified architecture:
8 8 - function_score wrapper for boosting fields
9 9 """
10 10  
11   -from typing import Dict, Any, List, Optional, Union, Tuple
  11 +from typing import Dict, Any, List, Optional, Tuple
12 12  
13 13 import numpy as np
14 14 from config import FunctionScoreConfig
15 15  
16   -# (Elasticsearch field path, boost before formatting as "path^boost")
17   -MatchFieldSpec = Tuple[str, float]
18   -
19 16  
20 17 class ESQueryBuilder:
21 18 """Builds Elasticsearch DSL queries."""
... ... @@ -39,7 +36,6 @@ class ESQueryBuilder:
39 36 tie_breaker_base_query: float = 0.9,
40 37 best_fields_boosts: Optional[Dict[str, float]] = None,
41 38 best_fields_clause_boost: float = 2.0,
42   - mixed_script_merged_field_boost_scale: float = 0.6,
43 39 phrase_field_boosts: Optional[Dict[str, float]] = None,
44 40 phrase_match_base_fields: Optional[Tuple[str, ...]] = None,
45 41 phrase_match_slop: int = 0,
... ... @@ -60,7 +56,6 @@ class ESQueryBuilder:
60 56 function_score_config: Function score configuration
61 57 default_language: Default language to use when detection fails or returns "unknown"
62 58 knn_boost: Boost value for KNN (embedding recall)
63   - mixed_script_merged_field_boost_scale: Multiply per-field ^boost for cross-script merged fields
64 59 """
65 60 self.match_fields = match_fields
66 61 self.field_boosts = field_boosts or {}
... ... @@ -77,7 +72,6 @@ class ESQueryBuilder:
77 72 self.translation_minimum_should_match = translation_minimum_should_match
78 73 self.translation_boost = float(translation_boost)
79 74 self.tie_breaker_base_query = float(tie_breaker_base_query)
80   - self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale)
81 75 default_best_fields = {
82 76 base: self._get_field_boost(base)
83 77 for base in self.core_multilingual_fields
... ... @@ -180,7 +174,6 @@ class ESQueryBuilder:
180 174 knn_num_candidates: int = 200,
181 175 min_score: Optional[float] = None,
182 176 parsed_query: Optional[Any] = None,
183   - index_languages: Optional[List[str]] = None,
184 177 ) -> Dict[str, Any]:
185 178 """
186 179 Build complete ES query with post_filter support for multi-select faceting.
... ... @@ -223,11 +216,7 @@ class ESQueryBuilder:
223 216 # Text recall (always include if query_text exists)
224 217 if query_text:
225 218 # Unified text query strategy
226   - text_query = self._build_advanced_text_query(
227   - query_text,
228   - parsed_query,
229   - index_languages=index_languages,
230   - )
  219 + text_query = self._build_advanced_text_query(query_text, parsed_query)
231 220 recall_clauses.append(text_query)
232 221  
233 222 # Embedding recall (KNN - separate from query, handled below)
... ... @@ -434,90 +423,36 @@ class ESQueryBuilder:
434 423 return float(self.field_boosts[base_field])
435 424 return 1.0
436 425  
437   - def _build_match_field_specs(
  426 + def _match_field_strings(
438 427 self,
439 428 language: str,
440 429 *,
441 430 multilingual_fields: Optional[List[str]] = None,
442 431 shared_fields: Optional[List[str]] = None,
443 432 boost_overrides: Optional[Dict[str, float]] = None,
444   - ) -> List[MatchFieldSpec]:
445   - """
446   - Per-language match targets as (field_path, boost). Single source of truth before
447   - formatting as Elasticsearch ``fields`` strings.
448   - """
  433 + ) -> List[str]:
  434 + """Build ``multi_match`` / ``combined_fields`` field entries for one language code."""
449 435 lang = (language or "").strip().lower()
450   - specs: List[MatchFieldSpec] = []
451   - text_fields = multilingual_fields if multilingual_fields is not None else self.multilingual_fields
  436 + text_bases = multilingual_fields if multilingual_fields is not None else self.multilingual_fields
452 437 term_fields = shared_fields if shared_fields is not None else self.shared_fields
453 438 overrides = boost_overrides or {}
454   -
455   - for base in text_fields:
456   - field = f"{base}.{lang}"
  439 + out: List[str] = []
  440 + for base in text_bases:
  441 + path = f"{base}.{lang}"
457 442 boost = float(overrides.get(base, self._get_field_boost(base, lang)))
458   - specs.append((field, boost))
459   -
  443 + out.append(self._format_field_with_boost(path, boost))
460 444 for shared in term_fields:
461 445 boost = float(overrides.get(shared, self._get_field_boost(shared, None)))
462   - specs.append((shared, boost))
463   - return specs
464   -
465   - def _format_match_field_specs(self, specs: List[MatchFieldSpec]) -> List[str]:
466   - """Format (field_path, boost) pairs for Elasticsearch multi_match ``fields``."""
467   - return [self._format_field_with_boost(path, boost) for path, boost in specs]
468   -
469   - def _merge_supplemental_lang_field_specs(
470   - self,
471   - specs: List[MatchFieldSpec],
472   - supplemental_lang: str,
473   - ) -> List[MatchFieldSpec]:
474   - """Append supplemental-language columns; boosts multiplied by mixed_script scale."""
475   - scale = float(self.mixed_script_merged_field_boost_scale)
476   - extra_all = self._build_match_field_specs(supplemental_lang)
477   - seen = {path for path, _ in specs}
478   - out = list(specs)
479   - for path, boost in extra_all:
480   - if path not in seen:
481   - out.append((path, boost * scale))
482   - seen.add(path)
483   - return out
484   -
485   - def _expand_match_field_specs_for_mixed_script(
486   - self,
487   - lang: str,
488   - specs: List[MatchFieldSpec],
489   - contains_chinese: bool,
490   - contains_english: bool,
491   - index_languages: List[str],
492   - is_source: bool = False
493   - ) -> List[MatchFieldSpec]:
494   - """
495   - When the query mixes scripts, widen each clause to indexed fields for the other script
496   - (e.g. zh clause also searches title.en when the query contains an English word token).
497   - """
498   - norm = {str(x or "").strip().lower() for x in (index_languages or []) if str(x or "").strip()}
499   - allow = norm or {"zh", "en"}
500   -
501   - def can_use(lcode: str) -> bool:
502   - return lcode in allow if norm else True
503   -
504   - out = list(specs)
505   - lnorm = (lang or "").strip().lower()
506   - if is_source:
507   - if contains_english and lnorm != "en" and can_use("en"):
508   - out = self._merge_supplemental_lang_field_specs(out, "en")
509   - if contains_chinese and lnorm != "zh" and can_use("zh"):
510   - out = self._merge_supplemental_lang_field_specs(out, "zh")
  446 + out.append(self._format_field_with_boost(shared, boost))
511 447 return out
512 448  
513 449 def _build_best_fields_clause(self, language: str, query_text: str) -> Optional[Dict[str, Any]]:
514   - specs = self._build_match_field_specs(
  450 + fields = self._match_field_strings(
515 451 language,
516 452 multilingual_fields=list(self.best_fields_boosts),
517 453 shared_fields=[],
518 454 boost_overrides=self.best_fields_boosts,
519 455 )
520   - fields = self._format_match_field_specs(specs)
521 456 if not fields:
522 457 return None
523 458 return {
... ... @@ -530,13 +465,12 @@ class ESQueryBuilder:
530 465 }
531 466  
532 467 def _build_phrase_clause(self, language: str, query_text: str) -> Optional[Dict[str, Any]]:
533   - specs = self._build_match_field_specs(
  468 + fields = self._match_field_strings(
534 469 language,
535 470 multilingual_fields=list(self.phrase_field_boosts),
536 471 shared_fields=[],
537 472 boost_overrides=self.phrase_field_boosts,
538 473 )
539   - fields = self._format_match_field_specs(specs)
540 474 if not fields:
541 475 return None
542 476 clause: Dict[str, Any] = {
... ... @@ -560,20 +494,8 @@ class ESQueryBuilder:
560 494 clause_name: str,
561 495 *,
562 496 is_source: bool,
563   - contains_chinese: bool,
564   - contains_english: bool,
565   - index_languages: List[str],
566 497 ) -> Optional[Dict[str, Any]]:
567   - all_specs = self._build_match_field_specs(lang)
568   - expanded_specs = self._expand_match_field_specs_for_mixed_script(
569   - lang,
570   - all_specs,
571   - contains_chinese,
572   - contains_english,
573   - index_languages,
574   - is_source,
575   - )
576   - combined_fields = self._format_match_field_specs(expanded_specs)
  498 + combined_fields = self._match_field_strings(lang)
577 499 if not combined_fields:
578 500 return None
579 501 minimum_should_match = (
... ... @@ -607,29 +529,10 @@ class ESQueryBuilder:
607 529 clause["bool"]["boost"] = float(self.translation_boost)
608 530 return clause
609 531  
610   - def _get_embedding_field(self, language: str) -> str:
611   - """Get embedding field name for a language."""
612   - # Currently using unified embedding field
613   - return self.text_embedding_field or "title_embedding"
614   -
615   - @staticmethod
616   - def _normalize_language_list(languages: Optional[List[str]]) -> List[str]:
617   - normalized: List[str] = []
618   - seen = set()
619   - for language in languages or []:
620   - token = str(language or "").strip().lower()
621   - if not token or token in seen:
622   - continue
623   - seen.add(token)
624   - normalized.append(token)
625   - return normalized
626   -
627 532 def _build_advanced_text_query(
628 533 self,
629 534 query_text: str,
630 535 parsed_query: Optional[Any] = None,
631   - *,
632   - index_languages: Optional[List[str]] = None,
633 536 ) -> Dict[str, Any]:
634 537 """
635 538 Build advanced text query using base and translated lexical clauses.
... ... @@ -649,39 +552,26 @@ class ESQueryBuilder:
649 552 should_clauses = []
650 553 source_lang = self.default_language
651 554 translations: Dict[str, str] = {}
652   - contains_chinese = False
653   - contains_english = False
654   - normalized_index_languages = self._normalize_language_list(index_languages)
655 555  
656 556 if parsed_query:
657 557 detected_lang = getattr(parsed_query, "detected_language", None)
658 558 source_lang = detected_lang if detected_lang and detected_lang != "unknown" else self.default_language
659 559 translations = getattr(parsed_query, "translations", None) or {}
660   - contains_chinese = bool(getattr(parsed_query, "contains_chinese", False))
661   - contains_english = bool(getattr(parsed_query, "contains_english", False))
662 560  
663 561 source_lang = str(source_lang or self.default_language).strip().lower() or self.default_language
664 562 base_query_text = (
665 563 getattr(parsed_query, "rewritten_query", None) if parsed_query else None
666 564 ) or query_text
667 565  
668   - def append_clause(lang: str, lang_query: str, clause_name: str, is_source: bool) -> None:
669   - nonlocal should_clauses
670   - clause = self._build_lexical_language_clause(
671   - lang,
672   - lang_query,
673   - clause_name,
674   - is_source=is_source,
675   - contains_chinese=contains_chinese,
676   - contains_english=contains_english,
677   - index_languages=normalized_index_languages,
678   - )
679   - if not clause:
680   - return
681   - should_clauses.append(clause)
682   -
683 566 if base_query_text:
684   - append_clause(source_lang, base_query_text, "base_query", True)
  567 + base_clause = self._build_lexical_language_clause(
  568 + source_lang,
  569 + base_query_text,
  570 + "base_query",
  571 + is_source=True,
  572 + )
  573 + if base_clause:
  574 + should_clauses.append(base_clause)
685 575  
686 576 for lang, translated_text in translations.items():
687 577 normalized_lang = str(lang or "").strip().lower()
... ... @@ -690,7 +580,14 @@ class ESQueryBuilder:
690 580 continue
691 581 if normalized_lang == source_lang and normalized_text == base_query_text:
692 582 continue
693   - append_clause(normalized_lang, normalized_text, f"base_query_trans_{normalized_lang}", False)
  583 + trans_clause = self._build_lexical_language_clause(
  584 + normalized_lang,
  585 + normalized_text,
  586 + f"base_query_trans_{normalized_lang}",
  587 + is_source=False,
  588 + )
  589 + if trans_clause:
  590 + should_clauses.append(trans_clause)
694 591  
695 592 # Fallback to a simple query when language fields cannot be resolved.
696 593 if not should_clauses:
... ...
search/searcher.py
... ... @@ -645,7 +645,6 @@ class Searcher:
645 645 enable_knn=enable_embedding and parsed_query.query_vector is not None,
646 646 min_score=min_score,
647 647 parsed_query=parsed_query,
648   - index_languages=index_langs,
649 648 )
650 649  
651 650 # Add facets for faceted search
... ...
tests/test_es_query_builder.py
... ... @@ -9,6 +9,9 @@ from search.es_query_builder import ESQueryBuilder
9 9 def _builder() -> ESQueryBuilder:
10 10 return ESQueryBuilder(
11 11 match_fields=["title.en^3.0", "brief.en^1.0"],
  12 + multilingual_fields=["title", "brief"],
  13 + core_multilingual_fields=["title", "brief"],
  14 + shared_fields=[],
12 15 text_embedding_field="title_embedding",
13 16 default_language="en",
14 17 )
... ... @@ -25,10 +28,6 @@ def _lexical_clause(query_root: Dict[str, Any]) -&gt; Dict[str, Any]:
25 28 raise AssertionError("no lexical bool clause in query_root")
26 29  
27 30  
28   -def _lexical_combined_fields(query_root: Dict[str, Any]) -> list:
29   - return _lexical_clause(query_root)["must"][0]["combined_fields"]["fields"]
30   -
31   -
32 31 def test_knn_prefilter_includes_range_filters():
33 32 qb = _builder()
34 33 q = qb.build_query(
... ... @@ -93,7 +92,6 @@ def test_text_query_contains_only_base_and_translation_named_queries():
93 92 query_text="dress",
94 93 parsed_query=parsed_query,
95 94 enable_knn=False,
96   - index_languages=["en", "zh", "fr"],
97 95 )
98 96 should = q["query"]["bool"]["should"]
99 97 names = [clause["bool"]["_name"] for clause in should]
... ... @@ -115,120 +113,8 @@ def test_text_query_skips_duplicate_translation_same_as_base():
115 113 query_text="dress",
116 114 parsed_query=parsed_query,
117 115 enable_knn=False,
118   - index_languages=["en", "zh"],
119 116 )
120 117  
121 118 root = q["query"]
122 119 assert root["bool"]["_name"] == "base_query"
123 120 assert [clause["multi_match"]["type"] for clause in root["bool"]["should"]] == ["best_fields", "phrase"]
124   -
125   -
126   -def test_mixed_script_merges_en_fields_into_zh_clause():
127   - qb = ESQueryBuilder(
128   - match_fields=["title.en^3.0"],
129   - multilingual_fields=["title", "brief"],
130   - shared_fields=[],
131   - text_embedding_field="title_embedding",
132   - default_language="en",
133   - )
134   - parsed_query = SimpleNamespace(
135   - rewritten_query="法式 dress",
136   - detected_language="zh",
137   - translations={},
138   - contains_chinese=True,
139   - contains_english=True,
140   - )
141   - q = qb.build_query(
142   - query_text="法式 dress",
143   - parsed_query=parsed_query,
144   - enable_knn=False,
145   - index_languages=["zh", "en"],
146   - )
147   - fields = _lexical_combined_fields(q["query"])
148   - bases = {f.split("^", 1)[0] for f in fields}
149   - assert "title.zh" in bases and "title.en" in bases
150   - assert "brief.zh" in bases and "brief.en" in bases
151   - # Merged supplemental language fields use boost * 0.6 by default.
152   - assert "title.en^0.6" in fields
153   - assert "brief.en^0.6" in fields
154   -
155   -
156   -def test_mixed_script_merges_zh_fields_into_en_clause():
157   - qb = ESQueryBuilder(
158   - match_fields=["title.en^3.0"],
159   - multilingual_fields=["title"],
160   - shared_fields=[],
161   - text_embedding_field="title_embedding",
162   - default_language="en",
163   - )
164   - parsed_query = SimpleNamespace(
165   - rewritten_query="red 连衣裙",
166   - detected_language="en",
167   - translations={},
168   - contains_chinese=True,
169   - contains_english=True,
170   - )
171   - q = qb.build_query(
172   - query_text="red 连衣裙",
173   - parsed_query=parsed_query,
174   - enable_knn=False,
175   - index_languages=["zh", "en"],
176   - )
177   - fields = _lexical_combined_fields(q["query"])
178   - bases = {f.split("^", 1)[0] for f in fields}
179   - assert "title.en" in bases and "title.zh" in bases
180   - assert "title.zh^0.6" in fields
181   -
182   -
183   -def test_mixed_script_merged_fields_scale_configured_boosts():
184   - qb = ESQueryBuilder(
185   - match_fields=["title.en^3.0"],
186   - multilingual_fields=["title"],
187   - shared_fields=[],
188   - field_boosts={"title.zh": 5.0, "title.en": 10.0},
189   - text_embedding_field="title_embedding",
190   - default_language="en",
191   - )
192   - parsed_query = SimpleNamespace(
193   - rewritten_query="法式 dress",
194   - detected_language="zh",
195   - translations={},
196   - contains_chinese=True,
197   - contains_english=True,
198   - )
199   - q = qb.build_query(
200   - query_text="法式 dress",
201   - parsed_query=parsed_query,
202   - enable_knn=False,
203   - index_languages=["zh", "en"],
204   - )
205   - fields = _lexical_combined_fields(q["query"])
206   - assert "title.zh^5.0" in fields
207   - assert "title.en^6.0" in fields # 10.0 * 0.6
208   -
209   -
210   -def test_mixed_script_does_not_merge_en_when_not_in_index_languages():
211   - qb = ESQueryBuilder(
212   - match_fields=["title.zh^3.0"],
213   - multilingual_fields=["title"],
214   - shared_fields=[],
215   - text_embedding_field="title_embedding",
216   - default_language="zh",
217   - )
218   - parsed_query = SimpleNamespace(
219   - rewritten_query="法式 dress",
220   - detected_language="zh",
221   - translations={},
222   - contains_chinese=True,
223   - contains_english=True,
224   - )
225   - q = qb.build_query(
226   - query_text="法式 dress",
227   - parsed_query=parsed_query,
228   - enable_knn=False,
229   - index_languages=["zh"],
230   - )
231   - fields = _lexical_combined_fields(q["query"])
232   - bases = {f.split("^", 1)[0] for f in fields}
233   - assert "title.zh" in bases
234   - assert "title.en" not in bases
... ...
tests/test_es_query_builder_text_recall_languages.py
1 1 """
2 2 ES text recall: base_query (rewritten @ detected_language) + base_query_trans_*.
3 3  
4   -Covers combinations of query language vs tenant index_languages, translations,
5   -and mixed Chinese/English queries. Asserts named lexical clause boundaries,
6   -combined_fields payloads, and per-language target fields (title.{lang}).
  4 +Covers translation routing, mixed-script queries (per-clause language fields only),
  5 +and clause naming. Asserts named lexical clause boundaries, combined_fields payloads,
  6 +and per-language target fields (title.{lang}).
7 7 """
8 8  
9 9 from types import SimpleNamespace
... ... @@ -14,11 +14,7 @@ import numpy as np
14 14 from search.es_query_builder import ESQueryBuilder
15 15  
16 16  
17   -def _builder_multilingual_title_only(
18   - *,
19   - default_language: str = "en",
20   - mixed_script_scale: float = 0.6,
21   -) -> ESQueryBuilder:
  17 +def _builder_multilingual_title_only(*, default_language: str = "en") -> ESQueryBuilder:
22 18 """Minimal builder: only title.{lang} for easy field assertions."""
23 19 return ESQueryBuilder(
24 20 match_fields=["title.en^1.0"],
... ... @@ -26,7 +22,6 @@ def _builder_multilingual_title_only(
26 22 shared_fields=[],
27 23 text_embedding_field="title_embedding",
28 24 default_language=default_language,
29   - mixed_script_merged_field_boost_scale=mixed_script_scale,
30 25 function_score_config=None,
31 26 )
32 27  
... ... @@ -101,22 +96,16 @@ def _build(
101 96 rewritten: str,
102 97 detected_language: str,
103 98 translations: Dict[str, str],
104   - index_languages: List[str],
105   - contains_chinese: bool = False,
106   - contains_english: bool = False,
107 99 ) -> Dict[str, Any]:
108 100 parsed = SimpleNamespace(
109 101 rewritten_query=rewritten,
110 102 detected_language=detected_language,
111 103 translations=dict(translations),
112   - contains_chinese=contains_chinese,
113   - contains_english=contains_english,
114 104 )
115 105 return qb.build_query(
116 106 query_text=query_text,
117 107 parsed_query=parsed,
118 108 enable_knn=False,
119   - index_languages=index_languages,
120 109 )
121 110  
122 111  
... ... @@ -131,7 +120,6 @@ def test_zh_query_index_zh_en_includes_base_zh_and_trans_en():
131 120 rewritten="连衣裙",
132 121 detected_language="zh",
133 122 translations={"en": "dress"},
134   - index_languages=["zh", "en"],
135 123 )
136 124 idx = _clauses_index(q)
137 125 assert set(idx) == {"base_query", "base_query_trans_en"}
... ... @@ -149,7 +137,6 @@ def test_en_query_index_zh_en_includes_base_en_and_trans_zh():
149 137 rewritten="dress",
150 138 detected_language="en",
151 139 translations={"zh": "连衣裙"},
152   - index_languages=["en", "zh"],
153 140 )
154 141 idx = _clauses_index(q)
155 142 assert set(idx) == {"base_query", "base_query_trans_zh"}
... ... @@ -167,7 +154,6 @@ def test_de_query_index_de_en_fr_includes_base_and_two_translations():
167 154 rewritten="kleid",
168 155 detected_language="de",
169 156 translations={"en": "dress", "fr": "robe"},
170   - index_languages=["de", "en", "fr"],
171 157 )
172 158 idx = _clauses_index(q)
173 159 assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_fr"}
... ... @@ -188,7 +174,6 @@ def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields():
188 174 rewritten="schuh",
189 175 detected_language="de",
190 176 translations={"en": "shoe", "zh": "鞋"},
191   - index_languages=["en", "zh"],
192 177 )
193 178 idx = _clauses_index(q)
194 179 assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_zh"}
... ... @@ -201,10 +186,10 @@ def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields():
201 186 assert idx["base_query_trans_zh"]["boost"] == qb.translation_boost
202 187  
203 188  
204   -# --- 中英混写:原文在 base_query;翻译子句独立;混写时 base 子句扩列 ---
  189 +# --- 中英混写:base 打在检测语种字段;翻译子句打在译文语种字段 ---
205 190  
206 191  
207   -def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause():
  192 +def test_mixed_zh_detected_base_clause_zh_fields_only_with_en_translation():
208 193 qb = _builder_multilingual_title_only(default_language="en")
209 194 q = _build(
210 195 qb,
... ... @@ -212,19 +197,16 @@ def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause():
212 197 rewritten="红色 dress",
213 198 detected_language="zh",
214 199 translations={"en": "red dress"},
215   - index_languages=["zh", "en"],
216   - contains_chinese=True,
217   - contains_english=True,
218 200 )
219 201 idx = _clauses_index(q)
220 202 assert set(idx) == {"base_query", "base_query_trans_en"}
221 203 assert _combined_fields_clause(idx["base_query"])["query"] == "红色 dress"
222   - assert _has_title_lang(idx["base_query"], "zh") and _has_title_lang(idx["base_query"], "en")
  204 + assert _has_title_lang(idx["base_query"], "zh") and not _has_title_lang(idx["base_query"], "en")
223 205 assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "red dress"
224 206 assert _has_title_lang(idx["base_query_trans_en"], "en")
225 207  
226 208  
227   -def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause():
  209 +def test_mixed_en_detected_base_clause_en_fields_only_with_zh_translation():
228 210 qb = _builder_multilingual_title_only(default_language="en")
229 211 q = _build(
230 212 qb,
... ... @@ -232,18 +214,15 @@ def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause():
232 214 rewritten="nike 运动鞋",
233 215 detected_language="en",
234 216 translations={"zh": "耐克运动鞋"},
235   - index_languages=["zh", "en"],
236   - contains_chinese=True,
237   - contains_english=True,
238 217 )
239 218 idx = _clauses_index(q)
240 219 assert set(idx) == {"base_query", "base_query_trans_zh"}
241 220 assert _combined_fields_clause(idx["base_query"])["query"] == "nike 运动鞋"
242   - assert _has_title_lang(idx["base_query"], "en") and _has_title_lang(idx["base_query"], "zh")
  221 + assert _has_title_lang(idx["base_query"], "en") and not _has_title_lang(idx["base_query"], "zh")
243 222 assert _combined_fields_clause(idx["base_query_trans_zh"])["query"] == "耐克运动鞋"
244 223  
245 224  
246   -def test_mixed_zh_query_index_zh_only_no_en_merge_in_base():
  225 +def test_zh_query_no_translations_only_zh_fields():
247 226 qb = _builder_multilingual_title_only(default_language="en")
248 227 q = _build(
249 228 qb,
... ... @@ -251,9 +230,6 @@ def test_mixed_zh_query_index_zh_only_no_en_merge_in_base():
251 230 rewritten="法式 dress",
252 231 detected_language="zh",
253 232 translations={},
254   - index_languages=["zh"],
255   - contains_chinese=True,
256   - contains_english=True,
257 233 )
258 234 idx = _clauses_index(q)
259 235 assert set(idx) == {"base_query"}
... ... @@ -272,7 +248,6 @@ def test_skips_translation_when_same_lang_and_same_text_as_base():
272 248 rewritten="NIKE",
273 249 detected_language="en",
274 250 translations={"en": "NIKE", "zh": "耐克"},
275   - index_languages=["en", "zh"],
276 251 )
277 252 idx = _clauses_index(q)
278 253 assert set(idx) == {"base_query", "base_query_trans_zh"}
... ... @@ -286,7 +261,6 @@ def test_keeps_translation_when_same_text_but_different_lang_than_base():
286 261 rewritten="NIKE",
287 262 detected_language="en",
288 263 translations={"zh": "NIKE"},
289   - index_languages=["en", "zh"],
290 264 )
291 265 idx = _clauses_index(q)
292 266 assert set(idx) == {"base_query", "base_query_trans_zh"}
... ... @@ -304,7 +278,6 @@ def test_translation_language_key_is_normalized_case_insensitive():
304 278 rewritten="dress",
305 279 detected_language="en",
306 280 translations={"ZH": "连衣裙"},
307   - index_languages=["en", "zh"],
308 281 )
309 282 idx = _clauses_index(q)
310 283 assert "base_query_trans_zh" in idx
... ... @@ -319,17 +292,16 @@ def test_empty_translation_value_is_skipped():
319 292 rewritten="dress",
320 293 detected_language="en",
321 294 translations={"zh": " ", "fr": "robe"},
322   - index_languages=["en", "zh", "fr"],
323 295 )
324 296 idx = _clauses_index(q)
325 297 assert "base_query_trans_zh" not in idx
326 298 assert "base_query_trans_fr" in idx
327 299  
328 300  
329   -# --- index_languages 为空:视为「未约束」source_in_index 为 True ---
  301 +# --- base 子句无 bool.boost;翻译子句带 translation_boost;phrase should 继承 phrase_match_boost ---
330 302  
331 303  
332   -def test_empty_index_languages_treats_source_as_in_index_boosts():
  304 +def test_de_base_and_en_translation_phrase_boosts():
333 305 qb = _builder_multilingual_title_only(default_language="en")
334 306 q = _build(
335 307 qb,
... ... @@ -337,7 +309,6 @@ def test_empty_index_languages_treats_source_as_in_index_boosts():
337 309 rewritten="x",
338 310 detected_language="de",
339 311 translations={"en": "y"},
340   - index_languages=[],
341 312 )
342 313 idx = _clauses_index(q)
343 314 assert "boost" not in idx["base_query"]
... ... @@ -359,7 +330,6 @@ def test_no_translations_only_base_query():
359 330 rewritten="hello",
360 331 detected_language="en",
361 332 translations={},
362   - index_languages=["en", "zh"],
363 333 )
364 334 idx = _clauses_index(q)
365 335 assert set(idx) == {"base_query"}
... ... @@ -374,15 +344,12 @@ def test_text_clauses_present_alongside_knn():
374 344 rewritten_query="dress",
375 345 detected_language="en",
376 346 translations={"zh": "连衣裙"},
377   - contains_chinese=False,
378   - contains_english=True,
379 347 )
380 348 q = qb.build_query(
381 349 query_text="dress",
382 350 query_vector=np.array([0.1, 0.2, 0.3], dtype=np.float32),
383 351 parsed_query=parsed,
384 352 enable_knn=True,
385   - index_languages=["en", "zh"],
386 353 )
387 354 assert "knn" in q
388 355 idx = _clauses_index(q)
... ... @@ -396,14 +363,11 @@ def test_detected_language_unknown_falls_back_to_default_language():
396 363 rewritten_query="shirt",
397 364 detected_language="unknown",
398 365 translations={"zh": "衬衫"},
399   - contains_chinese=False,
400   - contains_english=True,
401 366 )
402 367 q = qb.build_query(
403 368 query_text="shirt",
404 369 parsed_query=parsed,
405 370 enable_knn=False,
406   - index_languages=["en", "zh"],
407 371 )
408 372 idx = _clauses_index(q)
409 373 assert set(idx) == {"base_query", "base_query_trans_zh"}
... ... @@ -419,7 +383,6 @@ def test_ru_query_index_ru_en_includes_base_ru_and_trans_en():
419 383 rewritten="платье",
420 384 detected_language="ru",
421 385 translations={"en": "dress"},
422   - index_languages=["ru", "en"],
423 386 )
424 387 idx = _clauses_index(q)
425 388 assert set(idx) == {"base_query", "base_query_trans_en"}
... ... @@ -428,11 +391,8 @@ def test_ru_query_index_ru_en_includes_base_ru_and_trans_en():
428 391 assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "dress"
429 392  
430 393  
431   -def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause():
432   - """
433   - 当前实现:凡是 translations 里非空的条目都会生成子句;
434   - index_languages 只约束混写扩列,不用于过滤翻译子句。
435   - """
  394 +def test_translation_generates_clause_for_any_target_lang_key():
  395 + """translations 里非空的每个语种键都会生成对应 base_query_trans_* 子句。"""
436 396 qb = _builder_multilingual_title_only(default_language="en")
437 397 q = _build(
438 398 qb,
... ... @@ -440,7 +400,6 @@ def test_translation_for_lang_not_listed_in_index_languages_still_generates_clau
440 400 rewritten="dress",
441 401 detected_language="en",
442 402 translations={"zh": "连衣裙", "de": "Kleid"},
443   - index_languages=["en", "zh"],
444 403 )
445 404 idx = _clauses_index(q)
446 405 assert "base_query_trans_de" in idx
... ... @@ -457,9 +416,6 @@ def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_bas
457 416 rewritten="红色连衣裙",
458 417 detected_language="zh",
459 418 translations={"en": "red dress"},
460   - index_languages=["zh", "en"],
461   - contains_chinese=True,
462   - contains_english=False,
463 419 )
464 420 idx = _clauses_index(q)
465 421 assert _combined_fields_clause(idx["base_query"])["query"] == "红色连衣裙"
... ...
tests/test_query_parser_mixed_language.py
... ... @@ -11,14 +11,6 @@ def _tokenizer(text):
11 11 return str(text).split()
12 12  
13 13  
14   -def test_pure_english_word_token_length_and_script():
15   - assert QueryParser._is_pure_english_word_token("ab") is False
16   - assert QueryParser._is_pure_english_word_token("abc") is True
17   - assert QueryParser._is_pure_english_word_token("wi-fi") is True
18   - assert QueryParser._is_pure_english_word_token("连衣裙") is False
19   - assert QueryParser._is_pure_english_word_token("ab12") is False
20   -
21   -
22 14 def _build_config() -> SearchConfig:
23 15 return SearchConfig(
24 16 es_index_name="test_products",
... ... @@ -36,7 +28,7 @@ def _build_config() -&gt; SearchConfig:
36 28 )
37 29  
38 30  
39   -def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(monkeypatch):
  31 +def test_parse_mixed_zh_query_translates_to_en(monkeypatch):
40 32 parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
41 33 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh")
42 34  
... ... @@ -48,15 +40,13 @@ def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(mo
48 40 )
49 41  
50 42 assert result.detected_language == "zh"
51   - assert result.contains_chinese is True
52   - assert result.contains_english is True
53 43 assert result.translations == {"en": "法式 dress 连衣裙-en"}
54 44 assert result.query_tokens == ["法式", "dress", "连衣裙"]
55 45 assert not hasattr(result, "query_text_by_lang")
56 46 assert not hasattr(result, "search_langs")
57 47  
58 48  
59   -def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch):
  49 +def test_parse_mixed_en_query_translates_to_zh(monkeypatch):
60 50 parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
61 51 monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
62 52  
... ... @@ -68,8 +58,6 @@ def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch):
68 58 )
69 59  
70 60 assert result.detected_language == "en"
71   - assert result.contains_chinese is True
72   - assert result.contains_english is True
73 61 assert result.translations == {"zh": "red 连衣裙-zh"}
74 62 assert result.query_tokens == ["red", "连衣裙"]
75 63  
... ... @@ -87,7 +75,5 @@ def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch)
87 75 )
88 76  
89 77 assert result.detected_language == "en"
90   - assert result.contains_chinese is False
91   - assert result.contains_english is True
92 78 assert result.translations.get("zh") == "off shoulder top-zh"
93 79 assert not hasattr(result, "source_in_index_languages")
... ...