Commit 35da381333a86fda7b179f767b6f5a5dcd157a65
1 parent
445496cd
中英混写query的优化逻辑,不适合新的combined_fields+best_fields+phrase查询方式,带来的复杂度较多,清理该部分逻辑
Showing
7 changed files
with
53 additions
and
362 deletions
Show diff stats
docs/相关性检索优化说明.md
| ... | ... | @@ -17,9 +17,9 @@ |
| 17 | 17 | 查询链路(文本相关): |
| 18 | 18 | |
| 19 | 19 | 1. `QueryParser.parse()` |
| 20 | - 负责产出解析事实:`query_normalized`、`rewritten_query`、`detected_language`、`translations`、`query_vector`、`query_tokens`、`contains_chinese`、`contains_english`。 | |
| 20 | + 负责产出解析事实:`query_normalized`、`rewritten_query`、`detected_language`、`translations`、`query_vector`、`query_tokens`。 | |
| 21 | 21 | 2. `Searcher.search()` |
| 22 | - 负责读取租户 `index_languages`,并将其一方面传给 `QueryParser` 作为 `target_languages`,另一方面传给 `ESQueryBuilder` 作为字段展开约束。 | |
| 22 | + 负责读取租户 `index_languages`,并将其传给 `QueryParser` 作为 `target_languages`(控制翻译目标语种);`ESQueryBuilder` 仅根据 `detected_language` 与各条译文构建子句字段,不再接收 `index_languages`。 | |
| 23 | 23 | 2. `ESQueryBuilder._build_advanced_text_query()` |
| 24 | 24 | 基于 `rewritten_query + detected_language + translations + index_languages` 构建 `base_query` 与 `base_query_trans_*`;并按语言动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段,叠加 shared 字段(`tags`、`option*_values`)。 |
| 25 | 25 | 3. `build_query()` |
| ... | ... | @@ -76,9 +76,6 @@ |
| 76 | 76 | |
| 77 | 77 | 最终按 `bool.should` 组合,`minimum_should_match: 1`。 |
| 78 | 78 | |
| 79 | -> **附 — 混写辅助召回** | |
| 80 | -> 当中英(或多脚本)混写时,为略抬召回:`QueryParser` 用 `contains_chinese`(文中有汉字)、`contains_english`(分词中有长度 ≥3 的纯英文 token)打标;`ESQueryBuilder` 在某一语言的 `multi_match` 上,按规则把**另一语种**的同类字段并入同一 `fields`(受 `index_languages` 限制),并入列的 boost 为配置值再乘 **`mixed_script_merged_field_boost_scale`(默认 0.6,`ESQueryBuilder` 构造参数)**。字段在内部以 `(path, boost)` 列表合并后再格式化为 ES 字符串。 | |
| 81 | - | |
| 82 | 79 | ## 5. 关键配置项(文本策略) |
| 83 | 80 | |
| 84 | 81 | `query_config` 下与解析等待相关的项: |
| ... | ... | @@ -147,11 +144,9 @@ |
| 147 | 144 | - `translations` |
| 148 | 145 | - `query_vector` |
| 149 | 146 | - `query_tokens` |
| 150 | - - `contains_chinese` / `contains_english` | |
| 151 | 147 | - `Searcher` 负责“租户语境”: |
| 152 | 148 | - `index_languages` |
| 153 | 149 | - 将其传给 parser 作为 `target_languages` |
| 154 | - - 将其传给 builder 作为字段展开约束 | |
| 155 | 150 | - `ESQueryBuilder` 负责“表达式展开”: |
| 156 | 151 | - 动态字段组装 |
| 157 | 152 | - 子句权重分配 | ... | ... |
query/query_parser.py
| ... | ... | @@ -50,8 +50,6 @@ class ParsedQuery: |
| 50 | 50 | translations: Dict[str, str] = field(default_factory=dict) |
| 51 | 51 | query_vector: Optional[np.ndarray] = None |
| 52 | 52 | query_tokens: List[str] = field(default_factory=list) |
| 53 | - contains_chinese: bool = False | |
| 54 | - contains_english: bool = False | |
| 55 | 53 | |
| 56 | 54 | def to_dict(self) -> Dict[str, Any]: |
| 57 | 55 | """Convert to dictionary representation.""" |
| ... | ... | @@ -62,8 +60,6 @@ class ParsedQuery: |
| 62 | 60 | "detected_language": self.detected_language, |
| 63 | 61 | "translations": self.translations, |
| 64 | 62 | "query_tokens": self.query_tokens, |
| 65 | - "contains_chinese": self.contains_chinese, | |
| 66 | - "contains_english": self.contains_english, | |
| 67 | 63 | } |
| 68 | 64 | |
| 69 | 65 | |
| ... | ... | @@ -202,21 +198,6 @@ class QueryParser: |
| 202 | 198 | def _get_query_tokens(self, query: str) -> List[str]: |
| 203 | 199 | return self._extract_tokens(self._tokenizer(query)) |
| 204 | 200 | |
| 205 | - @staticmethod | |
| 206 | - def _contains_cjk(text: str) -> bool: | |
| 207 | - """Whether query contains any CJK ideograph.""" | |
| 208 | - return bool(re.search(r"[\u4e00-\u9fff]", text or "")) | |
| 209 | - | |
| 210 | - @staticmethod | |
| 211 | - def _is_pure_english_word_token(token: str) -> bool: | |
| 212 | - """ | |
| 213 | - A tokenizer token counts as English iff it is letters only (optional internal hyphens) | |
| 214 | - and length >= 3. | |
| 215 | - """ | |
| 216 | - if not token or len(token) < 3: | |
| 217 | - return False | |
| 218 | - return bool(re.fullmatch(r"[A-Za-z]+(?:-[A-Za-z]+)*", token)) | |
| 219 | - | |
| 220 | 201 | def parse( |
| 221 | 202 | self, |
| 222 | 203 | query: str, |
| ... | ... | @@ -285,19 +266,12 @@ class QueryParser: |
| 285 | 266 | log_info(f"Language detection | Detected language: {detected_lang}") |
| 286 | 267 | if context: |
| 287 | 268 | context.store_intermediate_result('detected_language', detected_lang) |
| 288 | - # Stage 4: Query analysis (tokenization + script flags) | |
| 269 | + # Stage 4: Query analysis (tokenization) | |
| 289 | 270 | query_tokens = self._get_query_tokens(query_text) |
| 290 | - contains_chinese = self._contains_cjk(query_text) | |
| 291 | - contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens) | |
| 292 | 271 | |
| 293 | - log_debug( | |
| 294 | - f"Query analysis | Query tokens: {query_tokens} | " | |
| 295 | - f"contains_chinese={contains_chinese} | contains_english={contains_english}" | |
| 296 | - ) | |
| 272 | + log_debug(f"Query analysis | Query tokens: {query_tokens}") | |
| 297 | 273 | if context: |
| 298 | 274 | context.store_intermediate_result('query_tokens', query_tokens) |
| 299 | - context.store_intermediate_result('contains_chinese', contains_chinese) | |
| 300 | - context.store_intermediate_result('contains_english', contains_english) | |
| 301 | 275 | |
| 302 | 276 | # Stage 5: Translation + embedding. Parser only coordinates async enrichment work; the |
| 303 | 277 | # caller decides translation targets and later search-field planning. |
| ... | ... | @@ -459,8 +433,6 @@ class QueryParser: |
| 459 | 433 | translations=translations, |
| 460 | 434 | query_vector=query_vector, |
| 461 | 435 | query_tokens=query_tokens, |
| 462 | - contains_chinese=contains_chinese, | |
| 463 | - contains_english=contains_english, | |
| 464 | 436 | ) |
| 465 | 437 | |
| 466 | 438 | if context and hasattr(context, 'logger'): | ... | ... |
search/es_query_builder.py
| ... | ... | @@ -8,14 +8,11 @@ Simplified architecture: |
| 8 | 8 | - function_score wrapper for boosting fields |
| 9 | 9 | """ |
| 10 | 10 | |
| 11 | -from typing import Dict, Any, List, Optional, Union, Tuple | |
| 11 | +from typing import Dict, Any, List, Optional, Tuple | |
| 12 | 12 | |
| 13 | 13 | import numpy as np |
| 14 | 14 | from config import FunctionScoreConfig |
| 15 | 15 | |
| 16 | -# (Elasticsearch field path, boost before formatting as "path^boost") | |
| 17 | -MatchFieldSpec = Tuple[str, float] | |
| 18 | - | |
| 19 | 16 | |
| 20 | 17 | class ESQueryBuilder: |
| 21 | 18 | """Builds Elasticsearch DSL queries.""" |
| ... | ... | @@ -39,7 +36,6 @@ class ESQueryBuilder: |
| 39 | 36 | tie_breaker_base_query: float = 0.9, |
| 40 | 37 | best_fields_boosts: Optional[Dict[str, float]] = None, |
| 41 | 38 | best_fields_clause_boost: float = 2.0, |
| 42 | - mixed_script_merged_field_boost_scale: float = 0.6, | |
| 43 | 39 | phrase_field_boosts: Optional[Dict[str, float]] = None, |
| 44 | 40 | phrase_match_base_fields: Optional[Tuple[str, ...]] = None, |
| 45 | 41 | phrase_match_slop: int = 0, |
| ... | ... | @@ -60,7 +56,6 @@ class ESQueryBuilder: |
| 60 | 56 | function_score_config: Function score configuration |
| 61 | 57 | default_language: Default language to use when detection fails or returns "unknown" |
| 62 | 58 | knn_boost: Boost value for KNN (embedding recall) |
| 63 | - mixed_script_merged_field_boost_scale: Multiply per-field ^boost for cross-script merged fields | |
| 64 | 59 | """ |
| 65 | 60 | self.match_fields = match_fields |
| 66 | 61 | self.field_boosts = field_boosts or {} |
| ... | ... | @@ -77,7 +72,6 @@ class ESQueryBuilder: |
| 77 | 72 | self.translation_minimum_should_match = translation_minimum_should_match |
| 78 | 73 | self.translation_boost = float(translation_boost) |
| 79 | 74 | self.tie_breaker_base_query = float(tie_breaker_base_query) |
| 80 | - self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale) | |
| 81 | 75 | default_best_fields = { |
| 82 | 76 | base: self._get_field_boost(base) |
| 83 | 77 | for base in self.core_multilingual_fields |
| ... | ... | @@ -180,7 +174,6 @@ class ESQueryBuilder: |
| 180 | 174 | knn_num_candidates: int = 200, |
| 181 | 175 | min_score: Optional[float] = None, |
| 182 | 176 | parsed_query: Optional[Any] = None, |
| 183 | - index_languages: Optional[List[str]] = None, | |
| 184 | 177 | ) -> Dict[str, Any]: |
| 185 | 178 | """ |
| 186 | 179 | Build complete ES query with post_filter support for multi-select faceting. |
| ... | ... | @@ -223,11 +216,7 @@ class ESQueryBuilder: |
| 223 | 216 | # Text recall (always include if query_text exists) |
| 224 | 217 | if query_text: |
| 225 | 218 | # Unified text query strategy |
| 226 | - text_query = self._build_advanced_text_query( | |
| 227 | - query_text, | |
| 228 | - parsed_query, | |
| 229 | - index_languages=index_languages, | |
| 230 | - ) | |
| 219 | + text_query = self._build_advanced_text_query(query_text, parsed_query) | |
| 231 | 220 | recall_clauses.append(text_query) |
| 232 | 221 | |
| 233 | 222 | # Embedding recall (KNN - separate from query, handled below) |
| ... | ... | @@ -434,90 +423,36 @@ class ESQueryBuilder: |
| 434 | 423 | return float(self.field_boosts[base_field]) |
| 435 | 424 | return 1.0 |
| 436 | 425 | |
| 437 | - def _build_match_field_specs( | |
| 426 | + def _match_field_strings( | |
| 438 | 427 | self, |
| 439 | 428 | language: str, |
| 440 | 429 | *, |
| 441 | 430 | multilingual_fields: Optional[List[str]] = None, |
| 442 | 431 | shared_fields: Optional[List[str]] = None, |
| 443 | 432 | boost_overrides: Optional[Dict[str, float]] = None, |
| 444 | - ) -> List[MatchFieldSpec]: | |
| 445 | - """ | |
| 446 | - Per-language match targets as (field_path, boost). Single source of truth before | |
| 447 | - formatting as Elasticsearch ``fields`` strings. | |
| 448 | - """ | |
| 433 | + ) -> List[str]: | |
| 434 | + """Build ``multi_match`` / ``combined_fields`` field entries for one language code.""" | |
| 449 | 435 | lang = (language or "").strip().lower() |
| 450 | - specs: List[MatchFieldSpec] = [] | |
| 451 | - text_fields = multilingual_fields if multilingual_fields is not None else self.multilingual_fields | |
| 436 | + text_bases = multilingual_fields if multilingual_fields is not None else self.multilingual_fields | |
| 452 | 437 | term_fields = shared_fields if shared_fields is not None else self.shared_fields |
| 453 | 438 | overrides = boost_overrides or {} |
| 454 | - | |
| 455 | - for base in text_fields: | |
| 456 | - field = f"{base}.{lang}" | |
| 439 | + out: List[str] = [] | |
| 440 | + for base in text_bases: | |
| 441 | + path = f"{base}.{lang}" | |
| 457 | 442 | boost = float(overrides.get(base, self._get_field_boost(base, lang))) |
| 458 | - specs.append((field, boost)) | |
| 459 | - | |
| 443 | + out.append(self._format_field_with_boost(path, boost)) | |
| 460 | 444 | for shared in term_fields: |
| 461 | 445 | boost = float(overrides.get(shared, self._get_field_boost(shared, None))) |
| 462 | - specs.append((shared, boost)) | |
| 463 | - return specs | |
| 464 | - | |
| 465 | - def _format_match_field_specs(self, specs: List[MatchFieldSpec]) -> List[str]: | |
| 466 | - """Format (field_path, boost) pairs for Elasticsearch multi_match ``fields``.""" | |
| 467 | - return [self._format_field_with_boost(path, boost) for path, boost in specs] | |
| 468 | - | |
| 469 | - def _merge_supplemental_lang_field_specs( | |
| 470 | - self, | |
| 471 | - specs: List[MatchFieldSpec], | |
| 472 | - supplemental_lang: str, | |
| 473 | - ) -> List[MatchFieldSpec]: | |
| 474 | - """Append supplemental-language columns; boosts multiplied by mixed_script scale.""" | |
| 475 | - scale = float(self.mixed_script_merged_field_boost_scale) | |
| 476 | - extra_all = self._build_match_field_specs(supplemental_lang) | |
| 477 | - seen = {path for path, _ in specs} | |
| 478 | - out = list(specs) | |
| 479 | - for path, boost in extra_all: | |
| 480 | - if path not in seen: | |
| 481 | - out.append((path, boost * scale)) | |
| 482 | - seen.add(path) | |
| 483 | - return out | |
| 484 | - | |
| 485 | - def _expand_match_field_specs_for_mixed_script( | |
| 486 | - self, | |
| 487 | - lang: str, | |
| 488 | - specs: List[MatchFieldSpec], | |
| 489 | - contains_chinese: bool, | |
| 490 | - contains_english: bool, | |
| 491 | - index_languages: List[str], | |
| 492 | - is_source: bool = False | |
| 493 | - ) -> List[MatchFieldSpec]: | |
| 494 | - """ | |
| 495 | - When the query mixes scripts, widen each clause to indexed fields for the other script | |
| 496 | - (e.g. zh clause also searches title.en when the query contains an English word token). | |
| 497 | - """ | |
| 498 | - norm = {str(x or "").strip().lower() for x in (index_languages or []) if str(x or "").strip()} | |
| 499 | - allow = norm or {"zh", "en"} | |
| 500 | - | |
| 501 | - def can_use(lcode: str) -> bool: | |
| 502 | - return lcode in allow if norm else True | |
| 503 | - | |
| 504 | - out = list(specs) | |
| 505 | - lnorm = (lang or "").strip().lower() | |
| 506 | - if is_source: | |
| 507 | - if contains_english and lnorm != "en" and can_use("en"): | |
| 508 | - out = self._merge_supplemental_lang_field_specs(out, "en") | |
| 509 | - if contains_chinese and lnorm != "zh" and can_use("zh"): | |
| 510 | - out = self._merge_supplemental_lang_field_specs(out, "zh") | |
| 446 | + out.append(self._format_field_with_boost(shared, boost)) | |
| 511 | 447 | return out |
| 512 | 448 | |
| 513 | 449 | def _build_best_fields_clause(self, language: str, query_text: str) -> Optional[Dict[str, Any]]: |
| 514 | - specs = self._build_match_field_specs( | |
| 450 | + fields = self._match_field_strings( | |
| 515 | 451 | language, |
| 516 | 452 | multilingual_fields=list(self.best_fields_boosts), |
| 517 | 453 | shared_fields=[], |
| 518 | 454 | boost_overrides=self.best_fields_boosts, |
| 519 | 455 | ) |
| 520 | - fields = self._format_match_field_specs(specs) | |
| 521 | 456 | if not fields: |
| 522 | 457 | return None |
| 523 | 458 | return { |
| ... | ... | @@ -530,13 +465,12 @@ class ESQueryBuilder: |
| 530 | 465 | } |
| 531 | 466 | |
| 532 | 467 | def _build_phrase_clause(self, language: str, query_text: str) -> Optional[Dict[str, Any]]: |
| 533 | - specs = self._build_match_field_specs( | |
| 468 | + fields = self._match_field_strings( | |
| 534 | 469 | language, |
| 535 | 470 | multilingual_fields=list(self.phrase_field_boosts), |
| 536 | 471 | shared_fields=[], |
| 537 | 472 | boost_overrides=self.phrase_field_boosts, |
| 538 | 473 | ) |
| 539 | - fields = self._format_match_field_specs(specs) | |
| 540 | 474 | if not fields: |
| 541 | 475 | return None |
| 542 | 476 | clause: Dict[str, Any] = { |
| ... | ... | @@ -560,20 +494,8 @@ class ESQueryBuilder: |
| 560 | 494 | clause_name: str, |
| 561 | 495 | *, |
| 562 | 496 | is_source: bool, |
| 563 | - contains_chinese: bool, | |
| 564 | - contains_english: bool, | |
| 565 | - index_languages: List[str], | |
| 566 | 497 | ) -> Optional[Dict[str, Any]]: |
| 567 | - all_specs = self._build_match_field_specs(lang) | |
| 568 | - expanded_specs = self._expand_match_field_specs_for_mixed_script( | |
| 569 | - lang, | |
| 570 | - all_specs, | |
| 571 | - contains_chinese, | |
| 572 | - contains_english, | |
| 573 | - index_languages, | |
| 574 | - is_source, | |
| 575 | - ) | |
| 576 | - combined_fields = self._format_match_field_specs(expanded_specs) | |
| 498 | + combined_fields = self._match_field_strings(lang) | |
| 577 | 499 | if not combined_fields: |
| 578 | 500 | return None |
| 579 | 501 | minimum_should_match = ( |
| ... | ... | @@ -607,29 +529,10 @@ class ESQueryBuilder: |
| 607 | 529 | clause["bool"]["boost"] = float(self.translation_boost) |
| 608 | 530 | return clause |
| 609 | 531 | |
| 610 | - def _get_embedding_field(self, language: str) -> str: | |
| 611 | - """Get embedding field name for a language.""" | |
| 612 | - # Currently using unified embedding field | |
| 613 | - return self.text_embedding_field or "title_embedding" | |
| 614 | - | |
| 615 | - @staticmethod | |
| 616 | - def _normalize_language_list(languages: Optional[List[str]]) -> List[str]: | |
| 617 | - normalized: List[str] = [] | |
| 618 | - seen = set() | |
| 619 | - for language in languages or []: | |
| 620 | - token = str(language or "").strip().lower() | |
| 621 | - if not token or token in seen: | |
| 622 | - continue | |
| 623 | - seen.add(token) | |
| 624 | - normalized.append(token) | |
| 625 | - return normalized | |
| 626 | - | |
| 627 | 532 | def _build_advanced_text_query( |
| 628 | 533 | self, |
| 629 | 534 | query_text: str, |
| 630 | 535 | parsed_query: Optional[Any] = None, |
| 631 | - *, | |
| 632 | - index_languages: Optional[List[str]] = None, | |
| 633 | 536 | ) -> Dict[str, Any]: |
| 634 | 537 | """ |
| 635 | 538 | Build advanced text query using base and translated lexical clauses. |
| ... | ... | @@ -649,39 +552,26 @@ class ESQueryBuilder: |
| 649 | 552 | should_clauses = [] |
| 650 | 553 | source_lang = self.default_language |
| 651 | 554 | translations: Dict[str, str] = {} |
| 652 | - contains_chinese = False | |
| 653 | - contains_english = False | |
| 654 | - normalized_index_languages = self._normalize_language_list(index_languages) | |
| 655 | 555 | |
| 656 | 556 | if parsed_query: |
| 657 | 557 | detected_lang = getattr(parsed_query, "detected_language", None) |
| 658 | 558 | source_lang = detected_lang if detected_lang and detected_lang != "unknown" else self.default_language |
| 659 | 559 | translations = getattr(parsed_query, "translations", None) or {} |
| 660 | - contains_chinese = bool(getattr(parsed_query, "contains_chinese", False)) | |
| 661 | - contains_english = bool(getattr(parsed_query, "contains_english", False)) | |
| 662 | 560 | |
| 663 | 561 | source_lang = str(source_lang or self.default_language).strip().lower() or self.default_language |
| 664 | 562 | base_query_text = ( |
| 665 | 563 | getattr(parsed_query, "rewritten_query", None) if parsed_query else None |
| 666 | 564 | ) or query_text |
| 667 | 565 | |
| 668 | - def append_clause(lang: str, lang_query: str, clause_name: str, is_source: bool) -> None: | |
| 669 | - nonlocal should_clauses | |
| 670 | - clause = self._build_lexical_language_clause( | |
| 671 | - lang, | |
| 672 | - lang_query, | |
| 673 | - clause_name, | |
| 674 | - is_source=is_source, | |
| 675 | - contains_chinese=contains_chinese, | |
| 676 | - contains_english=contains_english, | |
| 677 | - index_languages=normalized_index_languages, | |
| 678 | - ) | |
| 679 | - if not clause: | |
| 680 | - return | |
| 681 | - should_clauses.append(clause) | |
| 682 | - | |
| 683 | 566 | if base_query_text: |
| 684 | - append_clause(source_lang, base_query_text, "base_query", True) | |
| 567 | + base_clause = self._build_lexical_language_clause( | |
| 568 | + source_lang, | |
| 569 | + base_query_text, | |
| 570 | + "base_query", | |
| 571 | + is_source=True, | |
| 572 | + ) | |
| 573 | + if base_clause: | |
| 574 | + should_clauses.append(base_clause) | |
| 685 | 575 | |
| 686 | 576 | for lang, translated_text in translations.items(): |
| 687 | 577 | normalized_lang = str(lang or "").strip().lower() |
| ... | ... | @@ -690,7 +580,14 @@ class ESQueryBuilder: |
| 690 | 580 | continue |
| 691 | 581 | if normalized_lang == source_lang and normalized_text == base_query_text: |
| 692 | 582 | continue |
| 693 | - append_clause(normalized_lang, normalized_text, f"base_query_trans_{normalized_lang}", False) | |
| 583 | + trans_clause = self._build_lexical_language_clause( | |
| 584 | + normalized_lang, | |
| 585 | + normalized_text, | |
| 586 | + f"base_query_trans_{normalized_lang}", | |
| 587 | + is_source=False, | |
| 588 | + ) | |
| 589 | + if trans_clause: | |
| 590 | + should_clauses.append(trans_clause) | |
| 694 | 591 | |
| 695 | 592 | # Fallback to a simple query when language fields cannot be resolved. |
| 696 | 593 | if not should_clauses: | ... | ... |
search/searcher.py
tests/test_es_query_builder.py
| ... | ... | @@ -9,6 +9,9 @@ from search.es_query_builder import ESQueryBuilder |
| 9 | 9 | def _builder() -> ESQueryBuilder: |
| 10 | 10 | return ESQueryBuilder( |
| 11 | 11 | match_fields=["title.en^3.0", "brief.en^1.0"], |
| 12 | + multilingual_fields=["title", "brief"], | |
| 13 | + core_multilingual_fields=["title", "brief"], | |
| 14 | + shared_fields=[], | |
| 12 | 15 | text_embedding_field="title_embedding", |
| 13 | 16 | default_language="en", |
| 14 | 17 | ) |
| ... | ... | @@ -25,10 +28,6 @@ def _lexical_clause(query_root: Dict[str, Any]) -> Dict[str, Any]: |
| 25 | 28 | raise AssertionError("no lexical bool clause in query_root") |
| 26 | 29 | |
| 27 | 30 | |
| 28 | -def _lexical_combined_fields(query_root: Dict[str, Any]) -> list: | |
| 29 | - return _lexical_clause(query_root)["must"][0]["combined_fields"]["fields"] | |
| 30 | - | |
| 31 | - | |
| 32 | 31 | def test_knn_prefilter_includes_range_filters(): |
| 33 | 32 | qb = _builder() |
| 34 | 33 | q = qb.build_query( |
| ... | ... | @@ -93,7 +92,6 @@ def test_text_query_contains_only_base_and_translation_named_queries(): |
| 93 | 92 | query_text="dress", |
| 94 | 93 | parsed_query=parsed_query, |
| 95 | 94 | enable_knn=False, |
| 96 | - index_languages=["en", "zh", "fr"], | |
| 97 | 95 | ) |
| 98 | 96 | should = q["query"]["bool"]["should"] |
| 99 | 97 | names = [clause["bool"]["_name"] for clause in should] |
| ... | ... | @@ -115,120 +113,8 @@ def test_text_query_skips_duplicate_translation_same_as_base(): |
| 115 | 113 | query_text="dress", |
| 116 | 114 | parsed_query=parsed_query, |
| 117 | 115 | enable_knn=False, |
| 118 | - index_languages=["en", "zh"], | |
| 119 | 116 | ) |
| 120 | 117 | |
| 121 | 118 | root = q["query"] |
| 122 | 119 | assert root["bool"]["_name"] == "base_query" |
| 123 | 120 | assert [clause["multi_match"]["type"] for clause in root["bool"]["should"]] == ["best_fields", "phrase"] |
| 124 | - | |
| 125 | - | |
| 126 | -def test_mixed_script_merges_en_fields_into_zh_clause(): | |
| 127 | - qb = ESQueryBuilder( | |
| 128 | - match_fields=["title.en^3.0"], | |
| 129 | - multilingual_fields=["title", "brief"], | |
| 130 | - shared_fields=[], | |
| 131 | - text_embedding_field="title_embedding", | |
| 132 | - default_language="en", | |
| 133 | - ) | |
| 134 | - parsed_query = SimpleNamespace( | |
| 135 | - rewritten_query="法式 dress", | |
| 136 | - detected_language="zh", | |
| 137 | - translations={}, | |
| 138 | - contains_chinese=True, | |
| 139 | - contains_english=True, | |
| 140 | - ) | |
| 141 | - q = qb.build_query( | |
| 142 | - query_text="法式 dress", | |
| 143 | - parsed_query=parsed_query, | |
| 144 | - enable_knn=False, | |
| 145 | - index_languages=["zh", "en"], | |
| 146 | - ) | |
| 147 | - fields = _lexical_combined_fields(q["query"]) | |
| 148 | - bases = {f.split("^", 1)[0] for f in fields} | |
| 149 | - assert "title.zh" in bases and "title.en" in bases | |
| 150 | - assert "brief.zh" in bases and "brief.en" in bases | |
| 151 | - # Merged supplemental language fields use boost * 0.6 by default. | |
| 152 | - assert "title.en^0.6" in fields | |
| 153 | - assert "brief.en^0.6" in fields | |
| 154 | - | |
| 155 | - | |
| 156 | -def test_mixed_script_merges_zh_fields_into_en_clause(): | |
| 157 | - qb = ESQueryBuilder( | |
| 158 | - match_fields=["title.en^3.0"], | |
| 159 | - multilingual_fields=["title"], | |
| 160 | - shared_fields=[], | |
| 161 | - text_embedding_field="title_embedding", | |
| 162 | - default_language="en", | |
| 163 | - ) | |
| 164 | - parsed_query = SimpleNamespace( | |
| 165 | - rewritten_query="red 连衣裙", | |
| 166 | - detected_language="en", | |
| 167 | - translations={}, | |
| 168 | - contains_chinese=True, | |
| 169 | - contains_english=True, | |
| 170 | - ) | |
| 171 | - q = qb.build_query( | |
| 172 | - query_text="red 连衣裙", | |
| 173 | - parsed_query=parsed_query, | |
| 174 | - enable_knn=False, | |
| 175 | - index_languages=["zh", "en"], | |
| 176 | - ) | |
| 177 | - fields = _lexical_combined_fields(q["query"]) | |
| 178 | - bases = {f.split("^", 1)[0] for f in fields} | |
| 179 | - assert "title.en" in bases and "title.zh" in bases | |
| 180 | - assert "title.zh^0.6" in fields | |
| 181 | - | |
| 182 | - | |
| 183 | -def test_mixed_script_merged_fields_scale_configured_boosts(): | |
| 184 | - qb = ESQueryBuilder( | |
| 185 | - match_fields=["title.en^3.0"], | |
| 186 | - multilingual_fields=["title"], | |
| 187 | - shared_fields=[], | |
| 188 | - field_boosts={"title.zh": 5.0, "title.en": 10.0}, | |
| 189 | - text_embedding_field="title_embedding", | |
| 190 | - default_language="en", | |
| 191 | - ) | |
| 192 | - parsed_query = SimpleNamespace( | |
| 193 | - rewritten_query="法式 dress", | |
| 194 | - detected_language="zh", | |
| 195 | - translations={}, | |
| 196 | - contains_chinese=True, | |
| 197 | - contains_english=True, | |
| 198 | - ) | |
| 199 | - q = qb.build_query( | |
| 200 | - query_text="法式 dress", | |
| 201 | - parsed_query=parsed_query, | |
| 202 | - enable_knn=False, | |
| 203 | - index_languages=["zh", "en"], | |
| 204 | - ) | |
| 205 | - fields = _lexical_combined_fields(q["query"]) | |
| 206 | - assert "title.zh^5.0" in fields | |
| 207 | - assert "title.en^6.0" in fields # 10.0 * 0.6 | |
| 208 | - | |
| 209 | - | |
| 210 | -def test_mixed_script_does_not_merge_en_when_not_in_index_languages(): | |
| 211 | - qb = ESQueryBuilder( | |
| 212 | - match_fields=["title.zh^3.0"], | |
| 213 | - multilingual_fields=["title"], | |
| 214 | - shared_fields=[], | |
| 215 | - text_embedding_field="title_embedding", | |
| 216 | - default_language="zh", | |
| 217 | - ) | |
| 218 | - parsed_query = SimpleNamespace( | |
| 219 | - rewritten_query="法式 dress", | |
| 220 | - detected_language="zh", | |
| 221 | - translations={}, | |
| 222 | - contains_chinese=True, | |
| 223 | - contains_english=True, | |
| 224 | - ) | |
| 225 | - q = qb.build_query( | |
| 226 | - query_text="法式 dress", | |
| 227 | - parsed_query=parsed_query, | |
| 228 | - enable_knn=False, | |
| 229 | - index_languages=["zh"], | |
| 230 | - ) | |
| 231 | - fields = _lexical_combined_fields(q["query"]) | |
| 232 | - bases = {f.split("^", 1)[0] for f in fields} | |
| 233 | - assert "title.zh" in bases | |
| 234 | - assert "title.en" not in bases | ... | ... |
tests/test_es_query_builder_text_recall_languages.py
| 1 | 1 | """ |
| 2 | 2 | ES text recall: base_query (rewritten @ detected_language) + base_query_trans_*. |
| 3 | 3 | |
| 4 | -Covers combinations of query language vs tenant index_languages, translations, | |
| 5 | -and mixed Chinese/English queries. Asserts named lexical clause boundaries, | |
| 6 | -combined_fields payloads, and per-language target fields (title.{lang}). | |
| 4 | +Covers translation routing, mixed-script queries (per-clause language fields only), | |
| 5 | +and clause naming. Asserts named lexical clause boundaries, combined_fields payloads, | |
| 6 | +and per-language target fields (title.{lang}). | |
| 7 | 7 | """ |
| 8 | 8 | |
| 9 | 9 | from types import SimpleNamespace |
| ... | ... | @@ -14,11 +14,7 @@ import numpy as np |
| 14 | 14 | from search.es_query_builder import ESQueryBuilder |
| 15 | 15 | |
| 16 | 16 | |
| 17 | -def _builder_multilingual_title_only( | |
| 18 | - *, | |
| 19 | - default_language: str = "en", | |
| 20 | - mixed_script_scale: float = 0.6, | |
| 21 | -) -> ESQueryBuilder: | |
| 17 | +def _builder_multilingual_title_only(*, default_language: str = "en") -> ESQueryBuilder: | |
| 22 | 18 | """Minimal builder: only title.{lang} for easy field assertions.""" |
| 23 | 19 | return ESQueryBuilder( |
| 24 | 20 | match_fields=["title.en^1.0"], |
| ... | ... | @@ -26,7 +22,6 @@ def _builder_multilingual_title_only( |
| 26 | 22 | shared_fields=[], |
| 27 | 23 | text_embedding_field="title_embedding", |
| 28 | 24 | default_language=default_language, |
| 29 | - mixed_script_merged_field_boost_scale=mixed_script_scale, | |
| 30 | 25 | function_score_config=None, |
| 31 | 26 | ) |
| 32 | 27 | |
| ... | ... | @@ -101,22 +96,16 @@ def _build( |
| 101 | 96 | rewritten: str, |
| 102 | 97 | detected_language: str, |
| 103 | 98 | translations: Dict[str, str], |
| 104 | - index_languages: List[str], | |
| 105 | - contains_chinese: bool = False, | |
| 106 | - contains_english: bool = False, | |
| 107 | 99 | ) -> Dict[str, Any]: |
| 108 | 100 | parsed = SimpleNamespace( |
| 109 | 101 | rewritten_query=rewritten, |
| 110 | 102 | detected_language=detected_language, |
| 111 | 103 | translations=dict(translations), |
| 112 | - contains_chinese=contains_chinese, | |
| 113 | - contains_english=contains_english, | |
| 114 | 104 | ) |
| 115 | 105 | return qb.build_query( |
| 116 | 106 | query_text=query_text, |
| 117 | 107 | parsed_query=parsed, |
| 118 | 108 | enable_knn=False, |
| 119 | - index_languages=index_languages, | |
| 120 | 109 | ) |
| 121 | 110 | |
| 122 | 111 | |
| ... | ... | @@ -131,7 +120,6 @@ def test_zh_query_index_zh_en_includes_base_zh_and_trans_en(): |
| 131 | 120 | rewritten="连衣裙", |
| 132 | 121 | detected_language="zh", |
| 133 | 122 | translations={"en": "dress"}, |
| 134 | - index_languages=["zh", "en"], | |
| 135 | 123 | ) |
| 136 | 124 | idx = _clauses_index(q) |
| 137 | 125 | assert set(idx) == {"base_query", "base_query_trans_en"} |
| ... | ... | @@ -149,7 +137,6 @@ def test_en_query_index_zh_en_includes_base_en_and_trans_zh(): |
| 149 | 137 | rewritten="dress", |
| 150 | 138 | detected_language="en", |
| 151 | 139 | translations={"zh": "连衣裙"}, |
| 152 | - index_languages=["en", "zh"], | |
| 153 | 140 | ) |
| 154 | 141 | idx = _clauses_index(q) |
| 155 | 142 | assert set(idx) == {"base_query", "base_query_trans_zh"} |
| ... | ... | @@ -167,7 +154,6 @@ def test_de_query_index_de_en_fr_includes_base_and_two_translations(): |
| 167 | 154 | rewritten="kleid", |
| 168 | 155 | detected_language="de", |
| 169 | 156 | translations={"en": "dress", "fr": "robe"}, |
| 170 | - index_languages=["de", "en", "fr"], | |
| 171 | 157 | ) |
| 172 | 158 | idx = _clauses_index(q) |
| 173 | 159 | assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_fr"} |
| ... | ... | @@ -188,7 +174,6 @@ def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields(): |
| 188 | 174 | rewritten="schuh", |
| 189 | 175 | detected_language="de", |
| 190 | 176 | translations={"en": "shoe", "zh": "鞋"}, |
| 191 | - index_languages=["en", "zh"], | |
| 192 | 177 | ) |
| 193 | 178 | idx = _clauses_index(q) |
| 194 | 179 | assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_zh"} |
| ... | ... | @@ -201,10 +186,10 @@ def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields(): |
| 201 | 186 | assert idx["base_query_trans_zh"]["boost"] == qb.translation_boost |
| 202 | 187 | |
| 203 | 188 | |
| 204 | -# --- 中英混写:原文在 base_query;翻译子句独立;混写时 base 子句扩列 --- | |
| 189 | +# --- 中英混写:base 打在检测语种字段;翻译子句打在译文语种字段 --- | |
| 205 | 190 | |
| 206 | 191 | |
| 207 | -def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause(): | |
| 192 | +def test_mixed_zh_detected_base_clause_zh_fields_only_with_en_translation(): | |
| 208 | 193 | qb = _builder_multilingual_title_only(default_language="en") |
| 209 | 194 | q = _build( |
| 210 | 195 | qb, |
| ... | ... | @@ -212,19 +197,16 @@ def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause(): |
| 212 | 197 | rewritten="红色 dress", |
| 213 | 198 | detected_language="zh", |
| 214 | 199 | translations={"en": "red dress"}, |
| 215 | - index_languages=["zh", "en"], | |
| 216 | - contains_chinese=True, | |
| 217 | - contains_english=True, | |
| 218 | 200 | ) |
| 219 | 201 | idx = _clauses_index(q) |
| 220 | 202 | assert set(idx) == {"base_query", "base_query_trans_en"} |
| 221 | 203 | assert _combined_fields_clause(idx["base_query"])["query"] == "红色 dress" |
| 222 | - assert _has_title_lang(idx["base_query"], "zh") and _has_title_lang(idx["base_query"], "en") | |
| 204 | + assert _has_title_lang(idx["base_query"], "zh") and not _has_title_lang(idx["base_query"], "en") | |
| 223 | 205 | assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "red dress" |
| 224 | 206 | assert _has_title_lang(idx["base_query_trans_en"], "en") |
| 225 | 207 | |
| 226 | 208 | |
| 227 | -def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause(): | |
| 209 | +def test_mixed_en_detected_base_clause_en_fields_only_with_zh_translation(): | |
| 228 | 210 | qb = _builder_multilingual_title_only(default_language="en") |
| 229 | 211 | q = _build( |
| 230 | 212 | qb, |
| ... | ... | @@ -232,18 +214,15 @@ def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause(): |
| 232 | 214 | rewritten="nike 运动鞋", |
| 233 | 215 | detected_language="en", |
| 234 | 216 | translations={"zh": "耐克运动鞋"}, |
| 235 | - index_languages=["zh", "en"], | |
| 236 | - contains_chinese=True, | |
| 237 | - contains_english=True, | |
| 238 | 217 | ) |
| 239 | 218 | idx = _clauses_index(q) |
| 240 | 219 | assert set(idx) == {"base_query", "base_query_trans_zh"} |
| 241 | 220 | assert _combined_fields_clause(idx["base_query"])["query"] == "nike 运动鞋" |
| 242 | - assert _has_title_lang(idx["base_query"], "en") and _has_title_lang(idx["base_query"], "zh") | |
| 221 | + assert _has_title_lang(idx["base_query"], "en") and not _has_title_lang(idx["base_query"], "zh") | |
| 243 | 222 | assert _combined_fields_clause(idx["base_query_trans_zh"])["query"] == "耐克运动鞋" |
| 244 | 223 | |
| 245 | 224 | |
| 246 | -def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): | |
| 225 | +def test_zh_query_no_translations_only_zh_fields(): | |
| 247 | 226 | qb = _builder_multilingual_title_only(default_language="en") |
| 248 | 227 | q = _build( |
| 249 | 228 | qb, |
| ... | ... | @@ -251,9 +230,6 @@ def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): |
| 251 | 230 | rewritten="法式 dress", |
| 252 | 231 | detected_language="zh", |
| 253 | 232 | translations={}, |
| 254 | - index_languages=["zh"], | |
| 255 | - contains_chinese=True, | |
| 256 | - contains_english=True, | |
| 257 | 233 | ) |
| 258 | 234 | idx = _clauses_index(q) |
| 259 | 235 | assert set(idx) == {"base_query"} |
| ... | ... | @@ -272,7 +248,6 @@ def test_skips_translation_when_same_lang_and_same_text_as_base(): |
| 272 | 248 | rewritten="NIKE", |
| 273 | 249 | detected_language="en", |
| 274 | 250 | translations={"en": "NIKE", "zh": "耐克"}, |
| 275 | - index_languages=["en", "zh"], | |
| 276 | 251 | ) |
| 277 | 252 | idx = _clauses_index(q) |
| 278 | 253 | assert set(idx) == {"base_query", "base_query_trans_zh"} |
| ... | ... | @@ -286,7 +261,6 @@ def test_keeps_translation_when_same_text_but_different_lang_than_base(): |
| 286 | 261 | rewritten="NIKE", |
| 287 | 262 | detected_language="en", |
| 288 | 263 | translations={"zh": "NIKE"}, |
| 289 | - index_languages=["en", "zh"], | |
| 290 | 264 | ) |
| 291 | 265 | idx = _clauses_index(q) |
| 292 | 266 | assert set(idx) == {"base_query", "base_query_trans_zh"} |
| ... | ... | @@ -304,7 +278,6 @@ def test_translation_language_key_is_normalized_case_insensitive(): |
| 304 | 278 | rewritten="dress", |
| 305 | 279 | detected_language="en", |
| 306 | 280 | translations={"ZH": "连衣裙"}, |
| 307 | - index_languages=["en", "zh"], | |
| 308 | 281 | ) |
| 309 | 282 | idx = _clauses_index(q) |
| 310 | 283 | assert "base_query_trans_zh" in idx |
| ... | ... | @@ -319,17 +292,16 @@ def test_empty_translation_value_is_skipped(): |
| 319 | 292 | rewritten="dress", |
| 320 | 293 | detected_language="en", |
| 321 | 294 | translations={"zh": " ", "fr": "robe"}, |
| 322 | - index_languages=["en", "zh", "fr"], | |
| 323 | 295 | ) |
| 324 | 296 | idx = _clauses_index(q) |
| 325 | 297 | assert "base_query_trans_zh" not in idx |
| 326 | 298 | assert "base_query_trans_fr" in idx |
| 327 | 299 | |
| 328 | 300 | |
| 329 | -# --- index_languages 为空:视为「未约束」source_in_index 为 True --- | |
| 301 | +# --- base 子句无 bool.boost;翻译子句带 translation_boost;phrase should 继承 phrase_match_boost --- | |
| 330 | 302 | |
| 331 | 303 | |
| 332 | -def test_empty_index_languages_treats_source_as_in_index_boosts(): | |
| 304 | +def test_de_base_and_en_translation_phrase_boosts(): | |
| 333 | 305 | qb = _builder_multilingual_title_only(default_language="en") |
| 334 | 306 | q = _build( |
| 335 | 307 | qb, |
| ... | ... | @@ -337,7 +309,6 @@ def test_empty_index_languages_treats_source_as_in_index_boosts(): |
| 337 | 309 | rewritten="x", |
| 338 | 310 | detected_language="de", |
| 339 | 311 | translations={"en": "y"}, |
| 340 | - index_languages=[], | |
| 341 | 312 | ) |
| 342 | 313 | idx = _clauses_index(q) |
| 343 | 314 | assert "boost" not in idx["base_query"] |
| ... | ... | @@ -359,7 +330,6 @@ def test_no_translations_only_base_query(): |
| 359 | 330 | rewritten="hello", |
| 360 | 331 | detected_language="en", |
| 361 | 332 | translations={}, |
| 362 | - index_languages=["en", "zh"], | |
| 363 | 333 | ) |
| 364 | 334 | idx = _clauses_index(q) |
| 365 | 335 | assert set(idx) == {"base_query"} |
| ... | ... | @@ -374,15 +344,12 @@ def test_text_clauses_present_alongside_knn(): |
| 374 | 344 | rewritten_query="dress", |
| 375 | 345 | detected_language="en", |
| 376 | 346 | translations={"zh": "连衣裙"}, |
| 377 | - contains_chinese=False, | |
| 378 | - contains_english=True, | |
| 379 | 347 | ) |
| 380 | 348 | q = qb.build_query( |
| 381 | 349 | query_text="dress", |
| 382 | 350 | query_vector=np.array([0.1, 0.2, 0.3], dtype=np.float32), |
| 383 | 351 | parsed_query=parsed, |
| 384 | 352 | enable_knn=True, |
| 385 | - index_languages=["en", "zh"], | |
| 386 | 353 | ) |
| 387 | 354 | assert "knn" in q |
| 388 | 355 | idx = _clauses_index(q) |
| ... | ... | @@ -396,14 +363,11 @@ def test_detected_language_unknown_falls_back_to_default_language(): |
| 396 | 363 | rewritten_query="shirt", |
| 397 | 364 | detected_language="unknown", |
| 398 | 365 | translations={"zh": "衬衫"}, |
| 399 | - contains_chinese=False, | |
| 400 | - contains_english=True, | |
| 401 | 366 | ) |
| 402 | 367 | q = qb.build_query( |
| 403 | 368 | query_text="shirt", |
| 404 | 369 | parsed_query=parsed, |
| 405 | 370 | enable_knn=False, |
| 406 | - index_languages=["en", "zh"], | |
| 407 | 371 | ) |
| 408 | 372 | idx = _clauses_index(q) |
| 409 | 373 | assert set(idx) == {"base_query", "base_query_trans_zh"} |
| ... | ... | @@ -419,7 +383,6 @@ def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): |
| 419 | 383 | rewritten="платье", |
| 420 | 384 | detected_language="ru", |
| 421 | 385 | translations={"en": "dress"}, |
| 422 | - index_languages=["ru", "en"], | |
| 423 | 386 | ) |
| 424 | 387 | idx = _clauses_index(q) |
| 425 | 388 | assert set(idx) == {"base_query", "base_query_trans_en"} |
| ... | ... | @@ -428,11 +391,8 @@ def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): |
| 428 | 391 | assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "dress" |
| 429 | 392 | |
| 430 | 393 | |
| 431 | -def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause(): | |
| 432 | - """ | |
| 433 | - 当前实现:凡是 translations 里非空的条目都会生成子句; | |
| 434 | - index_languages 只约束混写扩列,不用于过滤翻译子句。 | |
| 435 | - """ | |
| 394 | +def test_translation_generates_clause_for_any_target_lang_key(): | |
| 395 | + """translations 里非空的每个语种键都会生成对应 base_query_trans_* 子句。""" | |
| 436 | 396 | qb = _builder_multilingual_title_only(default_language="en") |
| 437 | 397 | q = _build( |
| 438 | 398 | qb, |
| ... | ... | @@ -440,7 +400,6 @@ def test_translation_for_lang_not_listed_in_index_languages_still_generates_clau |
| 440 | 400 | rewritten="dress", |
| 441 | 401 | detected_language="en", |
| 442 | 402 | translations={"zh": "连衣裙", "de": "Kleid"}, |
| 443 | - index_languages=["en", "zh"], | |
| 444 | 403 | ) |
| 445 | 404 | idx = _clauses_index(q) |
| 446 | 405 | assert "base_query_trans_de" in idx |
| ... | ... | @@ -457,9 +416,6 @@ def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_bas |
| 457 | 416 | rewritten="红色连衣裙", |
| 458 | 417 | detected_language="zh", |
| 459 | 418 | translations={"en": "red dress"}, |
| 460 | - index_languages=["zh", "en"], | |
| 461 | - contains_chinese=True, | |
| 462 | - contains_english=False, | |
| 463 | 419 | ) |
| 464 | 420 | idx = _clauses_index(q) |
| 465 | 421 | assert _combined_fields_clause(idx["base_query"])["query"] == "红色连衣裙" | ... | ... |
tests/test_query_parser_mixed_language.py
| ... | ... | @@ -11,14 +11,6 @@ def _tokenizer(text): |
| 11 | 11 | return str(text).split() |
| 12 | 12 | |
| 13 | 13 | |
| 14 | -def test_pure_english_word_token_length_and_script(): | |
| 15 | - assert QueryParser._is_pure_english_word_token("ab") is False | |
| 16 | - assert QueryParser._is_pure_english_word_token("abc") is True | |
| 17 | - assert QueryParser._is_pure_english_word_token("wi-fi") is True | |
| 18 | - assert QueryParser._is_pure_english_word_token("连衣裙") is False | |
| 19 | - assert QueryParser._is_pure_english_word_token("ab12") is False | |
| 20 | - | |
| 21 | - | |
| 22 | 14 | def _build_config() -> SearchConfig: |
| 23 | 15 | return SearchConfig( |
| 24 | 16 | es_index_name="test_products", |
| ... | ... | @@ -36,7 +28,7 @@ def _build_config() -> SearchConfig: |
| 36 | 28 | ) |
| 37 | 29 | |
| 38 | 30 | |
| 39 | -def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(monkeypatch): | |
| 31 | +def test_parse_mixed_zh_query_translates_to_en(monkeypatch): | |
| 40 | 32 | parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) |
| 41 | 33 | monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh") |
| 42 | 34 | |
| ... | ... | @@ -48,15 +40,13 @@ def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(mo |
| 48 | 40 | ) |
| 49 | 41 | |
| 50 | 42 | assert result.detected_language == "zh" |
| 51 | - assert result.contains_chinese is True | |
| 52 | - assert result.contains_english is True | |
| 53 | 43 | assert result.translations == {"en": "法式 dress 连衣裙-en"} |
| 54 | 44 | assert result.query_tokens == ["法式", "dress", "连衣裙"] |
| 55 | 45 | assert not hasattr(result, "query_text_by_lang") |
| 56 | 46 | assert not hasattr(result, "search_langs") |
| 57 | 47 | |
| 58 | 48 | |
| 59 | -def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch): | |
| 49 | +def test_parse_mixed_en_query_translates_to_zh(monkeypatch): | |
| 60 | 50 | parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) |
| 61 | 51 | monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") |
| 62 | 52 | |
| ... | ... | @@ -68,8 +58,6 @@ def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch): |
| 68 | 58 | ) |
| 69 | 59 | |
| 70 | 60 | assert result.detected_language == "en" |
| 71 | - assert result.contains_chinese is True | |
| 72 | - assert result.contains_english is True | |
| 73 | 61 | assert result.translations == {"zh": "red 连衣裙-zh"} |
| 74 | 62 | assert result.query_tokens == ["red", "连衣裙"] |
| 75 | 63 | |
| ... | ... | @@ -87,7 +75,5 @@ def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch) |
| 87 | 75 | ) |
| 88 | 76 | |
| 89 | 77 | assert result.detected_language == "en" |
| 90 | - assert result.contains_chinese is False | |
| 91 | - assert result.contains_english is True | |
| 92 | 78 | assert result.translations.get("zh") == "off shoulder top-zh" |
| 93 | 79 | assert not hasattr(result, "source_in_index_languages") | ... | ... |