Commit 35da381333a86fda7b179f767b6f5a5dcd157a65
1 parent
445496cd
中英混写query的优化逻辑,不适合新的combined_fields+best_fields+phrase查询方式,带来的复杂度较多,清理该部分逻辑
Showing
7 changed files
with
53 additions
and
362 deletions
Show diff stats
docs/相关性检索优化说明.md
| @@ -17,9 +17,9 @@ | @@ -17,9 +17,9 @@ | ||
| 17 | 查询链路(文本相关): | 17 | 查询链路(文本相关): |
| 18 | 18 | ||
| 19 | 1. `QueryParser.parse()` | 19 | 1. `QueryParser.parse()` |
| 20 | - 负责产出解析事实:`query_normalized`、`rewritten_query`、`detected_language`、`translations`、`query_vector`、`query_tokens`、`contains_chinese`、`contains_english`。 | 20 | + 负责产出解析事实:`query_normalized`、`rewritten_query`、`detected_language`、`translations`、`query_vector`、`query_tokens`。 |
| 21 | 2. `Searcher.search()` | 21 | 2. `Searcher.search()` |
| 22 | - 负责读取租户 `index_languages`,并将其一方面传给 `QueryParser` 作为 `target_languages`,另一方面传给 `ESQueryBuilder` 作为字段展开约束。 | 22 | + 负责读取租户 `index_languages`,并将其传给 `QueryParser` 作为 `target_languages`(控制翻译目标语种);`ESQueryBuilder` 仅根据 `detected_language` 与各条译文构建子句字段,不再接收 `index_languages`。 |
| 23 | 2. `ESQueryBuilder._build_advanced_text_query()` | 23 | 2. `ESQueryBuilder._build_advanced_text_query()` |
| 24 | 基于 `rewritten_query + detected_language + translations + index_languages` 构建 `base_query` 与 `base_query_trans_*`;并按语言动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段,叠加 shared 字段(`tags`、`option*_values`)。 | 24 | 基于 `rewritten_query + detected_language + translations + index_languages` 构建 `base_query` 与 `base_query_trans_*`;并按语言动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段,叠加 shared 字段(`tags`、`option*_values`)。 |
| 25 | 3. `build_query()` | 25 | 3. `build_query()` |
| @@ -76,9 +76,6 @@ | @@ -76,9 +76,6 @@ | ||
| 76 | 76 | ||
| 77 | 最终按 `bool.should` 组合,`minimum_should_match: 1`。 | 77 | 最终按 `bool.should` 组合,`minimum_should_match: 1`。 |
| 78 | 78 | ||
| 79 | -> **附 — 混写辅助召回** | ||
| 80 | -> 当中英(或多脚本)混写时,为略抬召回:`QueryParser` 用 `contains_chinese`(文中有汉字)、`contains_english`(分词中有长度 ≥3 的纯英文 token)打标;`ESQueryBuilder` 在某一语言的 `multi_match` 上,按规则把**另一语种**的同类字段并入同一 `fields`(受 `index_languages` 限制),并入列的 boost 为配置值再乘 **`mixed_script_merged_field_boost_scale`(默认 0.6,`ESQueryBuilder` 构造参数)**。字段在内部以 `(path, boost)` 列表合并后再格式化为 ES 字符串。 | ||
| 81 | - | ||
| 82 | ## 5. 关键配置项(文本策略) | 79 | ## 5. 关键配置项(文本策略) |
| 83 | 80 | ||
| 84 | `query_config` 下与解析等待相关的项: | 81 | `query_config` 下与解析等待相关的项: |
| @@ -147,11 +144,9 @@ | @@ -147,11 +144,9 @@ | ||
| 147 | - `translations` | 144 | - `translations` |
| 148 | - `query_vector` | 145 | - `query_vector` |
| 149 | - `query_tokens` | 146 | - `query_tokens` |
| 150 | - - `contains_chinese` / `contains_english` | ||
| 151 | - `Searcher` 负责“租户语境”: | 147 | - `Searcher` 负责“租户语境”: |
| 152 | - `index_languages` | 148 | - `index_languages` |
| 153 | - 将其传给 parser 作为 `target_languages` | 149 | - 将其传给 parser 作为 `target_languages` |
| 154 | - - 将其传给 builder 作为字段展开约束 | ||
| 155 | - `ESQueryBuilder` 负责“表达式展开”: | 150 | - `ESQueryBuilder` 负责“表达式展开”: |
| 156 | - 动态字段组装 | 151 | - 动态字段组装 |
| 157 | - 子句权重分配 | 152 | - 子句权重分配 |
query/query_parser.py
| @@ -50,8 +50,6 @@ class ParsedQuery: | @@ -50,8 +50,6 @@ class ParsedQuery: | ||
| 50 | translations: Dict[str, str] = field(default_factory=dict) | 50 | translations: Dict[str, str] = field(default_factory=dict) |
| 51 | query_vector: Optional[np.ndarray] = None | 51 | query_vector: Optional[np.ndarray] = None |
| 52 | query_tokens: List[str] = field(default_factory=list) | 52 | query_tokens: List[str] = field(default_factory=list) |
| 53 | - contains_chinese: bool = False | ||
| 54 | - contains_english: bool = False | ||
| 55 | 53 | ||
| 56 | def to_dict(self) -> Dict[str, Any]: | 54 | def to_dict(self) -> Dict[str, Any]: |
| 57 | """Convert to dictionary representation.""" | 55 | """Convert to dictionary representation.""" |
| @@ -62,8 +60,6 @@ class ParsedQuery: | @@ -62,8 +60,6 @@ class ParsedQuery: | ||
| 62 | "detected_language": self.detected_language, | 60 | "detected_language": self.detected_language, |
| 63 | "translations": self.translations, | 61 | "translations": self.translations, |
| 64 | "query_tokens": self.query_tokens, | 62 | "query_tokens": self.query_tokens, |
| 65 | - "contains_chinese": self.contains_chinese, | ||
| 66 | - "contains_english": self.contains_english, | ||
| 67 | } | 63 | } |
| 68 | 64 | ||
| 69 | 65 | ||
| @@ -202,21 +198,6 @@ class QueryParser: | @@ -202,21 +198,6 @@ class QueryParser: | ||
| 202 | def _get_query_tokens(self, query: str) -> List[str]: | 198 | def _get_query_tokens(self, query: str) -> List[str]: |
| 203 | return self._extract_tokens(self._tokenizer(query)) | 199 | return self._extract_tokens(self._tokenizer(query)) |
| 204 | 200 | ||
| 205 | - @staticmethod | ||
| 206 | - def _contains_cjk(text: str) -> bool: | ||
| 207 | - """Whether query contains any CJK ideograph.""" | ||
| 208 | - return bool(re.search(r"[\u4e00-\u9fff]", text or "")) | ||
| 209 | - | ||
| 210 | - @staticmethod | ||
| 211 | - def _is_pure_english_word_token(token: str) -> bool: | ||
| 212 | - """ | ||
| 213 | - A tokenizer token counts as English iff it is letters only (optional internal hyphens) | ||
| 214 | - and length >= 3. | ||
| 215 | - """ | ||
| 216 | - if not token or len(token) < 3: | ||
| 217 | - return False | ||
| 218 | - return bool(re.fullmatch(r"[A-Za-z]+(?:-[A-Za-z]+)*", token)) | ||
| 219 | - | ||
| 220 | def parse( | 201 | def parse( |
| 221 | self, | 202 | self, |
| 222 | query: str, | 203 | query: str, |
| @@ -285,19 +266,12 @@ class QueryParser: | @@ -285,19 +266,12 @@ class QueryParser: | ||
| 285 | log_info(f"Language detection | Detected language: {detected_lang}") | 266 | log_info(f"Language detection | Detected language: {detected_lang}") |
| 286 | if context: | 267 | if context: |
| 287 | context.store_intermediate_result('detected_language', detected_lang) | 268 | context.store_intermediate_result('detected_language', detected_lang) |
| 288 | - # Stage 4: Query analysis (tokenization + script flags) | 269 | + # Stage 4: Query analysis (tokenization) |
| 289 | query_tokens = self._get_query_tokens(query_text) | 270 | query_tokens = self._get_query_tokens(query_text) |
| 290 | - contains_chinese = self._contains_cjk(query_text) | ||
| 291 | - contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens) | ||
| 292 | 271 | ||
| 293 | - log_debug( | ||
| 294 | - f"Query analysis | Query tokens: {query_tokens} | " | ||
| 295 | - f"contains_chinese={contains_chinese} | contains_english={contains_english}" | ||
| 296 | - ) | 272 | + log_debug(f"Query analysis | Query tokens: {query_tokens}") |
| 297 | if context: | 273 | if context: |
| 298 | context.store_intermediate_result('query_tokens', query_tokens) | 274 | context.store_intermediate_result('query_tokens', query_tokens) |
| 299 | - context.store_intermediate_result('contains_chinese', contains_chinese) | ||
| 300 | - context.store_intermediate_result('contains_english', contains_english) | ||
| 301 | 275 | ||
| 302 | # Stage 5: Translation + embedding. Parser only coordinates async enrichment work; the | 276 | # Stage 5: Translation + embedding. Parser only coordinates async enrichment work; the |
| 303 | # caller decides translation targets and later search-field planning. | 277 | # caller decides translation targets and later search-field planning. |
| @@ -459,8 +433,6 @@ class QueryParser: | @@ -459,8 +433,6 @@ class QueryParser: | ||
| 459 | translations=translations, | 433 | translations=translations, |
| 460 | query_vector=query_vector, | 434 | query_vector=query_vector, |
| 461 | query_tokens=query_tokens, | 435 | query_tokens=query_tokens, |
| 462 | - contains_chinese=contains_chinese, | ||
| 463 | - contains_english=contains_english, | ||
| 464 | ) | 436 | ) |
| 465 | 437 | ||
| 466 | if context and hasattr(context, 'logger'): | 438 | if context and hasattr(context, 'logger'): |
search/es_query_builder.py
| @@ -8,14 +8,11 @@ Simplified architecture: | @@ -8,14 +8,11 @@ Simplified architecture: | ||
| 8 | - function_score wrapper for boosting fields | 8 | - function_score wrapper for boosting fields |
| 9 | """ | 9 | """ |
| 10 | 10 | ||
| 11 | -from typing import Dict, Any, List, Optional, Union, Tuple | 11 | +from typing import Dict, Any, List, Optional, Tuple |
| 12 | 12 | ||
| 13 | import numpy as np | 13 | import numpy as np |
| 14 | from config import FunctionScoreConfig | 14 | from config import FunctionScoreConfig |
| 15 | 15 | ||
| 16 | -# (Elasticsearch field path, boost before formatting as "path^boost") | ||
| 17 | -MatchFieldSpec = Tuple[str, float] | ||
| 18 | - | ||
| 19 | 16 | ||
| 20 | class ESQueryBuilder: | 17 | class ESQueryBuilder: |
| 21 | """Builds Elasticsearch DSL queries.""" | 18 | """Builds Elasticsearch DSL queries.""" |
| @@ -39,7 +36,6 @@ class ESQueryBuilder: | @@ -39,7 +36,6 @@ class ESQueryBuilder: | ||
| 39 | tie_breaker_base_query: float = 0.9, | 36 | tie_breaker_base_query: float = 0.9, |
| 40 | best_fields_boosts: Optional[Dict[str, float]] = None, | 37 | best_fields_boosts: Optional[Dict[str, float]] = None, |
| 41 | best_fields_clause_boost: float = 2.0, | 38 | best_fields_clause_boost: float = 2.0, |
| 42 | - mixed_script_merged_field_boost_scale: float = 0.6, | ||
| 43 | phrase_field_boosts: Optional[Dict[str, float]] = None, | 39 | phrase_field_boosts: Optional[Dict[str, float]] = None, |
| 44 | phrase_match_base_fields: Optional[Tuple[str, ...]] = None, | 40 | phrase_match_base_fields: Optional[Tuple[str, ...]] = None, |
| 45 | phrase_match_slop: int = 0, | 41 | phrase_match_slop: int = 0, |
| @@ -60,7 +56,6 @@ class ESQueryBuilder: | @@ -60,7 +56,6 @@ class ESQueryBuilder: | ||
| 60 | function_score_config: Function score configuration | 56 | function_score_config: Function score configuration |
| 61 | default_language: Default language to use when detection fails or returns "unknown" | 57 | default_language: Default language to use when detection fails or returns "unknown" |
| 62 | knn_boost: Boost value for KNN (embedding recall) | 58 | knn_boost: Boost value for KNN (embedding recall) |
| 63 | - mixed_script_merged_field_boost_scale: Multiply per-field ^boost for cross-script merged fields | ||
| 64 | """ | 59 | """ |
| 65 | self.match_fields = match_fields | 60 | self.match_fields = match_fields |
| 66 | self.field_boosts = field_boosts or {} | 61 | self.field_boosts = field_boosts or {} |
| @@ -77,7 +72,6 @@ class ESQueryBuilder: | @@ -77,7 +72,6 @@ class ESQueryBuilder: | ||
| 77 | self.translation_minimum_should_match = translation_minimum_should_match | 72 | self.translation_minimum_should_match = translation_minimum_should_match |
| 78 | self.translation_boost = float(translation_boost) | 73 | self.translation_boost = float(translation_boost) |
| 79 | self.tie_breaker_base_query = float(tie_breaker_base_query) | 74 | self.tie_breaker_base_query = float(tie_breaker_base_query) |
| 80 | - self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale) | ||
| 81 | default_best_fields = { | 75 | default_best_fields = { |
| 82 | base: self._get_field_boost(base) | 76 | base: self._get_field_boost(base) |
| 83 | for base in self.core_multilingual_fields | 77 | for base in self.core_multilingual_fields |
| @@ -180,7 +174,6 @@ class ESQueryBuilder: | @@ -180,7 +174,6 @@ class ESQueryBuilder: | ||
| 180 | knn_num_candidates: int = 200, | 174 | knn_num_candidates: int = 200, |
| 181 | min_score: Optional[float] = None, | 175 | min_score: Optional[float] = None, |
| 182 | parsed_query: Optional[Any] = None, | 176 | parsed_query: Optional[Any] = None, |
| 183 | - index_languages: Optional[List[str]] = None, | ||
| 184 | ) -> Dict[str, Any]: | 177 | ) -> Dict[str, Any]: |
| 185 | """ | 178 | """ |
| 186 | Build complete ES query with post_filter support for multi-select faceting. | 179 | Build complete ES query with post_filter support for multi-select faceting. |
| @@ -223,11 +216,7 @@ class ESQueryBuilder: | @@ -223,11 +216,7 @@ class ESQueryBuilder: | ||
| 223 | # Text recall (always include if query_text exists) | 216 | # Text recall (always include if query_text exists) |
| 224 | if query_text: | 217 | if query_text: |
| 225 | # Unified text query strategy | 218 | # Unified text query strategy |
| 226 | - text_query = self._build_advanced_text_query( | ||
| 227 | - query_text, | ||
| 228 | - parsed_query, | ||
| 229 | - index_languages=index_languages, | ||
| 230 | - ) | 219 | + text_query = self._build_advanced_text_query(query_text, parsed_query) |
| 231 | recall_clauses.append(text_query) | 220 | recall_clauses.append(text_query) |
| 232 | 221 | ||
| 233 | # Embedding recall (KNN - separate from query, handled below) | 222 | # Embedding recall (KNN - separate from query, handled below) |
| @@ -434,90 +423,36 @@ class ESQueryBuilder: | @@ -434,90 +423,36 @@ class ESQueryBuilder: | ||
| 434 | return float(self.field_boosts[base_field]) | 423 | return float(self.field_boosts[base_field]) |
| 435 | return 1.0 | 424 | return 1.0 |
| 436 | 425 | ||
| 437 | - def _build_match_field_specs( | 426 | + def _match_field_strings( |
| 438 | self, | 427 | self, |
| 439 | language: str, | 428 | language: str, |
| 440 | *, | 429 | *, |
| 441 | multilingual_fields: Optional[List[str]] = None, | 430 | multilingual_fields: Optional[List[str]] = None, |
| 442 | shared_fields: Optional[List[str]] = None, | 431 | shared_fields: Optional[List[str]] = None, |
| 443 | boost_overrides: Optional[Dict[str, float]] = None, | 432 | boost_overrides: Optional[Dict[str, float]] = None, |
| 444 | - ) -> List[MatchFieldSpec]: | ||
| 445 | - """ | ||
| 446 | - Per-language match targets as (field_path, boost). Single source of truth before | ||
| 447 | - formatting as Elasticsearch ``fields`` strings. | ||
| 448 | - """ | 433 | + ) -> List[str]: |
| 434 | + """Build ``multi_match`` / ``combined_fields`` field entries for one language code.""" | ||
| 449 | lang = (language or "").strip().lower() | 435 | lang = (language or "").strip().lower() |
| 450 | - specs: List[MatchFieldSpec] = [] | ||
| 451 | - text_fields = multilingual_fields if multilingual_fields is not None else self.multilingual_fields | 436 | + text_bases = multilingual_fields if multilingual_fields is not None else self.multilingual_fields |
| 452 | term_fields = shared_fields if shared_fields is not None else self.shared_fields | 437 | term_fields = shared_fields if shared_fields is not None else self.shared_fields |
| 453 | overrides = boost_overrides or {} | 438 | overrides = boost_overrides or {} |
| 454 | - | ||
| 455 | - for base in text_fields: | ||
| 456 | - field = f"{base}.{lang}" | 439 | + out: List[str] = [] |
| 440 | + for base in text_bases: | ||
| 441 | + path = f"{base}.{lang}" | ||
| 457 | boost = float(overrides.get(base, self._get_field_boost(base, lang))) | 442 | boost = float(overrides.get(base, self._get_field_boost(base, lang))) |
| 458 | - specs.append((field, boost)) | ||
| 459 | - | 443 | + out.append(self._format_field_with_boost(path, boost)) |
| 460 | for shared in term_fields: | 444 | for shared in term_fields: |
| 461 | boost = float(overrides.get(shared, self._get_field_boost(shared, None))) | 445 | boost = float(overrides.get(shared, self._get_field_boost(shared, None))) |
| 462 | - specs.append((shared, boost)) | ||
| 463 | - return specs | ||
| 464 | - | ||
| 465 | - def _format_match_field_specs(self, specs: List[MatchFieldSpec]) -> List[str]: | ||
| 466 | - """Format (field_path, boost) pairs for Elasticsearch multi_match ``fields``.""" | ||
| 467 | - return [self._format_field_with_boost(path, boost) for path, boost in specs] | ||
| 468 | - | ||
| 469 | - def _merge_supplemental_lang_field_specs( | ||
| 470 | - self, | ||
| 471 | - specs: List[MatchFieldSpec], | ||
| 472 | - supplemental_lang: str, | ||
| 473 | - ) -> List[MatchFieldSpec]: | ||
| 474 | - """Append supplemental-language columns; boosts multiplied by mixed_script scale.""" | ||
| 475 | - scale = float(self.mixed_script_merged_field_boost_scale) | ||
| 476 | - extra_all = self._build_match_field_specs(supplemental_lang) | ||
| 477 | - seen = {path for path, _ in specs} | ||
| 478 | - out = list(specs) | ||
| 479 | - for path, boost in extra_all: | ||
| 480 | - if path not in seen: | ||
| 481 | - out.append((path, boost * scale)) | ||
| 482 | - seen.add(path) | ||
| 483 | - return out | ||
| 484 | - | ||
| 485 | - def _expand_match_field_specs_for_mixed_script( | ||
| 486 | - self, | ||
| 487 | - lang: str, | ||
| 488 | - specs: List[MatchFieldSpec], | ||
| 489 | - contains_chinese: bool, | ||
| 490 | - contains_english: bool, | ||
| 491 | - index_languages: List[str], | ||
| 492 | - is_source: bool = False | ||
| 493 | - ) -> List[MatchFieldSpec]: | ||
| 494 | - """ | ||
| 495 | - When the query mixes scripts, widen each clause to indexed fields for the other script | ||
| 496 | - (e.g. zh clause also searches title.en when the query contains an English word token). | ||
| 497 | - """ | ||
| 498 | - norm = {str(x or "").strip().lower() for x in (index_languages or []) if str(x or "").strip()} | ||
| 499 | - allow = norm or {"zh", "en"} | ||
| 500 | - | ||
| 501 | - def can_use(lcode: str) -> bool: | ||
| 502 | - return lcode in allow if norm else True | ||
| 503 | - | ||
| 504 | - out = list(specs) | ||
| 505 | - lnorm = (lang or "").strip().lower() | ||
| 506 | - if is_source: | ||
| 507 | - if contains_english and lnorm != "en" and can_use("en"): | ||
| 508 | - out = self._merge_supplemental_lang_field_specs(out, "en") | ||
| 509 | - if contains_chinese and lnorm != "zh" and can_use("zh"): | ||
| 510 | - out = self._merge_supplemental_lang_field_specs(out, "zh") | 446 | + out.append(self._format_field_with_boost(shared, boost)) |
| 511 | return out | 447 | return out |
| 512 | 448 | ||
| 513 | def _build_best_fields_clause(self, language: str, query_text: str) -> Optional[Dict[str, Any]]: | 449 | def _build_best_fields_clause(self, language: str, query_text: str) -> Optional[Dict[str, Any]]: |
| 514 | - specs = self._build_match_field_specs( | 450 | + fields = self._match_field_strings( |
| 515 | language, | 451 | language, |
| 516 | multilingual_fields=list(self.best_fields_boosts), | 452 | multilingual_fields=list(self.best_fields_boosts), |
| 517 | shared_fields=[], | 453 | shared_fields=[], |
| 518 | boost_overrides=self.best_fields_boosts, | 454 | boost_overrides=self.best_fields_boosts, |
| 519 | ) | 455 | ) |
| 520 | - fields = self._format_match_field_specs(specs) | ||
| 521 | if not fields: | 456 | if not fields: |
| 522 | return None | 457 | return None |
| 523 | return { | 458 | return { |
| @@ -530,13 +465,12 @@ class ESQueryBuilder: | @@ -530,13 +465,12 @@ class ESQueryBuilder: | ||
| 530 | } | 465 | } |
| 531 | 466 | ||
| 532 | def _build_phrase_clause(self, language: str, query_text: str) -> Optional[Dict[str, Any]]: | 467 | def _build_phrase_clause(self, language: str, query_text: str) -> Optional[Dict[str, Any]]: |
| 533 | - specs = self._build_match_field_specs( | 468 | + fields = self._match_field_strings( |
| 534 | language, | 469 | language, |
| 535 | multilingual_fields=list(self.phrase_field_boosts), | 470 | multilingual_fields=list(self.phrase_field_boosts), |
| 536 | shared_fields=[], | 471 | shared_fields=[], |
| 537 | boost_overrides=self.phrase_field_boosts, | 472 | boost_overrides=self.phrase_field_boosts, |
| 538 | ) | 473 | ) |
| 539 | - fields = self._format_match_field_specs(specs) | ||
| 540 | if not fields: | 474 | if not fields: |
| 541 | return None | 475 | return None |
| 542 | clause: Dict[str, Any] = { | 476 | clause: Dict[str, Any] = { |
| @@ -560,20 +494,8 @@ class ESQueryBuilder: | @@ -560,20 +494,8 @@ class ESQueryBuilder: | ||
| 560 | clause_name: str, | 494 | clause_name: str, |
| 561 | *, | 495 | *, |
| 562 | is_source: bool, | 496 | is_source: bool, |
| 563 | - contains_chinese: bool, | ||
| 564 | - contains_english: bool, | ||
| 565 | - index_languages: List[str], | ||
| 566 | ) -> Optional[Dict[str, Any]]: | 497 | ) -> Optional[Dict[str, Any]]: |
| 567 | - all_specs = self._build_match_field_specs(lang) | ||
| 568 | - expanded_specs = self._expand_match_field_specs_for_mixed_script( | ||
| 569 | - lang, | ||
| 570 | - all_specs, | ||
| 571 | - contains_chinese, | ||
| 572 | - contains_english, | ||
| 573 | - index_languages, | ||
| 574 | - is_source, | ||
| 575 | - ) | ||
| 576 | - combined_fields = self._format_match_field_specs(expanded_specs) | 498 | + combined_fields = self._match_field_strings(lang) |
| 577 | if not combined_fields: | 499 | if not combined_fields: |
| 578 | return None | 500 | return None |
| 579 | minimum_should_match = ( | 501 | minimum_should_match = ( |
| @@ -607,29 +529,10 @@ class ESQueryBuilder: | @@ -607,29 +529,10 @@ class ESQueryBuilder: | ||
| 607 | clause["bool"]["boost"] = float(self.translation_boost) | 529 | clause["bool"]["boost"] = float(self.translation_boost) |
| 608 | return clause | 530 | return clause |
| 609 | 531 | ||
| 610 | - def _get_embedding_field(self, language: str) -> str: | ||
| 611 | - """Get embedding field name for a language.""" | ||
| 612 | - # Currently using unified embedding field | ||
| 613 | - return self.text_embedding_field or "title_embedding" | ||
| 614 | - | ||
| 615 | - @staticmethod | ||
| 616 | - def _normalize_language_list(languages: Optional[List[str]]) -> List[str]: | ||
| 617 | - normalized: List[str] = [] | ||
| 618 | - seen = set() | ||
| 619 | - for language in languages or []: | ||
| 620 | - token = str(language or "").strip().lower() | ||
| 621 | - if not token or token in seen: | ||
| 622 | - continue | ||
| 623 | - seen.add(token) | ||
| 624 | - normalized.append(token) | ||
| 625 | - return normalized | ||
| 626 | - | ||
| 627 | def _build_advanced_text_query( | 532 | def _build_advanced_text_query( |
| 628 | self, | 533 | self, |
| 629 | query_text: str, | 534 | query_text: str, |
| 630 | parsed_query: Optional[Any] = None, | 535 | parsed_query: Optional[Any] = None, |
| 631 | - *, | ||
| 632 | - index_languages: Optional[List[str]] = None, | ||
| 633 | ) -> Dict[str, Any]: | 536 | ) -> Dict[str, Any]: |
| 634 | """ | 537 | """ |
| 635 | Build advanced text query using base and translated lexical clauses. | 538 | Build advanced text query using base and translated lexical clauses. |
| @@ -649,39 +552,26 @@ class ESQueryBuilder: | @@ -649,39 +552,26 @@ class ESQueryBuilder: | ||
| 649 | should_clauses = [] | 552 | should_clauses = [] |
| 650 | source_lang = self.default_language | 553 | source_lang = self.default_language |
| 651 | translations: Dict[str, str] = {} | 554 | translations: Dict[str, str] = {} |
| 652 | - contains_chinese = False | ||
| 653 | - contains_english = False | ||
| 654 | - normalized_index_languages = self._normalize_language_list(index_languages) | ||
| 655 | 555 | ||
| 656 | if parsed_query: | 556 | if parsed_query: |
| 657 | detected_lang = getattr(parsed_query, "detected_language", None) | 557 | detected_lang = getattr(parsed_query, "detected_language", None) |
| 658 | source_lang = detected_lang if detected_lang and detected_lang != "unknown" else self.default_language | 558 | source_lang = detected_lang if detected_lang and detected_lang != "unknown" else self.default_language |
| 659 | translations = getattr(parsed_query, "translations", None) or {} | 559 | translations = getattr(parsed_query, "translations", None) or {} |
| 660 | - contains_chinese = bool(getattr(parsed_query, "contains_chinese", False)) | ||
| 661 | - contains_english = bool(getattr(parsed_query, "contains_english", False)) | ||
| 662 | 560 | ||
| 663 | source_lang = str(source_lang or self.default_language).strip().lower() or self.default_language | 561 | source_lang = str(source_lang or self.default_language).strip().lower() or self.default_language |
| 664 | base_query_text = ( | 562 | base_query_text = ( |
| 665 | getattr(parsed_query, "rewritten_query", None) if parsed_query else None | 563 | getattr(parsed_query, "rewritten_query", None) if parsed_query else None |
| 666 | ) or query_text | 564 | ) or query_text |
| 667 | 565 | ||
| 668 | - def append_clause(lang: str, lang_query: str, clause_name: str, is_source: bool) -> None: | ||
| 669 | - nonlocal should_clauses | ||
| 670 | - clause = self._build_lexical_language_clause( | ||
| 671 | - lang, | ||
| 672 | - lang_query, | ||
| 673 | - clause_name, | ||
| 674 | - is_source=is_source, | ||
| 675 | - contains_chinese=contains_chinese, | ||
| 676 | - contains_english=contains_english, | ||
| 677 | - index_languages=normalized_index_languages, | ||
| 678 | - ) | ||
| 679 | - if not clause: | ||
| 680 | - return | ||
| 681 | - should_clauses.append(clause) | ||
| 682 | - | ||
| 683 | if base_query_text: | 566 | if base_query_text: |
| 684 | - append_clause(source_lang, base_query_text, "base_query", True) | 567 | + base_clause = self._build_lexical_language_clause( |
| 568 | + source_lang, | ||
| 569 | + base_query_text, | ||
| 570 | + "base_query", | ||
| 571 | + is_source=True, | ||
| 572 | + ) | ||
| 573 | + if base_clause: | ||
| 574 | + should_clauses.append(base_clause) | ||
| 685 | 575 | ||
| 686 | for lang, translated_text in translations.items(): | 576 | for lang, translated_text in translations.items(): |
| 687 | normalized_lang = str(lang or "").strip().lower() | 577 | normalized_lang = str(lang or "").strip().lower() |
| @@ -690,7 +580,14 @@ class ESQueryBuilder: | @@ -690,7 +580,14 @@ class ESQueryBuilder: | ||
| 690 | continue | 580 | continue |
| 691 | if normalized_lang == source_lang and normalized_text == base_query_text: | 581 | if normalized_lang == source_lang and normalized_text == base_query_text: |
| 692 | continue | 582 | continue |
| 693 | - append_clause(normalized_lang, normalized_text, f"base_query_trans_{normalized_lang}", False) | 583 | + trans_clause = self._build_lexical_language_clause( |
| 584 | + normalized_lang, | ||
| 585 | + normalized_text, | ||
| 586 | + f"base_query_trans_{normalized_lang}", | ||
| 587 | + is_source=False, | ||
| 588 | + ) | ||
| 589 | + if trans_clause: | ||
| 590 | + should_clauses.append(trans_clause) | ||
| 694 | 591 | ||
| 695 | # Fallback to a simple query when language fields cannot be resolved. | 592 | # Fallback to a simple query when language fields cannot be resolved. |
| 696 | if not should_clauses: | 593 | if not should_clauses: |
search/searcher.py
| @@ -645,7 +645,6 @@ class Searcher: | @@ -645,7 +645,6 @@ class Searcher: | ||
| 645 | enable_knn=enable_embedding and parsed_query.query_vector is not None, | 645 | enable_knn=enable_embedding and parsed_query.query_vector is not None, |
| 646 | min_score=min_score, | 646 | min_score=min_score, |
| 647 | parsed_query=parsed_query, | 647 | parsed_query=parsed_query, |
| 648 | - index_languages=index_langs, | ||
| 649 | ) | 648 | ) |
| 650 | 649 | ||
| 651 | # Add facets for faceted search | 650 | # Add facets for faceted search |
tests/test_es_query_builder.py
| @@ -9,6 +9,9 @@ from search.es_query_builder import ESQueryBuilder | @@ -9,6 +9,9 @@ from search.es_query_builder import ESQueryBuilder | ||
| 9 | def _builder() -> ESQueryBuilder: | 9 | def _builder() -> ESQueryBuilder: |
| 10 | return ESQueryBuilder( | 10 | return ESQueryBuilder( |
| 11 | match_fields=["title.en^3.0", "brief.en^1.0"], | 11 | match_fields=["title.en^3.0", "brief.en^1.0"], |
| 12 | + multilingual_fields=["title", "brief"], | ||
| 13 | + core_multilingual_fields=["title", "brief"], | ||
| 14 | + shared_fields=[], | ||
| 12 | text_embedding_field="title_embedding", | 15 | text_embedding_field="title_embedding", |
| 13 | default_language="en", | 16 | default_language="en", |
| 14 | ) | 17 | ) |
| @@ -25,10 +28,6 @@ def _lexical_clause(query_root: Dict[str, Any]) -> Dict[str, Any]: | @@ -25,10 +28,6 @@ def _lexical_clause(query_root: Dict[str, Any]) -> Dict[str, Any]: | ||
| 25 | raise AssertionError("no lexical bool clause in query_root") | 28 | raise AssertionError("no lexical bool clause in query_root") |
| 26 | 29 | ||
| 27 | 30 | ||
| 28 | -def _lexical_combined_fields(query_root: Dict[str, Any]) -> list: | ||
| 29 | - return _lexical_clause(query_root)["must"][0]["combined_fields"]["fields"] | ||
| 30 | - | ||
| 31 | - | ||
| 32 | def test_knn_prefilter_includes_range_filters(): | 31 | def test_knn_prefilter_includes_range_filters(): |
| 33 | qb = _builder() | 32 | qb = _builder() |
| 34 | q = qb.build_query( | 33 | q = qb.build_query( |
| @@ -93,7 +92,6 @@ def test_text_query_contains_only_base_and_translation_named_queries(): | @@ -93,7 +92,6 @@ def test_text_query_contains_only_base_and_translation_named_queries(): | ||
| 93 | query_text="dress", | 92 | query_text="dress", |
| 94 | parsed_query=parsed_query, | 93 | parsed_query=parsed_query, |
| 95 | enable_knn=False, | 94 | enable_knn=False, |
| 96 | - index_languages=["en", "zh", "fr"], | ||
| 97 | ) | 95 | ) |
| 98 | should = q["query"]["bool"]["should"] | 96 | should = q["query"]["bool"]["should"] |
| 99 | names = [clause["bool"]["_name"] for clause in should] | 97 | names = [clause["bool"]["_name"] for clause in should] |
| @@ -115,120 +113,8 @@ def test_text_query_skips_duplicate_translation_same_as_base(): | @@ -115,120 +113,8 @@ def test_text_query_skips_duplicate_translation_same_as_base(): | ||
| 115 | query_text="dress", | 113 | query_text="dress", |
| 116 | parsed_query=parsed_query, | 114 | parsed_query=parsed_query, |
| 117 | enable_knn=False, | 115 | enable_knn=False, |
| 118 | - index_languages=["en", "zh"], | ||
| 119 | ) | 116 | ) |
| 120 | 117 | ||
| 121 | root = q["query"] | 118 | root = q["query"] |
| 122 | assert root["bool"]["_name"] == "base_query" | 119 | assert root["bool"]["_name"] == "base_query" |
| 123 | assert [clause["multi_match"]["type"] for clause in root["bool"]["should"]] == ["best_fields", "phrase"] | 120 | assert [clause["multi_match"]["type"] for clause in root["bool"]["should"]] == ["best_fields", "phrase"] |
| 124 | - | ||
| 125 | - | ||
| 126 | -def test_mixed_script_merges_en_fields_into_zh_clause(): | ||
| 127 | - qb = ESQueryBuilder( | ||
| 128 | - match_fields=["title.en^3.0"], | ||
| 129 | - multilingual_fields=["title", "brief"], | ||
| 130 | - shared_fields=[], | ||
| 131 | - text_embedding_field="title_embedding", | ||
| 132 | - default_language="en", | ||
| 133 | - ) | ||
| 134 | - parsed_query = SimpleNamespace( | ||
| 135 | - rewritten_query="法式 dress", | ||
| 136 | - detected_language="zh", | ||
| 137 | - translations={}, | ||
| 138 | - contains_chinese=True, | ||
| 139 | - contains_english=True, | ||
| 140 | - ) | ||
| 141 | - q = qb.build_query( | ||
| 142 | - query_text="法式 dress", | ||
| 143 | - parsed_query=parsed_query, | ||
| 144 | - enable_knn=False, | ||
| 145 | - index_languages=["zh", "en"], | ||
| 146 | - ) | ||
| 147 | - fields = _lexical_combined_fields(q["query"]) | ||
| 148 | - bases = {f.split("^", 1)[0] for f in fields} | ||
| 149 | - assert "title.zh" in bases and "title.en" in bases | ||
| 150 | - assert "brief.zh" in bases and "brief.en" in bases | ||
| 151 | - # Merged supplemental language fields use boost * 0.6 by default. | ||
| 152 | - assert "title.en^0.6" in fields | ||
| 153 | - assert "brief.en^0.6" in fields | ||
| 154 | - | ||
| 155 | - | ||
| 156 | -def test_mixed_script_merges_zh_fields_into_en_clause(): | ||
| 157 | - qb = ESQueryBuilder( | ||
| 158 | - match_fields=["title.en^3.0"], | ||
| 159 | - multilingual_fields=["title"], | ||
| 160 | - shared_fields=[], | ||
| 161 | - text_embedding_field="title_embedding", | ||
| 162 | - default_language="en", | ||
| 163 | - ) | ||
| 164 | - parsed_query = SimpleNamespace( | ||
| 165 | - rewritten_query="red 连衣裙", | ||
| 166 | - detected_language="en", | ||
| 167 | - translations={}, | ||
| 168 | - contains_chinese=True, | ||
| 169 | - contains_english=True, | ||
| 170 | - ) | ||
| 171 | - q = qb.build_query( | ||
| 172 | - query_text="red 连衣裙", | ||
| 173 | - parsed_query=parsed_query, | ||
| 174 | - enable_knn=False, | ||
| 175 | - index_languages=["zh", "en"], | ||
| 176 | - ) | ||
| 177 | - fields = _lexical_combined_fields(q["query"]) | ||
| 178 | - bases = {f.split("^", 1)[0] for f in fields} | ||
| 179 | - assert "title.en" in bases and "title.zh" in bases | ||
| 180 | - assert "title.zh^0.6" in fields | ||
| 181 | - | ||
| 182 | - | ||
| 183 | -def test_mixed_script_merged_fields_scale_configured_boosts(): | ||
| 184 | - qb = ESQueryBuilder( | ||
| 185 | - match_fields=["title.en^3.0"], | ||
| 186 | - multilingual_fields=["title"], | ||
| 187 | - shared_fields=[], | ||
| 188 | - field_boosts={"title.zh": 5.0, "title.en": 10.0}, | ||
| 189 | - text_embedding_field="title_embedding", | ||
| 190 | - default_language="en", | ||
| 191 | - ) | ||
| 192 | - parsed_query = SimpleNamespace( | ||
| 193 | - rewritten_query="法式 dress", | ||
| 194 | - detected_language="zh", | ||
| 195 | - translations={}, | ||
| 196 | - contains_chinese=True, | ||
| 197 | - contains_english=True, | ||
| 198 | - ) | ||
| 199 | - q = qb.build_query( | ||
| 200 | - query_text="法式 dress", | ||
| 201 | - parsed_query=parsed_query, | ||
| 202 | - enable_knn=False, | ||
| 203 | - index_languages=["zh", "en"], | ||
| 204 | - ) | ||
| 205 | - fields = _lexical_combined_fields(q["query"]) | ||
| 206 | - assert "title.zh^5.0" in fields | ||
| 207 | - assert "title.en^6.0" in fields # 10.0 * 0.6 | ||
| 208 | - | ||
| 209 | - | ||
| 210 | -def test_mixed_script_does_not_merge_en_when_not_in_index_languages(): | ||
| 211 | - qb = ESQueryBuilder( | ||
| 212 | - match_fields=["title.zh^3.0"], | ||
| 213 | - multilingual_fields=["title"], | ||
| 214 | - shared_fields=[], | ||
| 215 | - text_embedding_field="title_embedding", | ||
| 216 | - default_language="zh", | ||
| 217 | - ) | ||
| 218 | - parsed_query = SimpleNamespace( | ||
| 219 | - rewritten_query="法式 dress", | ||
| 220 | - detected_language="zh", | ||
| 221 | - translations={}, | ||
| 222 | - contains_chinese=True, | ||
| 223 | - contains_english=True, | ||
| 224 | - ) | ||
| 225 | - q = qb.build_query( | ||
| 226 | - query_text="法式 dress", | ||
| 227 | - parsed_query=parsed_query, | ||
| 228 | - enable_knn=False, | ||
| 229 | - index_languages=["zh"], | ||
| 230 | - ) | ||
| 231 | - fields = _lexical_combined_fields(q["query"]) | ||
| 232 | - bases = {f.split("^", 1)[0] for f in fields} | ||
| 233 | - assert "title.zh" in bases | ||
| 234 | - assert "title.en" not in bases |
tests/test_es_query_builder_text_recall_languages.py
| 1 | """ | 1 | """ |
| 2 | ES text recall: base_query (rewritten @ detected_language) + base_query_trans_*. | 2 | ES text recall: base_query (rewritten @ detected_language) + base_query_trans_*. |
| 3 | 3 | ||
| 4 | -Covers combinations of query language vs tenant index_languages, translations, | ||
| 5 | -and mixed Chinese/English queries. Asserts named lexical clause boundaries, | ||
| 6 | -combined_fields payloads, and per-language target fields (title.{lang}). | 4 | +Covers translation routing, mixed-script queries (per-clause language fields only), |
| 5 | +and clause naming. Asserts named lexical clause boundaries, combined_fields payloads, | ||
| 6 | +and per-language target fields (title.{lang}). | ||
| 7 | """ | 7 | """ |
| 8 | 8 | ||
| 9 | from types import SimpleNamespace | 9 | from types import SimpleNamespace |
| @@ -14,11 +14,7 @@ import numpy as np | @@ -14,11 +14,7 @@ import numpy as np | ||
| 14 | from search.es_query_builder import ESQueryBuilder | 14 | from search.es_query_builder import ESQueryBuilder |
| 15 | 15 | ||
| 16 | 16 | ||
| 17 | -def _builder_multilingual_title_only( | ||
| 18 | - *, | ||
| 19 | - default_language: str = "en", | ||
| 20 | - mixed_script_scale: float = 0.6, | ||
| 21 | -) -> ESQueryBuilder: | 17 | +def _builder_multilingual_title_only(*, default_language: str = "en") -> ESQueryBuilder: |
| 22 | """Minimal builder: only title.{lang} for easy field assertions.""" | 18 | """Minimal builder: only title.{lang} for easy field assertions.""" |
| 23 | return ESQueryBuilder( | 19 | return ESQueryBuilder( |
| 24 | match_fields=["title.en^1.0"], | 20 | match_fields=["title.en^1.0"], |
| @@ -26,7 +22,6 @@ def _builder_multilingual_title_only( | @@ -26,7 +22,6 @@ def _builder_multilingual_title_only( | ||
| 26 | shared_fields=[], | 22 | shared_fields=[], |
| 27 | text_embedding_field="title_embedding", | 23 | text_embedding_field="title_embedding", |
| 28 | default_language=default_language, | 24 | default_language=default_language, |
| 29 | - mixed_script_merged_field_boost_scale=mixed_script_scale, | ||
| 30 | function_score_config=None, | 25 | function_score_config=None, |
| 31 | ) | 26 | ) |
| 32 | 27 | ||
| @@ -101,22 +96,16 @@ def _build( | @@ -101,22 +96,16 @@ def _build( | ||
| 101 | rewritten: str, | 96 | rewritten: str, |
| 102 | detected_language: str, | 97 | detected_language: str, |
| 103 | translations: Dict[str, str], | 98 | translations: Dict[str, str], |
| 104 | - index_languages: List[str], | ||
| 105 | - contains_chinese: bool = False, | ||
| 106 | - contains_english: bool = False, | ||
| 107 | ) -> Dict[str, Any]: | 99 | ) -> Dict[str, Any]: |
| 108 | parsed = SimpleNamespace( | 100 | parsed = SimpleNamespace( |
| 109 | rewritten_query=rewritten, | 101 | rewritten_query=rewritten, |
| 110 | detected_language=detected_language, | 102 | detected_language=detected_language, |
| 111 | translations=dict(translations), | 103 | translations=dict(translations), |
| 112 | - contains_chinese=contains_chinese, | ||
| 113 | - contains_english=contains_english, | ||
| 114 | ) | 104 | ) |
| 115 | return qb.build_query( | 105 | return qb.build_query( |
| 116 | query_text=query_text, | 106 | query_text=query_text, |
| 117 | parsed_query=parsed, | 107 | parsed_query=parsed, |
| 118 | enable_knn=False, | 108 | enable_knn=False, |
| 119 | - index_languages=index_languages, | ||
| 120 | ) | 109 | ) |
| 121 | 110 | ||
| 122 | 111 | ||
| @@ -131,7 +120,6 @@ def test_zh_query_index_zh_en_includes_base_zh_and_trans_en(): | @@ -131,7 +120,6 @@ def test_zh_query_index_zh_en_includes_base_zh_and_trans_en(): | ||
| 131 | rewritten="连衣裙", | 120 | rewritten="连衣裙", |
| 132 | detected_language="zh", | 121 | detected_language="zh", |
| 133 | translations={"en": "dress"}, | 122 | translations={"en": "dress"}, |
| 134 | - index_languages=["zh", "en"], | ||
| 135 | ) | 123 | ) |
| 136 | idx = _clauses_index(q) | 124 | idx = _clauses_index(q) |
| 137 | assert set(idx) == {"base_query", "base_query_trans_en"} | 125 | assert set(idx) == {"base_query", "base_query_trans_en"} |
| @@ -149,7 +137,6 @@ def test_en_query_index_zh_en_includes_base_en_and_trans_zh(): | @@ -149,7 +137,6 @@ def test_en_query_index_zh_en_includes_base_en_and_trans_zh(): | ||
| 149 | rewritten="dress", | 137 | rewritten="dress", |
| 150 | detected_language="en", | 138 | detected_language="en", |
| 151 | translations={"zh": "连衣裙"}, | 139 | translations={"zh": "连衣裙"}, |
| 152 | - index_languages=["en", "zh"], | ||
| 153 | ) | 140 | ) |
| 154 | idx = _clauses_index(q) | 141 | idx = _clauses_index(q) |
| 155 | assert set(idx) == {"base_query", "base_query_trans_zh"} | 142 | assert set(idx) == {"base_query", "base_query_trans_zh"} |
| @@ -167,7 +154,6 @@ def test_de_query_index_de_en_fr_includes_base_and_two_translations(): | @@ -167,7 +154,6 @@ def test_de_query_index_de_en_fr_includes_base_and_two_translations(): | ||
| 167 | rewritten="kleid", | 154 | rewritten="kleid", |
| 168 | detected_language="de", | 155 | detected_language="de", |
| 169 | translations={"en": "dress", "fr": "robe"}, | 156 | translations={"en": "dress", "fr": "robe"}, |
| 170 | - index_languages=["de", "en", "fr"], | ||
| 171 | ) | 157 | ) |
| 172 | idx = _clauses_index(q) | 158 | idx = _clauses_index(q) |
| 173 | assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_fr"} | 159 | assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_fr"} |
| @@ -188,7 +174,6 @@ def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields(): | @@ -188,7 +174,6 @@ def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields(): | ||
| 188 | rewritten="schuh", | 174 | rewritten="schuh", |
| 189 | detected_language="de", | 175 | detected_language="de", |
| 190 | translations={"en": "shoe", "zh": "鞋"}, | 176 | translations={"en": "shoe", "zh": "鞋"}, |
| 191 | - index_languages=["en", "zh"], | ||
| 192 | ) | 177 | ) |
| 193 | idx = _clauses_index(q) | 178 | idx = _clauses_index(q) |
| 194 | assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_zh"} | 179 | assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_zh"} |
| @@ -201,10 +186,10 @@ def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields(): | @@ -201,10 +186,10 @@ def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields(): | ||
| 201 | assert idx["base_query_trans_zh"]["boost"] == qb.translation_boost | 186 | assert idx["base_query_trans_zh"]["boost"] == qb.translation_boost |
| 202 | 187 | ||
| 203 | 188 | ||
| 204 | -# --- 中英混写:原文在 base_query;翻译子句独立;混写时 base 子句扩列 --- | 189 | +# --- 中英混写:base 打在检测语种字段;翻译子句打在译文语种字段 --- |
| 205 | 190 | ||
| 206 | 191 | ||
| 207 | -def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause(): | 192 | +def test_mixed_zh_detected_base_clause_zh_fields_only_with_en_translation(): |
| 208 | qb = _builder_multilingual_title_only(default_language="en") | 193 | qb = _builder_multilingual_title_only(default_language="en") |
| 209 | q = _build( | 194 | q = _build( |
| 210 | qb, | 195 | qb, |
| @@ -212,19 +197,16 @@ def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause(): | @@ -212,19 +197,16 @@ def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause(): | ||
| 212 | rewritten="红色 dress", | 197 | rewritten="红色 dress", |
| 213 | detected_language="zh", | 198 | detected_language="zh", |
| 214 | translations={"en": "red dress"}, | 199 | translations={"en": "red dress"}, |
| 215 | - index_languages=["zh", "en"], | ||
| 216 | - contains_chinese=True, | ||
| 217 | - contains_english=True, | ||
| 218 | ) | 200 | ) |
| 219 | idx = _clauses_index(q) | 201 | idx = _clauses_index(q) |
| 220 | assert set(idx) == {"base_query", "base_query_trans_en"} | 202 | assert set(idx) == {"base_query", "base_query_trans_en"} |
| 221 | assert _combined_fields_clause(idx["base_query"])["query"] == "红色 dress" | 203 | assert _combined_fields_clause(idx["base_query"])["query"] == "红色 dress" |
| 222 | - assert _has_title_lang(idx["base_query"], "zh") and _has_title_lang(idx["base_query"], "en") | 204 | + assert _has_title_lang(idx["base_query"], "zh") and not _has_title_lang(idx["base_query"], "en") |
| 223 | assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "red dress" | 205 | assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "red dress" |
| 224 | assert _has_title_lang(idx["base_query_trans_en"], "en") | 206 | assert _has_title_lang(idx["base_query_trans_en"], "en") |
| 225 | 207 | ||
| 226 | 208 | ||
| 227 | -def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause(): | 209 | +def test_mixed_en_detected_base_clause_en_fields_only_with_zh_translation(): |
| 228 | qb = _builder_multilingual_title_only(default_language="en") | 210 | qb = _builder_multilingual_title_only(default_language="en") |
| 229 | q = _build( | 211 | q = _build( |
| 230 | qb, | 212 | qb, |
| @@ -232,18 +214,15 @@ def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause(): | @@ -232,18 +214,15 @@ def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause(): | ||
| 232 | rewritten="nike 运动鞋", | 214 | rewritten="nike 运动鞋", |
| 233 | detected_language="en", | 215 | detected_language="en", |
| 234 | translations={"zh": "耐克运动鞋"}, | 216 | translations={"zh": "耐克运动鞋"}, |
| 235 | - index_languages=["zh", "en"], | ||
| 236 | - contains_chinese=True, | ||
| 237 | - contains_english=True, | ||
| 238 | ) | 217 | ) |
| 239 | idx = _clauses_index(q) | 218 | idx = _clauses_index(q) |
| 240 | assert set(idx) == {"base_query", "base_query_trans_zh"} | 219 | assert set(idx) == {"base_query", "base_query_trans_zh"} |
| 241 | assert _combined_fields_clause(idx["base_query"])["query"] == "nike 运动鞋" | 220 | assert _combined_fields_clause(idx["base_query"])["query"] == "nike 运动鞋" |
| 242 | - assert _has_title_lang(idx["base_query"], "en") and _has_title_lang(idx["base_query"], "zh") | 221 | + assert _has_title_lang(idx["base_query"], "en") and not _has_title_lang(idx["base_query"], "zh") |
| 243 | assert _combined_fields_clause(idx["base_query_trans_zh"])["query"] == "耐克运动鞋" | 222 | assert _combined_fields_clause(idx["base_query_trans_zh"])["query"] == "耐克运动鞋" |
| 244 | 223 | ||
| 245 | 224 | ||
| 246 | -def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): | 225 | +def test_zh_query_no_translations_only_zh_fields(): |
| 247 | qb = _builder_multilingual_title_only(default_language="en") | 226 | qb = _builder_multilingual_title_only(default_language="en") |
| 248 | q = _build( | 227 | q = _build( |
| 249 | qb, | 228 | qb, |
| @@ -251,9 +230,6 @@ def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): | @@ -251,9 +230,6 @@ def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): | ||
| 251 | rewritten="法式 dress", | 230 | rewritten="法式 dress", |
| 252 | detected_language="zh", | 231 | detected_language="zh", |
| 253 | translations={}, | 232 | translations={}, |
| 254 | - index_languages=["zh"], | ||
| 255 | - contains_chinese=True, | ||
| 256 | - contains_english=True, | ||
| 257 | ) | 233 | ) |
| 258 | idx = _clauses_index(q) | 234 | idx = _clauses_index(q) |
| 259 | assert set(idx) == {"base_query"} | 235 | assert set(idx) == {"base_query"} |
| @@ -272,7 +248,6 @@ def test_skips_translation_when_same_lang_and_same_text_as_base(): | @@ -272,7 +248,6 @@ def test_skips_translation_when_same_lang_and_same_text_as_base(): | ||
| 272 | rewritten="NIKE", | 248 | rewritten="NIKE", |
| 273 | detected_language="en", | 249 | detected_language="en", |
| 274 | translations={"en": "NIKE", "zh": "耐克"}, | 250 | translations={"en": "NIKE", "zh": "耐克"}, |
| 275 | - index_languages=["en", "zh"], | ||
| 276 | ) | 251 | ) |
| 277 | idx = _clauses_index(q) | 252 | idx = _clauses_index(q) |
| 278 | assert set(idx) == {"base_query", "base_query_trans_zh"} | 253 | assert set(idx) == {"base_query", "base_query_trans_zh"} |
| @@ -286,7 +261,6 @@ def test_keeps_translation_when_same_text_but_different_lang_than_base(): | @@ -286,7 +261,6 @@ def test_keeps_translation_when_same_text_but_different_lang_than_base(): | ||
| 286 | rewritten="NIKE", | 261 | rewritten="NIKE", |
| 287 | detected_language="en", | 262 | detected_language="en", |
| 288 | translations={"zh": "NIKE"}, | 263 | translations={"zh": "NIKE"}, |
| 289 | - index_languages=["en", "zh"], | ||
| 290 | ) | 264 | ) |
| 291 | idx = _clauses_index(q) | 265 | idx = _clauses_index(q) |
| 292 | assert set(idx) == {"base_query", "base_query_trans_zh"} | 266 | assert set(idx) == {"base_query", "base_query_trans_zh"} |
| @@ -304,7 +278,6 @@ def test_translation_language_key_is_normalized_case_insensitive(): | @@ -304,7 +278,6 @@ def test_translation_language_key_is_normalized_case_insensitive(): | ||
| 304 | rewritten="dress", | 278 | rewritten="dress", |
| 305 | detected_language="en", | 279 | detected_language="en", |
| 306 | translations={"ZH": "连衣裙"}, | 280 | translations={"ZH": "连衣裙"}, |
| 307 | - index_languages=["en", "zh"], | ||
| 308 | ) | 281 | ) |
| 309 | idx = _clauses_index(q) | 282 | idx = _clauses_index(q) |
| 310 | assert "base_query_trans_zh" in idx | 283 | assert "base_query_trans_zh" in idx |
| @@ -319,17 +292,16 @@ def test_empty_translation_value_is_skipped(): | @@ -319,17 +292,16 @@ def test_empty_translation_value_is_skipped(): | ||
| 319 | rewritten="dress", | 292 | rewritten="dress", |
| 320 | detected_language="en", | 293 | detected_language="en", |
| 321 | translations={"zh": " ", "fr": "robe"}, | 294 | translations={"zh": " ", "fr": "robe"}, |
| 322 | - index_languages=["en", "zh", "fr"], | ||
| 323 | ) | 295 | ) |
| 324 | idx = _clauses_index(q) | 296 | idx = _clauses_index(q) |
| 325 | assert "base_query_trans_zh" not in idx | 297 | assert "base_query_trans_zh" not in idx |
| 326 | assert "base_query_trans_fr" in idx | 298 | assert "base_query_trans_fr" in idx |
| 327 | 299 | ||
| 328 | 300 | ||
| 329 | -# --- index_languages 为空:视为「未约束」source_in_index 为 True --- | 301 | +# --- base 子句无 bool.boost;翻译子句带 translation_boost;phrase should 继承 phrase_match_boost --- |
| 330 | 302 | ||
| 331 | 303 | ||
| 332 | -def test_empty_index_languages_treats_source_as_in_index_boosts(): | 304 | +def test_de_base_and_en_translation_phrase_boosts(): |
| 333 | qb = _builder_multilingual_title_only(default_language="en") | 305 | qb = _builder_multilingual_title_only(default_language="en") |
| 334 | q = _build( | 306 | q = _build( |
| 335 | qb, | 307 | qb, |
| @@ -337,7 +309,6 @@ def test_empty_index_languages_treats_source_as_in_index_boosts(): | @@ -337,7 +309,6 @@ def test_empty_index_languages_treats_source_as_in_index_boosts(): | ||
| 337 | rewritten="x", | 309 | rewritten="x", |
| 338 | detected_language="de", | 310 | detected_language="de", |
| 339 | translations={"en": "y"}, | 311 | translations={"en": "y"}, |
| 340 | - index_languages=[], | ||
| 341 | ) | 312 | ) |
| 342 | idx = _clauses_index(q) | 313 | idx = _clauses_index(q) |
| 343 | assert "boost" not in idx["base_query"] | 314 | assert "boost" not in idx["base_query"] |
| @@ -359,7 +330,6 @@ def test_no_translations_only_base_query(): | @@ -359,7 +330,6 @@ def test_no_translations_only_base_query(): | ||
| 359 | rewritten="hello", | 330 | rewritten="hello", |
| 360 | detected_language="en", | 331 | detected_language="en", |
| 361 | translations={}, | 332 | translations={}, |
| 362 | - index_languages=["en", "zh"], | ||
| 363 | ) | 333 | ) |
| 364 | idx = _clauses_index(q) | 334 | idx = _clauses_index(q) |
| 365 | assert set(idx) == {"base_query"} | 335 | assert set(idx) == {"base_query"} |
| @@ -374,15 +344,12 @@ def test_text_clauses_present_alongside_knn(): | @@ -374,15 +344,12 @@ def test_text_clauses_present_alongside_knn(): | ||
| 374 | rewritten_query="dress", | 344 | rewritten_query="dress", |
| 375 | detected_language="en", | 345 | detected_language="en", |
| 376 | translations={"zh": "连衣裙"}, | 346 | translations={"zh": "连衣裙"}, |
| 377 | - contains_chinese=False, | ||
| 378 | - contains_english=True, | ||
| 379 | ) | 347 | ) |
| 380 | q = qb.build_query( | 348 | q = qb.build_query( |
| 381 | query_text="dress", | 349 | query_text="dress", |
| 382 | query_vector=np.array([0.1, 0.2, 0.3], dtype=np.float32), | 350 | query_vector=np.array([0.1, 0.2, 0.3], dtype=np.float32), |
| 383 | parsed_query=parsed, | 351 | parsed_query=parsed, |
| 384 | enable_knn=True, | 352 | enable_knn=True, |
| 385 | - index_languages=["en", "zh"], | ||
| 386 | ) | 353 | ) |
| 387 | assert "knn" in q | 354 | assert "knn" in q |
| 388 | idx = _clauses_index(q) | 355 | idx = _clauses_index(q) |
| @@ -396,14 +363,11 @@ def test_detected_language_unknown_falls_back_to_default_language(): | @@ -396,14 +363,11 @@ def test_detected_language_unknown_falls_back_to_default_language(): | ||
| 396 | rewritten_query="shirt", | 363 | rewritten_query="shirt", |
| 397 | detected_language="unknown", | 364 | detected_language="unknown", |
| 398 | translations={"zh": "衬衫"}, | 365 | translations={"zh": "衬衫"}, |
| 399 | - contains_chinese=False, | ||
| 400 | - contains_english=True, | ||
| 401 | ) | 366 | ) |
| 402 | q = qb.build_query( | 367 | q = qb.build_query( |
| 403 | query_text="shirt", | 368 | query_text="shirt", |
| 404 | parsed_query=parsed, | 369 | parsed_query=parsed, |
| 405 | enable_knn=False, | 370 | enable_knn=False, |
| 406 | - index_languages=["en", "zh"], | ||
| 407 | ) | 371 | ) |
| 408 | idx = _clauses_index(q) | 372 | idx = _clauses_index(q) |
| 409 | assert set(idx) == {"base_query", "base_query_trans_zh"} | 373 | assert set(idx) == {"base_query", "base_query_trans_zh"} |
| @@ -419,7 +383,6 @@ def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): | @@ -419,7 +383,6 @@ def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): | ||
| 419 | rewritten="платье", | 383 | rewritten="платье", |
| 420 | detected_language="ru", | 384 | detected_language="ru", |
| 421 | translations={"en": "dress"}, | 385 | translations={"en": "dress"}, |
| 422 | - index_languages=["ru", "en"], | ||
| 423 | ) | 386 | ) |
| 424 | idx = _clauses_index(q) | 387 | idx = _clauses_index(q) |
| 425 | assert set(idx) == {"base_query", "base_query_trans_en"} | 388 | assert set(idx) == {"base_query", "base_query_trans_en"} |
| @@ -428,11 +391,8 @@ def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): | @@ -428,11 +391,8 @@ def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): | ||
| 428 | assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "dress" | 391 | assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "dress" |
| 429 | 392 | ||
| 430 | 393 | ||
| 431 | -def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause(): | ||
| 432 | - """ | ||
| 433 | - 当前实现:凡是 translations 里非空的条目都会生成子句; | ||
| 434 | - index_languages 只约束混写扩列,不用于过滤翻译子句。 | ||
| 435 | - """ | 394 | +def test_translation_generates_clause_for_any_target_lang_key(): |
| 395 | + """translations 里非空的每个语种键都会生成对应 base_query_trans_* 子句。""" | ||
| 436 | qb = _builder_multilingual_title_only(default_language="en") | 396 | qb = _builder_multilingual_title_only(default_language="en") |
| 437 | q = _build( | 397 | q = _build( |
| 438 | qb, | 398 | qb, |
| @@ -440,7 +400,6 @@ def test_translation_for_lang_not_listed_in_index_languages_still_generates_clau | @@ -440,7 +400,6 @@ def test_translation_for_lang_not_listed_in_index_languages_still_generates_clau | ||
| 440 | rewritten="dress", | 400 | rewritten="dress", |
| 441 | detected_language="en", | 401 | detected_language="en", |
| 442 | translations={"zh": "连衣裙", "de": "Kleid"}, | 402 | translations={"zh": "连衣裙", "de": "Kleid"}, |
| 443 | - index_languages=["en", "zh"], | ||
| 444 | ) | 403 | ) |
| 445 | idx = _clauses_index(q) | 404 | idx = _clauses_index(q) |
| 446 | assert "base_query_trans_de" in idx | 405 | assert "base_query_trans_de" in idx |
| @@ -457,9 +416,6 @@ def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_bas | @@ -457,9 +416,6 @@ def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_bas | ||
| 457 | rewritten="红色连衣裙", | 416 | rewritten="红色连衣裙", |
| 458 | detected_language="zh", | 417 | detected_language="zh", |
| 459 | translations={"en": "red dress"}, | 418 | translations={"en": "red dress"}, |
| 460 | - index_languages=["zh", "en"], | ||
| 461 | - contains_chinese=True, | ||
| 462 | - contains_english=False, | ||
| 463 | ) | 419 | ) |
| 464 | idx = _clauses_index(q) | 420 | idx = _clauses_index(q) |
| 465 | assert _combined_fields_clause(idx["base_query"])["query"] == "红色连衣裙" | 421 | assert _combined_fields_clause(idx["base_query"])["query"] == "红色连衣裙" |
tests/test_query_parser_mixed_language.py
| @@ -11,14 +11,6 @@ def _tokenizer(text): | @@ -11,14 +11,6 @@ def _tokenizer(text): | ||
| 11 | return str(text).split() | 11 | return str(text).split() |
| 12 | 12 | ||
| 13 | 13 | ||
| 14 | -def test_pure_english_word_token_length_and_script(): | ||
| 15 | - assert QueryParser._is_pure_english_word_token("ab") is False | ||
| 16 | - assert QueryParser._is_pure_english_word_token("abc") is True | ||
| 17 | - assert QueryParser._is_pure_english_word_token("wi-fi") is True | ||
| 18 | - assert QueryParser._is_pure_english_word_token("连衣裙") is False | ||
| 19 | - assert QueryParser._is_pure_english_word_token("ab12") is False | ||
| 20 | - | ||
| 21 | - | ||
| 22 | def _build_config() -> SearchConfig: | 14 | def _build_config() -> SearchConfig: |
| 23 | return SearchConfig( | 15 | return SearchConfig( |
| 24 | es_index_name="test_products", | 16 | es_index_name="test_products", |
| @@ -36,7 +28,7 @@ def _build_config() -> SearchConfig: | @@ -36,7 +28,7 @@ def _build_config() -> SearchConfig: | ||
| 36 | ) | 28 | ) |
| 37 | 29 | ||
| 38 | 30 | ||
| 39 | -def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(monkeypatch): | 31 | +def test_parse_mixed_zh_query_translates_to_en(monkeypatch): |
| 40 | parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) | 32 | parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) |
| 41 | monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh") | 33 | monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh") |
| 42 | 34 | ||
| @@ -48,15 +40,13 @@ def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(mo | @@ -48,15 +40,13 @@ def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(mo | ||
| 48 | ) | 40 | ) |
| 49 | 41 | ||
| 50 | assert result.detected_language == "zh" | 42 | assert result.detected_language == "zh" |
| 51 | - assert result.contains_chinese is True | ||
| 52 | - assert result.contains_english is True | ||
| 53 | assert result.translations == {"en": "法式 dress 连衣裙-en"} | 43 | assert result.translations == {"en": "法式 dress 连衣裙-en"} |
| 54 | assert result.query_tokens == ["法式", "dress", "连衣裙"] | 44 | assert result.query_tokens == ["法式", "dress", "连衣裙"] |
| 55 | assert not hasattr(result, "query_text_by_lang") | 45 | assert not hasattr(result, "query_text_by_lang") |
| 56 | assert not hasattr(result, "search_langs") | 46 | assert not hasattr(result, "search_langs") |
| 57 | 47 | ||
| 58 | 48 | ||
| 59 | -def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch): | 49 | +def test_parse_mixed_en_query_translates_to_zh(monkeypatch): |
| 60 | parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) | 50 | parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) |
| 61 | monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") | 51 | monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") |
| 62 | 52 | ||
| @@ -68,8 +58,6 @@ def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch): | @@ -68,8 +58,6 @@ def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch): | ||
| 68 | ) | 58 | ) |
| 69 | 59 | ||
| 70 | assert result.detected_language == "en" | 60 | assert result.detected_language == "en" |
| 71 | - assert result.contains_chinese is True | ||
| 72 | - assert result.contains_english is True | ||
| 73 | assert result.translations == {"zh": "red 连衣裙-zh"} | 61 | assert result.translations == {"zh": "red 连衣裙-zh"} |
| 74 | assert result.query_tokens == ["red", "连衣裙"] | 62 | assert result.query_tokens == ["red", "连衣裙"] |
| 75 | 63 | ||
| @@ -87,7 +75,5 @@ def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch) | @@ -87,7 +75,5 @@ def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch) | ||
| 87 | ) | 75 | ) |
| 88 | 76 | ||
| 89 | assert result.detected_language == "en" | 77 | assert result.detected_language == "en" |
| 90 | - assert result.contains_chinese is False | ||
| 91 | - assert result.contains_english is True | ||
| 92 | assert result.translations.get("zh") == "off shoulder top-zh" | 78 | assert result.translations.get("zh") == "off shoulder top-zh" |
| 93 | assert not hasattr(result, "source_in_index_languages") | 79 | assert not hasattr(result, "source_in_index_languages") |