Commit 6823fe3e333e350417e58a7c9da6a4a9dff5253e
1 parent
1556989b
feat(search): 混合语种查询分析与跨语言字段召回
## 背景 多语言索引下,用户查询常中英混写;需在解析阶段显式标记脚本类型,并在 BM25 子句中同时覆盖对应语言字段。 ## 方案 ### 1. Query 分析(query_parser.ParsedQuery) - 新增 `contains_chinese`:query 文本含 CJK(沿用 _contains_cjk)。 - 新增 `contains_english`:分词结果中存在「纯英文、len>=3」token(fullmatch 字母及可选连字符)。 - 写入 to_dict、请求 context 中间结果,便于调试与 API 透出。 ### 2. ES 文本召回(es_query_builder._build_advanced_text_query) - 对每个 search_lang 子句:若含英文且子句语言非 en(且租户 index_languages 含 en),合并 en 列字段;若含中文且子句语言非 zh(且含 zh),合并 zh 列字段。 - 合并进来的字段 boost 乘以 `mixed_script_merged_field_boost_scale`(默认 0.8,可在 ESQueryBuilder 构造参数调整)。 - fallback_original_query_* 分支同样应用上述逻辑。 ### 3. 实现整理 - 引入 `MatchFieldSpec = (field_path, boost)`:`_build_match_field_specs` 为唯一权重来源;`_merge_supplemental_lang_field_specs` / `_expand_match_field_specs_for_mixed_script` 在 tuple 上合并与缩放;最后 `_format_match_field_specs` 再格式化为 ES `path^boost`,避免先拼字符串再解析。 ## 测试 - tests/test_query_parser_mixed_language.py:脚本标记与 token 规则。 - tests/test_es_query_builder.py:合并字段、0.8 缩放、index_languages 限制。 Made-with: Cursor
Showing
4 changed files
with
223 additions
and
22 deletions
Show diff stats
query/query_parser.py
| ... | ... | @@ -42,6 +42,8 @@ class ParsedQuery: |
| 42 | 42 | search_langs: Optional[List[str]] = None, |
| 43 | 43 | index_languages: Optional[List[str]] = None, |
| 44 | 44 | source_in_index_languages: bool = True, |
| 45 | + contains_chinese: bool = False, | |
| 46 | + contains_english: bool = False, | |
| 45 | 47 | ): |
| 46 | 48 | self.original_query = original_query |
| 47 | 49 | self.query_normalized = query_normalized |
| ... | ... | @@ -58,6 +60,8 @@ class ParsedQuery: |
| 58 | 60 | self.search_langs = search_langs or [] |
| 59 | 61 | self.index_languages = index_languages or [] |
| 60 | 62 | self.source_in_index_languages = bool(source_in_index_languages) |
| 63 | + self.contains_chinese = bool(contains_chinese) | |
| 64 | + self.contains_english = bool(contains_english) | |
| 61 | 65 | |
| 62 | 66 | def to_dict(self) -> Dict[str, Any]: |
| 63 | 67 | """Convert to dictionary representation.""" |
| ... | ... | @@ -73,6 +77,8 @@ class ParsedQuery: |
| 73 | 77 | result["search_langs"] = self.search_langs |
| 74 | 78 | result["index_languages"] = self.index_languages |
| 75 | 79 | result["source_in_index_languages"] = self.source_in_index_languages |
| 80 | + result["contains_chinese"] = self.contains_chinese | |
| 81 | + result["contains_english"] = self.contains_english | |
| 76 | 82 | return result |
| 77 | 83 | |
| 78 | 84 | |
| ... | ... | @@ -217,6 +223,16 @@ class QueryParser: |
| 217 | 223 | return bool(re.search(r"[\u4e00-\u9fff]", text or "")) |
| 218 | 224 | |
| 219 | 225 | @staticmethod |
| 226 | + def _is_pure_english_word_token(token: str) -> bool: | |
| 227 | + """ | |
| 228 | + A tokenizer token counts as English iff it is letters only (optional internal hyphens) | |
| 229 | + and length >= 3. | |
| 230 | + """ | |
| 231 | + if not token or len(token) < 3: | |
| 232 | + return False | |
| 233 | + return bool(re.fullmatch(r"[A-Za-z]+(?:-[A-Za-z]+)*", token)) | |
| 234 | + | |
| 235 | + @staticmethod | |
| 220 | 236 | def _extract_latin_tokens(text: str) -> List[str]: |
| 221 | 237 | """Extract latin word tokens from query text.""" |
| 222 | 238 | return re.findall(r"[A-Za-z]+(?:-[A-Za-z]+)*", text or "") |
| ... | ... | @@ -391,13 +407,18 @@ class QueryParser: |
| 391 | 407 | keywords = self._extract_keywords(query_text) |
| 392 | 408 | query_tokens = self._get_query_tokens(query_text) |
| 393 | 409 | token_count = len(query_tokens) |
| 410 | + contains_chinese = self._contains_cjk(query_text) | |
| 411 | + contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens) | |
| 394 | 412 | |
| 395 | 413 | log_debug(f"Query analysis | Keywords: {keywords} | Token count: {token_count} | " |
| 396 | - f"Query tokens: {query_tokens}") | |
| 414 | + f"Query tokens: {query_tokens} | contains_chinese={contains_chinese} | " | |
| 415 | + f"contains_english={contains_english}") | |
| 397 | 416 | if context: |
| 398 | 417 | context.store_intermediate_result('keywords', keywords) |
| 399 | 418 | context.store_intermediate_result('token_count', token_count) |
| 400 | 419 | context.store_intermediate_result('query_tokens', query_tokens) |
| 420 | + context.store_intermediate_result('contains_chinese', contains_chinese) | |
| 421 | + context.store_intermediate_result('contains_english', contains_english) | |
| 401 | 422 | |
| 402 | 423 | # Stage 6: Text embedding (only for non-short queries) - async execution |
| 403 | 424 | query_vector = None |
| ... | ... | @@ -578,6 +599,8 @@ class QueryParser: |
| 578 | 599 | search_langs=ordered_search_langs, |
| 579 | 600 | index_languages=index_langs, |
| 580 | 601 | source_in_index_languages=source_in_index_languages, |
| 602 | + contains_chinese=contains_chinese, | |
| 603 | + contains_english=contains_english, | |
| 581 | 604 | ) |
| 582 | 605 | |
| 583 | 606 | if context and hasattr(context, 'logger'): | ... | ... |
search/es_query_builder.py
| ... | ... | @@ -9,9 +9,13 @@ Simplified architecture: |
| 9 | 9 | """ |
| 10 | 10 | |
| 11 | 11 | from typing import Dict, Any, List, Optional, Union, Tuple |
| 12 | + | |
| 12 | 13 | import numpy as np |
| 13 | 14 | from config import FunctionScoreConfig |
| 14 | 15 | |
| 16 | +# (Elasticsearch field path, boost before formatting as "path^boost") | |
| 17 | +MatchFieldSpec = Tuple[str, float] | |
| 18 | + | |
| 15 | 19 | |
| 16 | 20 | class ESQueryBuilder: |
| 17 | 21 | """Builds Elasticsearch DSL queries.""" |
| ... | ... | @@ -36,6 +40,7 @@ class ESQueryBuilder: |
| 36 | 40 | source_boost_when_missing: float = 0.6, |
| 37 | 41 | original_query_fallback_boost_when_translation_missing: float = 0.2, |
| 38 | 42 | tie_breaker_base_query: float = 0.9, |
| 43 | + mixed_script_merged_field_boost_scale: float = 0.6, | |
| 39 | 44 | ): |
| 40 | 45 | """ |
| 41 | 46 | Initialize query builder. |
| ... | ... | @@ -51,6 +56,7 @@ class ESQueryBuilder: |
| 51 | 56 | function_score_config: Function score configuration |
| 52 | 57 | default_language: Default language to use when detection fails or returns "unknown" |
| 53 | 58 | knn_boost: Boost value for KNN (embedding recall) |
| 59 | + mixed_script_merged_field_boost_scale: Multiply per-field ^boost for cross-script merged fields | |
| 54 | 60 | """ |
| 55 | 61 | self.match_fields = match_fields |
| 56 | 62 | self.field_boosts = field_boosts or {} |
| ... | ... | @@ -74,6 +80,7 @@ class ESQueryBuilder: |
| 74 | 80 | original_query_fallback_boost_when_translation_missing |
| 75 | 81 | ) |
| 76 | 82 | self.tie_breaker_base_query = float(tie_breaker_base_query) |
| 83 | + self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale) | |
| 77 | 84 | |
| 78 | 85 | def _apply_source_filter(self, es_query: Dict[str, Any]) -> None: |
| 79 | 86 | """ |
| ... | ... | @@ -414,7 +421,7 @@ class ESQueryBuilder: |
| 414 | 421 | def _format_field_with_boost(self, field_name: str, boost: float) -> str: |
| 415 | 422 | if abs(float(boost) - 1.0) < 1e-9: |
| 416 | 423 | return field_name |
| 417 | - return f"{field_name}^{boost}" | |
| 424 | + return f"{field_name}^{round(boost, 2)}" | |
| 418 | 425 | |
| 419 | 426 | def _get_field_boost(self, base_field: str, language: Optional[str] = None) -> float: |
| 420 | 427 | # Language-specific override first (e.g. title.de), then base field (e.g. title) |
| ... | ... | @@ -426,36 +433,74 @@ class ESQueryBuilder: |
| 426 | 433 | return float(self.field_boosts[base_field]) |
| 427 | 434 | return 1.0 |
| 428 | 435 | |
| 429 | - def _get_match_fields(self, language: str) -> Tuple[List[str], List[str]]: | |
| 436 | + def _build_match_field_specs(self, language: str) -> Tuple[List[MatchFieldSpec], List[MatchFieldSpec]]: | |
| 430 | 437 | """ |
| 431 | - Build dynamic match fields for one language. | |
| 432 | - | |
| 433 | - Args: | |
| 434 | - language: Language code (e.g. zh/en/de/fr/...) | |
| 435 | - | |
| 436 | - Returns: | |
| 437 | - (all_fields, core_fields) - core_fields are for phrase/keyword queries | |
| 438 | + Per-language match targets as (field_path, boost). Single source of truth before string formatting. | |
| 439 | + Returns (all_fields, core_fields); core_fields are for phrase/keyword strategies elsewhere. | |
| 438 | 440 | """ |
| 439 | 441 | lang = (language or "").strip().lower() |
| 440 | - all_fields: List[str] = [] | |
| 441 | - core_fields: List[str] = [] | |
| 442 | + all_specs: List[MatchFieldSpec] = [] | |
| 443 | + core_specs: List[MatchFieldSpec] = [] | |
| 442 | 444 | |
| 443 | 445 | for base in self.multilingual_fields: |
| 444 | 446 | field = f"{base}.{lang}" |
| 445 | - boost = self._get_field_boost(base, lang) | |
| 446 | - all_fields.append(self._format_field_with_boost(field, boost)) | |
| 447 | + all_specs.append((field, self._get_field_boost(base, lang))) | |
| 447 | 448 | |
| 448 | 449 | for shared in self.shared_fields: |
| 449 | - boost = self._get_field_boost(shared, None) | |
| 450 | - all_fields.append(self._format_field_with_boost(shared, boost)) | |
| 450 | + all_specs.append((shared, self._get_field_boost(shared, None))) | |
| 451 | 451 | |
| 452 | 452 | for base in self.core_multilingual_fields: |
| 453 | 453 | field = f"{base}.{lang}" |
| 454 | - boost = self._get_field_boost(base, lang) | |
| 455 | - core_fields.append(self._format_field_with_boost(field, boost)) | |
| 454 | + core_specs.append((field, self._get_field_boost(base, lang))) | |
| 455 | + | |
| 456 | + return all_specs, core_specs | |
| 457 | + | |
| 458 | + def _format_match_field_specs(self, specs: List[MatchFieldSpec]) -> List[str]: | |
| 459 | + """Format (field_path, boost) pairs for Elasticsearch multi_match ``fields``.""" | |
| 460 | + return [self._format_field_with_boost(path, boost) for path, boost in specs] | |
| 461 | + | |
| 462 | + def _merge_supplemental_lang_field_specs( | |
| 463 | + self, | |
| 464 | + specs: List[MatchFieldSpec], | |
| 465 | + supplemental_lang: str, | |
| 466 | + ) -> List[MatchFieldSpec]: | |
| 467 | + """Append supplemental-language columns; boosts multiplied by mixed_script scale.""" | |
| 468 | + scale = float(self.mixed_script_merged_field_boost_scale) | |
| 469 | + extra_all, _ = self._build_match_field_specs(supplemental_lang) | |
| 470 | + seen = {path for path, _ in specs} | |
| 471 | + out = list(specs) | |
| 472 | + for path, boost in extra_all: | |
| 473 | + if path not in seen: | |
| 474 | + out.append((path, boost * scale)) | |
| 475 | + seen.add(path) | |
| 476 | + return out | |
| 477 | + | |
| 478 | + def _expand_match_field_specs_for_mixed_script( | |
| 479 | + self, | |
| 480 | + lang: str, | |
| 481 | + specs: List[MatchFieldSpec], | |
| 482 | + contains_chinese: bool, | |
| 483 | + contains_english: bool, | |
| 484 | + index_languages: List[str], | |
| 485 | + ) -> List[MatchFieldSpec]: | |
| 486 | + """ | |
| 487 | + When the query mixes scripts, widen each clause to indexed fields for the other script | |
| 488 | + (e.g. zh clause also searches title.en when the query contains an English word token). | |
| 489 | + """ | |
| 490 | + norm = {str(x or "").strip().lower() for x in (index_languages or []) if str(x or "").strip()} | |
| 491 | + allow = norm or {"zh", "en"} | |
| 492 | + | |
| 493 | + def can_use(lcode: str) -> bool: | |
| 494 | + return lcode in allow if norm else True | |
| 495 | + | |
| 496 | + out = list(specs) | |
| 497 | + lnorm = (lang or "").strip().lower() | |
| 498 | + if contains_english and lnorm != "en" and can_use("en"): | |
| 499 | + out = self._merge_supplemental_lang_field_specs(out, "en") | |
| 500 | + if contains_chinese and lnorm != "zh" and can_use("zh"): | |
| 501 | + out = self._merge_supplemental_lang_field_specs(out, "zh") | |
| 502 | + return out | |
| 456 | 503 | |
| 457 | - return all_fields, core_fields | |
| 458 | - | |
| 459 | 504 | def _get_embedding_field(self, language: str) -> str: |
| 460 | 505 | """Get embedding field name for a language.""" |
| 461 | 506 | # Currently using unified embedding field |
| ... | ... | @@ -486,6 +531,8 @@ class ESQueryBuilder: |
| 486 | 531 | source_in_index_languages = True |
| 487 | 532 | index_languages: List[str] = [] |
| 488 | 533 | |
| 534 | + contains_chinese = False | |
| 535 | + contains_english = False | |
| 489 | 536 | if parsed_query: |
| 490 | 537 | query_text_by_lang = getattr(parsed_query, "query_text_by_lang", None) or {} |
| 491 | 538 | search_langs = getattr(parsed_query, "search_langs", None) or [] |
| ... | ... | @@ -495,6 +542,8 @@ class ESQueryBuilder: |
| 495 | 542 | getattr(parsed_query, "source_in_index_languages", True) |
| 496 | 543 | ) |
| 497 | 544 | index_languages = getattr(parsed_query, "index_languages", None) or [] |
| 545 | + contains_chinese = bool(getattr(parsed_query, "contains_chinese", False)) | |
| 546 | + contains_english = bool(getattr(parsed_query, "contains_english", False)) | |
| 498 | 547 | |
| 499 | 548 | if not query_text_by_lang: |
| 500 | 549 | query_text_by_lang = {source_lang: query_text} |
| ... | ... | @@ -508,7 +557,15 @@ class ESQueryBuilder: |
| 508 | 557 | lang_query = query_text_by_lang.get(lang) |
| 509 | 558 | if not lang_query: |
| 510 | 559 | continue |
| 511 | - match_fields, _ = self._get_match_fields(lang) | |
| 560 | + all_specs, _ = self._build_match_field_specs(lang) | |
| 561 | + expanded_specs = self._expand_match_field_specs_for_mixed_script( | |
| 562 | + lang, | |
| 563 | + all_specs, | |
| 564 | + contains_chinese, | |
| 565 | + contains_english, | |
| 566 | + index_languages, | |
| 567 | + ) | |
| 568 | + match_fields = self._format_match_field_specs(expanded_specs) | |
| 512 | 569 | if not match_fields: |
| 513 | 570 | continue |
| 514 | 571 | |
| ... | ... | @@ -559,7 +616,15 @@ class ESQueryBuilder: |
| 559 | 616 | continue |
| 560 | 617 | if lang in query_text_by_lang: |
| 561 | 618 | continue |
| 562 | - match_fields, _ = self._get_match_fields(lang) | |
| 619 | + fb_specs, _ = self._build_match_field_specs(lang) | |
| 620 | + expanded_fb = self._expand_match_field_specs_for_mixed_script( | |
| 621 | + lang, | |
| 622 | + fb_specs, | |
| 623 | + contains_chinese, | |
| 624 | + contains_english, | |
| 625 | + index_languages, | |
| 626 | + ) | |
| 627 | + match_fields = self._format_match_field_specs(expanded_fb) | |
| 563 | 628 | if not match_fields: |
| 564 | 629 | continue |
| 565 | 630 | should_clauses.append({ | ... | ... |
tests/test_es_query_builder.py
| ... | ... | @@ -80,3 +80,102 @@ def test_text_query_contains_only_base_translation_and_fallback_named_queries(): |
| 80 | 80 | names = [clause["multi_match"]["_name"] for clause in should] |
| 81 | 81 | |
| 82 | 82 | assert names == ["base_query", "base_query_trans_zh", "fallback_original_query_fr"] |
| 83 | + | |
| 84 | + | |
| 85 | +def test_mixed_script_merges_en_fields_into_zh_clause(): | |
| 86 | + qb = ESQueryBuilder( | |
| 87 | + match_fields=["title.en^3.0"], | |
| 88 | + multilingual_fields=["title", "brief"], | |
| 89 | + shared_fields=[], | |
| 90 | + text_embedding_field="title_embedding", | |
| 91 | + default_language="en", | |
| 92 | + ) | |
| 93 | + parsed_query = SimpleNamespace( | |
| 94 | + query_text_by_lang={"zh": "法式 dress"}, | |
| 95 | + search_langs=["zh"], | |
| 96 | + detected_language="zh", | |
| 97 | + source_in_index_languages=True, | |
| 98 | + index_languages=["zh", "en"], | |
| 99 | + contains_chinese=True, | |
| 100 | + contains_english=True, | |
| 101 | + ) | |
| 102 | + q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False) | |
| 103 | + fields = q["query"]["multi_match"]["fields"] | |
| 104 | + bases = {f.split("^", 1)[0] for f in fields} | |
| 105 | + assert "title.zh" in bases and "title.en" in bases | |
| 106 | + assert "brief.zh" in bases and "brief.en" in bases | |
| 107 | + # Merged supplemental language fields use boost * 0.8 (implicit 1.0 -> ^0.8) | |
| 108 | + assert "title.en^0.8" in fields | |
| 109 | + assert "brief.en^0.8" in fields | |
| 110 | + | |
| 111 | + | |
| 112 | +def test_mixed_script_merges_zh_fields_into_en_clause(): | |
| 113 | + qb = ESQueryBuilder( | |
| 114 | + match_fields=["title.en^3.0"], | |
| 115 | + multilingual_fields=["title"], | |
| 116 | + shared_fields=[], | |
| 117 | + text_embedding_field="title_embedding", | |
| 118 | + default_language="en", | |
| 119 | + ) | |
| 120 | + parsed_query = SimpleNamespace( | |
| 121 | + query_text_by_lang={"en": "red 连衣裙"}, | |
| 122 | + search_langs=["en"], | |
| 123 | + detected_language="en", | |
| 124 | + source_in_index_languages=True, | |
| 125 | + index_languages=["zh", "en"], | |
| 126 | + contains_chinese=True, | |
| 127 | + contains_english=True, | |
| 128 | + ) | |
| 129 | + q = qb.build_query(query_text="red 连衣裙", parsed_query=parsed_query, enable_knn=False) | |
| 130 | + fields = q["query"]["multi_match"]["fields"] | |
| 131 | + bases = {f.split("^", 1)[0] for f in fields} | |
| 132 | + assert "title.en" in bases and "title.zh" in bases | |
| 133 | + assert "title.zh^0.8" in fields | |
| 134 | + | |
| 135 | + | |
| 136 | +def test_mixed_script_merged_fields_scale_configured_boosts(): | |
| 137 | + qb = ESQueryBuilder( | |
| 138 | + match_fields=["title.en^3.0"], | |
| 139 | + multilingual_fields=["title"], | |
| 140 | + shared_fields=[], | |
| 141 | + field_boosts={"title.zh": 5.0, "title.en": 10.0}, | |
| 142 | + text_embedding_field="title_embedding", | |
| 143 | + default_language="en", | |
| 144 | + ) | |
| 145 | + parsed_query = SimpleNamespace( | |
| 146 | + query_text_by_lang={"zh": "法式 dress"}, | |
| 147 | + search_langs=["zh"], | |
| 148 | + detected_language="zh", | |
| 149 | + source_in_index_languages=True, | |
| 150 | + index_languages=["zh", "en"], | |
| 151 | + contains_chinese=True, | |
| 152 | + contains_english=True, | |
| 153 | + ) | |
| 154 | + q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False) | |
| 155 | + fields = q["query"]["multi_match"]["fields"] | |
| 156 | + assert "title.zh^5.0" in fields | |
| 157 | + assert "title.en^8.0" in fields # 10.0 * 0.8 | |
| 158 | + | |
| 159 | + | |
| 160 | +def test_mixed_script_does_not_merge_en_when_not_in_index_languages(): | |
| 161 | + qb = ESQueryBuilder( | |
| 162 | + match_fields=["title.zh^3.0"], | |
| 163 | + multilingual_fields=["title"], | |
| 164 | + shared_fields=[], | |
| 165 | + text_embedding_field="title_embedding", | |
| 166 | + default_language="zh", | |
| 167 | + ) | |
| 168 | + parsed_query = SimpleNamespace( | |
| 169 | + query_text_by_lang={"zh": "法式 dress"}, | |
| 170 | + search_langs=["zh"], | |
| 171 | + detected_language="zh", | |
| 172 | + source_in_index_languages=True, | |
| 173 | + index_languages=["zh"], | |
| 174 | + contains_chinese=True, | |
| 175 | + contains_english=True, | |
| 176 | + ) | |
| 177 | + q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False) | |
| 178 | + fields = q["query"]["multi_match"]["fields"] | |
| 179 | + bases = {f.split("^", 1)[0] for f in fields} | |
| 180 | + assert "title.zh" in bases | |
| 181 | + assert "title.en" not in bases | ... | ... |
tests/test_query_parser_mixed_language.py
| ... | ... | @@ -9,6 +9,14 @@ class _DummyTranslator: |
| 9 | 9 | return f"{text}-{target_lang}" |
| 10 | 10 | |
| 11 | 11 | |
| 12 | +def test_pure_english_word_token_length_and_script(): | |
| 13 | + assert QueryParser._is_pure_english_word_token("ab") is False | |
| 14 | + assert QueryParser._is_pure_english_word_token("abc") is True | |
| 15 | + assert QueryParser._is_pure_english_word_token("wi-fi") is True | |
| 16 | + assert QueryParser._is_pure_english_word_token("连衣裙") is False | |
| 17 | + assert QueryParser._is_pure_english_word_token("ab12") is False | |
| 18 | + | |
| 19 | + | |
| 12 | 20 | def _build_config() -> SearchConfig: |
| 13 | 21 | return SearchConfig( |
| 14 | 22 | es_index_name="test_products", |
| ... | ... | @@ -38,6 +46,8 @@ def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(mo |
| 38 | 46 | result = parser.parse("法式 dress 连衣裙", tenant_id="162", generate_vector=False) |
| 39 | 47 | |
| 40 | 48 | assert result.detected_language == "zh" |
| 49 | + assert result.contains_chinese is True | |
| 50 | + assert result.contains_english is True | |
| 41 | 51 | assert "en" in result.search_langs |
| 42 | 52 | # 翻译在预算内完成时会写入目标语言字段(优于仅用原文做 supplemental 探测) |
| 43 | 53 | assert result.query_text_by_lang["en"] == "法式 dress 连衣裙-en" |
| ... | ... | @@ -56,6 +66,8 @@ def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch): |
| 56 | 66 | result = parser.parse("red 连衣裙", tenant_id="0", generate_vector=False) |
| 57 | 67 | |
| 58 | 68 | assert result.detected_language == "en" |
| 69 | + assert result.contains_chinese is True | |
| 70 | + assert result.contains_english is True | |
| 59 | 71 | assert "zh" in result.search_langs |
| 60 | 72 | assert result.query_text_by_lang["zh"] == "red 连衣裙-zh" |
| 61 | 73 | assert result.query_text_by_lang["en"] == "red 连衣裙" |
| ... | ... | @@ -74,6 +86,8 @@ def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch) |
| 74 | 86 | result = parser.parse("off shoulder top", tenant_id="0", generate_vector=False) |
| 75 | 87 | |
| 76 | 88 | assert result.detected_language == "en" |
| 89 | + assert result.contains_chinese is False | |
| 90 | + assert result.contains_english is True | |
| 77 | 91 | assert result.translations.get("zh") == "off shoulder top-zh" |
| 78 | 92 | assert result.query_text_by_lang.get("zh") == "off shoulder top-zh" |
| 79 | 93 | assert result.source_in_index_languages is True | ... | ... |