Commit 6823fe3e333e350417e58a7c9da6a4a9dff5253e

Authored by tangwang
1 parent 1556989b

feat(search): 混合语种查询分析与跨语言字段召回

## 背景
多语言索引下,用户查询常中英混写;需在解析阶段显式标记脚本类型,并在 BM25 子句中同时覆盖对应语言字段。

## 方案

### 1. Query 分析(query_parser.ParsedQuery)
- 新增 `contains_chinese`:query 文本含 CJK(沿用 _contains_cjk)。
- 新增 `contains_english`:分词结果中存在「纯英文、len>=3」token(fullmatch 字母及可选连字符)。
- 写入 to_dict、请求 context 中间结果,便于调试与 API 透出。

### 2. ES 文本召回(es_query_builder._build_advanced_text_query)
- 对每个 search_lang 子句:若含英文且子句语言非 en(且租户 index_languages 含 en),合并 en 列字段;若含中文且子句语言非 zh(且含 zh),合并 zh 列字段。
- 合并进来的字段 boost 乘以 `mixed_script_merged_field_boost_scale`(默认 0.8,可在 ESQueryBuilder 构造参数调整)。
- fallback_original_query_* 分支同样应用上述逻辑。

### 3. 实现整理
- 引入 `MatchFieldSpec = (field_path, boost)`:`_build_match_field_specs` 为唯一权重来源;`_merge_supplemental_lang_field_specs` / `_expand_match_field_specs_for_mixed_script` 在 tuple 上合并与缩放;最后 `_format_match_field_specs` 再格式化为 ES `path^boost`,避免先拼字符串再解析。

## 测试
- tests/test_query_parser_mixed_language.py:脚本标记与 token 规则。
- tests/test_es_query_builder.py:合并字段、0.8 缩放、index_languages 限制。

Made-with: Cursor
query/query_parser.py
@@ -42,6 +42,8 @@ class ParsedQuery: @@ -42,6 +42,8 @@ class ParsedQuery:
42 search_langs: Optional[List[str]] = None, 42 search_langs: Optional[List[str]] = None,
43 index_languages: Optional[List[str]] = None, 43 index_languages: Optional[List[str]] = None,
44 source_in_index_languages: bool = True, 44 source_in_index_languages: bool = True,
  45 + contains_chinese: bool = False,
  46 + contains_english: bool = False,
45 ): 47 ):
46 self.original_query = original_query 48 self.original_query = original_query
47 self.query_normalized = query_normalized 49 self.query_normalized = query_normalized
@@ -58,6 +60,8 @@ class ParsedQuery: @@ -58,6 +60,8 @@ class ParsedQuery:
58 self.search_langs = search_langs or [] 60 self.search_langs = search_langs or []
59 self.index_languages = index_languages or [] 61 self.index_languages = index_languages or []
60 self.source_in_index_languages = bool(source_in_index_languages) 62 self.source_in_index_languages = bool(source_in_index_languages)
  63 + self.contains_chinese = bool(contains_chinese)
  64 + self.contains_english = bool(contains_english)
61 65
62 def to_dict(self) -> Dict[str, Any]: 66 def to_dict(self) -> Dict[str, Any]:
63 """Convert to dictionary representation.""" 67 """Convert to dictionary representation."""
@@ -73,6 +77,8 @@ class ParsedQuery: @@ -73,6 +77,8 @@ class ParsedQuery:
73 result["search_langs"] = self.search_langs 77 result["search_langs"] = self.search_langs
74 result["index_languages"] = self.index_languages 78 result["index_languages"] = self.index_languages
75 result["source_in_index_languages"] = self.source_in_index_languages 79 result["source_in_index_languages"] = self.source_in_index_languages
  80 + result["contains_chinese"] = self.contains_chinese
  81 + result["contains_english"] = self.contains_english
76 return result 82 return result
77 83
78 84
@@ -217,6 +223,16 @@ class QueryParser: @@ -217,6 +223,16 @@ class QueryParser:
217 return bool(re.search(r"[\u4e00-\u9fff]", text or "")) 223 return bool(re.search(r"[\u4e00-\u9fff]", text or ""))
218 224
219 @staticmethod 225 @staticmethod
  226 + def _is_pure_english_word_token(token: str) -> bool:
  227 + """
  228 + A tokenizer token counts as English iff it is letters only (optional internal hyphens)
  229 + and length >= 3.
  230 + """
  231 + if not token or len(token) < 3:
  232 + return False
  233 + return bool(re.fullmatch(r"[A-Za-z]+(?:-[A-Za-z]+)*", token))
  234 +
  235 + @staticmethod
220 def _extract_latin_tokens(text: str) -> List[str]: 236 def _extract_latin_tokens(text: str) -> List[str]:
221 """Extract latin word tokens from query text.""" 237 """Extract latin word tokens from query text."""
222 return re.findall(r"[A-Za-z]+(?:-[A-Za-z]+)*", text or "") 238 return re.findall(r"[A-Za-z]+(?:-[A-Za-z]+)*", text or "")
@@ -391,13 +407,18 @@ class QueryParser: @@ -391,13 +407,18 @@ class QueryParser:
391 keywords = self._extract_keywords(query_text) 407 keywords = self._extract_keywords(query_text)
392 query_tokens = self._get_query_tokens(query_text) 408 query_tokens = self._get_query_tokens(query_text)
393 token_count = len(query_tokens) 409 token_count = len(query_tokens)
  410 + contains_chinese = self._contains_cjk(query_text)
  411 + contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens)
394 412
395 log_debug(f"Query analysis | Keywords: {keywords} | Token count: {token_count} | " 413 log_debug(f"Query analysis | Keywords: {keywords} | Token count: {token_count} | "
396 - f"Query tokens: {query_tokens}") 414 + f"Query tokens: {query_tokens} | contains_chinese={contains_chinese} | "
  415 + f"contains_english={contains_english}")
397 if context: 416 if context:
398 context.store_intermediate_result('keywords', keywords) 417 context.store_intermediate_result('keywords', keywords)
399 context.store_intermediate_result('token_count', token_count) 418 context.store_intermediate_result('token_count', token_count)
400 context.store_intermediate_result('query_tokens', query_tokens) 419 context.store_intermediate_result('query_tokens', query_tokens)
  420 + context.store_intermediate_result('contains_chinese', contains_chinese)
  421 + context.store_intermediate_result('contains_english', contains_english)
401 422
402 # Stage 6: Text embedding (only for non-short queries) - async execution 423 # Stage 6: Text embedding (only for non-short queries) - async execution
403 query_vector = None 424 query_vector = None
@@ -578,6 +599,8 @@ class QueryParser: @@ -578,6 +599,8 @@ class QueryParser:
578 search_langs=ordered_search_langs, 599 search_langs=ordered_search_langs,
579 index_languages=index_langs, 600 index_languages=index_langs,
580 source_in_index_languages=source_in_index_languages, 601 source_in_index_languages=source_in_index_languages,
  602 + contains_chinese=contains_chinese,
  603 + contains_english=contains_english,
581 ) 604 )
582 605
583 if context and hasattr(context, 'logger'): 606 if context and hasattr(context, 'logger'):
search/es_query_builder.py
@@ -9,9 +9,13 @@ Simplified architecture: @@ -9,9 +9,13 @@ Simplified architecture:
9 """ 9 """
10 10
11 from typing import Dict, Any, List, Optional, Union, Tuple 11 from typing import Dict, Any, List, Optional, Union, Tuple
  12 +
12 import numpy as np 13 import numpy as np
13 from config import FunctionScoreConfig 14 from config import FunctionScoreConfig
14 15
  16 +# (Elasticsearch field path, boost before formatting as "path^boost")
  17 +MatchFieldSpec = Tuple[str, float]
  18 +
15 19
16 class ESQueryBuilder: 20 class ESQueryBuilder:
17 """Builds Elasticsearch DSL queries.""" 21 """Builds Elasticsearch DSL queries."""
@@ -36,6 +40,7 @@ class ESQueryBuilder: @@ -36,6 +40,7 @@ class ESQueryBuilder:
36 source_boost_when_missing: float = 0.6, 40 source_boost_when_missing: float = 0.6,
37 original_query_fallback_boost_when_translation_missing: float = 0.2, 41 original_query_fallback_boost_when_translation_missing: float = 0.2,
38 tie_breaker_base_query: float = 0.9, 42 tie_breaker_base_query: float = 0.9,
  43 + mixed_script_merged_field_boost_scale: float = 0.6,
39 ): 44 ):
40 """ 45 """
41 Initialize query builder. 46 Initialize query builder.
@@ -51,6 +56,7 @@ class ESQueryBuilder: @@ -51,6 +56,7 @@ class ESQueryBuilder:
51 function_score_config: Function score configuration 56 function_score_config: Function score configuration
52 default_language: Default language to use when detection fails or returns "unknown" 57 default_language: Default language to use when detection fails or returns "unknown"
53 knn_boost: Boost value for KNN (embedding recall) 58 knn_boost: Boost value for KNN (embedding recall)
  59 + mixed_script_merged_field_boost_scale: Multiply per-field ^boost for cross-script merged fields
54 """ 60 """
55 self.match_fields = match_fields 61 self.match_fields = match_fields
56 self.field_boosts = field_boosts or {} 62 self.field_boosts = field_boosts or {}
@@ -74,6 +80,7 @@ class ESQueryBuilder: @@ -74,6 +80,7 @@ class ESQueryBuilder:
74 original_query_fallback_boost_when_translation_missing 80 original_query_fallback_boost_when_translation_missing
75 ) 81 )
76 self.tie_breaker_base_query = float(tie_breaker_base_query) 82 self.tie_breaker_base_query = float(tie_breaker_base_query)
  83 + self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale)
77 84
78 def _apply_source_filter(self, es_query: Dict[str, Any]) -> None: 85 def _apply_source_filter(self, es_query: Dict[str, Any]) -> None:
79 """ 86 """
@@ -414,7 +421,7 @@ class ESQueryBuilder: @@ -414,7 +421,7 @@ class ESQueryBuilder:
414 def _format_field_with_boost(self, field_name: str, boost: float) -> str: 421 def _format_field_with_boost(self, field_name: str, boost: float) -> str:
415 if abs(float(boost) - 1.0) < 1e-9: 422 if abs(float(boost) - 1.0) < 1e-9:
416 return field_name 423 return field_name
417 - return f"{field_name}^{boost}" 424 + return f"{field_name}^{round(boost, 2)}"
418 425
419 def _get_field_boost(self, base_field: str, language: Optional[str] = None) -> float: 426 def _get_field_boost(self, base_field: str, language: Optional[str] = None) -> float:
420 # Language-specific override first (e.g. title.de), then base field (e.g. title) 427 # Language-specific override first (e.g. title.de), then base field (e.g. title)
@@ -426,36 +433,74 @@ class ESQueryBuilder: @@ -426,36 +433,74 @@ class ESQueryBuilder:
426 return float(self.field_boosts[base_field]) 433 return float(self.field_boosts[base_field])
427 return 1.0 434 return 1.0
428 435
429 - def _get_match_fields(self, language: str) -> Tuple[List[str], List[str]]: 436 + def _build_match_field_specs(self, language: str) -> Tuple[List[MatchFieldSpec], List[MatchFieldSpec]]:
430 """ 437 """
431 - Build dynamic match fields for one language.  
432 -  
433 - Args:  
434 - language: Language code (e.g. zh/en/de/fr/...)  
435 -  
436 - Returns:  
437 - (all_fields, core_fields) - core_fields are for phrase/keyword queries 438 + Per-language match targets as (field_path, boost). Single source of truth before string formatting.
  439 + Returns (all_fields, core_fields); core_fields are for phrase/keyword strategies elsewhere.
438 """ 440 """
439 lang = (language or "").strip().lower() 441 lang = (language or "").strip().lower()
440 - all_fields: List[str] = []  
441 - core_fields: List[str] = [] 442 + all_specs: List[MatchFieldSpec] = []
  443 + core_specs: List[MatchFieldSpec] = []
442 444
443 for base in self.multilingual_fields: 445 for base in self.multilingual_fields:
444 field = f"{base}.{lang}" 446 field = f"{base}.{lang}"
445 - boost = self._get_field_boost(base, lang)  
446 - all_fields.append(self._format_field_with_boost(field, boost)) 447 + all_specs.append((field, self._get_field_boost(base, lang)))
447 448
448 for shared in self.shared_fields: 449 for shared in self.shared_fields:
449 - boost = self._get_field_boost(shared, None)  
450 - all_fields.append(self._format_field_with_boost(shared, boost)) 450 + all_specs.append((shared, self._get_field_boost(shared, None)))
451 451
452 for base in self.core_multilingual_fields: 452 for base in self.core_multilingual_fields:
453 field = f"{base}.{lang}" 453 field = f"{base}.{lang}"
454 - boost = self._get_field_boost(base, lang)  
455 - core_fields.append(self._format_field_with_boost(field, boost)) 454 + core_specs.append((field, self._get_field_boost(base, lang)))
  455 +
  456 + return all_specs, core_specs
  457 +
  458 + def _format_match_field_specs(self, specs: List[MatchFieldSpec]) -> List[str]:
  459 + """Format (field_path, boost) pairs for Elasticsearch multi_match ``fields``."""
  460 + return [self._format_field_with_boost(path, boost) for path, boost in specs]
  461 +
  462 + def _merge_supplemental_lang_field_specs(
  463 + self,
  464 + specs: List[MatchFieldSpec],
  465 + supplemental_lang: str,
  466 + ) -> List[MatchFieldSpec]:
  467 + """Append supplemental-language columns; boosts multiplied by mixed_script scale."""
  468 + scale = float(self.mixed_script_merged_field_boost_scale)
  469 + extra_all, _ = self._build_match_field_specs(supplemental_lang)
  470 + seen = {path for path, _ in specs}
  471 + out = list(specs)
  472 + for path, boost in extra_all:
  473 + if path not in seen:
  474 + out.append((path, boost * scale))
  475 + seen.add(path)
  476 + return out
  477 +
  478 + def _expand_match_field_specs_for_mixed_script(
  479 + self,
  480 + lang: str,
  481 + specs: List[MatchFieldSpec],
  482 + contains_chinese: bool,
  483 + contains_english: bool,
  484 + index_languages: List[str],
  485 + ) -> List[MatchFieldSpec]:
  486 + """
  487 + When the query mixes scripts, widen each clause to indexed fields for the other script
  488 + (e.g. zh clause also searches title.en when the query contains an English word token).
  489 + """
  490 + norm = {str(x or "").strip().lower() for x in (index_languages or []) if str(x or "").strip()}
  491 + allow = norm or {"zh", "en"}
  492 +
  493 + def can_use(lcode: str) -> bool:
  494 + return lcode in allow if norm else True
  495 +
  496 + out = list(specs)
  497 + lnorm = (lang or "").strip().lower()
  498 + if contains_english and lnorm != "en" and can_use("en"):
  499 + out = self._merge_supplemental_lang_field_specs(out, "en")
  500 + if contains_chinese and lnorm != "zh" and can_use("zh"):
  501 + out = self._merge_supplemental_lang_field_specs(out, "zh")
  502 + return out
456 503
457 - return all_fields, core_fields  
458 -  
459 def _get_embedding_field(self, language: str) -> str: 504 def _get_embedding_field(self, language: str) -> str:
460 """Get embedding field name for a language.""" 505 """Get embedding field name for a language."""
461 # Currently using unified embedding field 506 # Currently using unified embedding field
@@ -486,6 +531,8 @@ class ESQueryBuilder: @@ -486,6 +531,8 @@ class ESQueryBuilder:
486 source_in_index_languages = True 531 source_in_index_languages = True
487 index_languages: List[str] = [] 532 index_languages: List[str] = []
488 533
  534 + contains_chinese = False
  535 + contains_english = False
489 if parsed_query: 536 if parsed_query:
490 query_text_by_lang = getattr(parsed_query, "query_text_by_lang", None) or {} 537 query_text_by_lang = getattr(parsed_query, "query_text_by_lang", None) or {}
491 search_langs = getattr(parsed_query, "search_langs", None) or [] 538 search_langs = getattr(parsed_query, "search_langs", None) or []
@@ -495,6 +542,8 @@ class ESQueryBuilder: @@ -495,6 +542,8 @@ class ESQueryBuilder:
495 getattr(parsed_query, "source_in_index_languages", True) 542 getattr(parsed_query, "source_in_index_languages", True)
496 ) 543 )
497 index_languages = getattr(parsed_query, "index_languages", None) or [] 544 index_languages = getattr(parsed_query, "index_languages", None) or []
  545 + contains_chinese = bool(getattr(parsed_query, "contains_chinese", False))
  546 + contains_english = bool(getattr(parsed_query, "contains_english", False))
498 547
499 if not query_text_by_lang: 548 if not query_text_by_lang:
500 query_text_by_lang = {source_lang: query_text} 549 query_text_by_lang = {source_lang: query_text}
@@ -508,7 +557,15 @@ class ESQueryBuilder: @@ -508,7 +557,15 @@ class ESQueryBuilder:
508 lang_query = query_text_by_lang.get(lang) 557 lang_query = query_text_by_lang.get(lang)
509 if not lang_query: 558 if not lang_query:
510 continue 559 continue
511 - match_fields, _ = self._get_match_fields(lang) 560 + all_specs, _ = self._build_match_field_specs(lang)
  561 + expanded_specs = self._expand_match_field_specs_for_mixed_script(
  562 + lang,
  563 + all_specs,
  564 + contains_chinese,
  565 + contains_english,
  566 + index_languages,
  567 + )
  568 + match_fields = self._format_match_field_specs(expanded_specs)
512 if not match_fields: 569 if not match_fields:
513 continue 570 continue
514 571
@@ -559,7 +616,15 @@ class ESQueryBuilder: @@ -559,7 +616,15 @@ class ESQueryBuilder:
559 continue 616 continue
560 if lang in query_text_by_lang: 617 if lang in query_text_by_lang:
561 continue 618 continue
562 - match_fields, _ = self._get_match_fields(lang) 619 + fb_specs, _ = self._build_match_field_specs(lang)
  620 + expanded_fb = self._expand_match_field_specs_for_mixed_script(
  621 + lang,
  622 + fb_specs,
  623 + contains_chinese,
  624 + contains_english,
  625 + index_languages,
  626 + )
  627 + match_fields = self._format_match_field_specs(expanded_fb)
563 if not match_fields: 628 if not match_fields:
564 continue 629 continue
565 should_clauses.append({ 630 should_clauses.append({
tests/test_es_query_builder.py
@@ -80,3 +80,102 @@ def test_text_query_contains_only_base_translation_and_fallback_named_queries(): @@ -80,3 +80,102 @@ def test_text_query_contains_only_base_translation_and_fallback_named_queries():
80 names = [clause["multi_match"]["_name"] for clause in should] 80 names = [clause["multi_match"]["_name"] for clause in should]
81 81
82 assert names == ["base_query", "base_query_trans_zh", "fallback_original_query_fr"] 82 assert names == ["base_query", "base_query_trans_zh", "fallback_original_query_fr"]
  83 +
  84 +
  85 +def test_mixed_script_merges_en_fields_into_zh_clause():
  86 + qb = ESQueryBuilder(
  87 + match_fields=["title.en^3.0"],
  88 + multilingual_fields=["title", "brief"],
  89 + shared_fields=[],
  90 + text_embedding_field="title_embedding",
  91 + default_language="en",
  92 + )
  93 + parsed_query = SimpleNamespace(
  94 + query_text_by_lang={"zh": "法式 dress"},
  95 + search_langs=["zh"],
  96 + detected_language="zh",
  97 + source_in_index_languages=True,
  98 + index_languages=["zh", "en"],
  99 + contains_chinese=True,
  100 + contains_english=True,
  101 + )
  102 + q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False)
  103 + fields = q["query"]["multi_match"]["fields"]
  104 + bases = {f.split("^", 1)[0] for f in fields}
  105 + assert "title.zh" in bases and "title.en" in bases
  106 + assert "brief.zh" in bases and "brief.en" in bases
  107 + # Merged supplemental language fields use boost * 0.8 (implicit 1.0 -> ^0.8)
  108 + assert "title.en^0.8" in fields
  109 + assert "brief.en^0.8" in fields
  110 +
  111 +
  112 +def test_mixed_script_merges_zh_fields_into_en_clause():
  113 + qb = ESQueryBuilder(
  114 + match_fields=["title.en^3.0"],
  115 + multilingual_fields=["title"],
  116 + shared_fields=[],
  117 + text_embedding_field="title_embedding",
  118 + default_language="en",
  119 + )
  120 + parsed_query = SimpleNamespace(
  121 + query_text_by_lang={"en": "red 连衣裙"},
  122 + search_langs=["en"],
  123 + detected_language="en",
  124 + source_in_index_languages=True,
  125 + index_languages=["zh", "en"],
  126 + contains_chinese=True,
  127 + contains_english=True,
  128 + )
  129 + q = qb.build_query(query_text="red 连衣裙", parsed_query=parsed_query, enable_knn=False)
  130 + fields = q["query"]["multi_match"]["fields"]
  131 + bases = {f.split("^", 1)[0] for f in fields}
  132 + assert "title.en" in bases and "title.zh" in bases
  133 + assert "title.zh^0.8" in fields
  134 +
  135 +
  136 +def test_mixed_script_merged_fields_scale_configured_boosts():
  137 + qb = ESQueryBuilder(
  138 + match_fields=["title.en^3.0"],
  139 + multilingual_fields=["title"],
  140 + shared_fields=[],
  141 + field_boosts={"title.zh": 5.0, "title.en": 10.0},
  142 + text_embedding_field="title_embedding",
  143 + default_language="en",
  144 + )
  145 + parsed_query = SimpleNamespace(
  146 + query_text_by_lang={"zh": "法式 dress"},
  147 + search_langs=["zh"],
  148 + detected_language="zh",
  149 + source_in_index_languages=True,
  150 + index_languages=["zh", "en"],
  151 + contains_chinese=True,
  152 + contains_english=True,
  153 + )
  154 + q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False)
  155 + fields = q["query"]["multi_match"]["fields"]
  156 + assert "title.zh^5.0" in fields
  157 + assert "title.en^8.0" in fields # 10.0 * 0.8
  158 +
  159 +
  160 +def test_mixed_script_does_not_merge_en_when_not_in_index_languages():
  161 + qb = ESQueryBuilder(
  162 + match_fields=["title.zh^3.0"],
  163 + multilingual_fields=["title"],
  164 + shared_fields=[],
  165 + text_embedding_field="title_embedding",
  166 + default_language="zh",
  167 + )
  168 + parsed_query = SimpleNamespace(
  169 + query_text_by_lang={"zh": "法式 dress"},
  170 + search_langs=["zh"],
  171 + detected_language="zh",
  172 + source_in_index_languages=True,
  173 + index_languages=["zh"],
  174 + contains_chinese=True,
  175 + contains_english=True,
  176 + )
  177 + q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False)
  178 + fields = q["query"]["multi_match"]["fields"]
  179 + bases = {f.split("^", 1)[0] for f in fields}
  180 + assert "title.zh" in bases
  181 + assert "title.en" not in bases
tests/test_query_parser_mixed_language.py
@@ -9,6 +9,14 @@ class _DummyTranslator: @@ -9,6 +9,14 @@ class _DummyTranslator:
9 return f"{text}-{target_lang}" 9 return f"{text}-{target_lang}"
10 10
11 11
  12 +def test_pure_english_word_token_length_and_script():
  13 + assert QueryParser._is_pure_english_word_token("ab") is False
  14 + assert QueryParser._is_pure_english_word_token("abc") is True
  15 + assert QueryParser._is_pure_english_word_token("wi-fi") is True
  16 + assert QueryParser._is_pure_english_word_token("连衣裙") is False
  17 + assert QueryParser._is_pure_english_word_token("ab12") is False
  18 +
  19 +
12 def _build_config() -> SearchConfig: 20 def _build_config() -> SearchConfig:
13 return SearchConfig( 21 return SearchConfig(
14 es_index_name="test_products", 22 es_index_name="test_products",
@@ -38,6 +46,8 @@ def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(mo @@ -38,6 +46,8 @@ def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(mo
38 result = parser.parse("法式 dress 连衣裙", tenant_id="162", generate_vector=False) 46 result = parser.parse("法式 dress 连衣裙", tenant_id="162", generate_vector=False)
39 47
40 assert result.detected_language == "zh" 48 assert result.detected_language == "zh"
  49 + assert result.contains_chinese is True
  50 + assert result.contains_english is True
41 assert "en" in result.search_langs 51 assert "en" in result.search_langs
42 # 翻译在预算内完成时会写入目标语言字段(优于仅用原文做 supplemental 探测) 52 # 翻译在预算内完成时会写入目标语言字段(优于仅用原文做 supplemental 探测)
43 assert result.query_text_by_lang["en"] == "法式 dress 连衣裙-en" 53 assert result.query_text_by_lang["en"] == "法式 dress 连衣裙-en"
@@ -56,6 +66,8 @@ def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch): @@ -56,6 +66,8 @@ def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch):
56 result = parser.parse("red 连衣裙", tenant_id="0", generate_vector=False) 66 result = parser.parse("red 连衣裙", tenant_id="0", generate_vector=False)
57 67
58 assert result.detected_language == "en" 68 assert result.detected_language == "en"
  69 + assert result.contains_chinese is True
  70 + assert result.contains_english is True
59 assert "zh" in result.search_langs 71 assert "zh" in result.search_langs
60 assert result.query_text_by_lang["zh"] == "red 连衣裙-zh" 72 assert result.query_text_by_lang["zh"] == "red 连衣裙-zh"
61 assert result.query_text_by_lang["en"] == "red 连衣裙" 73 assert result.query_text_by_lang["en"] == "red 连衣裙"
@@ -74,6 +86,8 @@ def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch) @@ -74,6 +86,8 @@ def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch)
74 result = parser.parse("off shoulder top", tenant_id="0", generate_vector=False) 86 result = parser.parse("off shoulder top", tenant_id="0", generate_vector=False)
75 87
76 assert result.detected_language == "en" 88 assert result.detected_language == "en"
  89 + assert result.contains_chinese is False
  90 + assert result.contains_english is True
77 assert result.translations.get("zh") == "off shoulder top-zh" 91 assert result.translations.get("zh") == "off shoulder top-zh"
78 assert result.query_text_by_lang.get("zh") == "off shoulder top-zh" 92 assert result.query_text_by_lang.get("zh") == "off shoulder top-zh"
79 assert result.source_in_index_languages is True 93 assert result.source_in_index_languages is True