Commit 6823fe3e333e350417e58a7c9da6a4a9dff5253e

Authored by tangwang
1 parent 1556989b

feat(search): 混合语种查询分析与跨语言字段召回

## 背景
多语言索引下,用户查询常中英混写;需在解析阶段显式标记脚本类型,并在 BM25 子句中同时覆盖对应语言字段。

## 方案

### 1. Query 分析(query_parser.ParsedQuery)
- 新增 `contains_chinese`:query 文本含 CJK(沿用 _contains_cjk)。
- 新增 `contains_english`:分词结果中存在「纯英文、len>=3」token(fullmatch 字母及可选连字符)。
- 写入 to_dict、请求 context 中间结果,便于调试与 API 透出。

### 2. ES 文本召回(es_query_builder._build_advanced_text_query)
- 对每个 search_lang 子句:若含英文且子句语言非 en(且租户 index_languages 含 en),合并 en 列字段;若含中文且子句语言非 zh(且含 zh),合并 zh 列字段。
- 合并进来的字段 boost 乘以 `mixed_script_merged_field_boost_scale`(默认 0.8,可在 ESQueryBuilder 构造参数调整)。
- fallback_original_query_* 分支同样应用上述逻辑。

### 3. 实现整理
- 引入 `MatchFieldSpec = (field_path, boost)`:`_build_match_field_specs` 为唯一权重来源;`_merge_supplemental_lang_field_specs` / `_expand_match_field_specs_for_mixed_script` 在 tuple 上合并与缩放;最后 `_format_match_field_specs` 再格式化为 ES `path^boost`,避免先拼字符串再解析。

## 测试
- tests/test_query_parser_mixed_language.py:脚本标记与 token 规则。
- tests/test_es_query_builder.py:合并字段、0.8 缩放、index_languages 限制。

Made-with: Cursor
query/query_parser.py
... ... @@ -42,6 +42,8 @@ class ParsedQuery:
42 42 search_langs: Optional[List[str]] = None,
43 43 index_languages: Optional[List[str]] = None,
44 44 source_in_index_languages: bool = True,
  45 + contains_chinese: bool = False,
  46 + contains_english: bool = False,
45 47 ):
46 48 self.original_query = original_query
47 49 self.query_normalized = query_normalized
... ... @@ -58,6 +60,8 @@ class ParsedQuery:
58 60 self.search_langs = search_langs or []
59 61 self.index_languages = index_languages or []
60 62 self.source_in_index_languages = bool(source_in_index_languages)
  63 + self.contains_chinese = bool(contains_chinese)
  64 + self.contains_english = bool(contains_english)
61 65  
62 66 def to_dict(self) -> Dict[str, Any]:
63 67 """Convert to dictionary representation."""
... ... @@ -73,6 +77,8 @@ class ParsedQuery:
73 77 result["search_langs"] = self.search_langs
74 78 result["index_languages"] = self.index_languages
75 79 result["source_in_index_languages"] = self.source_in_index_languages
  80 + result["contains_chinese"] = self.contains_chinese
  81 + result["contains_english"] = self.contains_english
76 82 return result
77 83  
78 84  
... ... @@ -217,6 +223,16 @@ class QueryParser:
217 223 return bool(re.search(r"[\u4e00-\u9fff]", text or ""))
218 224  
219 225 @staticmethod
  226 + def _is_pure_english_word_token(token: str) -> bool:
  227 + """
  228 + A tokenizer token counts as English iff it is letters only (optional internal hyphens)
  229 + and length >= 3.
  230 + """
  231 + if not token or len(token) < 3:
  232 + return False
  233 + return bool(re.fullmatch(r"[A-Za-z]+(?:-[A-Za-z]+)*", token))
  234 +
  235 + @staticmethod
220 236 def _extract_latin_tokens(text: str) -> List[str]:
221 237 """Extract latin word tokens from query text."""
222 238 return re.findall(r"[A-Za-z]+(?:-[A-Za-z]+)*", text or "")
... ... @@ -391,13 +407,18 @@ class QueryParser:
391 407 keywords = self._extract_keywords(query_text)
392 408 query_tokens = self._get_query_tokens(query_text)
393 409 token_count = len(query_tokens)
  410 + contains_chinese = self._contains_cjk(query_text)
  411 + contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens)
394 412  
395 413 log_debug(f"Query analysis | Keywords: {keywords} | Token count: {token_count} | "
396   - f"Query tokens: {query_tokens}")
  414 + f"Query tokens: {query_tokens} | contains_chinese={contains_chinese} | "
  415 + f"contains_english={contains_english}")
397 416 if context:
398 417 context.store_intermediate_result('keywords', keywords)
399 418 context.store_intermediate_result('token_count', token_count)
400 419 context.store_intermediate_result('query_tokens', query_tokens)
  420 + context.store_intermediate_result('contains_chinese', contains_chinese)
  421 + context.store_intermediate_result('contains_english', contains_english)
401 422  
402 423 # Stage 6: Text embedding (only for non-short queries) - async execution
403 424 query_vector = None
... ... @@ -578,6 +599,8 @@ class QueryParser:
578 599 search_langs=ordered_search_langs,
579 600 index_languages=index_langs,
580 601 source_in_index_languages=source_in_index_languages,
  602 + contains_chinese=contains_chinese,
  603 + contains_english=contains_english,
581 604 )
582 605  
583 606 if context and hasattr(context, 'logger'):
... ...
search/es_query_builder.py
... ... @@ -9,9 +9,13 @@ Simplified architecture:
9 9 """
10 10  
11 11 from typing import Dict, Any, List, Optional, Union, Tuple
  12 +
12 13 import numpy as np
13 14 from config import FunctionScoreConfig
14 15  
  16 +# (Elasticsearch field path, boost before formatting as "path^boost")
  17 +MatchFieldSpec = Tuple[str, float]
  18 +
15 19  
16 20 class ESQueryBuilder:
17 21 """Builds Elasticsearch DSL queries."""
... ... @@ -36,6 +40,7 @@ class ESQueryBuilder:
36 40 source_boost_when_missing: float = 0.6,
37 41 original_query_fallback_boost_when_translation_missing: float = 0.2,
38 42 tie_breaker_base_query: float = 0.9,
  43 + mixed_script_merged_field_boost_scale: float = 0.6,
39 44 ):
40 45 """
41 46 Initialize query builder.
... ... @@ -51,6 +56,7 @@ class ESQueryBuilder:
51 56 function_score_config: Function score configuration
52 57 default_language: Default language to use when detection fails or returns "unknown"
53 58 knn_boost: Boost value for KNN (embedding recall)
  59 + mixed_script_merged_field_boost_scale: Multiply per-field ^boost for cross-script merged fields
54 60 """
55 61 self.match_fields = match_fields
56 62 self.field_boosts = field_boosts or {}
... ... @@ -74,6 +80,7 @@ class ESQueryBuilder:
74 80 original_query_fallback_boost_when_translation_missing
75 81 )
76 82 self.tie_breaker_base_query = float(tie_breaker_base_query)
  83 + self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale)
77 84  
78 85 def _apply_source_filter(self, es_query: Dict[str, Any]) -> None:
79 86 """
... ... @@ -414,7 +421,7 @@ class ESQueryBuilder:
414 421 def _format_field_with_boost(self, field_name: str, boost: float) -> str:
415 422 if abs(float(boost) - 1.0) < 1e-9:
416 423 return field_name
417   - return f"{field_name}^{boost}"
  424 + return f"{field_name}^{round(boost, 2)}"
418 425  
419 426 def _get_field_boost(self, base_field: str, language: Optional[str] = None) -> float:
420 427 # Language-specific override first (e.g. title.de), then base field (e.g. title)
... ... @@ -426,36 +433,74 @@ class ESQueryBuilder:
426 433 return float(self.field_boosts[base_field])
427 434 return 1.0
428 435  
429   - def _get_match_fields(self, language: str) -> Tuple[List[str], List[str]]:
  436 + def _build_match_field_specs(self, language: str) -> Tuple[List[MatchFieldSpec], List[MatchFieldSpec]]:
430 437 """
431   - Build dynamic match fields for one language.
432   -
433   - Args:
434   - language: Language code (e.g. zh/en/de/fr/...)
435   -
436   - Returns:
437   - (all_fields, core_fields) - core_fields are for phrase/keyword queries
  438 + Per-language match targets as (field_path, boost). Single source of truth before string formatting.
  439 + Returns (all_fields, core_fields); core_fields are for phrase/keyword strategies elsewhere.
438 440 """
439 441 lang = (language or "").strip().lower()
440   - all_fields: List[str] = []
441   - core_fields: List[str] = []
  442 + all_specs: List[MatchFieldSpec] = []
  443 + core_specs: List[MatchFieldSpec] = []
442 444  
443 445 for base in self.multilingual_fields:
444 446 field = f"{base}.{lang}"
445   - boost = self._get_field_boost(base, lang)
446   - all_fields.append(self._format_field_with_boost(field, boost))
  447 + all_specs.append((field, self._get_field_boost(base, lang)))
447 448  
448 449 for shared in self.shared_fields:
449   - boost = self._get_field_boost(shared, None)
450   - all_fields.append(self._format_field_with_boost(shared, boost))
  450 + all_specs.append((shared, self._get_field_boost(shared, None)))
451 451  
452 452 for base in self.core_multilingual_fields:
453 453 field = f"{base}.{lang}"
454   - boost = self._get_field_boost(base, lang)
455   - core_fields.append(self._format_field_with_boost(field, boost))
  454 + core_specs.append((field, self._get_field_boost(base, lang)))
  455 +
  456 + return all_specs, core_specs
  457 +
  458 + def _format_match_field_specs(self, specs: List[MatchFieldSpec]) -> List[str]:
  459 + """Format (field_path, boost) pairs for Elasticsearch multi_match ``fields``."""
  460 + return [self._format_field_with_boost(path, boost) for path, boost in specs]
  461 +
  462 + def _merge_supplemental_lang_field_specs(
  463 + self,
  464 + specs: List[MatchFieldSpec],
  465 + supplemental_lang: str,
  466 + ) -> List[MatchFieldSpec]:
  467 + """Append supplemental-language columns; boosts multiplied by mixed_script scale."""
  468 + scale = float(self.mixed_script_merged_field_boost_scale)
  469 + extra_all, _ = self._build_match_field_specs(supplemental_lang)
  470 + seen = {path for path, _ in specs}
  471 + out = list(specs)
  472 + for path, boost in extra_all:
  473 + if path not in seen:
  474 + out.append((path, boost * scale))
  475 + seen.add(path)
  476 + return out
  477 +
  478 + def _expand_match_field_specs_for_mixed_script(
  479 + self,
  480 + lang: str,
  481 + specs: List[MatchFieldSpec],
  482 + contains_chinese: bool,
  483 + contains_english: bool,
  484 + index_languages: List[str],
  485 + ) -> List[MatchFieldSpec]:
  486 + """
  487 + When the query mixes scripts, widen each clause to indexed fields for the other script
  488 + (e.g. zh clause also searches title.en when the query contains an English word token).
  489 + """
  490 + norm = {str(x or "").strip().lower() for x in (index_languages or []) if str(x or "").strip()}
  491 + allow = norm or {"zh", "en"}
  492 +
  493 + def can_use(lcode: str) -> bool:
  494 + return lcode in allow if norm else True
  495 +
  496 + out = list(specs)
  497 + lnorm = (lang or "").strip().lower()
  498 + if contains_english and lnorm != "en" and can_use("en"):
  499 + out = self._merge_supplemental_lang_field_specs(out, "en")
  500 + if contains_chinese and lnorm != "zh" and can_use("zh"):
  501 + out = self._merge_supplemental_lang_field_specs(out, "zh")
  502 + return out
456 503  
457   - return all_fields, core_fields
458   -
459 504 def _get_embedding_field(self, language: str) -> str:
460 505 """Get embedding field name for a language."""
461 506 # Currently using unified embedding field
... ... @@ -486,6 +531,8 @@ class ESQueryBuilder:
486 531 source_in_index_languages = True
487 532 index_languages: List[str] = []
488 533  
  534 + contains_chinese = False
  535 + contains_english = False
489 536 if parsed_query:
490 537 query_text_by_lang = getattr(parsed_query, "query_text_by_lang", None) or {}
491 538 search_langs = getattr(parsed_query, "search_langs", None) or []
... ... @@ -495,6 +542,8 @@ class ESQueryBuilder:
495 542 getattr(parsed_query, "source_in_index_languages", True)
496 543 )
497 544 index_languages = getattr(parsed_query, "index_languages", None) or []
  545 + contains_chinese = bool(getattr(parsed_query, "contains_chinese", False))
  546 + contains_english = bool(getattr(parsed_query, "contains_english", False))
498 547  
499 548 if not query_text_by_lang:
500 549 query_text_by_lang = {source_lang: query_text}
... ... @@ -508,7 +557,15 @@ class ESQueryBuilder:
508 557 lang_query = query_text_by_lang.get(lang)
509 558 if not lang_query:
510 559 continue
511   - match_fields, _ = self._get_match_fields(lang)
  560 + all_specs, _ = self._build_match_field_specs(lang)
  561 + expanded_specs = self._expand_match_field_specs_for_mixed_script(
  562 + lang,
  563 + all_specs,
  564 + contains_chinese,
  565 + contains_english,
  566 + index_languages,
  567 + )
  568 + match_fields = self._format_match_field_specs(expanded_specs)
512 569 if not match_fields:
513 570 continue
514 571  
... ... @@ -559,7 +616,15 @@ class ESQueryBuilder:
559 616 continue
560 617 if lang in query_text_by_lang:
561 618 continue
562   - match_fields, _ = self._get_match_fields(lang)
  619 + fb_specs, _ = self._build_match_field_specs(lang)
  620 + expanded_fb = self._expand_match_field_specs_for_mixed_script(
  621 + lang,
  622 + fb_specs,
  623 + contains_chinese,
  624 + contains_english,
  625 + index_languages,
  626 + )
  627 + match_fields = self._format_match_field_specs(expanded_fb)
563 628 if not match_fields:
564 629 continue
565 630 should_clauses.append({
... ...
tests/test_es_query_builder.py
... ... @@ -80,3 +80,102 @@ def test_text_query_contains_only_base_translation_and_fallback_named_queries():
80 80 names = [clause["multi_match"]["_name"] for clause in should]
81 81  
82 82 assert names == ["base_query", "base_query_trans_zh", "fallback_original_query_fr"]
  83 +
  84 +
  85 +def test_mixed_script_merges_en_fields_into_zh_clause():
  86 + qb = ESQueryBuilder(
  87 + match_fields=["title.en^3.0"],
  88 + multilingual_fields=["title", "brief"],
  89 + shared_fields=[],
  90 + text_embedding_field="title_embedding",
  91 + default_language="en",
  92 + )
  93 + parsed_query = SimpleNamespace(
  94 + query_text_by_lang={"zh": "法式 dress"},
  95 + search_langs=["zh"],
  96 + detected_language="zh",
  97 + source_in_index_languages=True,
  98 + index_languages=["zh", "en"],
  99 + contains_chinese=True,
  100 + contains_english=True,
  101 + )
  102 + q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False)
  103 + fields = q["query"]["multi_match"]["fields"]
  104 + bases = {f.split("^", 1)[0] for f in fields}
  105 + assert "title.zh" in bases and "title.en" in bases
  106 + assert "brief.zh" in bases and "brief.en" in bases
  107 + # Merged supplemental language fields use boost * 0.8 (implicit 1.0 -> ^0.8)
  108 + assert "title.en^0.8" in fields
  109 + assert "brief.en^0.8" in fields
  110 +
  111 +
  112 +def test_mixed_script_merges_zh_fields_into_en_clause():
  113 + qb = ESQueryBuilder(
  114 + match_fields=["title.en^3.0"],
  115 + multilingual_fields=["title"],
  116 + shared_fields=[],
  117 + text_embedding_field="title_embedding",
  118 + default_language="en",
  119 + )
  120 + parsed_query = SimpleNamespace(
  121 + query_text_by_lang={"en": "red 连衣裙"},
  122 + search_langs=["en"],
  123 + detected_language="en",
  124 + source_in_index_languages=True,
  125 + index_languages=["zh", "en"],
  126 + contains_chinese=True,
  127 + contains_english=True,
  128 + )
  129 + q = qb.build_query(query_text="red 连衣裙", parsed_query=parsed_query, enable_knn=False)
  130 + fields = q["query"]["multi_match"]["fields"]
  131 + bases = {f.split("^", 1)[0] for f in fields}
  132 + assert "title.en" in bases and "title.zh" in bases
  133 + assert "title.zh^0.8" in fields
  134 +
  135 +
  136 +def test_mixed_script_merged_fields_scale_configured_boosts():
  137 + qb = ESQueryBuilder(
  138 + match_fields=["title.en^3.0"],
  139 + multilingual_fields=["title"],
  140 + shared_fields=[],
  141 + field_boosts={"title.zh": 5.0, "title.en": 10.0},
  142 + text_embedding_field="title_embedding",
  143 + default_language="en",
  144 + )
  145 + parsed_query = SimpleNamespace(
  146 + query_text_by_lang={"zh": "法式 dress"},
  147 + search_langs=["zh"],
  148 + detected_language="zh",
  149 + source_in_index_languages=True,
  150 + index_languages=["zh", "en"],
  151 + contains_chinese=True,
  152 + contains_english=True,
  153 + )
  154 + q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False)
  155 + fields = q["query"]["multi_match"]["fields"]
  156 + assert "title.zh^5.0" in fields
  157 + assert "title.en^8.0" in fields # 10.0 * 0.8
  158 +
  159 +
  160 +def test_mixed_script_does_not_merge_en_when_not_in_index_languages():
  161 + qb = ESQueryBuilder(
  162 + match_fields=["title.zh^3.0"],
  163 + multilingual_fields=["title"],
  164 + shared_fields=[],
  165 + text_embedding_field="title_embedding",
  166 + default_language="zh",
  167 + )
  168 + parsed_query = SimpleNamespace(
  169 + query_text_by_lang={"zh": "法式 dress"},
  170 + search_langs=["zh"],
  171 + detected_language="zh",
  172 + source_in_index_languages=True,
  173 + index_languages=["zh"],
  174 + contains_chinese=True,
  175 + contains_english=True,
  176 + )
  177 + q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False)
  178 + fields = q["query"]["multi_match"]["fields"]
  179 + bases = {f.split("^", 1)[0] for f in fields}
  180 + assert "title.zh" in bases
  181 + assert "title.en" not in bases
... ...
tests/test_query_parser_mixed_language.py
... ... @@ -9,6 +9,14 @@ class _DummyTranslator:
9 9 return f"{text}-{target_lang}"
10 10  
11 11  
  12 +def test_pure_english_word_token_length_and_script():
  13 + assert QueryParser._is_pure_english_word_token("ab") is False
  14 + assert QueryParser._is_pure_english_word_token("abc") is True
  15 + assert QueryParser._is_pure_english_word_token("wi-fi") is True
  16 + assert QueryParser._is_pure_english_word_token("连衣裙") is False
  17 + assert QueryParser._is_pure_english_word_token("ab12") is False
  18 +
  19 +
12 20 def _build_config() -> SearchConfig:
13 21 return SearchConfig(
14 22 es_index_name="test_products",
... ... @@ -38,6 +46,8 @@ def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(mo
38 46 result = parser.parse("法式 dress 连衣裙", tenant_id="162", generate_vector=False)
39 47  
40 48 assert result.detected_language == "zh"
  49 + assert result.contains_chinese is True
  50 + assert result.contains_english is True
41 51 assert "en" in result.search_langs
42 52 # 翻译在预算内完成时会写入目标语言字段(优于仅用原文做 supplemental 探测)
43 53 assert result.query_text_by_lang["en"] == "法式 dress 连衣裙-en"
... ... @@ -56,6 +66,8 @@ def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch):
56 66 result = parser.parse("red 连衣裙", tenant_id="0", generate_vector=False)
57 67  
58 68 assert result.detected_language == "en"
  69 + assert result.contains_chinese is True
  70 + assert result.contains_english is True
59 71 assert "zh" in result.search_langs
60 72 assert result.query_text_by_lang["zh"] == "red 连衣裙-zh"
61 73 assert result.query_text_by_lang["en"] == "red 连衣裙"
... ... @@ -74,6 +86,8 @@ def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch)
74 86 result = parser.parse("off shoulder top", tenant_id="0", generate_vector=False)
75 87  
76 88 assert result.detected_language == "en"
  89 + assert result.contains_chinese is False
  90 + assert result.contains_english is True
77 91 assert result.translations.get("zh") == "off shoulder top-zh"
78 92 assert result.query_text_by_lang.get("zh") == "off shoulder top-zh"
79 93 assert result.source_in_index_languages is True
... ...