From 6823fe3e333e350417e58a7c9da6a4a9dff5253e Mon Sep 17 00:00:00 2001 From: tangwang Date: Fri, 20 Mar 2026 14:45:57 +0800 Subject: [PATCH] feat(search): 混合语种查询分析与跨语言字段召回 --- query/query_parser.py | 25 ++++++++++++++++++++++++- search/es_query_builder.py | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------- tests/test_es_query_builder.py | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ tests/test_query_parser_mixed_language.py | 14 ++++++++++++++ 4 files changed, 223 insertions(+), 22 deletions(-) diff --git a/query/query_parser.py b/query/query_parser.py index 0dc829b..e09202c 100644 --- a/query/query_parser.py +++ b/query/query_parser.py @@ -42,6 +42,8 @@ class ParsedQuery: search_langs: Optional[List[str]] = None, index_languages: Optional[List[str]] = None, source_in_index_languages: bool = True, + contains_chinese: bool = False, + contains_english: bool = False, ): self.original_query = original_query self.query_normalized = query_normalized @@ -58,6 +60,8 @@ class ParsedQuery: self.search_langs = search_langs or [] self.index_languages = index_languages or [] self.source_in_index_languages = bool(source_in_index_languages) + self.contains_chinese = bool(contains_chinese) + self.contains_english = bool(contains_english) def to_dict(self) -> Dict[str, Any]: """Convert to dictionary representation.""" @@ -73,6 +77,8 @@ class ParsedQuery: result["search_langs"] = self.search_langs result["index_languages"] = self.index_languages result["source_in_index_languages"] = self.source_in_index_languages + result["contains_chinese"] = self.contains_chinese + result["contains_english"] = self.contains_english return result @@ -217,6 +223,16 @@ class QueryParser: return bool(re.search(r"[\u4e00-\u9fff]", text or "")) @staticmethod + def _is_pure_english_word_token(token: str) -> bool: + """ + A tokenizer token counts as English iff it is letters only (optional internal hyphens) + and length >= 3. + """ + if not token or len(token) < 3: + return False + return bool(re.fullmatch(r"[A-Za-z]+(?:-[A-Za-z]+)*", token)) + + @staticmethod def _extract_latin_tokens(text: str) -> List[str]: """Extract latin word tokens from query text.""" return re.findall(r"[A-Za-z]+(?:-[A-Za-z]+)*", text or "") @@ -391,13 +407,18 @@ class QueryParser: keywords = self._extract_keywords(query_text) query_tokens = self._get_query_tokens(query_text) token_count = len(query_tokens) + contains_chinese = self._contains_cjk(query_text) + contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens) log_debug(f"Query analysis | Keywords: {keywords} | Token count: {token_count} | " - f"Query tokens: {query_tokens}") + f"Query tokens: {query_tokens} | contains_chinese={contains_chinese} | " + f"contains_english={contains_english}") if context: context.store_intermediate_result('keywords', keywords) context.store_intermediate_result('token_count', token_count) context.store_intermediate_result('query_tokens', query_tokens) + context.store_intermediate_result('contains_chinese', contains_chinese) + context.store_intermediate_result('contains_english', contains_english) # Stage 6: Text embedding (only for non-short queries) - async execution query_vector = None @@ -578,6 +599,8 @@ class QueryParser: search_langs=ordered_search_langs, index_languages=index_langs, source_in_index_languages=source_in_index_languages, + contains_chinese=contains_chinese, + contains_english=contains_english, ) if context and hasattr(context, 'logger'): diff --git a/search/es_query_builder.py b/search/es_query_builder.py index 23481e4..7266aa4 100644 --- a/search/es_query_builder.py +++ b/search/es_query_builder.py @@ -9,9 +9,13 @@ Simplified architecture: """ from typing import Dict, Any, List, Optional, Union, Tuple + import numpy as np from config import FunctionScoreConfig +# (Elasticsearch field path, boost before formatting as "path^boost") +MatchFieldSpec = Tuple[str, float] + class ESQueryBuilder: """Builds Elasticsearch DSL queries.""" @@ -36,6 +40,7 @@ class ESQueryBuilder: source_boost_when_missing: float = 0.6, original_query_fallback_boost_when_translation_missing: float = 0.2, tie_breaker_base_query: float = 0.9, + mixed_script_merged_field_boost_scale: float = 0.6, ): """ Initialize query builder. @@ -51,6 +56,7 @@ class ESQueryBuilder: function_score_config: Function score configuration default_language: Default language to use when detection fails or returns "unknown" knn_boost: Boost value for KNN (embedding recall) + mixed_script_merged_field_boost_scale: Multiply per-field ^boost for cross-script merged fields """ self.match_fields = match_fields self.field_boosts = field_boosts or {} @@ -74,6 +80,7 @@ class ESQueryBuilder: original_query_fallback_boost_when_translation_missing ) self.tie_breaker_base_query = float(tie_breaker_base_query) + self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale) def _apply_source_filter(self, es_query: Dict[str, Any]) -> None: """ @@ -414,7 +421,7 @@ class ESQueryBuilder: def _format_field_with_boost(self, field_name: str, boost: float) -> str: if abs(float(boost) - 1.0) < 1e-9: return field_name - return f"{field_name}^{boost}" + return f"{field_name}^{round(boost, 2)}" def _get_field_boost(self, base_field: str, language: Optional[str] = None) -> float: # Language-specific override first (e.g. title.de), then base field (e.g. title) @@ -426,36 +433,74 @@ class ESQueryBuilder: return float(self.field_boosts[base_field]) return 1.0 - def _get_match_fields(self, language: str) -> Tuple[List[str], List[str]]: + def _build_match_field_specs(self, language: str) -> Tuple[List[MatchFieldSpec], List[MatchFieldSpec]]: """ - Build dynamic match fields for one language. - - Args: - language: Language code (e.g. zh/en/de/fr/...) - - Returns: - (all_fields, core_fields) - core_fields are for phrase/keyword queries + Per-language match targets as (field_path, boost). Single source of truth before string formatting. + Returns (all_fields, core_fields); core_fields are for phrase/keyword strategies elsewhere. """ lang = (language or "").strip().lower() - all_fields: List[str] = [] - core_fields: List[str] = [] + all_specs: List[MatchFieldSpec] = [] + core_specs: List[MatchFieldSpec] = [] for base in self.multilingual_fields: field = f"{base}.{lang}" - boost = self._get_field_boost(base, lang) - all_fields.append(self._format_field_with_boost(field, boost)) + all_specs.append((field, self._get_field_boost(base, lang))) for shared in self.shared_fields: - boost = self._get_field_boost(shared, None) - all_fields.append(self._format_field_with_boost(shared, boost)) + all_specs.append((shared, self._get_field_boost(shared, None))) for base in self.core_multilingual_fields: field = f"{base}.{lang}" - boost = self._get_field_boost(base, lang) - core_fields.append(self._format_field_with_boost(field, boost)) + core_specs.append((field, self._get_field_boost(base, lang))) + + return all_specs, core_specs + + def _format_match_field_specs(self, specs: List[MatchFieldSpec]) -> List[str]: + """Format (field_path, boost) pairs for Elasticsearch multi_match ``fields``.""" + return [self._format_field_with_boost(path, boost) for path, boost in specs] + + def _merge_supplemental_lang_field_specs( + self, + specs: List[MatchFieldSpec], + supplemental_lang: str, + ) -> List[MatchFieldSpec]: + """Append supplemental-language columns; boosts multiplied by mixed_script scale.""" + scale = float(self.mixed_script_merged_field_boost_scale) + extra_all, _ = self._build_match_field_specs(supplemental_lang) + seen = {path for path, _ in specs} + out = list(specs) + for path, boost in extra_all: + if path not in seen: + out.append((path, boost * scale)) + seen.add(path) + return out + + def _expand_match_field_specs_for_mixed_script( + self, + lang: str, + specs: List[MatchFieldSpec], + contains_chinese: bool, + contains_english: bool, + index_languages: List[str], + ) -> List[MatchFieldSpec]: + """ + When the query mixes scripts, widen each clause to indexed fields for the other script + (e.g. zh clause also searches title.en when the query contains an English word token). + """ + norm = {str(x or "").strip().lower() for x in (index_languages or []) if str(x or "").strip()} + allow = norm or {"zh", "en"} + + def can_use(lcode: str) -> bool: + return lcode in allow if norm else True + + out = list(specs) + lnorm = (lang or "").strip().lower() + if contains_english and lnorm != "en" and can_use("en"): + out = self._merge_supplemental_lang_field_specs(out, "en") + if contains_chinese and lnorm != "zh" and can_use("zh"): + out = self._merge_supplemental_lang_field_specs(out, "zh") + return out - return all_fields, core_fields - def _get_embedding_field(self, language: str) -> str: """Get embedding field name for a language.""" # Currently using unified embedding field @@ -486,6 +531,8 @@ class ESQueryBuilder: source_in_index_languages = True index_languages: List[str] = [] + contains_chinese = False + contains_english = False if parsed_query: query_text_by_lang = getattr(parsed_query, "query_text_by_lang", None) or {} search_langs = getattr(parsed_query, "search_langs", None) or [] @@ -495,6 +542,8 @@ class ESQueryBuilder: getattr(parsed_query, "source_in_index_languages", True) ) index_languages = getattr(parsed_query, "index_languages", None) or [] + contains_chinese = bool(getattr(parsed_query, "contains_chinese", False)) + contains_english = bool(getattr(parsed_query, "contains_english", False)) if not query_text_by_lang: query_text_by_lang = {source_lang: query_text} @@ -508,7 +557,15 @@ class ESQueryBuilder: lang_query = query_text_by_lang.get(lang) if not lang_query: continue - match_fields, _ = self._get_match_fields(lang) + all_specs, _ = self._build_match_field_specs(lang) + expanded_specs = self._expand_match_field_specs_for_mixed_script( + lang, + all_specs, + contains_chinese, + contains_english, + index_languages, + ) + match_fields = self._format_match_field_specs(expanded_specs) if not match_fields: continue @@ -559,7 +616,15 @@ class ESQueryBuilder: continue if lang in query_text_by_lang: continue - match_fields, _ = self._get_match_fields(lang) + fb_specs, _ = self._build_match_field_specs(lang) + expanded_fb = self._expand_match_field_specs_for_mixed_script( + lang, + fb_specs, + contains_chinese, + contains_english, + index_languages, + ) + match_fields = self._format_match_field_specs(expanded_fb) if not match_fields: continue should_clauses.append({ diff --git a/tests/test_es_query_builder.py b/tests/test_es_query_builder.py index 5a82b61..71a1f07 100644 --- a/tests/test_es_query_builder.py +++ b/tests/test_es_query_builder.py @@ -80,3 +80,102 @@ def test_text_query_contains_only_base_translation_and_fallback_named_queries(): names = [clause["multi_match"]["_name"] for clause in should] assert names == ["base_query", "base_query_trans_zh", "fallback_original_query_fr"] + + +def test_mixed_script_merges_en_fields_into_zh_clause(): + qb = ESQueryBuilder( + match_fields=["title.en^3.0"], + multilingual_fields=["title", "brief"], + shared_fields=[], + text_embedding_field="title_embedding", + default_language="en", + ) + parsed_query = SimpleNamespace( + query_text_by_lang={"zh": "法式 dress"}, + search_langs=["zh"], + detected_language="zh", + source_in_index_languages=True, + index_languages=["zh", "en"], + contains_chinese=True, + contains_english=True, + ) + q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False) + fields = q["query"]["multi_match"]["fields"] + bases = {f.split("^", 1)[0] for f in fields} + assert "title.zh" in bases and "title.en" in bases + assert "brief.zh" in bases and "brief.en" in bases + # Merged supplemental language fields use boost * 0.8 (implicit 1.0 -> ^0.8) + assert "title.en^0.8" in fields + assert "brief.en^0.8" in fields + + +def test_mixed_script_merges_zh_fields_into_en_clause(): + qb = ESQueryBuilder( + match_fields=["title.en^3.0"], + multilingual_fields=["title"], + shared_fields=[], + text_embedding_field="title_embedding", + default_language="en", + ) + parsed_query = SimpleNamespace( + query_text_by_lang={"en": "red 连衣裙"}, + search_langs=["en"], + detected_language="en", + source_in_index_languages=True, + index_languages=["zh", "en"], + contains_chinese=True, + contains_english=True, + ) + q = qb.build_query(query_text="red 连衣裙", parsed_query=parsed_query, enable_knn=False) + fields = q["query"]["multi_match"]["fields"] + bases = {f.split("^", 1)[0] for f in fields} + assert "title.en" in bases and "title.zh" in bases + assert "title.zh^0.8" in fields + + +def test_mixed_script_merged_fields_scale_configured_boosts(): + qb = ESQueryBuilder( + match_fields=["title.en^3.0"], + multilingual_fields=["title"], + shared_fields=[], + field_boosts={"title.zh": 5.0, "title.en": 10.0}, + text_embedding_field="title_embedding", + default_language="en", + ) + parsed_query = SimpleNamespace( + query_text_by_lang={"zh": "法式 dress"}, + search_langs=["zh"], + detected_language="zh", + source_in_index_languages=True, + index_languages=["zh", "en"], + contains_chinese=True, + contains_english=True, + ) + q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False) + fields = q["query"]["multi_match"]["fields"] + assert "title.zh^5.0" in fields + assert "title.en^8.0" in fields # 10.0 * 0.8 + + +def test_mixed_script_does_not_merge_en_when_not_in_index_languages(): + qb = ESQueryBuilder( + match_fields=["title.zh^3.0"], + multilingual_fields=["title"], + shared_fields=[], + text_embedding_field="title_embedding", + default_language="zh", + ) + parsed_query = SimpleNamespace( + query_text_by_lang={"zh": "法式 dress"}, + search_langs=["zh"], + detected_language="zh", + source_in_index_languages=True, + index_languages=["zh"], + contains_chinese=True, + contains_english=True, + ) + q = qb.build_query(query_text="法式 dress", parsed_query=parsed_query, enable_knn=False) + fields = q["query"]["multi_match"]["fields"] + bases = {f.split("^", 1)[0] for f in fields} + assert "title.zh" in bases + assert "title.en" not in bases diff --git a/tests/test_query_parser_mixed_language.py b/tests/test_query_parser_mixed_language.py index 5cf1fa5..218de59 100644 --- a/tests/test_query_parser_mixed_language.py +++ b/tests/test_query_parser_mixed_language.py @@ -9,6 +9,14 @@ class _DummyTranslator: return f"{text}-{target_lang}" +def test_pure_english_word_token_length_and_script(): + assert QueryParser._is_pure_english_word_token("ab") is False + assert QueryParser._is_pure_english_word_token("abc") is True + assert QueryParser._is_pure_english_word_token("wi-fi") is True + assert QueryParser._is_pure_english_word_token("连衣裙") is False + assert QueryParser._is_pure_english_word_token("ab12") is False + + def _build_config() -> SearchConfig: return SearchConfig( es_index_name="test_products", @@ -38,6 +46,8 @@ def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(mo result = parser.parse("法式 dress 连衣裙", tenant_id="162", generate_vector=False) assert result.detected_language == "zh" + assert result.contains_chinese is True + assert result.contains_english is True assert "en" in result.search_langs # 翻译在预算内完成时会写入目标语言字段(优于仅用原文做 supplemental 探测) assert result.query_text_by_lang["en"] == "法式 dress 连衣裙-en" @@ -56,6 +66,8 @@ def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch): result = parser.parse("red 连衣裙", tenant_id="0", generate_vector=False) assert result.detected_language == "en" + assert result.contains_chinese is True + assert result.contains_english is True assert "zh" in result.search_langs assert result.query_text_by_lang["zh"] == "red 连衣裙-zh" assert result.query_text_by_lang["en"] == "red 连衣裙" @@ -74,6 +86,8 @@ def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch) result = parser.parse("off shoulder top", tenant_id="0", generate_vector=False) assert result.detected_language == "en" + assert result.contains_chinese is False + assert result.contains_english is True assert result.translations.get("zh") == "off shoulder top-zh" assert result.query_text_by_lang.get("zh") == "off shoulder top-zh" assert result.source_in_index_languages is True -- libgit2 0.21.2