diff --git a/docs/相关性检索优化说明.md b/docs/相关性检索优化说明.md index 1822573..b398c75 100644 --- a/docs/相关性检索优化说明.md +++ b/docs/相关性检索优化说明.md @@ -17,9 +17,9 @@ 查询链路(文本相关): 1. `QueryParser.parse()` - 负责产出解析事实:`query_normalized`、`rewritten_query`、`detected_language`、`translations`、`query_vector`、`query_tokens`、`contains_chinese`、`contains_english`。 + 负责产出解析事实:`query_normalized`、`rewritten_query`、`detected_language`、`translations`、`query_vector`、`query_tokens`。 2. `Searcher.search()` - 负责读取租户 `index_languages`,并将其一方面传给 `QueryParser` 作为 `target_languages`,另一方面传给 `ESQueryBuilder` 作为字段展开约束。 + 负责读取租户 `index_languages`,并将其传给 `QueryParser` 作为 `target_languages`(控制翻译目标语种);`ESQueryBuilder` 仅根据 `detected_language` 与各条译文构建子句字段,不再接收 `index_languages`。 2. `ESQueryBuilder._build_advanced_text_query()` 基于 `rewritten_query + detected_language + translations + index_languages` 构建 `base_query` 与 `base_query_trans_*`;并按语言动态拼接 `title/brief/description/vendor/category_*` 的 `.{lang}` 字段,叠加 shared 字段(`tags`、`option*_values`)。 3. `build_query()` @@ -76,9 +76,6 @@ 最终按 `bool.should` 组合,`minimum_should_match: 1`。 -> **附 — 混写辅助召回** -> 当中英(或多脚本)混写时,为略抬召回:`QueryParser` 用 `contains_chinese`(文中有汉字)、`contains_english`(分词中有长度 ≥3 的纯英文 token)打标;`ESQueryBuilder` 在某一语言的 `multi_match` 上,按规则把**另一语种**的同类字段并入同一 `fields`(受 `index_languages` 限制),并入列的 boost 为配置值再乘 **`mixed_script_merged_field_boost_scale`(默认 0.6,`ESQueryBuilder` 构造参数)**。字段在内部以 `(path, boost)` 列表合并后再格式化为 ES 字符串。 - ## 5. 关键配置项(文本策略) `query_config` 下与解析等待相关的项: @@ -147,11 +144,9 @@ - `translations` - `query_vector` - `query_tokens` - - `contains_chinese` / `contains_english` - `Searcher` 负责“租户语境”: - `index_languages` - 将其传给 parser 作为 `target_languages` - - 将其传给 builder 作为字段展开约束 - `ESQueryBuilder` 负责“表达式展开”: - 动态字段组装 - 子句权重分配 diff --git a/query/query_parser.py b/query/query_parser.py index 3ee2f85..655d827 100644 --- a/query/query_parser.py +++ b/query/query_parser.py @@ -50,8 +50,6 @@ class ParsedQuery: translations: Dict[str, str] = field(default_factory=dict) query_vector: Optional[np.ndarray] = None query_tokens: List[str] = field(default_factory=list) - contains_chinese: bool = False - contains_english: bool = False def to_dict(self) -> Dict[str, Any]: """Convert to dictionary representation.""" @@ -62,8 +60,6 @@ class ParsedQuery: "detected_language": self.detected_language, "translations": self.translations, "query_tokens": self.query_tokens, - "contains_chinese": self.contains_chinese, - "contains_english": self.contains_english, } @@ -202,21 +198,6 @@ class QueryParser: def _get_query_tokens(self, query: str) -> List[str]: return self._extract_tokens(self._tokenizer(query)) - @staticmethod - def _contains_cjk(text: str) -> bool: - """Whether query contains any CJK ideograph.""" - return bool(re.search(r"[\u4e00-\u9fff]", text or "")) - - @staticmethod - def _is_pure_english_word_token(token: str) -> bool: - """ - A tokenizer token counts as English iff it is letters only (optional internal hyphens) - and length >= 3. - """ - if not token or len(token) < 3: - return False - return bool(re.fullmatch(r"[A-Za-z]+(?:-[A-Za-z]+)*", token)) - def parse( self, query: str, @@ -285,19 +266,12 @@ class QueryParser: log_info(f"Language detection | Detected language: {detected_lang}") if context: context.store_intermediate_result('detected_language', detected_lang) - # Stage 4: Query analysis (tokenization + script flags) + # Stage 4: Query analysis (tokenization) query_tokens = self._get_query_tokens(query_text) - contains_chinese = self._contains_cjk(query_text) - contains_english = any(self._is_pure_english_word_token(t) for t in query_tokens) - log_debug( - f"Query analysis | Query tokens: {query_tokens} | " - f"contains_chinese={contains_chinese} | contains_english={contains_english}" - ) + log_debug(f"Query analysis | Query tokens: {query_tokens}") if context: context.store_intermediate_result('query_tokens', query_tokens) - context.store_intermediate_result('contains_chinese', contains_chinese) - context.store_intermediate_result('contains_english', contains_english) # Stage 5: Translation + embedding. Parser only coordinates async enrichment work; the # caller decides translation targets and later search-field planning. @@ -459,8 +433,6 @@ class QueryParser: translations=translations, query_vector=query_vector, query_tokens=query_tokens, - contains_chinese=contains_chinese, - contains_english=contains_english, ) if context and hasattr(context, 'logger'): diff --git a/search/es_query_builder.py b/search/es_query_builder.py index 32eb0d3..4b99a09 100644 --- a/search/es_query_builder.py +++ b/search/es_query_builder.py @@ -8,14 +8,11 @@ Simplified architecture: - function_score wrapper for boosting fields """ -from typing import Dict, Any, List, Optional, Union, Tuple +from typing import Dict, Any, List, Optional, Tuple import numpy as np from config import FunctionScoreConfig -# (Elasticsearch field path, boost before formatting as "path^boost") -MatchFieldSpec = Tuple[str, float] - class ESQueryBuilder: """Builds Elasticsearch DSL queries.""" @@ -39,7 +36,6 @@ class ESQueryBuilder: tie_breaker_base_query: float = 0.9, best_fields_boosts: Optional[Dict[str, float]] = None, best_fields_clause_boost: float = 2.0, - mixed_script_merged_field_boost_scale: float = 0.6, phrase_field_boosts: Optional[Dict[str, float]] = None, phrase_match_base_fields: Optional[Tuple[str, ...]] = None, phrase_match_slop: int = 0, @@ -60,7 +56,6 @@ class ESQueryBuilder: function_score_config: Function score configuration default_language: Default language to use when detection fails or returns "unknown" knn_boost: Boost value for KNN (embedding recall) - mixed_script_merged_field_boost_scale: Multiply per-field ^boost for cross-script merged fields """ self.match_fields = match_fields self.field_boosts = field_boosts or {} @@ -77,7 +72,6 @@ class ESQueryBuilder: self.translation_minimum_should_match = translation_minimum_should_match self.translation_boost = float(translation_boost) self.tie_breaker_base_query = float(tie_breaker_base_query) - self.mixed_script_merged_field_boost_scale = float(mixed_script_merged_field_boost_scale) default_best_fields = { base: self._get_field_boost(base) for base in self.core_multilingual_fields @@ -180,7 +174,6 @@ class ESQueryBuilder: knn_num_candidates: int = 200, min_score: Optional[float] = None, parsed_query: Optional[Any] = None, - index_languages: Optional[List[str]] = None, ) -> Dict[str, Any]: """ Build complete ES query with post_filter support for multi-select faceting. @@ -223,11 +216,7 @@ class ESQueryBuilder: # Text recall (always include if query_text exists) if query_text: # Unified text query strategy - text_query = self._build_advanced_text_query( - query_text, - parsed_query, - index_languages=index_languages, - ) + text_query = self._build_advanced_text_query(query_text, parsed_query) recall_clauses.append(text_query) # Embedding recall (KNN - separate from query, handled below) @@ -434,90 +423,36 @@ class ESQueryBuilder: return float(self.field_boosts[base_field]) return 1.0 - def _build_match_field_specs( + def _match_field_strings( self, language: str, *, multilingual_fields: Optional[List[str]] = None, shared_fields: Optional[List[str]] = None, boost_overrides: Optional[Dict[str, float]] = None, - ) -> List[MatchFieldSpec]: - """ - Per-language match targets as (field_path, boost). Single source of truth before - formatting as Elasticsearch ``fields`` strings. - """ + ) -> List[str]: + """Build ``multi_match`` / ``combined_fields`` field entries for one language code.""" lang = (language or "").strip().lower() - specs: List[MatchFieldSpec] = [] - text_fields = multilingual_fields if multilingual_fields is not None else self.multilingual_fields + text_bases = multilingual_fields if multilingual_fields is not None else self.multilingual_fields term_fields = shared_fields if shared_fields is not None else self.shared_fields overrides = boost_overrides or {} - - for base in text_fields: - field = f"{base}.{lang}" + out: List[str] = [] + for base in text_bases: + path = f"{base}.{lang}" boost = float(overrides.get(base, self._get_field_boost(base, lang))) - specs.append((field, boost)) - + out.append(self._format_field_with_boost(path, boost)) for shared in term_fields: boost = float(overrides.get(shared, self._get_field_boost(shared, None))) - specs.append((shared, boost)) - return specs - - def _format_match_field_specs(self, specs: List[MatchFieldSpec]) -> List[str]: - """Format (field_path, boost) pairs for Elasticsearch multi_match ``fields``.""" - return [self._format_field_with_boost(path, boost) for path, boost in specs] - - def _merge_supplemental_lang_field_specs( - self, - specs: List[MatchFieldSpec], - supplemental_lang: str, - ) -> List[MatchFieldSpec]: - """Append supplemental-language columns; boosts multiplied by mixed_script scale.""" - scale = float(self.mixed_script_merged_field_boost_scale) - extra_all = self._build_match_field_specs(supplemental_lang) - seen = {path for path, _ in specs} - out = list(specs) - for path, boost in extra_all: - if path not in seen: - out.append((path, boost * scale)) - seen.add(path) - return out - - def _expand_match_field_specs_for_mixed_script( - self, - lang: str, - specs: List[MatchFieldSpec], - contains_chinese: bool, - contains_english: bool, - index_languages: List[str], - is_source: bool = False - ) -> List[MatchFieldSpec]: - """ - When the query mixes scripts, widen each clause to indexed fields for the other script - (e.g. zh clause also searches title.en when the query contains an English word token). - """ - norm = {str(x or "").strip().lower() for x in (index_languages or []) if str(x or "").strip()} - allow = norm or {"zh", "en"} - - def can_use(lcode: str) -> bool: - return lcode in allow if norm else True - - out = list(specs) - lnorm = (lang or "").strip().lower() - if is_source: - if contains_english and lnorm != "en" and can_use("en"): - out = self._merge_supplemental_lang_field_specs(out, "en") - if contains_chinese and lnorm != "zh" and can_use("zh"): - out = self._merge_supplemental_lang_field_specs(out, "zh") + out.append(self._format_field_with_boost(shared, boost)) return out def _build_best_fields_clause(self, language: str, query_text: str) -> Optional[Dict[str, Any]]: - specs = self._build_match_field_specs( + fields = self._match_field_strings( language, multilingual_fields=list(self.best_fields_boosts), shared_fields=[], boost_overrides=self.best_fields_boosts, ) - fields = self._format_match_field_specs(specs) if not fields: return None return { @@ -530,13 +465,12 @@ class ESQueryBuilder: } def _build_phrase_clause(self, language: str, query_text: str) -> Optional[Dict[str, Any]]: - specs = self._build_match_field_specs( + fields = self._match_field_strings( language, multilingual_fields=list(self.phrase_field_boosts), shared_fields=[], boost_overrides=self.phrase_field_boosts, ) - fields = self._format_match_field_specs(specs) if not fields: return None clause: Dict[str, Any] = { @@ -560,20 +494,8 @@ class ESQueryBuilder: clause_name: str, *, is_source: bool, - contains_chinese: bool, - contains_english: bool, - index_languages: List[str], ) -> Optional[Dict[str, Any]]: - all_specs = self._build_match_field_specs(lang) - expanded_specs = self._expand_match_field_specs_for_mixed_script( - lang, - all_specs, - contains_chinese, - contains_english, - index_languages, - is_source, - ) - combined_fields = self._format_match_field_specs(expanded_specs) + combined_fields = self._match_field_strings(lang) if not combined_fields: return None minimum_should_match = ( @@ -607,29 +529,10 @@ class ESQueryBuilder: clause["bool"]["boost"] = float(self.translation_boost) return clause - def _get_embedding_field(self, language: str) -> str: - """Get embedding field name for a language.""" - # Currently using unified embedding field - return self.text_embedding_field or "title_embedding" - - @staticmethod - def _normalize_language_list(languages: Optional[List[str]]) -> List[str]: - normalized: List[str] = [] - seen = set() - for language in languages or []: - token = str(language or "").strip().lower() - if not token or token in seen: - continue - seen.add(token) - normalized.append(token) - return normalized - def _build_advanced_text_query( self, query_text: str, parsed_query: Optional[Any] = None, - *, - index_languages: Optional[List[str]] = None, ) -> Dict[str, Any]: """ Build advanced text query using base and translated lexical clauses. @@ -649,39 +552,26 @@ class ESQueryBuilder: should_clauses = [] source_lang = self.default_language translations: Dict[str, str] = {} - contains_chinese = False - contains_english = False - normalized_index_languages = self._normalize_language_list(index_languages) if parsed_query: detected_lang = getattr(parsed_query, "detected_language", None) source_lang = detected_lang if detected_lang and detected_lang != "unknown" else self.default_language translations = getattr(parsed_query, "translations", None) or {} - contains_chinese = bool(getattr(parsed_query, "contains_chinese", False)) - contains_english = bool(getattr(parsed_query, "contains_english", False)) source_lang = str(source_lang or self.default_language).strip().lower() or self.default_language base_query_text = ( getattr(parsed_query, "rewritten_query", None) if parsed_query else None ) or query_text - def append_clause(lang: str, lang_query: str, clause_name: str, is_source: bool) -> None: - nonlocal should_clauses - clause = self._build_lexical_language_clause( - lang, - lang_query, - clause_name, - is_source=is_source, - contains_chinese=contains_chinese, - contains_english=contains_english, - index_languages=normalized_index_languages, - ) - if not clause: - return - should_clauses.append(clause) - if base_query_text: - append_clause(source_lang, base_query_text, "base_query", True) + base_clause = self._build_lexical_language_clause( + source_lang, + base_query_text, + "base_query", + is_source=True, + ) + if base_clause: + should_clauses.append(base_clause) for lang, translated_text in translations.items(): normalized_lang = str(lang or "").strip().lower() @@ -690,7 +580,14 @@ class ESQueryBuilder: continue if normalized_lang == source_lang and normalized_text == base_query_text: continue - append_clause(normalized_lang, normalized_text, f"base_query_trans_{normalized_lang}", False) + trans_clause = self._build_lexical_language_clause( + normalized_lang, + normalized_text, + f"base_query_trans_{normalized_lang}", + is_source=False, + ) + if trans_clause: + should_clauses.append(trans_clause) # Fallback to a simple query when language fields cannot be resolved. if not should_clauses: diff --git a/search/searcher.py b/search/searcher.py index 21bbadd..5285d2c 100644 --- a/search/searcher.py +++ b/search/searcher.py @@ -645,7 +645,6 @@ class Searcher: enable_knn=enable_embedding and parsed_query.query_vector is not None, min_score=min_score, parsed_query=parsed_query, - index_languages=index_langs, ) # Add facets for faceted search diff --git a/tests/test_es_query_builder.py b/tests/test_es_query_builder.py index bd30498..763bda2 100644 --- a/tests/test_es_query_builder.py +++ b/tests/test_es_query_builder.py @@ -9,6 +9,9 @@ from search.es_query_builder import ESQueryBuilder def _builder() -> ESQueryBuilder: return ESQueryBuilder( match_fields=["title.en^3.0", "brief.en^1.0"], + multilingual_fields=["title", "brief"], + core_multilingual_fields=["title", "brief"], + shared_fields=[], text_embedding_field="title_embedding", default_language="en", ) @@ -25,10 +28,6 @@ def _lexical_clause(query_root: Dict[str, Any]) -> Dict[str, Any]: raise AssertionError("no lexical bool clause in query_root") -def _lexical_combined_fields(query_root: Dict[str, Any]) -> list: - return _lexical_clause(query_root)["must"][0]["combined_fields"]["fields"] - - def test_knn_prefilter_includes_range_filters(): qb = _builder() q = qb.build_query( @@ -93,7 +92,6 @@ def test_text_query_contains_only_base_and_translation_named_queries(): query_text="dress", parsed_query=parsed_query, enable_knn=False, - index_languages=["en", "zh", "fr"], ) should = q["query"]["bool"]["should"] names = [clause["bool"]["_name"] for clause in should] @@ -115,120 +113,8 @@ def test_text_query_skips_duplicate_translation_same_as_base(): query_text="dress", parsed_query=parsed_query, enable_knn=False, - index_languages=["en", "zh"], ) root = q["query"] assert root["bool"]["_name"] == "base_query" assert [clause["multi_match"]["type"] for clause in root["bool"]["should"]] == ["best_fields", "phrase"] - - -def test_mixed_script_merges_en_fields_into_zh_clause(): - qb = ESQueryBuilder( - match_fields=["title.en^3.0"], - multilingual_fields=["title", "brief"], - shared_fields=[], - text_embedding_field="title_embedding", - default_language="en", - ) - parsed_query = SimpleNamespace( - rewritten_query="法式 dress", - detected_language="zh", - translations={}, - contains_chinese=True, - contains_english=True, - ) - q = qb.build_query( - query_text="法式 dress", - parsed_query=parsed_query, - enable_knn=False, - index_languages=["zh", "en"], - ) - fields = _lexical_combined_fields(q["query"]) - bases = {f.split("^", 1)[0] for f in fields} - assert "title.zh" in bases and "title.en" in bases - assert "brief.zh" in bases and "brief.en" in bases - # Merged supplemental language fields use boost * 0.6 by default. - assert "title.en^0.6" in fields - assert "brief.en^0.6" in fields - - -def test_mixed_script_merges_zh_fields_into_en_clause(): - qb = ESQueryBuilder( - match_fields=["title.en^3.0"], - multilingual_fields=["title"], - shared_fields=[], - text_embedding_field="title_embedding", - default_language="en", - ) - parsed_query = SimpleNamespace( - rewritten_query="red 连衣裙", - detected_language="en", - translations={}, - contains_chinese=True, - contains_english=True, - ) - q = qb.build_query( - query_text="red 连衣裙", - parsed_query=parsed_query, - enable_knn=False, - index_languages=["zh", "en"], - ) - fields = _lexical_combined_fields(q["query"]) - bases = {f.split("^", 1)[0] for f in fields} - assert "title.en" in bases and "title.zh" in bases - assert "title.zh^0.6" in fields - - -def test_mixed_script_merged_fields_scale_configured_boosts(): - qb = ESQueryBuilder( - match_fields=["title.en^3.0"], - multilingual_fields=["title"], - shared_fields=[], - field_boosts={"title.zh": 5.0, "title.en": 10.0}, - text_embedding_field="title_embedding", - default_language="en", - ) - parsed_query = SimpleNamespace( - rewritten_query="法式 dress", - detected_language="zh", - translations={}, - contains_chinese=True, - contains_english=True, - ) - q = qb.build_query( - query_text="法式 dress", - parsed_query=parsed_query, - enable_knn=False, - index_languages=["zh", "en"], - ) - fields = _lexical_combined_fields(q["query"]) - assert "title.zh^5.0" in fields - assert "title.en^6.0" in fields # 10.0 * 0.6 - - -def test_mixed_script_does_not_merge_en_when_not_in_index_languages(): - qb = ESQueryBuilder( - match_fields=["title.zh^3.0"], - multilingual_fields=["title"], - shared_fields=[], - text_embedding_field="title_embedding", - default_language="zh", - ) - parsed_query = SimpleNamespace( - rewritten_query="法式 dress", - detected_language="zh", - translations={}, - contains_chinese=True, - contains_english=True, - ) - q = qb.build_query( - query_text="法式 dress", - parsed_query=parsed_query, - enable_knn=False, - index_languages=["zh"], - ) - fields = _lexical_combined_fields(q["query"]) - bases = {f.split("^", 1)[0] for f in fields} - assert "title.zh" in bases - assert "title.en" not in bases diff --git a/tests/test_es_query_builder_text_recall_languages.py b/tests/test_es_query_builder_text_recall_languages.py index a115028..6db0c1e 100644 --- a/tests/test_es_query_builder_text_recall_languages.py +++ b/tests/test_es_query_builder_text_recall_languages.py @@ -1,9 +1,9 @@ """ ES text recall: base_query (rewritten @ detected_language) + base_query_trans_*. -Covers combinations of query language vs tenant index_languages, translations, -and mixed Chinese/English queries. Asserts named lexical clause boundaries, -combined_fields payloads, and per-language target fields (title.{lang}). +Covers translation routing, mixed-script queries (per-clause language fields only), +and clause naming. Asserts named lexical clause boundaries, combined_fields payloads, +and per-language target fields (title.{lang}). """ from types import SimpleNamespace @@ -14,11 +14,7 @@ import numpy as np from search.es_query_builder import ESQueryBuilder -def _builder_multilingual_title_only( - *, - default_language: str = "en", - mixed_script_scale: float = 0.6, -) -> ESQueryBuilder: +def _builder_multilingual_title_only(*, default_language: str = "en") -> ESQueryBuilder: """Minimal builder: only title.{lang} for easy field assertions.""" return ESQueryBuilder( match_fields=["title.en^1.0"], @@ -26,7 +22,6 @@ def _builder_multilingual_title_only( shared_fields=[], text_embedding_field="title_embedding", default_language=default_language, - mixed_script_merged_field_boost_scale=mixed_script_scale, function_score_config=None, ) @@ -101,22 +96,16 @@ def _build( rewritten: str, detected_language: str, translations: Dict[str, str], - index_languages: List[str], - contains_chinese: bool = False, - contains_english: bool = False, ) -> Dict[str, Any]: parsed = SimpleNamespace( rewritten_query=rewritten, detected_language=detected_language, translations=dict(translations), - contains_chinese=contains_chinese, - contains_english=contains_english, ) return qb.build_query( query_text=query_text, parsed_query=parsed, enable_knn=False, - index_languages=index_languages, ) @@ -131,7 +120,6 @@ def test_zh_query_index_zh_en_includes_base_zh_and_trans_en(): rewritten="连衣裙", detected_language="zh", translations={"en": "dress"}, - index_languages=["zh", "en"], ) idx = _clauses_index(q) assert set(idx) == {"base_query", "base_query_trans_en"} @@ -149,7 +137,6 @@ def test_en_query_index_zh_en_includes_base_en_and_trans_zh(): rewritten="dress", detected_language="en", translations={"zh": "连衣裙"}, - index_languages=["en", "zh"], ) idx = _clauses_index(q) assert set(idx) == {"base_query", "base_query_trans_zh"} @@ -167,7 +154,6 @@ def test_de_query_index_de_en_fr_includes_base_and_two_translations(): rewritten="kleid", detected_language="de", translations={"en": "dress", "fr": "robe"}, - index_languages=["de", "en", "fr"], ) idx = _clauses_index(q) assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_fr"} @@ -188,7 +174,6 @@ def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields(): rewritten="schuh", detected_language="de", translations={"en": "shoe", "zh": "鞋"}, - index_languages=["en", "zh"], ) idx = _clauses_index(q) assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_zh"} @@ -201,10 +186,10 @@ def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields(): assert idx["base_query_trans_zh"]["boost"] == qb.translation_boost -# --- 中英混写:原文在 base_query;翻译子句独立;混写时 base 子句扩列 --- +# --- 中英混写:base 打在检测语种字段;翻译子句打在译文语种字段 --- -def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause(): +def test_mixed_zh_detected_base_clause_zh_fields_only_with_en_translation(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, @@ -212,19 +197,16 @@ def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause(): rewritten="红色 dress", detected_language="zh", translations={"en": "red dress"}, - index_languages=["zh", "en"], - contains_chinese=True, - contains_english=True, ) idx = _clauses_index(q) assert set(idx) == {"base_query", "base_query_trans_en"} assert _combined_fields_clause(idx["base_query"])["query"] == "红色 dress" - assert _has_title_lang(idx["base_query"], "zh") and _has_title_lang(idx["base_query"], "en") + assert _has_title_lang(idx["base_query"], "zh") and not _has_title_lang(idx["base_query"], "en") assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "red dress" assert _has_title_lang(idx["base_query_trans_en"], "en") -def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause(): +def test_mixed_en_detected_base_clause_en_fields_only_with_zh_translation(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, @@ -232,18 +214,15 @@ def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause(): rewritten="nike 运动鞋", detected_language="en", translations={"zh": "耐克运动鞋"}, - index_languages=["zh", "en"], - contains_chinese=True, - contains_english=True, ) idx = _clauses_index(q) assert set(idx) == {"base_query", "base_query_trans_zh"} assert _combined_fields_clause(idx["base_query"])["query"] == "nike 运动鞋" - assert _has_title_lang(idx["base_query"], "en") and _has_title_lang(idx["base_query"], "zh") + assert _has_title_lang(idx["base_query"], "en") and not _has_title_lang(idx["base_query"], "zh") assert _combined_fields_clause(idx["base_query_trans_zh"])["query"] == "耐克运动鞋" -def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): +def test_zh_query_no_translations_only_zh_fields(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, @@ -251,9 +230,6 @@ def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): rewritten="法式 dress", detected_language="zh", translations={}, - index_languages=["zh"], - contains_chinese=True, - contains_english=True, ) idx = _clauses_index(q) assert set(idx) == {"base_query"} @@ -272,7 +248,6 @@ def test_skips_translation_when_same_lang_and_same_text_as_base(): rewritten="NIKE", detected_language="en", translations={"en": "NIKE", "zh": "耐克"}, - index_languages=["en", "zh"], ) idx = _clauses_index(q) assert set(idx) == {"base_query", "base_query_trans_zh"} @@ -286,7 +261,6 @@ def test_keeps_translation_when_same_text_but_different_lang_than_base(): rewritten="NIKE", detected_language="en", translations={"zh": "NIKE"}, - index_languages=["en", "zh"], ) idx = _clauses_index(q) assert set(idx) == {"base_query", "base_query_trans_zh"} @@ -304,7 +278,6 @@ def test_translation_language_key_is_normalized_case_insensitive(): rewritten="dress", detected_language="en", translations={"ZH": "连衣裙"}, - index_languages=["en", "zh"], ) idx = _clauses_index(q) assert "base_query_trans_zh" in idx @@ -319,17 +292,16 @@ def test_empty_translation_value_is_skipped(): rewritten="dress", detected_language="en", translations={"zh": " ", "fr": "robe"}, - index_languages=["en", "zh", "fr"], ) idx = _clauses_index(q) assert "base_query_trans_zh" not in idx assert "base_query_trans_fr" in idx -# --- index_languages 为空:视为「未约束」source_in_index 为 True --- +# --- base 子句无 bool.boost;翻译子句带 translation_boost;phrase should 继承 phrase_match_boost --- -def test_empty_index_languages_treats_source_as_in_index_boosts(): +def test_de_base_and_en_translation_phrase_boosts(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, @@ -337,7 +309,6 @@ def test_empty_index_languages_treats_source_as_in_index_boosts(): rewritten="x", detected_language="de", translations={"en": "y"}, - index_languages=[], ) idx = _clauses_index(q) assert "boost" not in idx["base_query"] @@ -359,7 +330,6 @@ def test_no_translations_only_base_query(): rewritten="hello", detected_language="en", translations={}, - index_languages=["en", "zh"], ) idx = _clauses_index(q) assert set(idx) == {"base_query"} @@ -374,15 +344,12 @@ def test_text_clauses_present_alongside_knn(): rewritten_query="dress", detected_language="en", translations={"zh": "连衣裙"}, - contains_chinese=False, - contains_english=True, ) q = qb.build_query( query_text="dress", query_vector=np.array([0.1, 0.2, 0.3], dtype=np.float32), parsed_query=parsed, enable_knn=True, - index_languages=["en", "zh"], ) assert "knn" in q idx = _clauses_index(q) @@ -396,14 +363,11 @@ def test_detected_language_unknown_falls_back_to_default_language(): rewritten_query="shirt", detected_language="unknown", translations={"zh": "衬衫"}, - contains_chinese=False, - contains_english=True, ) q = qb.build_query( query_text="shirt", parsed_query=parsed, enable_knn=False, - index_languages=["en", "zh"], ) idx = _clauses_index(q) assert set(idx) == {"base_query", "base_query_trans_zh"} @@ -419,7 +383,6 @@ def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): rewritten="платье", detected_language="ru", translations={"en": "dress"}, - index_languages=["ru", "en"], ) idx = _clauses_index(q) assert set(idx) == {"base_query", "base_query_trans_en"} @@ -428,11 +391,8 @@ def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "dress" -def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause(): - """ - 当前实现:凡是 translations 里非空的条目都会生成子句; - index_languages 只约束混写扩列,不用于过滤翻译子句。 - """ +def test_translation_generates_clause_for_any_target_lang_key(): + """translations 里非空的每个语种键都会生成对应 base_query_trans_* 子句。""" qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, @@ -440,7 +400,6 @@ def test_translation_for_lang_not_listed_in_index_languages_still_generates_clau rewritten="dress", detected_language="en", translations={"zh": "连衣裙", "de": "Kleid"}, - index_languages=["en", "zh"], ) idx = _clauses_index(q) assert "base_query_trans_de" in idx @@ -457,9 +416,6 @@ def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_bas rewritten="红色连衣裙", detected_language="zh", translations={"en": "red dress"}, - index_languages=["zh", "en"], - contains_chinese=True, - contains_english=False, ) idx = _clauses_index(q) assert _combined_fields_clause(idx["base_query"])["query"] == "红色连衣裙" diff --git a/tests/test_query_parser_mixed_language.py b/tests/test_query_parser_mixed_language.py index 70d2502..dcae93a 100644 --- a/tests/test_query_parser_mixed_language.py +++ b/tests/test_query_parser_mixed_language.py @@ -11,14 +11,6 @@ def _tokenizer(text): return str(text).split() -def test_pure_english_word_token_length_and_script(): - assert QueryParser._is_pure_english_word_token("ab") is False - assert QueryParser._is_pure_english_word_token("abc") is True - assert QueryParser._is_pure_english_word_token("wi-fi") is True - assert QueryParser._is_pure_english_word_token("连衣裙") is False - assert QueryParser._is_pure_english_word_token("ab12") is False - - def _build_config() -> SearchConfig: return SearchConfig( es_index_name="test_products", @@ -36,7 +28,7 @@ def _build_config() -> SearchConfig: ) -def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(monkeypatch): +def test_parse_mixed_zh_query_translates_to_en(monkeypatch): parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh") @@ -48,15 +40,13 @@ def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(mo ) assert result.detected_language == "zh" - assert result.contains_chinese is True - assert result.contains_english is True assert result.translations == {"en": "法式 dress 连衣裙-en"} assert result.query_tokens == ["法式", "dress", "连衣裙"] assert not hasattr(result, "query_text_by_lang") assert not hasattr(result, "search_langs") -def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch): +def test_parse_mixed_en_query_translates_to_zh(monkeypatch): parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") @@ -68,8 +58,6 @@ def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch): ) assert result.detected_language == "en" - assert result.contains_chinese is True - assert result.contains_english is True assert result.translations == {"zh": "red 连衣裙-zh"} assert result.query_tokens == ["red", "连衣裙"] @@ -87,7 +75,5 @@ def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch) ) assert result.detected_language == "en" - assert result.contains_chinese is False - assert result.contains_english is True assert result.translations.get("zh") == "off shoulder top-zh" assert not hasattr(result, "source_in_index_languages") -- libgit2 0.21.2