""" ES text recall: base_query (rewritten @ detected_language) + base_query_trans_*. Covers combinations of query language vs tenant index_languages, translations, and mixed Chinese/English queries. Asserts multi_match _name, query text, and target language fields (title.{lang}). """ from types import SimpleNamespace from typing import Any, Dict, List import numpy as np from search.es_query_builder import ESQueryBuilder def _builder_multilingual_title_only( *, default_language: str = "en", mixed_script_scale: float = 0.6, ) -> ESQueryBuilder: """Minimal builder: only title.{lang} for easy field assertions.""" return ESQueryBuilder( match_fields=["title.en^1.0"], multilingual_fields=["title"], shared_fields=[], text_embedding_field="title_embedding", default_language=default_language, mixed_script_merged_field_boost_scale=mixed_script_scale, function_score_config=None, ) def _unwrap_inner_query(es_body: Dict[str, Any]) -> Dict[str, Any]: """Navigate bool.must / function_score wrappers to the text recall root.""" q = es_body.get("query") or {} if "bool" in q and "must" in q["bool"] and q["bool"]["must"]: q = q["bool"]["must"][0] if "function_score" in q: q = q["function_score"]["query"] return q def _extract_multi_match_clauses(es_body: Dict[str, Any]) -> List[Dict[str, Any]]: inner = _unwrap_inner_query(es_body) if "multi_match" in inner: return [inner["multi_match"]] should = (inner.get("bool") or {}).get("should") or [] return [c["multi_match"] for c in should if "multi_match" in c] def _clauses_index(es_body: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: """Map _name -> multi_match dict.""" out: Dict[str, Dict[str, Any]] = {} for mm in _extract_multi_match_clauses(es_body): name = mm.get("_name") if name: out[str(name)] = mm return out def _with_phrase(lexical_names: set[str]) -> set[str]: """Each lexical recall clause has a companion ``*_phrase`` multi_match.""" return lexical_names | {f"{n}_phrase" for n in lexical_names} def _title_fields(mm: Dict[str, Any]) -> List[str]: fields = mm.get("fields") or [] return [f for f in fields if str(f).startswith("title.")] def _has_title_lang(mm: Dict[str, Any], lang: str) -> bool: """True if any field is title.{lang} with optional ^boost suffix.""" prefix = f"title.{lang}" for f in mm.get("fields") or []: s = str(f) if s == prefix or s.startswith(prefix + "^"): return True return False def _build( qb: ESQueryBuilder, *, query_text: str, rewritten: str, detected_language: str, translations: Dict[str, str], index_languages: List[str], contains_chinese: bool = False, contains_english: bool = False, ) -> Dict[str, Any]: parsed = SimpleNamespace( rewritten_query=rewritten, detected_language=detected_language, translations=dict(translations), contains_chinese=contains_chinese, contains_english=contains_english, ) return qb.build_query( query_text=query_text, parsed_query=parsed, enable_knn=False, index_languages=index_languages, ) # --- 检测语言在 index_languages 内:主召回 + 翻译补召回 --- def test_zh_query_index_zh_en_includes_base_zh_and_trans_en(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="连衣裙", rewritten="连衣裙", detected_language="zh", translations={"en": "dress"}, index_languages=["zh", "en"], ) idx = _clauses_index(q) assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"}) assert idx["base_query"]["query"] == "连衣裙" assert "title.zh" in _title_fields(idx["base_query"]) assert idx["base_query_trans_en"]["query"] == "dress" assert "title.en" in _title_fields(idx["base_query_trans_en"]) def test_en_query_index_zh_en_includes_base_en_and_trans_zh(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="dress", rewritten="dress", detected_language="en", translations={"zh": "连衣裙"}, index_languages=["en", "zh"], ) idx = _clauses_index(q) assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) assert idx["base_query"]["query"] == "dress" assert "title.en" in _title_fields(idx["base_query"]) assert idx["base_query_trans_zh"]["query"] == "连衣裙" assert "title.zh" in _title_fields(idx["base_query_trans_zh"]) def test_de_query_index_de_en_fr_includes_base_and_two_translations(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="kleid", rewritten="kleid", detected_language="de", translations={"en": "dress", "fr": "robe"}, index_languages=["de", "en", "fr"], ) idx = _clauses_index(q) assert set(idx) == _with_phrase( {"base_query", "base_query_trans_en", "base_query_trans_fr"} ) assert idx["base_query"]["query"] == "kleid" assert "title.de" in _title_fields(idx["base_query"]) assert idx["base_query_trans_en"]["query"] == "dress" assert idx["base_query_trans_fr"]["query"] == "robe" # --- 检测语言不在 index_languages:仍有 base(弱)+ 翻译(强) --- def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="schuh", rewritten="schuh", detected_language="de", translations={"en": "shoe", "zh": "鞋"}, index_languages=["en", "zh"], ) idx = _clauses_index(q) assert set(idx) == _with_phrase( {"base_query", "base_query_trans_en", "base_query_trans_zh"} ) assert idx["base_query"]["query"] == "schuh" assert "title.de" in _title_fields(idx["base_query"]) assert "boost" not in idx["base_query"] assert idx["base_query_trans_en"]["query"] == "shoe" assert idx["base_query_trans_en"]["boost"] == qb.translation_boost assert idx["base_query_trans_zh"]["query"] == "鞋" assert idx["base_query_trans_zh"]["boost"] == qb.translation_boost # --- 中英混写:原文在 base_query;翻译子句独立;混写时 base 子句扩列 --- def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="红色 dress", rewritten="红色 dress", detected_language="zh", translations={"en": "red dress"}, index_languages=["zh", "en"], contains_chinese=True, contains_english=True, ) idx = _clauses_index(q) assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"}) assert idx["base_query"]["query"] == "红色 dress" assert _has_title_lang(idx["base_query"], "zh") and _has_title_lang(idx["base_query"], "en") assert idx["base_query_trans_en"]["query"] == "red dress" assert _has_title_lang(idx["base_query_trans_en"], "en") def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="nike 运动鞋", rewritten="nike 运动鞋", detected_language="en", translations={"zh": "耐克运动鞋"}, index_languages=["zh", "en"], contains_chinese=True, contains_english=True, ) idx = _clauses_index(q) assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) assert idx["base_query"]["query"] == "nike 运动鞋" assert _has_title_lang(idx["base_query"], "en") and _has_title_lang(idx["base_query"], "zh") assert idx["base_query_trans_zh"]["query"] == "耐克运动鞋" def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="法式 dress", rewritten="法式 dress", detected_language="zh", translations={}, index_languages=["zh"], contains_chinese=True, contains_english=True, ) idx = _clauses_index(q) assert set(idx) == _with_phrase({"base_query"}) bases = {f.split("^", 1)[0] for f in _title_fields(idx["base_query"])} assert bases == {"title.zh"} # --- 去重:与 base 同语言同文本的翻译项跳过 --- def test_skips_translation_when_same_lang_and_same_text_as_base(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="NIKE", rewritten="NIKE", detected_language="en", translations={"en": "NIKE", "zh": "耐克"}, index_languages=["en", "zh"], ) idx = _clauses_index(q) assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) def test_keeps_translation_when_same_text_but_different_lang_than_base(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="NIKE", rewritten="NIKE", detected_language="en", translations={"zh": "NIKE"}, index_languages=["en", "zh"], ) idx = _clauses_index(q) assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) assert idx["base_query_trans_zh"]["query"] == "NIKE" # --- 翻译 key 规范化、空翻译跳过 --- def test_translation_language_key_is_normalized_case_insensitive(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="dress", rewritten="dress", detected_language="en", translations={"ZH": "连衣裙"}, index_languages=["en", "zh"], ) idx = _clauses_index(q) assert "base_query_trans_zh" in idx assert idx["base_query_trans_zh"]["query"] == "连衣裙" def test_empty_translation_value_is_skipped(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="dress", rewritten="dress", detected_language="en", translations={"zh": " ", "fr": "robe"}, index_languages=["en", "zh", "fr"], ) idx = _clauses_index(q) assert "base_query_trans_zh" not in idx assert "base_query_trans_fr" in idx # --- index_languages 为空:视为「未约束」source_in_index 为 True --- def test_empty_index_languages_treats_source_as_in_index_boosts(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="x", rewritten="x", detected_language="de", translations={"en": "y"}, index_languages=[], ) idx = _clauses_index(q) assert "boost" not in idx["base_query"] assert idx["base_query_trans_en"]["boost"] == qb.translation_boost assert idx["base_query_phrase"]["boost"] == qb.phrase_match_boost assert idx["base_query_trans_en_phrase"]["boost"] == qb.phrase_match_boost # --- 无翻译:仅 base_query --- def test_no_translations_only_base_query(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="hello", rewritten="hello", detected_language="en", translations={}, index_languages=["en", "zh"], ) idx = _clauses_index(q) assert set(idx) == _with_phrase({"base_query"}) # --- 与 KNN 同存时仍能解析文本子句(顶层 knn 不影响 query 内结构) --- def test_text_clauses_present_alongside_knn(): qb = _builder_multilingual_title_only(default_language="en") parsed = SimpleNamespace( rewritten_query="dress", detected_language="en", translations={"zh": "连衣裙"}, contains_chinese=False, contains_english=True, ) q = qb.build_query( query_text="dress", query_vector=np.array([0.1, 0.2, 0.3], dtype=np.float32), parsed_query=parsed, enable_knn=True, index_languages=["en", "zh"], ) assert "knn" in q idx = _clauses_index(q) assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) def test_detected_language_unknown_falls_back_to_default_language(): """与 LanguageDetector 失败时 QueryConfig.default_language 行为对齐。""" qb = _builder_multilingual_title_only(default_language="en") parsed = SimpleNamespace( rewritten_query="shirt", detected_language="unknown", translations={"zh": "衬衫"}, contains_chinese=False, contains_english=True, ) q = qb.build_query( query_text="shirt", parsed_query=parsed, enable_knn=False, index_languages=["en", "zh"], ) idx = _clauses_index(q) assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) assert idx["base_query"]["query"] == "shirt" assert _has_title_lang(idx["base_query"], "en") def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="платье", rewritten="платье", detected_language="ru", translations={"en": "dress"}, index_languages=["ru", "en"], ) idx = _clauses_index(q) assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"}) assert idx["base_query"]["query"] == "платье" assert _has_title_lang(idx["base_query"], "ru") assert idx["base_query_trans_en"]["query"] == "dress" def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause(): """ 当前实现:凡是 translations 里非空的条目都会生成子句; index_languages 只约束混写扩列,不用于过滤翻译子句。 """ qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="dress", rewritten="dress", detected_language="en", translations={"zh": "连衣裙", "de": "Kleid"}, index_languages=["en", "zh"], ) idx = _clauses_index(q) assert "base_query_trans_de" in idx assert idx["base_query_trans_de"]["query"] == "Kleid" assert _has_title_lang(idx["base_query_trans_de"], "de") def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_base(): """base_query 始终用 rewritten_query,而非仅 query_text。""" qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text=" 红色 ", rewritten="红色连衣裙", detected_language="zh", translations={"en": "red dress"}, index_languages=["zh", "en"], contains_chinese=True, contains_english=False, ) idx = _clauses_index(q) assert idx["base_query"]["query"] == "红色连衣裙" assert idx["base_query_trans_en"]["query"] == "red dress"