""" ES text recall: base_query (rewritten @ detected_language) + base_query_trans_*. Covers translation routing, mixed-script queries (per-clause language fields only), and clause naming. Asserts named lexical clause boundaries, combined_fields payloads, and per-language target fields (title.{lang}). """ from types import SimpleNamespace from typing import Any, Dict, List import numpy as np from query.keyword_extractor import KEYWORDS_QUERY_BASE_KEY from search.es_query_builder import ESQueryBuilder def _builder_multilingual_title_only(*, default_language: str = "en") -> ESQueryBuilder: """Minimal builder: only title.{lang} for easy field assertions.""" return ESQueryBuilder( match_fields=["title.en^1.0"], multilingual_fields=["title"], shared_fields=[], text_embedding_field="title_embedding", default_language=default_language, function_score_config=None, ) def _unwrap_inner_query(es_body: Dict[str, Any]) -> Dict[str, Any]: """Navigate bool.must / function_score wrappers to the text recall root.""" q = es_body.get("query") or {} if "bool" in q and not q["bool"].get("_name") and "must" in q["bool"] and q["bool"]["must"]: q = q["bool"]["must"][0] if "function_score" in q: q = q["function_score"]["query"] return q def _extract_multi_match_clauses(es_body: Dict[str, Any]) -> List[Dict[str, Any]]: inner = _unwrap_inner_query(es_body) if "multi_match" in inner: return [inner["multi_match"]] should = (inner.get("bool") or {}).get("should") or [] return [c["multi_match"] for c in should if "multi_match" in c] def _extract_named_lexical_clauses(es_body: Dict[str, Any]) -> List[Dict[str, Any]]: inner = _unwrap_inner_query(es_body) if "bool" in inner and inner["bool"].get("_name"): return [inner["bool"]] should = (inner.get("bool") or {}).get("should") or [] return [c["bool"] for c in should if "bool" in c and c["bool"].get("_name")] def _clauses_index(es_body: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: """Map lexical clause _name -> bool query body.""" out: Dict[str, Dict[str, Any]] = {} for clause in _extract_named_lexical_clauses(es_body): name = clause.get("_name") if name: out[str(name)] = clause return out def _combined_fields_clause(clause: Dict[str, Any]) -> Dict[str, Any]: return clause["must"][0]["combined_fields"] def _should_multi_matches(clause: Dict[str, Any]) -> List[Dict[str, Any]]: return [item["multi_match"] for item in clause.get("should") or [] if "multi_match" in item] def _should_multi_matches_by_type(clause: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: return {str(mm.get("type") or "best_fields"): mm for mm in _should_multi_matches(clause)} def _title_fields(clause: Dict[str, Any]) -> List[str]: fields = _combined_fields_clause(clause).get("fields") or [] return [f for f in fields if str(f).startswith("title.")] def _has_title_lang(clause: Dict[str, Any], lang: str) -> bool: """True if any field is title.{lang} with optional ^boost suffix.""" prefix = f"title.{lang}" for f in _combined_fields_clause(clause).get("fields") or []: s = str(f) if s == prefix or s.startswith(prefix + "^"): return True return False def _build( qb: ESQueryBuilder, *, query_text: str, rewritten: str, detected_language: str, translations: Dict[str, str], ) -> Dict[str, Any]: parsed = SimpleNamespace( rewritten_query=rewritten, detected_language=detected_language, translations=dict(translations), ) return qb.build_query( query_text=query_text, parsed_query=parsed, enable_knn=False, ) # --- 检测语言在 index_languages 内:主召回 + 翻译补召回 --- def test_zh_query_index_zh_en_includes_base_zh_and_trans_en(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="连衣裙", rewritten="连衣裙", detected_language="zh", translations={"en": "dress"}, ) idx = _clauses_index(q) assert set(idx) == {"base_query", "base_query_trans_en"} assert _combined_fields_clause(idx["base_query"])["query"] == "连衣裙" assert "title.zh" in _title_fields(idx["base_query"]) assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "dress" assert "title.en" in _title_fields(idx["base_query_trans_en"]) def test_keywords_combined_fields_second_must_same_fields_and_50pct(): """When ParsedQuery.keywords_queries is set, must includes a second combined_fields.""" qb = _builder_multilingual_title_only(default_language="en") parsed = SimpleNamespace( rewritten_query="连衣裙", detected_language="zh", translations={"en": "red dress"}, keywords_queries={KEYWORDS_QUERY_BASE_KEY: "连衣 裙", "en": "dress"}, ) q = qb.build_query(query_text="连衣裙", parsed_query=parsed, enable_knn=False) idx = _clauses_index(q) base = idx["base_query"] assert len(base["must"]) == 2 assert base["must"][0]["combined_fields"]["query"] == "连衣裙" assert base["must"][1]["combined_fields"]["query"] == "连衣 裙" assert base["must"][1]["combined_fields"]["minimum_should_match"] == "50%" assert base["must"][1]["combined_fields"]["fields"] == base["must"][0]["combined_fields"]["fields"] trans = idx["base_query_trans_en"] assert len(trans["must"]) == 2 assert trans["must"][1]["combined_fields"]["query"] == "dress" assert trans["must"][1]["combined_fields"]["minimum_should_match"] == "50%" def test_en_query_index_zh_en_includes_base_en_and_trans_zh(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="dress", rewritten="dress", detected_language="en", translations={"zh": "连衣裙"}, ) idx = _clauses_index(q) assert set(idx) == {"base_query", "base_query_trans_zh"} assert _combined_fields_clause(idx["base_query"])["query"] == "dress" assert "title.en" in _title_fields(idx["base_query"]) assert _combined_fields_clause(idx["base_query_trans_zh"])["query"] == "连衣裙" assert "title.zh" in _title_fields(idx["base_query_trans_zh"]) def test_de_query_index_de_en_fr_includes_base_and_two_translations(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="kleid", rewritten="kleid", detected_language="de", translations={"en": "dress", "fr": "robe"}, ) idx = _clauses_index(q) assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_fr"} assert _combined_fields_clause(idx["base_query"])["query"] == "kleid" assert "title.de" in _title_fields(idx["base_query"]) assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "dress" assert _combined_fields_clause(idx["base_query_trans_fr"])["query"] == "robe" # --- 检测语言不在 index_languages:仍有 base(弱)+ 翻译(强) --- def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="schuh", rewritten="schuh", detected_language="de", translations={"en": "shoe", "zh": "鞋"}, ) idx = _clauses_index(q) assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_zh"} assert _combined_fields_clause(idx["base_query"])["query"] == "schuh" assert "title.de" in _title_fields(idx["base_query"]) assert "boost" not in idx["base_query"] assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "shoe" assert idx["base_query_trans_en"]["boost"] == qb.translation_boost assert _combined_fields_clause(idx["base_query_trans_zh"])["query"] == "鞋" assert idx["base_query_trans_zh"]["boost"] == qb.translation_boost # --- 中英混写:base 打在检测语种字段;翻译子句打在译文语种字段 --- def test_mixed_zh_detected_base_clause_zh_fields_only_with_en_translation(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="红色 dress", rewritten="红色 dress", detected_language="zh", translations={"en": "red dress"}, ) idx = _clauses_index(q) assert set(idx) == {"base_query", "base_query_trans_en"} assert _combined_fields_clause(idx["base_query"])["query"] == "红色 dress" assert _has_title_lang(idx["base_query"], "zh") and not _has_title_lang(idx["base_query"], "en") assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "red dress" assert _has_title_lang(idx["base_query_trans_en"], "en") def test_mixed_en_detected_base_clause_en_fields_only_with_zh_translation(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="nike 运动鞋", rewritten="nike 运动鞋", detected_language="en", translations={"zh": "耐克运动鞋"}, ) idx = _clauses_index(q) assert set(idx) == {"base_query", "base_query_trans_zh"} assert _combined_fields_clause(idx["base_query"])["query"] == "nike 运动鞋" assert _has_title_lang(idx["base_query"], "en") and not _has_title_lang(idx["base_query"], "zh") assert _combined_fields_clause(idx["base_query_trans_zh"])["query"] == "耐克运动鞋" def test_zh_query_no_translations_only_zh_fields(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="法式 dress", rewritten="法式 dress", detected_language="zh", translations={}, ) idx = _clauses_index(q) assert set(idx) == {"base_query"} bases = {f.split("^", 1)[0] for f in _title_fields(idx["base_query"])} assert bases == {"title.zh"} # --- 去重:与 base 同语言同文本的翻译项跳过 --- def test_skips_translation_when_same_lang_and_same_text_as_base(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="NIKE", rewritten="NIKE", detected_language="en", translations={"en": "NIKE", "zh": "耐克"}, ) idx = _clauses_index(q) assert set(idx) == {"base_query", "base_query_trans_zh"} def test_keeps_translation_when_same_text_but_different_lang_than_base(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="NIKE", rewritten="NIKE", detected_language="en", translations={"zh": "NIKE"}, ) idx = _clauses_index(q) assert set(idx) == {"base_query", "base_query_trans_zh"} assert _combined_fields_clause(idx["base_query_trans_zh"])["query"] == "NIKE" # --- 翻译 key 规范化、空翻译跳过 --- def test_translation_language_key_is_normalized_case_insensitive(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="dress", rewritten="dress", detected_language="en", translations={"ZH": "连衣裙"}, ) idx = _clauses_index(q) assert "base_query_trans_zh" in idx assert _combined_fields_clause(idx["base_query_trans_zh"])["query"] == "连衣裙" def test_empty_translation_value_is_skipped(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="dress", rewritten="dress", detected_language="en", translations={"zh": " ", "fr": "robe"}, ) idx = _clauses_index(q) assert "base_query_trans_zh" not in idx assert "base_query_trans_fr" in idx # --- base 子句无 bool.boost;翻译子句带 translation_boost;phrase should 继承 phrase_match_boost --- def test_de_base_and_en_translation_phrase_boosts(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="x", rewritten="x", detected_language="de", translations={"en": "y"}, ) idx = _clauses_index(q) assert "boost" not in idx["base_query"] assert idx["base_query_trans_en"]["boost"] == qb.translation_boost base_should = _should_multi_matches_by_type(idx["base_query"]) trans_should = _should_multi_matches_by_type(idx["base_query_trans_en"]) assert base_should["phrase"]["boost"] == qb.phrase_match_boost assert trans_should["phrase"]["boost"] == qb.phrase_match_boost # --- 无翻译:仅 base_query --- def test_no_translations_only_base_query(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="hello", rewritten="hello", detected_language="en", translations={}, ) idx = _clauses_index(q) assert set(idx) == {"base_query"} # --- 与 KNN 同存时仍能解析文本子句(顶层 knn 不影响 query 内结构) --- def test_text_clauses_present_alongside_knn(): qb = _builder_multilingual_title_only(default_language="en") parsed = SimpleNamespace( rewritten_query="dress", detected_language="en", translations={"zh": "连衣裙"}, ) q = qb.build_query( query_text="dress", query_vector=np.array([0.1, 0.2, 0.3], dtype=np.float32), parsed_query=parsed, enable_knn=True, ) qr = q["query"] if "function_score" in qr: qr = qr["function_score"]["query"] assert any("knn" in c for c in qr["bool"]["should"]) idx = _clauses_index(q) assert set(idx) == {"base_query", "base_query_trans_zh"} def test_detected_language_unknown_falls_back_to_default_language(): """与 LanguageDetector 失败时 QueryConfig.default_language 行为对齐。""" qb = _builder_multilingual_title_only(default_language="en") parsed = SimpleNamespace( rewritten_query="shirt", detected_language="unknown", translations={"zh": "衬衫"}, ) q = qb.build_query( query_text="shirt", parsed_query=parsed, enable_knn=False, ) idx = _clauses_index(q) assert set(idx) == {"base_query", "base_query_trans_zh"} assert _combined_fields_clause(idx["base_query"])["query"] == "shirt" assert _has_title_lang(idx["base_query"], "en") def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="платье", rewritten="платье", detected_language="ru", translations={"en": "dress"}, ) idx = _clauses_index(q) assert set(idx) == {"base_query", "base_query_trans_en"} assert _combined_fields_clause(idx["base_query"])["query"] == "платье" assert _has_title_lang(idx["base_query"], "ru") assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "dress" def test_translation_generates_clause_for_any_target_lang_key(): """translations 里非空的每个语种键都会生成对应 base_query_trans_* 子句。""" qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text="dress", rewritten="dress", detected_language="en", translations={"zh": "连衣裙", "de": "Kleid"}, ) idx = _clauses_index(q) assert "base_query_trans_de" in idx assert _combined_fields_clause(idx["base_query_trans_de"])["query"] == "Kleid" assert _has_title_lang(idx["base_query_trans_de"], "de") def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_base(): """base_query 始终用 rewritten_query,而非仅 query_text。""" qb = _builder_multilingual_title_only(default_language="en") q = _build( qb, query_text=" 红色 ", rewritten="红色连衣裙", detected_language="zh", translations={"en": "red dress"}, ) idx = _clauses_index(q) assert _combined_fields_clause(idx["base_query"])["query"] == "红色连衣裙" assert _combined_fields_clause(idx["base_query_trans_en"])["query"] == "red dress"