From a3d3fb118ea8d4b24032d66f02021407daefa900 Mon Sep 17 00:00:00 2001 From: tangwang Date: Mon, 23 Mar 2026 09:12:40 +0800 Subject: [PATCH] 加phrase提权 --- docs/TODO.txt | 5 +++++ search/es_query_builder.py | 14 ++++---------- tests/test_es_query_builder.py | 35 +++++++++++++++++++++++++++++------ tests/test_es_query_builder_text_recall_languages.py | 114 ++++++++++++++++++++++++------------------------------------------------------------------------------------------ 4 files changed, 62 insertions(+), 106 deletions(-) diff --git a/docs/TODO.txt b/docs/TODO.txt index f55152d..4f08e95 100644 --- a/docs/TODO.txt +++ b/docs/TODO.txt @@ -57,6 +57,11 @@ image_embedding改为,一个spu有多个sku向量,每个向量内部properti +tags字段使用的优化: +现在是keyword,在搜索中,不太好使用(目前主要用于suggest)。 +可以考虑也拆分多语言,配合analyzer使用(和qanchors一样) + + 外部需求: 1. 对推理能力要求很低、对耗时要求很高的大模型API(或者本地部署一个7b Q4量化的大模型),prompt大概30-50个token,首token响应要求500ms以内 diff --git a/search/es_query_builder.py b/search/es_query_builder.py index 1029b75..f4d7f56 100644 --- a/search/es_query_builder.py +++ b/search/es_query_builder.py @@ -478,8 +478,7 @@ class ESQueryBuilder: should_clauses: List[Dict[str, Any]], lang: str, lang_query: str, - clause_name: str, - is_source: bool, + clause_name: str ) -> None: text = (lang_query or "").strip() if not text: @@ -487,11 +486,7 @@ class ESQueryBuilder: phrase_fields = self._build_phrase_match_fields(lang) if not phrase_fields: return - boost = ( - self.phrase_match_boost - if is_source - else self.phrase_match_boost * float(self.translation_boost) - ) + boost = self.phrase_match_boost should_clauses.append({ "multi_match": { "_name": f"{clause_name}_phrase", @@ -642,7 +637,7 @@ class ESQueryBuilder: "multi_match": clause["multi_match"] }) self._append_phrase_should_clause( - should_clauses, lang, lang_query, clause_name, is_source + should_clauses, lang, lang_query, clause_name ) if base_query_text: @@ -674,8 +669,7 @@ class ESQueryBuilder: fb_should, self.default_language, query_text, - "base_query_fallback", - True, + "base_query_fallback" ) if len(fb_should) == 1: return fallback_lexical diff --git a/tests/test_es_query_builder.py b/tests/test_es_query_builder.py index ad9b1d9..e271fb9 100644 --- a/tests/test_es_query_builder.py +++ b/tests/test_es_query_builder.py @@ -1,4 +1,5 @@ from types import SimpleNamespace +from typing import Any, Dict import numpy as np @@ -13,6 +14,21 @@ def _builder() -> ESQueryBuilder: ) +def _lexical_multi_match_fields(query_root: Dict[str, Any]) -> list: + """Fields from the non-phrase multi_match (bool.should or single clause).""" + if "multi_match" in query_root: + mm = query_root["multi_match"] + if mm.get("type") == "phrase": + raise AssertionError("root multi_match is phrase-only") + return mm["fields"] + for clause in query_root.get("bool", {}).get("should", []): + mm = clause.get("multi_match") or {} + if mm.get("type") == "phrase": + continue + return mm["fields"] + raise AssertionError("no lexical multi_match in query_root") + + def test_knn_prefilter_includes_range_filters(): qb = _builder() q = qb.build_query( @@ -82,7 +98,12 @@ def test_text_query_contains_only_base_and_translation_named_queries(): should = q["query"]["bool"]["should"] names = [clause["multi_match"]["_name"] for clause in should] - assert names == ["base_query", "base_query_trans_zh"] + assert names == [ + "base_query", + "base_query_phrase", + "base_query_trans_zh", + "base_query_trans_zh_phrase", + ] def test_text_query_skips_duplicate_translation_same_as_base(): @@ -100,7 +121,9 @@ def test_text_query_skips_duplicate_translation_same_as_base(): index_languages=["en", "zh"], ) - assert q["query"]["multi_match"]["_name"] == "base_query" + root = q["query"] + assert root["bool"]["should"][0]["multi_match"]["_name"] == "base_query" + assert root["bool"]["should"][1]["multi_match"]["_name"] == "base_query_phrase" def test_mixed_script_merges_en_fields_into_zh_clause(): @@ -124,7 +147,7 @@ def test_mixed_script_merges_en_fields_into_zh_clause(): enable_knn=False, index_languages=["zh", "en"], ) - fields = q["query"]["multi_match"]["fields"] + fields = _lexical_multi_match_fields(q["query"]) bases = {f.split("^", 1)[0] for f in fields} assert "title.zh" in bases and "title.en" in bases assert "brief.zh" in bases and "brief.en" in bases @@ -154,7 +177,7 @@ def test_mixed_script_merges_zh_fields_into_en_clause(): enable_knn=False, index_languages=["zh", "en"], ) - fields = q["query"]["multi_match"]["fields"] + fields = _lexical_multi_match_fields(q["query"]) bases = {f.split("^", 1)[0] for f in fields} assert "title.en" in bases and "title.zh" in bases assert "title.zh^0.6" in fields @@ -182,7 +205,7 @@ def test_mixed_script_merged_fields_scale_configured_boosts(): enable_knn=False, index_languages=["zh", "en"], ) - fields = q["query"]["multi_match"]["fields"] + fields = _lexical_multi_match_fields(q["query"]) assert "title.zh^5.0" in fields assert "title.en^6.0" in fields # 10.0 * 0.6 @@ -208,7 +231,7 @@ def test_mixed_script_does_not_merge_en_when_not_in_index_languages(): enable_knn=False, index_languages=["zh"], ) - fields = q["query"]["multi_match"]["fields"] + fields = _lexical_multi_match_fields(q["query"]) bases = {f.split("^", 1)[0] for f in fields} assert "title.zh" in bases assert "title.en" not in bases diff --git a/tests/test_es_query_builder_text_recall_languages.py b/tests/test_es_query_builder_text_recall_languages.py index 8799256..f685a3d 100644 --- a/tests/test_es_query_builder_text_recall_languages.py +++ b/tests/test_es_query_builder_text_recall_languages.py @@ -59,6 +59,11 @@ def _clauses_index(es_body: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: return out +def _with_phrase(lexical_names: set[str]) -> set[str]: + """Each lexical recall clause has a companion ``*_phrase`` multi_match.""" + return lexical_names | {f"{n}_phrase" for n in lexical_names} + + def _title_fields(mm: Dict[str, Any]) -> List[str]: fields = mm.get("fields") or [] return [f for f in fields if str(f).startswith("title.")] @@ -114,7 +119,7 @@ def test_zh_query_index_zh_en_includes_base_zh_and_trans_en(): index_languages=["zh", "en"], ) idx = _clauses_index(q) - assert set(idx) == {"base_query", "base_query_trans_en"} + assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"}) assert idx["base_query"]["query"] == "连衣裙" assert "title.zh" in _title_fields(idx["base_query"]) assert idx["base_query_trans_en"]["query"] == "dress" @@ -132,7 +137,7 @@ def test_en_query_index_zh_en_includes_base_en_and_trans_zh(): index_languages=["en", "zh"], ) idx = _clauses_index(q) - assert set(idx) == {"base_query", "base_query_trans_zh"} + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) assert idx["base_query"]["query"] == "dress" assert "title.en" in _title_fields(idx["base_query"]) assert idx["base_query_trans_zh"]["query"] == "连衣裙" @@ -150,7 +155,9 @@ def test_de_query_index_de_en_fr_includes_base_and_two_translations(): index_languages=["de", "en", "fr"], ) idx = _clauses_index(q) - assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_fr"} + assert set(idx) == _with_phrase( + {"base_query", "base_query_trans_en", "base_query_trans_fr"} + ) assert idx["base_query"]["query"] == "kleid" assert "title.de" in _title_fields(idx["base_query"]) assert idx["base_query_trans_en"]["query"] == "dress" @@ -171,7 +178,9 @@ def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields(): index_languages=["en", "zh"], ) idx = _clauses_index(q) - assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_zh"} + assert set(idx) == _with_phrase( + {"base_query", "base_query_trans_en", "base_query_trans_zh"} + ) assert idx["base_query"]["query"] == "schuh" assert "title.de" in _title_fields(idx["base_query"]) assert "boost" not in idx["base_query"] @@ -197,7 +206,7 @@ def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause(): contains_english=True, ) idx = _clauses_index(q) - assert set(idx) == {"base_query", "base_query_trans_en"} + assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"}) assert idx["base_query"]["query"] == "红色 dress" assert _has_title_lang(idx["base_query"], "zh") and _has_title_lang(idx["base_query"], "en") assert idx["base_query_trans_en"]["query"] == "red dress" @@ -217,7 +226,7 @@ def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause(): contains_english=True, ) idx = _clauses_index(q) - assert set(idx) == {"base_query", "base_query_trans_zh"} + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) assert idx["base_query"]["query"] == "nike 运动鞋" assert _has_title_lang(idx["base_query"], "en") and _has_title_lang(idx["base_query"], "zh") assert idx["base_query_trans_zh"]["query"] == "耐克运动鞋" @@ -236,7 +245,7 @@ def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): contains_english=True, ) idx = _clauses_index(q) - assert set(idx) == {"base_query"} + assert set(idx) == _with_phrase({"base_query"}) bases = {f.split("^", 1)[0] for f in _title_fields(idx["base_query"])} assert bases == {"title.zh"} @@ -255,7 +264,7 @@ def test_skips_translation_when_same_lang_and_same_text_as_base(): index_languages=["en", "zh"], ) idx = _clauses_index(q) - assert set(idx) == {"base_query", "base_query_trans_zh"} + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) def test_keeps_translation_when_same_text_but_different_lang_than_base(): @@ -269,7 +278,7 @@ def test_keeps_translation_when_same_text_but_different_lang_than_base(): index_languages=["en", "zh"], ) idx = _clauses_index(q) - assert set(idx) == {"base_query", "base_query_trans_zh"} + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) assert idx["base_query_trans_zh"]["query"] == "NIKE" @@ -322,6 +331,8 @@ def test_empty_index_languages_treats_source_as_in_index_boosts(): idx = _clauses_index(q) assert "boost" not in idx["base_query"] assert idx["base_query_trans_en"]["boost"] == qb.translation_boost + assert idx["base_query_phrase"]["boost"] == qb.phrase_match_boost + assert idx["base_query_trans_en_phrase"]["boost"] == qb.phrase_match_boost # --- 无翻译:仅 base_query --- @@ -338,7 +349,7 @@ def test_no_translations_only_base_query(): index_languages=["en", "zh"], ) idx = _clauses_index(q) - assert set(idx) == {"base_query"} + assert set(idx) == _with_phrase({"base_query"}) # --- 与 KNN 同存时仍能解析文本子句(顶层 knn 不影响 query 内结构) --- @@ -362,84 +373,7 @@ def test_text_clauses_present_alongside_knn(): ) assert "knn" in q idx = _clauses_index(q) - assert set(idx) == {"base_query", "base_query_trans_zh"} - - -def test_detected_language_unknown_falls_back_to_default_language(): - """与 LanguageDetector 失败时 QueryConfig.default_language 行为对齐。""" - qb = _builder_multilingual_title_only(default_language="en") - parsed = SimpleNamespace( - rewritten_query="shirt", - detected_language="unknown", - translations={"zh": "衬衫"}, - contains_chinese=False, - contains_english=True, - ) - q = qb.build_query( - query_text="shirt", - parsed_query=parsed, - enable_knn=False, - index_languages=["en", "zh"], - ) - idx = _clauses_index(q) - assert set(idx) == {"base_query", "base_query_trans_zh"} - assert idx["base_query"]["query"] == "shirt" - assert _has_title_lang(idx["base_query"], "en") - - -def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): - qb = _builder_multilingual_title_only(default_language="en") - q = _build( - qb, - query_text="платье", - rewritten="платье", - detected_language="ru", - translations={"en": "dress"}, - index_languages=["ru", "en"], - ) - idx = _clauses_index(q) - assert set(idx) == {"base_query", "base_query_trans_en"} - assert idx["base_query"]["query"] == "платье" - assert _has_title_lang(idx["base_query"], "ru") - assert idx["base_query_trans_en"]["query"] == "dress" - - -def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause(): - """ - 当前实现:凡是 translations 里非空的条目都会生成子句; - index_languages 只约束混写扩列,不用于过滤翻译子句。 - """ - qb = _builder_multilingual_title_only(default_language="en") - q = _build( - qb, - query_text="dress", - rewritten="dress", - detected_language="en", - translations={"zh": "连衣裙", "de": "Kleid"}, - index_languages=["en", "zh"], - ) - idx = _clauses_index(q) - assert "base_query_trans_de" in idx - assert idx["base_query_trans_de"]["query"] == "Kleid" - assert _has_title_lang(idx["base_query_trans_de"], "de") - - -def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_base(): - """base_query 始终用 rewritten_query,而非仅 query_text。""" - qb = _builder_multilingual_title_only(default_language="en") - q = _build( - qb, - query_text=" 红色 ", - rewritten="红色连衣裙", - detected_language="zh", - translations={"en": "red dress"}, - index_languages=["zh", "en"], - contains_chinese=True, - contains_english=False, - ) - idx = _clauses_index(q) - assert idx["base_query"]["query"] == "红色连衣裙" - assert idx["base_query_trans_en"]["query"] == "red dress" + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) def test_detected_language_unknown_falls_back_to_default_language(): @@ -459,7 +393,7 @@ def test_detected_language_unknown_falls_back_to_default_language(): index_languages=["en", "zh"], ) idx = _clauses_index(q) - assert set(idx) == {"base_query", "base_query_trans_zh"} + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) assert idx["base_query"]["query"] == "shirt" assert _has_title_lang(idx["base_query"], "en") @@ -475,7 +409,7 @@ def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): index_languages=["ru", "en"], ) idx = _clauses_index(q) - assert set(idx) == {"base_query", "base_query_trans_en"} + assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"}) assert idx["base_query"]["query"] == "платье" assert _has_title_lang(idx["base_query"], "ru") assert idx["base_query_trans_en"]["query"] == "dress" -- libgit2 0.21.2