Commit a3d3fb118ea8d4b24032d66f02021407daefa900
1 parent
69881ecb
加phrase提权
Showing
4 changed files
with
62 additions
and
106 deletions
Show diff stats
docs/TODO.txt
| @@ -57,6 +57,11 @@ image_embedding改为,一个spu有多个sku向量,每个向量内部properti | @@ -57,6 +57,11 @@ image_embedding改为,一个spu有多个sku向量,每个向量内部properti | ||
| 57 | 57 | ||
| 58 | 58 | ||
| 59 | 59 | ||
| 60 | +tags字段使用的优化: | ||
| 61 | +现在是keyword,在搜索中,不太好使用(目前主要用于suggest)。 | ||
| 62 | +可以考虑也拆分多语言,配合analyzer使用(和qanchors一样) | ||
| 63 | + | ||
| 64 | + | ||
| 60 | 65 | ||
| 61 | 外部需求: | 66 | 外部需求: |
| 62 | 1. 对推理能力要求很低、对耗时要求很高的大模型API(或者本地部署一个7b Q4量化的大模型),prompt大概30-50个token,首token响应要求500ms以内 | 67 | 1. 对推理能力要求很低、对耗时要求很高的大模型API(或者本地部署一个7b Q4量化的大模型),prompt大概30-50个token,首token响应要求500ms以内 |
search/es_query_builder.py
| @@ -478,8 +478,7 @@ class ESQueryBuilder: | @@ -478,8 +478,7 @@ class ESQueryBuilder: | ||
| 478 | should_clauses: List[Dict[str, Any]], | 478 | should_clauses: List[Dict[str, Any]], |
| 479 | lang: str, | 479 | lang: str, |
| 480 | lang_query: str, | 480 | lang_query: str, |
| 481 | - clause_name: str, | ||
| 482 | - is_source: bool, | 481 | + clause_name: str |
| 483 | ) -> None: | 482 | ) -> None: |
| 484 | text = (lang_query or "").strip() | 483 | text = (lang_query or "").strip() |
| 485 | if not text: | 484 | if not text: |
| @@ -487,11 +486,7 @@ class ESQueryBuilder: | @@ -487,11 +486,7 @@ class ESQueryBuilder: | ||
| 487 | phrase_fields = self._build_phrase_match_fields(lang) | 486 | phrase_fields = self._build_phrase_match_fields(lang) |
| 488 | if not phrase_fields: | 487 | if not phrase_fields: |
| 489 | return | 488 | return |
| 490 | - boost = ( | ||
| 491 | - self.phrase_match_boost | ||
| 492 | - if is_source | ||
| 493 | - else self.phrase_match_boost * float(self.translation_boost) | ||
| 494 | - ) | 489 | + boost = self.phrase_match_boost |
| 495 | should_clauses.append({ | 490 | should_clauses.append({ |
| 496 | "multi_match": { | 491 | "multi_match": { |
| 497 | "_name": f"{clause_name}_phrase", | 492 | "_name": f"{clause_name}_phrase", |
| @@ -642,7 +637,7 @@ class ESQueryBuilder: | @@ -642,7 +637,7 @@ class ESQueryBuilder: | ||
| 642 | "multi_match": clause["multi_match"] | 637 | "multi_match": clause["multi_match"] |
| 643 | }) | 638 | }) |
| 644 | self._append_phrase_should_clause( | 639 | self._append_phrase_should_clause( |
| 645 | - should_clauses, lang, lang_query, clause_name, is_source | 640 | + should_clauses, lang, lang_query, clause_name |
| 646 | ) | 641 | ) |
| 647 | 642 | ||
| 648 | if base_query_text: | 643 | if base_query_text: |
| @@ -674,8 +669,7 @@ class ESQueryBuilder: | @@ -674,8 +669,7 @@ class ESQueryBuilder: | ||
| 674 | fb_should, | 669 | fb_should, |
| 675 | self.default_language, | 670 | self.default_language, |
| 676 | query_text, | 671 | query_text, |
| 677 | - "base_query_fallback", | ||
| 678 | - True, | 672 | + "base_query_fallback" |
| 679 | ) | 673 | ) |
| 680 | if len(fb_should) == 1: | 674 | if len(fb_should) == 1: |
| 681 | return fallback_lexical | 675 | return fallback_lexical |
tests/test_es_query_builder.py
| 1 | from types import SimpleNamespace | 1 | from types import SimpleNamespace |
| 2 | +from typing import Any, Dict | ||
| 2 | 3 | ||
| 3 | import numpy as np | 4 | import numpy as np |
| 4 | 5 | ||
| @@ -13,6 +14,21 @@ def _builder() -> ESQueryBuilder: | @@ -13,6 +14,21 @@ def _builder() -> ESQueryBuilder: | ||
| 13 | ) | 14 | ) |
| 14 | 15 | ||
| 15 | 16 | ||
| 17 | +def _lexical_multi_match_fields(query_root: Dict[str, Any]) -> list: | ||
| 18 | + """Fields from the non-phrase multi_match (bool.should or single clause).""" | ||
| 19 | + if "multi_match" in query_root: | ||
| 20 | + mm = query_root["multi_match"] | ||
| 21 | + if mm.get("type") == "phrase": | ||
| 22 | + raise AssertionError("root multi_match is phrase-only") | ||
| 23 | + return mm["fields"] | ||
| 24 | + for clause in query_root.get("bool", {}).get("should", []): | ||
| 25 | + mm = clause.get("multi_match") or {} | ||
| 26 | + if mm.get("type") == "phrase": | ||
| 27 | + continue | ||
| 28 | + return mm["fields"] | ||
| 29 | + raise AssertionError("no lexical multi_match in query_root") | ||
| 30 | + | ||
| 31 | + | ||
| 16 | def test_knn_prefilter_includes_range_filters(): | 32 | def test_knn_prefilter_includes_range_filters(): |
| 17 | qb = _builder() | 33 | qb = _builder() |
| 18 | q = qb.build_query( | 34 | q = qb.build_query( |
| @@ -82,7 +98,12 @@ def test_text_query_contains_only_base_and_translation_named_queries(): | @@ -82,7 +98,12 @@ def test_text_query_contains_only_base_and_translation_named_queries(): | ||
| 82 | should = q["query"]["bool"]["should"] | 98 | should = q["query"]["bool"]["should"] |
| 83 | names = [clause["multi_match"]["_name"] for clause in should] | 99 | names = [clause["multi_match"]["_name"] for clause in should] |
| 84 | 100 | ||
| 85 | - assert names == ["base_query", "base_query_trans_zh"] | 101 | + assert names == [ |
| 102 | + "base_query", | ||
| 103 | + "base_query_phrase", | ||
| 104 | + "base_query_trans_zh", | ||
| 105 | + "base_query_trans_zh_phrase", | ||
| 106 | + ] | ||
| 86 | 107 | ||
| 87 | 108 | ||
| 88 | def test_text_query_skips_duplicate_translation_same_as_base(): | 109 | def test_text_query_skips_duplicate_translation_same_as_base(): |
| @@ -100,7 +121,9 @@ def test_text_query_skips_duplicate_translation_same_as_base(): | @@ -100,7 +121,9 @@ def test_text_query_skips_duplicate_translation_same_as_base(): | ||
| 100 | index_languages=["en", "zh"], | 121 | index_languages=["en", "zh"], |
| 101 | ) | 122 | ) |
| 102 | 123 | ||
| 103 | - assert q["query"]["multi_match"]["_name"] == "base_query" | 124 | + root = q["query"] |
| 125 | + assert root["bool"]["should"][0]["multi_match"]["_name"] == "base_query" | ||
| 126 | + assert root["bool"]["should"][1]["multi_match"]["_name"] == "base_query_phrase" | ||
| 104 | 127 | ||
| 105 | 128 | ||
| 106 | def test_mixed_script_merges_en_fields_into_zh_clause(): | 129 | def test_mixed_script_merges_en_fields_into_zh_clause(): |
| @@ -124,7 +147,7 @@ def test_mixed_script_merges_en_fields_into_zh_clause(): | @@ -124,7 +147,7 @@ def test_mixed_script_merges_en_fields_into_zh_clause(): | ||
| 124 | enable_knn=False, | 147 | enable_knn=False, |
| 125 | index_languages=["zh", "en"], | 148 | index_languages=["zh", "en"], |
| 126 | ) | 149 | ) |
| 127 | - fields = q["query"]["multi_match"]["fields"] | 150 | + fields = _lexical_multi_match_fields(q["query"]) |
| 128 | bases = {f.split("^", 1)[0] for f in fields} | 151 | bases = {f.split("^", 1)[0] for f in fields} |
| 129 | assert "title.zh" in bases and "title.en" in bases | 152 | assert "title.zh" in bases and "title.en" in bases |
| 130 | assert "brief.zh" in bases and "brief.en" in bases | 153 | assert "brief.zh" in bases and "brief.en" in bases |
| @@ -154,7 +177,7 @@ def test_mixed_script_merges_zh_fields_into_en_clause(): | @@ -154,7 +177,7 @@ def test_mixed_script_merges_zh_fields_into_en_clause(): | ||
| 154 | enable_knn=False, | 177 | enable_knn=False, |
| 155 | index_languages=["zh", "en"], | 178 | index_languages=["zh", "en"], |
| 156 | ) | 179 | ) |
| 157 | - fields = q["query"]["multi_match"]["fields"] | 180 | + fields = _lexical_multi_match_fields(q["query"]) |
| 158 | bases = {f.split("^", 1)[0] for f in fields} | 181 | bases = {f.split("^", 1)[0] for f in fields} |
| 159 | assert "title.en" in bases and "title.zh" in bases | 182 | assert "title.en" in bases and "title.zh" in bases |
| 160 | assert "title.zh^0.6" in fields | 183 | assert "title.zh^0.6" in fields |
| @@ -182,7 +205,7 @@ def test_mixed_script_merged_fields_scale_configured_boosts(): | @@ -182,7 +205,7 @@ def test_mixed_script_merged_fields_scale_configured_boosts(): | ||
| 182 | enable_knn=False, | 205 | enable_knn=False, |
| 183 | index_languages=["zh", "en"], | 206 | index_languages=["zh", "en"], |
| 184 | ) | 207 | ) |
| 185 | - fields = q["query"]["multi_match"]["fields"] | 208 | + fields = _lexical_multi_match_fields(q["query"]) |
| 186 | assert "title.zh^5.0" in fields | 209 | assert "title.zh^5.0" in fields |
| 187 | assert "title.en^6.0" in fields # 10.0 * 0.6 | 210 | assert "title.en^6.0" in fields # 10.0 * 0.6 |
| 188 | 211 | ||
| @@ -208,7 +231,7 @@ def test_mixed_script_does_not_merge_en_when_not_in_index_languages(): | @@ -208,7 +231,7 @@ def test_mixed_script_does_not_merge_en_when_not_in_index_languages(): | ||
| 208 | enable_knn=False, | 231 | enable_knn=False, |
| 209 | index_languages=["zh"], | 232 | index_languages=["zh"], |
| 210 | ) | 233 | ) |
| 211 | - fields = q["query"]["multi_match"]["fields"] | 234 | + fields = _lexical_multi_match_fields(q["query"]) |
| 212 | bases = {f.split("^", 1)[0] for f in fields} | 235 | bases = {f.split("^", 1)[0] for f in fields} |
| 213 | assert "title.zh" in bases | 236 | assert "title.zh" in bases |
| 214 | assert "title.en" not in bases | 237 | assert "title.en" not in bases |
tests/test_es_query_builder_text_recall_languages.py
| @@ -59,6 +59,11 @@ def _clauses_index(es_body: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: | @@ -59,6 +59,11 @@ def _clauses_index(es_body: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: | ||
| 59 | return out | 59 | return out |
| 60 | 60 | ||
| 61 | 61 | ||
| 62 | +def _with_phrase(lexical_names: set[str]) -> set[str]: | ||
| 63 | + """Each lexical recall clause has a companion ``*_phrase`` multi_match.""" | ||
| 64 | + return lexical_names | {f"{n}_phrase" for n in lexical_names} | ||
| 65 | + | ||
| 66 | + | ||
| 62 | def _title_fields(mm: Dict[str, Any]) -> List[str]: | 67 | def _title_fields(mm: Dict[str, Any]) -> List[str]: |
| 63 | fields = mm.get("fields") or [] | 68 | fields = mm.get("fields") or [] |
| 64 | return [f for f in fields if str(f).startswith("title.")] | 69 | return [f for f in fields if str(f).startswith("title.")] |
| @@ -114,7 +119,7 @@ def test_zh_query_index_zh_en_includes_base_zh_and_trans_en(): | @@ -114,7 +119,7 @@ def test_zh_query_index_zh_en_includes_base_zh_and_trans_en(): | ||
| 114 | index_languages=["zh", "en"], | 119 | index_languages=["zh", "en"], |
| 115 | ) | 120 | ) |
| 116 | idx = _clauses_index(q) | 121 | idx = _clauses_index(q) |
| 117 | - assert set(idx) == {"base_query", "base_query_trans_en"} | 122 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"}) |
| 118 | assert idx["base_query"]["query"] == "连衣裙" | 123 | assert idx["base_query"]["query"] == "连衣裙" |
| 119 | assert "title.zh" in _title_fields(idx["base_query"]) | 124 | assert "title.zh" in _title_fields(idx["base_query"]) |
| 120 | assert idx["base_query_trans_en"]["query"] == "dress" | 125 | assert idx["base_query_trans_en"]["query"] == "dress" |
| @@ -132,7 +137,7 @@ def test_en_query_index_zh_en_includes_base_en_and_trans_zh(): | @@ -132,7 +137,7 @@ def test_en_query_index_zh_en_includes_base_en_and_trans_zh(): | ||
| 132 | index_languages=["en", "zh"], | 137 | index_languages=["en", "zh"], |
| 133 | ) | 138 | ) |
| 134 | idx = _clauses_index(q) | 139 | idx = _clauses_index(q) |
| 135 | - assert set(idx) == {"base_query", "base_query_trans_zh"} | 140 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) |
| 136 | assert idx["base_query"]["query"] == "dress" | 141 | assert idx["base_query"]["query"] == "dress" |
| 137 | assert "title.en" in _title_fields(idx["base_query"]) | 142 | assert "title.en" in _title_fields(idx["base_query"]) |
| 138 | assert idx["base_query_trans_zh"]["query"] == "连衣裙" | 143 | assert idx["base_query_trans_zh"]["query"] == "连衣裙" |
| @@ -150,7 +155,9 @@ def test_de_query_index_de_en_fr_includes_base_and_two_translations(): | @@ -150,7 +155,9 @@ def test_de_query_index_de_en_fr_includes_base_and_two_translations(): | ||
| 150 | index_languages=["de", "en", "fr"], | 155 | index_languages=["de", "en", "fr"], |
| 151 | ) | 156 | ) |
| 152 | idx = _clauses_index(q) | 157 | idx = _clauses_index(q) |
| 153 | - assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_fr"} | 158 | + assert set(idx) == _with_phrase( |
| 159 | + {"base_query", "base_query_trans_en", "base_query_trans_fr"} | ||
| 160 | + ) | ||
| 154 | assert idx["base_query"]["query"] == "kleid" | 161 | assert idx["base_query"]["query"] == "kleid" |
| 155 | assert "title.de" in _title_fields(idx["base_query"]) | 162 | assert "title.de" in _title_fields(idx["base_query"]) |
| 156 | assert idx["base_query_trans_en"]["query"] == "dress" | 163 | assert idx["base_query_trans_en"]["query"] == "dress" |
| @@ -171,7 +178,9 @@ def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields(): | @@ -171,7 +178,9 @@ def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields(): | ||
| 171 | index_languages=["en", "zh"], | 178 | index_languages=["en", "zh"], |
| 172 | ) | 179 | ) |
| 173 | idx = _clauses_index(q) | 180 | idx = _clauses_index(q) |
| 174 | - assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_zh"} | 181 | + assert set(idx) == _with_phrase( |
| 182 | + {"base_query", "base_query_trans_en", "base_query_trans_zh"} | ||
| 183 | + ) | ||
| 175 | assert idx["base_query"]["query"] == "schuh" | 184 | assert idx["base_query"]["query"] == "schuh" |
| 176 | assert "title.de" in _title_fields(idx["base_query"]) | 185 | assert "title.de" in _title_fields(idx["base_query"]) |
| 177 | assert "boost" not in idx["base_query"] | 186 | assert "boost" not in idx["base_query"] |
| @@ -197,7 +206,7 @@ def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause(): | @@ -197,7 +206,7 @@ def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause(): | ||
| 197 | contains_english=True, | 206 | contains_english=True, |
| 198 | ) | 207 | ) |
| 199 | idx = _clauses_index(q) | 208 | idx = _clauses_index(q) |
| 200 | - assert set(idx) == {"base_query", "base_query_trans_en"} | 209 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"}) |
| 201 | assert idx["base_query"]["query"] == "红色 dress" | 210 | assert idx["base_query"]["query"] == "红色 dress" |
| 202 | assert _has_title_lang(idx["base_query"], "zh") and _has_title_lang(idx["base_query"], "en") | 211 | assert _has_title_lang(idx["base_query"], "zh") and _has_title_lang(idx["base_query"], "en") |
| 203 | assert idx["base_query_trans_en"]["query"] == "red dress" | 212 | assert idx["base_query_trans_en"]["query"] == "red dress" |
| @@ -217,7 +226,7 @@ def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause(): | @@ -217,7 +226,7 @@ def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause(): | ||
| 217 | contains_english=True, | 226 | contains_english=True, |
| 218 | ) | 227 | ) |
| 219 | idx = _clauses_index(q) | 228 | idx = _clauses_index(q) |
| 220 | - assert set(idx) == {"base_query", "base_query_trans_zh"} | 229 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) |
| 221 | assert idx["base_query"]["query"] == "nike 运动鞋" | 230 | assert idx["base_query"]["query"] == "nike 运动鞋" |
| 222 | assert _has_title_lang(idx["base_query"], "en") and _has_title_lang(idx["base_query"], "zh") | 231 | assert _has_title_lang(idx["base_query"], "en") and _has_title_lang(idx["base_query"], "zh") |
| 223 | assert idx["base_query_trans_zh"]["query"] == "耐克运动鞋" | 232 | assert idx["base_query_trans_zh"]["query"] == "耐克运动鞋" |
| @@ -236,7 +245,7 @@ def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): | @@ -236,7 +245,7 @@ def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): | ||
| 236 | contains_english=True, | 245 | contains_english=True, |
| 237 | ) | 246 | ) |
| 238 | idx = _clauses_index(q) | 247 | idx = _clauses_index(q) |
| 239 | - assert set(idx) == {"base_query"} | 248 | + assert set(idx) == _with_phrase({"base_query"}) |
| 240 | bases = {f.split("^", 1)[0] for f in _title_fields(idx["base_query"])} | 249 | bases = {f.split("^", 1)[0] for f in _title_fields(idx["base_query"])} |
| 241 | assert bases == {"title.zh"} | 250 | assert bases == {"title.zh"} |
| 242 | 251 | ||
| @@ -255,7 +264,7 @@ def test_skips_translation_when_same_lang_and_same_text_as_base(): | @@ -255,7 +264,7 @@ def test_skips_translation_when_same_lang_and_same_text_as_base(): | ||
| 255 | index_languages=["en", "zh"], | 264 | index_languages=["en", "zh"], |
| 256 | ) | 265 | ) |
| 257 | idx = _clauses_index(q) | 266 | idx = _clauses_index(q) |
| 258 | - assert set(idx) == {"base_query", "base_query_trans_zh"} | 267 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) |
| 259 | 268 | ||
| 260 | 269 | ||
| 261 | def test_keeps_translation_when_same_text_but_different_lang_than_base(): | 270 | def test_keeps_translation_when_same_text_but_different_lang_than_base(): |
| @@ -269,7 +278,7 @@ def test_keeps_translation_when_same_text_but_different_lang_than_base(): | @@ -269,7 +278,7 @@ def test_keeps_translation_when_same_text_but_different_lang_than_base(): | ||
| 269 | index_languages=["en", "zh"], | 278 | index_languages=["en", "zh"], |
| 270 | ) | 279 | ) |
| 271 | idx = _clauses_index(q) | 280 | idx = _clauses_index(q) |
| 272 | - assert set(idx) == {"base_query", "base_query_trans_zh"} | 281 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) |
| 273 | assert idx["base_query_trans_zh"]["query"] == "NIKE" | 282 | assert idx["base_query_trans_zh"]["query"] == "NIKE" |
| 274 | 283 | ||
| 275 | 284 | ||
| @@ -322,6 +331,8 @@ def test_empty_index_languages_treats_source_as_in_index_boosts(): | @@ -322,6 +331,8 @@ def test_empty_index_languages_treats_source_as_in_index_boosts(): | ||
| 322 | idx = _clauses_index(q) | 331 | idx = _clauses_index(q) |
| 323 | assert "boost" not in idx["base_query"] | 332 | assert "boost" not in idx["base_query"] |
| 324 | assert idx["base_query_trans_en"]["boost"] == qb.translation_boost | 333 | assert idx["base_query_trans_en"]["boost"] == qb.translation_boost |
| 334 | + assert idx["base_query_phrase"]["boost"] == qb.phrase_match_boost | ||
| 335 | + assert idx["base_query_trans_en_phrase"]["boost"] == qb.phrase_match_boost | ||
| 325 | 336 | ||
| 326 | 337 | ||
| 327 | # --- 无翻译:仅 base_query --- | 338 | # --- 无翻译:仅 base_query --- |
| @@ -338,7 +349,7 @@ def test_no_translations_only_base_query(): | @@ -338,7 +349,7 @@ def test_no_translations_only_base_query(): | ||
| 338 | index_languages=["en", "zh"], | 349 | index_languages=["en", "zh"], |
| 339 | ) | 350 | ) |
| 340 | idx = _clauses_index(q) | 351 | idx = _clauses_index(q) |
| 341 | - assert set(idx) == {"base_query"} | 352 | + assert set(idx) == _with_phrase({"base_query"}) |
| 342 | 353 | ||
| 343 | 354 | ||
| 344 | # --- 与 KNN 同存时仍能解析文本子句(顶层 knn 不影响 query 内结构) --- | 355 | # --- 与 KNN 同存时仍能解析文本子句(顶层 knn 不影响 query 内结构) --- |
| @@ -362,84 +373,7 @@ def test_text_clauses_present_alongside_knn(): | @@ -362,84 +373,7 @@ def test_text_clauses_present_alongside_knn(): | ||
| 362 | ) | 373 | ) |
| 363 | assert "knn" in q | 374 | assert "knn" in q |
| 364 | idx = _clauses_index(q) | 375 | idx = _clauses_index(q) |
| 365 | - assert set(idx) == {"base_query", "base_query_trans_zh"} | ||
| 366 | - | ||
| 367 | - | ||
| 368 | -def test_detected_language_unknown_falls_back_to_default_language(): | ||
| 369 | - """与 LanguageDetector 失败时 QueryConfig.default_language 行为对齐。""" | ||
| 370 | - qb = _builder_multilingual_title_only(default_language="en") | ||
| 371 | - parsed = SimpleNamespace( | ||
| 372 | - rewritten_query="shirt", | ||
| 373 | - detected_language="unknown", | ||
| 374 | - translations={"zh": "衬衫"}, | ||
| 375 | - contains_chinese=False, | ||
| 376 | - contains_english=True, | ||
| 377 | - ) | ||
| 378 | - q = qb.build_query( | ||
| 379 | - query_text="shirt", | ||
| 380 | - parsed_query=parsed, | ||
| 381 | - enable_knn=False, | ||
| 382 | - index_languages=["en", "zh"], | ||
| 383 | - ) | ||
| 384 | - idx = _clauses_index(q) | ||
| 385 | - assert set(idx) == {"base_query", "base_query_trans_zh"} | ||
| 386 | - assert idx["base_query"]["query"] == "shirt" | ||
| 387 | - assert _has_title_lang(idx["base_query"], "en") | ||
| 388 | - | ||
| 389 | - | ||
| 390 | -def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): | ||
| 391 | - qb = _builder_multilingual_title_only(default_language="en") | ||
| 392 | - q = _build( | ||
| 393 | - qb, | ||
| 394 | - query_text="платье", | ||
| 395 | - rewritten="платье", | ||
| 396 | - detected_language="ru", | ||
| 397 | - translations={"en": "dress"}, | ||
| 398 | - index_languages=["ru", "en"], | ||
| 399 | - ) | ||
| 400 | - idx = _clauses_index(q) | ||
| 401 | - assert set(idx) == {"base_query", "base_query_trans_en"} | ||
| 402 | - assert idx["base_query"]["query"] == "платье" | ||
| 403 | - assert _has_title_lang(idx["base_query"], "ru") | ||
| 404 | - assert idx["base_query_trans_en"]["query"] == "dress" | ||
| 405 | - | ||
| 406 | - | ||
| 407 | -def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause(): | ||
| 408 | - """ | ||
| 409 | - 当前实现:凡是 translations 里非空的条目都会生成子句; | ||
| 410 | - index_languages 只约束混写扩列,不用于过滤翻译子句。 | ||
| 411 | - """ | ||
| 412 | - qb = _builder_multilingual_title_only(default_language="en") | ||
| 413 | - q = _build( | ||
| 414 | - qb, | ||
| 415 | - query_text="dress", | ||
| 416 | - rewritten="dress", | ||
| 417 | - detected_language="en", | ||
| 418 | - translations={"zh": "连衣裙", "de": "Kleid"}, | ||
| 419 | - index_languages=["en", "zh"], | ||
| 420 | - ) | ||
| 421 | - idx = _clauses_index(q) | ||
| 422 | - assert "base_query_trans_de" in idx | ||
| 423 | - assert idx["base_query_trans_de"]["query"] == "Kleid" | ||
| 424 | - assert _has_title_lang(idx["base_query_trans_de"], "de") | ||
| 425 | - | ||
| 426 | - | ||
| 427 | -def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_base(): | ||
| 428 | - """base_query 始终用 rewritten_query,而非仅 query_text。""" | ||
| 429 | - qb = _builder_multilingual_title_only(default_language="en") | ||
| 430 | - q = _build( | ||
| 431 | - qb, | ||
| 432 | - query_text=" 红色 ", | ||
| 433 | - rewritten="红色连衣裙", | ||
| 434 | - detected_language="zh", | ||
| 435 | - translations={"en": "red dress"}, | ||
| 436 | - index_languages=["zh", "en"], | ||
| 437 | - contains_chinese=True, | ||
| 438 | - contains_english=False, | ||
| 439 | - ) | ||
| 440 | - idx = _clauses_index(q) | ||
| 441 | - assert idx["base_query"]["query"] == "红色连衣裙" | ||
| 442 | - assert idx["base_query_trans_en"]["query"] == "red dress" | 376 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) |
| 443 | 377 | ||
| 444 | 378 | ||
| 445 | def test_detected_language_unknown_falls_back_to_default_language(): | 379 | def test_detected_language_unknown_falls_back_to_default_language(): |
| @@ -459,7 +393,7 @@ def test_detected_language_unknown_falls_back_to_default_language(): | @@ -459,7 +393,7 @@ def test_detected_language_unknown_falls_back_to_default_language(): | ||
| 459 | index_languages=["en", "zh"], | 393 | index_languages=["en", "zh"], |
| 460 | ) | 394 | ) |
| 461 | idx = _clauses_index(q) | 395 | idx = _clauses_index(q) |
| 462 | - assert set(idx) == {"base_query", "base_query_trans_zh"} | 396 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) |
| 463 | assert idx["base_query"]["query"] == "shirt" | 397 | assert idx["base_query"]["query"] == "shirt" |
| 464 | assert _has_title_lang(idx["base_query"], "en") | 398 | assert _has_title_lang(idx["base_query"], "en") |
| 465 | 399 | ||
| @@ -475,7 +409,7 @@ def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): | @@ -475,7 +409,7 @@ def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): | ||
| 475 | index_languages=["ru", "en"], | 409 | index_languages=["ru", "en"], |
| 476 | ) | 410 | ) |
| 477 | idx = _clauses_index(q) | 411 | idx = _clauses_index(q) |
| 478 | - assert set(idx) == {"base_query", "base_query_trans_en"} | 412 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"}) |
| 479 | assert idx["base_query"]["query"] == "платье" | 413 | assert idx["base_query"]["query"] == "платье" |
| 480 | assert _has_title_lang(idx["base_query"], "ru") | 414 | assert _has_title_lang(idx["base_query"], "ru") |
| 481 | assert idx["base_query_trans_en"]["query"] == "dress" | 415 | assert idx["base_query_trans_en"]["query"] == "dress" |