Commit a3d3fb118ea8d4b24032d66f02021407daefa900
1 parent
69881ecb
加phrase提权
Showing
4 changed files
with
62 additions
and
106 deletions
Show diff stats
docs/TODO.txt
| ... | ... | @@ -57,6 +57,11 @@ image_embedding改为,一个spu有多个sku向量,每个向量内部properti |
| 57 | 57 | |
| 58 | 58 | |
| 59 | 59 | |
| 60 | +tags字段使用的优化: | |
| 61 | +现在是keyword,在搜索中,不太好使用(目前主要用于suggest)。 | |
| 62 | +可以考虑也拆分多语言,配合analyzer使用(和qanchors一样) | |
| 63 | + | |
| 64 | + | |
| 60 | 65 | |
| 61 | 66 | 外部需求: |
| 62 | 67 | 1. 对推理能力要求很低、对耗时要求很高的大模型API(或者本地部署一个7b Q4量化的大模型),prompt大概30-50个token,首token响应要求500ms以内 | ... | ... |
search/es_query_builder.py
| ... | ... | @@ -478,8 +478,7 @@ class ESQueryBuilder: |
| 478 | 478 | should_clauses: List[Dict[str, Any]], |
| 479 | 479 | lang: str, |
| 480 | 480 | lang_query: str, |
| 481 | - clause_name: str, | |
| 482 | - is_source: bool, | |
| 481 | + clause_name: str | |
| 483 | 482 | ) -> None: |
| 484 | 483 | text = (lang_query or "").strip() |
| 485 | 484 | if not text: |
| ... | ... | @@ -487,11 +486,7 @@ class ESQueryBuilder: |
| 487 | 486 | phrase_fields = self._build_phrase_match_fields(lang) |
| 488 | 487 | if not phrase_fields: |
| 489 | 488 | return |
| 490 | - boost = ( | |
| 491 | - self.phrase_match_boost | |
| 492 | - if is_source | |
| 493 | - else self.phrase_match_boost * float(self.translation_boost) | |
| 494 | - ) | |
| 489 | + boost = self.phrase_match_boost | |
| 495 | 490 | should_clauses.append({ |
| 496 | 491 | "multi_match": { |
| 497 | 492 | "_name": f"{clause_name}_phrase", |
| ... | ... | @@ -642,7 +637,7 @@ class ESQueryBuilder: |
| 642 | 637 | "multi_match": clause["multi_match"] |
| 643 | 638 | }) |
| 644 | 639 | self._append_phrase_should_clause( |
| 645 | - should_clauses, lang, lang_query, clause_name, is_source | |
| 640 | + should_clauses, lang, lang_query, clause_name | |
| 646 | 641 | ) |
| 647 | 642 | |
| 648 | 643 | if base_query_text: |
| ... | ... | @@ -674,8 +669,7 @@ class ESQueryBuilder: |
| 674 | 669 | fb_should, |
| 675 | 670 | self.default_language, |
| 676 | 671 | query_text, |
| 677 | - "base_query_fallback", | |
| 678 | - True, | |
| 672 | + "base_query_fallback" | |
| 679 | 673 | ) |
| 680 | 674 | if len(fb_should) == 1: |
| 681 | 675 | return fallback_lexical | ... | ... |
tests/test_es_query_builder.py
| 1 | 1 | from types import SimpleNamespace |
| 2 | +from typing import Any, Dict | |
| 2 | 3 | |
| 3 | 4 | import numpy as np |
| 4 | 5 | |
| ... | ... | @@ -13,6 +14,21 @@ def _builder() -> ESQueryBuilder: |
| 13 | 14 | ) |
| 14 | 15 | |
| 15 | 16 | |
| 17 | +def _lexical_multi_match_fields(query_root: Dict[str, Any]) -> list: | |
| 18 | + """Fields from the non-phrase multi_match (bool.should or single clause).""" | |
| 19 | + if "multi_match" in query_root: | |
| 20 | + mm = query_root["multi_match"] | |
| 21 | + if mm.get("type") == "phrase": | |
| 22 | + raise AssertionError("root multi_match is phrase-only") | |
| 23 | + return mm["fields"] | |
| 24 | + for clause in query_root.get("bool", {}).get("should", []): | |
| 25 | + mm = clause.get("multi_match") or {} | |
| 26 | + if mm.get("type") == "phrase": | |
| 27 | + continue | |
| 28 | + return mm["fields"] | |
| 29 | + raise AssertionError("no lexical multi_match in query_root") | |
| 30 | + | |
| 31 | + | |
| 16 | 32 | def test_knn_prefilter_includes_range_filters(): |
| 17 | 33 | qb = _builder() |
| 18 | 34 | q = qb.build_query( |
| ... | ... | @@ -82,7 +98,12 @@ def test_text_query_contains_only_base_and_translation_named_queries(): |
| 82 | 98 | should = q["query"]["bool"]["should"] |
| 83 | 99 | names = [clause["multi_match"]["_name"] for clause in should] |
| 84 | 100 | |
| 85 | - assert names == ["base_query", "base_query_trans_zh"] | |
| 101 | + assert names == [ | |
| 102 | + "base_query", | |
| 103 | + "base_query_phrase", | |
| 104 | + "base_query_trans_zh", | |
| 105 | + "base_query_trans_zh_phrase", | |
| 106 | + ] | |
| 86 | 107 | |
| 87 | 108 | |
| 88 | 109 | def test_text_query_skips_duplicate_translation_same_as_base(): |
| ... | ... | @@ -100,7 +121,9 @@ def test_text_query_skips_duplicate_translation_same_as_base(): |
| 100 | 121 | index_languages=["en", "zh"], |
| 101 | 122 | ) |
| 102 | 123 | |
| 103 | - assert q["query"]["multi_match"]["_name"] == "base_query" | |
| 124 | + root = q["query"] | |
| 125 | + assert root["bool"]["should"][0]["multi_match"]["_name"] == "base_query" | |
| 126 | + assert root["bool"]["should"][1]["multi_match"]["_name"] == "base_query_phrase" | |
| 104 | 127 | |
| 105 | 128 | |
| 106 | 129 | def test_mixed_script_merges_en_fields_into_zh_clause(): |
| ... | ... | @@ -124,7 +147,7 @@ def test_mixed_script_merges_en_fields_into_zh_clause(): |
| 124 | 147 | enable_knn=False, |
| 125 | 148 | index_languages=["zh", "en"], |
| 126 | 149 | ) |
| 127 | - fields = q["query"]["multi_match"]["fields"] | |
| 150 | + fields = _lexical_multi_match_fields(q["query"]) | |
| 128 | 151 | bases = {f.split("^", 1)[0] for f in fields} |
| 129 | 152 | assert "title.zh" in bases and "title.en" in bases |
| 130 | 153 | assert "brief.zh" in bases and "brief.en" in bases |
| ... | ... | @@ -154,7 +177,7 @@ def test_mixed_script_merges_zh_fields_into_en_clause(): |
| 154 | 177 | enable_knn=False, |
| 155 | 178 | index_languages=["zh", "en"], |
| 156 | 179 | ) |
| 157 | - fields = q["query"]["multi_match"]["fields"] | |
| 180 | + fields = _lexical_multi_match_fields(q["query"]) | |
| 158 | 181 | bases = {f.split("^", 1)[0] for f in fields} |
| 159 | 182 | assert "title.en" in bases and "title.zh" in bases |
| 160 | 183 | assert "title.zh^0.6" in fields |
| ... | ... | @@ -182,7 +205,7 @@ def test_mixed_script_merged_fields_scale_configured_boosts(): |
| 182 | 205 | enable_knn=False, |
| 183 | 206 | index_languages=["zh", "en"], |
| 184 | 207 | ) |
| 185 | - fields = q["query"]["multi_match"]["fields"] | |
| 208 | + fields = _lexical_multi_match_fields(q["query"]) | |
| 186 | 209 | assert "title.zh^5.0" in fields |
| 187 | 210 | assert "title.en^6.0" in fields # 10.0 * 0.6 |
| 188 | 211 | |
| ... | ... | @@ -208,7 +231,7 @@ def test_mixed_script_does_not_merge_en_when_not_in_index_languages(): |
| 208 | 231 | enable_knn=False, |
| 209 | 232 | index_languages=["zh"], |
| 210 | 233 | ) |
| 211 | - fields = q["query"]["multi_match"]["fields"] | |
| 234 | + fields = _lexical_multi_match_fields(q["query"]) | |
| 212 | 235 | bases = {f.split("^", 1)[0] for f in fields} |
| 213 | 236 | assert "title.zh" in bases |
| 214 | 237 | assert "title.en" not in bases | ... | ... |
tests/test_es_query_builder_text_recall_languages.py
| ... | ... | @@ -59,6 +59,11 @@ def _clauses_index(es_body: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: |
| 59 | 59 | return out |
| 60 | 60 | |
| 61 | 61 | |
| 62 | +def _with_phrase(lexical_names: set[str]) -> set[str]: | |
| 63 | + """Each lexical recall clause has a companion ``*_phrase`` multi_match.""" | |
| 64 | + return lexical_names | {f"{n}_phrase" for n in lexical_names} | |
| 65 | + | |
| 66 | + | |
| 62 | 67 | def _title_fields(mm: Dict[str, Any]) -> List[str]: |
| 63 | 68 | fields = mm.get("fields") or [] |
| 64 | 69 | return [f for f in fields if str(f).startswith("title.")] |
| ... | ... | @@ -114,7 +119,7 @@ def test_zh_query_index_zh_en_includes_base_zh_and_trans_en(): |
| 114 | 119 | index_languages=["zh", "en"], |
| 115 | 120 | ) |
| 116 | 121 | idx = _clauses_index(q) |
| 117 | - assert set(idx) == {"base_query", "base_query_trans_en"} | |
| 122 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"}) | |
| 118 | 123 | assert idx["base_query"]["query"] == "连衣裙" |
| 119 | 124 | assert "title.zh" in _title_fields(idx["base_query"]) |
| 120 | 125 | assert idx["base_query_trans_en"]["query"] == "dress" |
| ... | ... | @@ -132,7 +137,7 @@ def test_en_query_index_zh_en_includes_base_en_and_trans_zh(): |
| 132 | 137 | index_languages=["en", "zh"], |
| 133 | 138 | ) |
| 134 | 139 | idx = _clauses_index(q) |
| 135 | - assert set(idx) == {"base_query", "base_query_trans_zh"} | |
| 140 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | |
| 136 | 141 | assert idx["base_query"]["query"] == "dress" |
| 137 | 142 | assert "title.en" in _title_fields(idx["base_query"]) |
| 138 | 143 | assert idx["base_query_trans_zh"]["query"] == "连衣裙" |
| ... | ... | @@ -150,7 +155,9 @@ def test_de_query_index_de_en_fr_includes_base_and_two_translations(): |
| 150 | 155 | index_languages=["de", "en", "fr"], |
| 151 | 156 | ) |
| 152 | 157 | idx = _clauses_index(q) |
| 153 | - assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_fr"} | |
| 158 | + assert set(idx) == _with_phrase( | |
| 159 | + {"base_query", "base_query_trans_en", "base_query_trans_fr"} | |
| 160 | + ) | |
| 154 | 161 | assert idx["base_query"]["query"] == "kleid" |
| 155 | 162 | assert "title.de" in _title_fields(idx["base_query"]) |
| 156 | 163 | assert idx["base_query_trans_en"]["query"] == "dress" |
| ... | ... | @@ -171,7 +178,9 @@ def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields(): |
| 171 | 178 | index_languages=["en", "zh"], |
| 172 | 179 | ) |
| 173 | 180 | idx = _clauses_index(q) |
| 174 | - assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_zh"} | |
| 181 | + assert set(idx) == _with_phrase( | |
| 182 | + {"base_query", "base_query_trans_en", "base_query_trans_zh"} | |
| 183 | + ) | |
| 175 | 184 | assert idx["base_query"]["query"] == "schuh" |
| 176 | 185 | assert "title.de" in _title_fields(idx["base_query"]) |
| 177 | 186 | assert "boost" not in idx["base_query"] |
| ... | ... | @@ -197,7 +206,7 @@ def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause(): |
| 197 | 206 | contains_english=True, |
| 198 | 207 | ) |
| 199 | 208 | idx = _clauses_index(q) |
| 200 | - assert set(idx) == {"base_query", "base_query_trans_en"} | |
| 209 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"}) | |
| 201 | 210 | assert idx["base_query"]["query"] == "红色 dress" |
| 202 | 211 | assert _has_title_lang(idx["base_query"], "zh") and _has_title_lang(idx["base_query"], "en") |
| 203 | 212 | assert idx["base_query_trans_en"]["query"] == "red dress" |
| ... | ... | @@ -217,7 +226,7 @@ def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause(): |
| 217 | 226 | contains_english=True, |
| 218 | 227 | ) |
| 219 | 228 | idx = _clauses_index(q) |
| 220 | - assert set(idx) == {"base_query", "base_query_trans_zh"} | |
| 229 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | |
| 221 | 230 | assert idx["base_query"]["query"] == "nike 运动鞋" |
| 222 | 231 | assert _has_title_lang(idx["base_query"], "en") and _has_title_lang(idx["base_query"], "zh") |
| 223 | 232 | assert idx["base_query_trans_zh"]["query"] == "耐克运动鞋" |
| ... | ... | @@ -236,7 +245,7 @@ def test_mixed_zh_query_index_zh_only_no_en_merge_in_base(): |
| 236 | 245 | contains_english=True, |
| 237 | 246 | ) |
| 238 | 247 | idx = _clauses_index(q) |
| 239 | - assert set(idx) == {"base_query"} | |
| 248 | + assert set(idx) == _with_phrase({"base_query"}) | |
| 240 | 249 | bases = {f.split("^", 1)[0] for f in _title_fields(idx["base_query"])} |
| 241 | 250 | assert bases == {"title.zh"} |
| 242 | 251 | |
| ... | ... | @@ -255,7 +264,7 @@ def test_skips_translation_when_same_lang_and_same_text_as_base(): |
| 255 | 264 | index_languages=["en", "zh"], |
| 256 | 265 | ) |
| 257 | 266 | idx = _clauses_index(q) |
| 258 | - assert set(idx) == {"base_query", "base_query_trans_zh"} | |
| 267 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | |
| 259 | 268 | |
| 260 | 269 | |
| 261 | 270 | def test_keeps_translation_when_same_text_but_different_lang_than_base(): |
| ... | ... | @@ -269,7 +278,7 @@ def test_keeps_translation_when_same_text_but_different_lang_than_base(): |
| 269 | 278 | index_languages=["en", "zh"], |
| 270 | 279 | ) |
| 271 | 280 | idx = _clauses_index(q) |
| 272 | - assert set(idx) == {"base_query", "base_query_trans_zh"} | |
| 281 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | |
| 273 | 282 | assert idx["base_query_trans_zh"]["query"] == "NIKE" |
| 274 | 283 | |
| 275 | 284 | |
| ... | ... | @@ -322,6 +331,8 @@ def test_empty_index_languages_treats_source_as_in_index_boosts(): |
| 322 | 331 | idx = _clauses_index(q) |
| 323 | 332 | assert "boost" not in idx["base_query"] |
| 324 | 333 | assert idx["base_query_trans_en"]["boost"] == qb.translation_boost |
| 334 | + assert idx["base_query_phrase"]["boost"] == qb.phrase_match_boost | |
| 335 | + assert idx["base_query_trans_en_phrase"]["boost"] == qb.phrase_match_boost | |
| 325 | 336 | |
| 326 | 337 | |
| 327 | 338 | # --- 无翻译:仅 base_query --- |
| ... | ... | @@ -338,7 +349,7 @@ def test_no_translations_only_base_query(): |
| 338 | 349 | index_languages=["en", "zh"], |
| 339 | 350 | ) |
| 340 | 351 | idx = _clauses_index(q) |
| 341 | - assert set(idx) == {"base_query"} | |
| 352 | + assert set(idx) == _with_phrase({"base_query"}) | |
| 342 | 353 | |
| 343 | 354 | |
| 344 | 355 | # --- 与 KNN 同存时仍能解析文本子句(顶层 knn 不影响 query 内结构) --- |
| ... | ... | @@ -362,84 +373,7 @@ def test_text_clauses_present_alongside_knn(): |
| 362 | 373 | ) |
| 363 | 374 | assert "knn" in q |
| 364 | 375 | idx = _clauses_index(q) |
| 365 | - assert set(idx) == {"base_query", "base_query_trans_zh"} | |
| 366 | - | |
| 367 | - | |
| 368 | -def test_detected_language_unknown_falls_back_to_default_language(): | |
| 369 | - """与 LanguageDetector 失败时 QueryConfig.default_language 行为对齐。""" | |
| 370 | - qb = _builder_multilingual_title_only(default_language="en") | |
| 371 | - parsed = SimpleNamespace( | |
| 372 | - rewritten_query="shirt", | |
| 373 | - detected_language="unknown", | |
| 374 | - translations={"zh": "衬衫"}, | |
| 375 | - contains_chinese=False, | |
| 376 | - contains_english=True, | |
| 377 | - ) | |
| 378 | - q = qb.build_query( | |
| 379 | - query_text="shirt", | |
| 380 | - parsed_query=parsed, | |
| 381 | - enable_knn=False, | |
| 382 | - index_languages=["en", "zh"], | |
| 383 | - ) | |
| 384 | - idx = _clauses_index(q) | |
| 385 | - assert set(idx) == {"base_query", "base_query_trans_zh"} | |
| 386 | - assert idx["base_query"]["query"] == "shirt" | |
| 387 | - assert _has_title_lang(idx["base_query"], "en") | |
| 388 | - | |
| 389 | - | |
| 390 | -def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): | |
| 391 | - qb = _builder_multilingual_title_only(default_language="en") | |
| 392 | - q = _build( | |
| 393 | - qb, | |
| 394 | - query_text="платье", | |
| 395 | - rewritten="платье", | |
| 396 | - detected_language="ru", | |
| 397 | - translations={"en": "dress"}, | |
| 398 | - index_languages=["ru", "en"], | |
| 399 | - ) | |
| 400 | - idx = _clauses_index(q) | |
| 401 | - assert set(idx) == {"base_query", "base_query_trans_en"} | |
| 402 | - assert idx["base_query"]["query"] == "платье" | |
| 403 | - assert _has_title_lang(idx["base_query"], "ru") | |
| 404 | - assert idx["base_query_trans_en"]["query"] == "dress" | |
| 405 | - | |
| 406 | - | |
| 407 | -def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause(): | |
| 408 | - """ | |
| 409 | - 当前实现:凡是 translations 里非空的条目都会生成子句; | |
| 410 | - index_languages 只约束混写扩列,不用于过滤翻译子句。 | |
| 411 | - """ | |
| 412 | - qb = _builder_multilingual_title_only(default_language="en") | |
| 413 | - q = _build( | |
| 414 | - qb, | |
| 415 | - query_text="dress", | |
| 416 | - rewritten="dress", | |
| 417 | - detected_language="en", | |
| 418 | - translations={"zh": "连衣裙", "de": "Kleid"}, | |
| 419 | - index_languages=["en", "zh"], | |
| 420 | - ) | |
| 421 | - idx = _clauses_index(q) | |
| 422 | - assert "base_query_trans_de" in idx | |
| 423 | - assert idx["base_query_trans_de"]["query"] == "Kleid" | |
| 424 | - assert _has_title_lang(idx["base_query_trans_de"], "de") | |
| 425 | - | |
| 426 | - | |
| 427 | -def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_base(): | |
| 428 | - """base_query 始终用 rewritten_query,而非仅 query_text。""" | |
| 429 | - qb = _builder_multilingual_title_only(default_language="en") | |
| 430 | - q = _build( | |
| 431 | - qb, | |
| 432 | - query_text=" 红色 ", | |
| 433 | - rewritten="红色连衣裙", | |
| 434 | - detected_language="zh", | |
| 435 | - translations={"en": "red dress"}, | |
| 436 | - index_languages=["zh", "en"], | |
| 437 | - contains_chinese=True, | |
| 438 | - contains_english=False, | |
| 439 | - ) | |
| 440 | - idx = _clauses_index(q) | |
| 441 | - assert idx["base_query"]["query"] == "红色连衣裙" | |
| 442 | - assert idx["base_query_trans_en"]["query"] == "red dress" | |
| 376 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | |
| 443 | 377 | |
| 444 | 378 | |
| 445 | 379 | def test_detected_language_unknown_falls_back_to_default_language(): |
| ... | ... | @@ -459,7 +393,7 @@ def test_detected_language_unknown_falls_back_to_default_language(): |
| 459 | 393 | index_languages=["en", "zh"], |
| 460 | 394 | ) |
| 461 | 395 | idx = _clauses_index(q) |
| 462 | - assert set(idx) == {"base_query", "base_query_trans_zh"} | |
| 396 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"}) | |
| 463 | 397 | assert idx["base_query"]["query"] == "shirt" |
| 464 | 398 | assert _has_title_lang(idx["base_query"], "en") |
| 465 | 399 | |
| ... | ... | @@ -475,7 +409,7 @@ def test_ru_query_index_ru_en_includes_base_ru_and_trans_en(): |
| 475 | 409 | index_languages=["ru", "en"], |
| 476 | 410 | ) |
| 477 | 411 | idx = _clauses_index(q) |
| 478 | - assert set(idx) == {"base_query", "base_query_trans_en"} | |
| 412 | + assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"}) | |
| 479 | 413 | assert idx["base_query"]["query"] == "платье" |
| 480 | 414 | assert _has_title_lang(idx["base_query"], "ru") |
| 481 | 415 | assert idx["base_query_trans_en"]["query"] == "dress" | ... | ... |