Commit a3d3fb118ea8d4b24032d66f02021407daefa900

Authored by tangwang
1 parent 69881ecb

加phrase提权

docs/TODO.txt
... ... @@ -57,6 +57,11 @@ image_embedding改为,一个spu有多个sku向量,每个向量内部properti
57 57  
58 58  
59 59  
  60 +tags字段使用的优化:
  61 +现在是keyword,在搜索中,不太好使用(目前主要用于suggest)。
  62 +可以考虑也拆分多语言,配合analyzer使用(和qanchors一样)
  63 +
  64 +
60 65  
61 66 外部需求:
62 67 1. 对推理能力要求很低、对耗时要求很高的大模型API(或者本地部署一个7b Q4量化的大模型),prompt大概30-50个token,首token响应要求500ms以内
... ...
search/es_query_builder.py
... ... @@ -478,8 +478,7 @@ class ESQueryBuilder:
478 478 should_clauses: List[Dict[str, Any]],
479 479 lang: str,
480 480 lang_query: str,
481   - clause_name: str,
482   - is_source: bool,
  481 + clause_name: str
483 482 ) -> None:
484 483 text = (lang_query or "").strip()
485 484 if not text:
... ... @@ -487,11 +486,7 @@ class ESQueryBuilder:
487 486 phrase_fields = self._build_phrase_match_fields(lang)
488 487 if not phrase_fields:
489 488 return
490   - boost = (
491   - self.phrase_match_boost
492   - if is_source
493   - else self.phrase_match_boost * float(self.translation_boost)
494   - )
  489 + boost = self.phrase_match_boost
495 490 should_clauses.append({
496 491 "multi_match": {
497 492 "_name": f"{clause_name}_phrase",
... ... @@ -642,7 +637,7 @@ class ESQueryBuilder:
642 637 "multi_match": clause["multi_match"]
643 638 })
644 639 self._append_phrase_should_clause(
645   - should_clauses, lang, lang_query, clause_name, is_source
  640 + should_clauses, lang, lang_query, clause_name
646 641 )
647 642  
648 643 if base_query_text:
... ... @@ -674,8 +669,7 @@ class ESQueryBuilder:
674 669 fb_should,
675 670 self.default_language,
676 671 query_text,
677   - "base_query_fallback",
678   - True,
  672 + "base_query_fallback"
679 673 )
680 674 if len(fb_should) == 1:
681 675 return fallback_lexical
... ...
tests/test_es_query_builder.py
1 1 from types import SimpleNamespace
  2 +from typing import Any, Dict
2 3  
3 4 import numpy as np
4 5  
... ... @@ -13,6 +14,21 @@ def _builder() -> ESQueryBuilder:
13 14 )
14 15  
15 16  
  17 +def _lexical_multi_match_fields(query_root: Dict[str, Any]) -> list:
  18 + """Fields from the non-phrase multi_match (bool.should or single clause)."""
  19 + if "multi_match" in query_root:
  20 + mm = query_root["multi_match"]
  21 + if mm.get("type") == "phrase":
  22 + raise AssertionError("root multi_match is phrase-only")
  23 + return mm["fields"]
  24 + for clause in query_root.get("bool", {}).get("should", []):
  25 + mm = clause.get("multi_match") or {}
  26 + if mm.get("type") == "phrase":
  27 + continue
  28 + return mm["fields"]
  29 + raise AssertionError("no lexical multi_match in query_root")
  30 +
  31 +
16 32 def test_knn_prefilter_includes_range_filters():
17 33 qb = _builder()
18 34 q = qb.build_query(
... ... @@ -82,7 +98,12 @@ def test_text_query_contains_only_base_and_translation_named_queries():
82 98 should = q["query"]["bool"]["should"]
83 99 names = [clause["multi_match"]["_name"] for clause in should]
84 100  
85   - assert names == ["base_query", "base_query_trans_zh"]
  101 + assert names == [
  102 + "base_query",
  103 + "base_query_phrase",
  104 + "base_query_trans_zh",
  105 + "base_query_trans_zh_phrase",
  106 + ]
86 107  
87 108  
88 109 def test_text_query_skips_duplicate_translation_same_as_base():
... ... @@ -100,7 +121,9 @@ def test_text_query_skips_duplicate_translation_same_as_base():
100 121 index_languages=["en", "zh"],
101 122 )
102 123  
103   - assert q["query"]["multi_match"]["_name"] == "base_query"
  124 + root = q["query"]
  125 + assert root["bool"]["should"][0]["multi_match"]["_name"] == "base_query"
  126 + assert root["bool"]["should"][1]["multi_match"]["_name"] == "base_query_phrase"
104 127  
105 128  
106 129 def test_mixed_script_merges_en_fields_into_zh_clause():
... ... @@ -124,7 +147,7 @@ def test_mixed_script_merges_en_fields_into_zh_clause():
124 147 enable_knn=False,
125 148 index_languages=["zh", "en"],
126 149 )
127   - fields = q["query"]["multi_match"]["fields"]
  150 + fields = _lexical_multi_match_fields(q["query"])
128 151 bases = {f.split("^", 1)[0] for f in fields}
129 152 assert "title.zh" in bases and "title.en" in bases
130 153 assert "brief.zh" in bases and "brief.en" in bases
... ... @@ -154,7 +177,7 @@ def test_mixed_script_merges_zh_fields_into_en_clause():
154 177 enable_knn=False,
155 178 index_languages=["zh", "en"],
156 179 )
157   - fields = q["query"]["multi_match"]["fields"]
  180 + fields = _lexical_multi_match_fields(q["query"])
158 181 bases = {f.split("^", 1)[0] for f in fields}
159 182 assert "title.en" in bases and "title.zh" in bases
160 183 assert "title.zh^0.6" in fields
... ... @@ -182,7 +205,7 @@ def test_mixed_script_merged_fields_scale_configured_boosts():
182 205 enable_knn=False,
183 206 index_languages=["zh", "en"],
184 207 )
185   - fields = q["query"]["multi_match"]["fields"]
  208 + fields = _lexical_multi_match_fields(q["query"])
186 209 assert "title.zh^5.0" in fields
187 210 assert "title.en^6.0" in fields # 10.0 * 0.6
188 211  
... ... @@ -208,7 +231,7 @@ def test_mixed_script_does_not_merge_en_when_not_in_index_languages():
208 231 enable_knn=False,
209 232 index_languages=["zh"],
210 233 )
211   - fields = q["query"]["multi_match"]["fields"]
  234 + fields = _lexical_multi_match_fields(q["query"])
212 235 bases = {f.split("^", 1)[0] for f in fields}
213 236 assert "title.zh" in bases
214 237 assert "title.en" not in bases
... ...
tests/test_es_query_builder_text_recall_languages.py
... ... @@ -59,6 +59,11 @@ def _clauses_index(es_body: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
59 59 return out
60 60  
61 61  
  62 +def _with_phrase(lexical_names: set[str]) -> set[str]:
  63 + """Each lexical recall clause has a companion ``*_phrase`` multi_match."""
  64 + return lexical_names | {f"{n}_phrase" for n in lexical_names}
  65 +
  66 +
62 67 def _title_fields(mm: Dict[str, Any]) -> List[str]:
63 68 fields = mm.get("fields") or []
64 69 return [f for f in fields if str(f).startswith("title.")]
... ... @@ -114,7 +119,7 @@ def test_zh_query_index_zh_en_includes_base_zh_and_trans_en():
114 119 index_languages=["zh", "en"],
115 120 )
116 121 idx = _clauses_index(q)
117   - assert set(idx) == {"base_query", "base_query_trans_en"}
  122 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"})
118 123 assert idx["base_query"]["query"] == "连衣裙"
119 124 assert "title.zh" in _title_fields(idx["base_query"])
120 125 assert idx["base_query_trans_en"]["query"] == "dress"
... ... @@ -132,7 +137,7 @@ def test_en_query_index_zh_en_includes_base_en_and_trans_zh():
132 137 index_languages=["en", "zh"],
133 138 )
134 139 idx = _clauses_index(q)
135   - assert set(idx) == {"base_query", "base_query_trans_zh"}
  140 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"})
136 141 assert idx["base_query"]["query"] == "dress"
137 142 assert "title.en" in _title_fields(idx["base_query"])
138 143 assert idx["base_query_trans_zh"]["query"] == "连衣裙"
... ... @@ -150,7 +155,9 @@ def test_de_query_index_de_en_fr_includes_base_and_two_translations():
150 155 index_languages=["de", "en", "fr"],
151 156 )
152 157 idx = _clauses_index(q)
153   - assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_fr"}
  158 + assert set(idx) == _with_phrase(
  159 + {"base_query", "base_query_trans_en", "base_query_trans_fr"}
  160 + )
154 161 assert idx["base_query"]["query"] == "kleid"
155 162 assert "title.de" in _title_fields(idx["base_query"])
156 163 assert idx["base_query_trans_en"]["query"] == "dress"
... ... @@ -171,7 +178,9 @@ def test_de_query_index_only_en_zh_base_on_de_translations_on_target_fields():
171 178 index_languages=["en", "zh"],
172 179 )
173 180 idx = _clauses_index(q)
174   - assert set(idx) == {"base_query", "base_query_trans_en", "base_query_trans_zh"}
  181 + assert set(idx) == _with_phrase(
  182 + {"base_query", "base_query_trans_en", "base_query_trans_zh"}
  183 + )
175 184 assert idx["base_query"]["query"] == "schuh"
176 185 assert "title.de" in _title_fields(idx["base_query"])
177 186 assert "boost" not in idx["base_query"]
... ... @@ -197,7 +206,7 @@ def test_mixed_zh_primary_with_en_translation_merges_en_into_zh_base_clause():
197 206 contains_english=True,
198 207 )
199 208 idx = _clauses_index(q)
200   - assert set(idx) == {"base_query", "base_query_trans_en"}
  209 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"})
201 210 assert idx["base_query"]["query"] == "红色 dress"
202 211 assert _has_title_lang(idx["base_query"], "zh") and _has_title_lang(idx["base_query"], "en")
203 212 assert idx["base_query_trans_en"]["query"] == "red dress"
... ... @@ -217,7 +226,7 @@ def test_mixed_en_primary_with_zh_translation_merges_zh_into_en_base_clause():
217 226 contains_english=True,
218 227 )
219 228 idx = _clauses_index(q)
220   - assert set(idx) == {"base_query", "base_query_trans_zh"}
  229 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"})
221 230 assert idx["base_query"]["query"] == "nike 运动鞋"
222 231 assert _has_title_lang(idx["base_query"], "en") and _has_title_lang(idx["base_query"], "zh")
223 232 assert idx["base_query_trans_zh"]["query"] == "耐克运动鞋"
... ... @@ -236,7 +245,7 @@ def test_mixed_zh_query_index_zh_only_no_en_merge_in_base():
236 245 contains_english=True,
237 246 )
238 247 idx = _clauses_index(q)
239   - assert set(idx) == {"base_query"}
  248 + assert set(idx) == _with_phrase({"base_query"})
240 249 bases = {f.split("^", 1)[0] for f in _title_fields(idx["base_query"])}
241 250 assert bases == {"title.zh"}
242 251  
... ... @@ -255,7 +264,7 @@ def test_skips_translation_when_same_lang_and_same_text_as_base():
255 264 index_languages=["en", "zh"],
256 265 )
257 266 idx = _clauses_index(q)
258   - assert set(idx) == {"base_query", "base_query_trans_zh"}
  267 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"})
259 268  
260 269  
261 270 def test_keeps_translation_when_same_text_but_different_lang_than_base():
... ... @@ -269,7 +278,7 @@ def test_keeps_translation_when_same_text_but_different_lang_than_base():
269 278 index_languages=["en", "zh"],
270 279 )
271 280 idx = _clauses_index(q)
272   - assert set(idx) == {"base_query", "base_query_trans_zh"}
  281 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"})
273 282 assert idx["base_query_trans_zh"]["query"] == "NIKE"
274 283  
275 284  
... ... @@ -322,6 +331,8 @@ def test_empty_index_languages_treats_source_as_in_index_boosts():
322 331 idx = _clauses_index(q)
323 332 assert "boost" not in idx["base_query"]
324 333 assert idx["base_query_trans_en"]["boost"] == qb.translation_boost
  334 + assert idx["base_query_phrase"]["boost"] == qb.phrase_match_boost
  335 + assert idx["base_query_trans_en_phrase"]["boost"] == qb.phrase_match_boost
325 336  
326 337  
327 338 # --- 无翻译:仅 base_query ---
... ... @@ -338,7 +349,7 @@ def test_no_translations_only_base_query():
338 349 index_languages=["en", "zh"],
339 350 )
340 351 idx = _clauses_index(q)
341   - assert set(idx) == {"base_query"}
  352 + assert set(idx) == _with_phrase({"base_query"})
342 353  
343 354  
344 355 # --- 与 KNN 同存时仍能解析文本子句(顶层 knn 不影响 query 内结构) ---
... ... @@ -362,84 +373,7 @@ def test_text_clauses_present_alongside_knn():
362 373 )
363 374 assert "knn" in q
364 375 idx = _clauses_index(q)
365   - assert set(idx) == {"base_query", "base_query_trans_zh"}
366   -
367   -
368   -def test_detected_language_unknown_falls_back_to_default_language():
369   - """与 LanguageDetector 失败时 QueryConfig.default_language 行为对齐。"""
370   - qb = _builder_multilingual_title_only(default_language="en")
371   - parsed = SimpleNamespace(
372   - rewritten_query="shirt",
373   - detected_language="unknown",
374   - translations={"zh": "衬衫"},
375   - contains_chinese=False,
376   - contains_english=True,
377   - )
378   - q = qb.build_query(
379   - query_text="shirt",
380   - parsed_query=parsed,
381   - enable_knn=False,
382   - index_languages=["en", "zh"],
383   - )
384   - idx = _clauses_index(q)
385   - assert set(idx) == {"base_query", "base_query_trans_zh"}
386   - assert idx["base_query"]["query"] == "shirt"
387   - assert _has_title_lang(idx["base_query"], "en")
388   -
389   -
390   -def test_ru_query_index_ru_en_includes_base_ru_and_trans_en():
391   - qb = _builder_multilingual_title_only(default_language="en")
392   - q = _build(
393   - qb,
394   - query_text="платье",
395   - rewritten="платье",
396   - detected_language="ru",
397   - translations={"en": "dress"},
398   - index_languages=["ru", "en"],
399   - )
400   - idx = _clauses_index(q)
401   - assert set(idx) == {"base_query", "base_query_trans_en"}
402   - assert idx["base_query"]["query"] == "платье"
403   - assert _has_title_lang(idx["base_query"], "ru")
404   - assert idx["base_query_trans_en"]["query"] == "dress"
405   -
406   -
407   -def test_translation_for_lang_not_listed_in_index_languages_still_generates_clause():
408   - """
409   - 当前实现:凡是 translations 里非空的条目都会生成子句;
410   - index_languages 只约束混写扩列,不用于过滤翻译子句。
411   - """
412   - qb = _builder_multilingual_title_only(default_language="en")
413   - q = _build(
414   - qb,
415   - query_text="dress",
416   - rewritten="dress",
417   - detected_language="en",
418   - translations={"zh": "连衣裙", "de": "Kleid"},
419   - index_languages=["en", "zh"],
420   - )
421   - idx = _clauses_index(q)
422   - assert "base_query_trans_de" in idx
423   - assert idx["base_query_trans_de"]["query"] == "Kleid"
424   - assert _has_title_lang(idx["base_query_trans_de"], "de")
425   -
426   -
427   -def test_mixed_detected_zh_rewrite_differs_from_query_text_uses_rewritten_in_base():
428   - """base_query 始终用 rewritten_query,而非仅 query_text。"""
429   - qb = _builder_multilingual_title_only(default_language="en")
430   - q = _build(
431   - qb,
432   - query_text=" 红色 ",
433   - rewritten="红色连衣裙",
434   - detected_language="zh",
435   - translations={"en": "red dress"},
436   - index_languages=["zh", "en"],
437   - contains_chinese=True,
438   - contains_english=False,
439   - )
440   - idx = _clauses_index(q)
441   - assert idx["base_query"]["query"] == "红色连衣裙"
442   - assert idx["base_query_trans_en"]["query"] == "red dress"
  376 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"})
443 377  
444 378  
445 379 def test_detected_language_unknown_falls_back_to_default_language():
... ... @@ -459,7 +393,7 @@ def test_detected_language_unknown_falls_back_to_default_language():
459 393 index_languages=["en", "zh"],
460 394 )
461 395 idx = _clauses_index(q)
462   - assert set(idx) == {"base_query", "base_query_trans_zh"}
  396 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_zh"})
463 397 assert idx["base_query"]["query"] == "shirt"
464 398 assert _has_title_lang(idx["base_query"], "en")
465 399  
... ... @@ -475,7 +409,7 @@ def test_ru_query_index_ru_en_includes_base_ru_and_trans_en():
475 409 index_languages=["ru", "en"],
476 410 )
477 411 idx = _clauses_index(q)
478   - assert set(idx) == {"base_query", "base_query_trans_en"}
  412 + assert set(idx) == _with_phrase({"base_query", "base_query_trans_en"})
479 413 assert idx["base_query"]["query"] == "платье"
480 414 assert _has_title_lang(idx["base_query"], "ru")
481 415 assert idx["base_query_trans_en"]["query"] == "dress"
... ...