Blame view

tests/test_query_parser_mixed_language.py 3.98 KB
a8261ece   tangwang   检索效果优化
1
2
3
4
5
6
7
8
9
10
11
  from types import SimpleNamespace
  
  from config import FunctionScoreConfig, IndexConfig, QueryConfig, RerankConfig, SPUConfig, SearchConfig
  from query.query_parser import QueryParser
  
  
  class _DummyTranslator:
      def translate(self, text, target_lang, source_lang, scene, model_name):
          return f"{text}-{target_lang}"
  
  
6823fe3e   tangwang   feat(search): 混合语...
12
13
14
15
16
17
18
19
  def test_pure_english_word_token_length_and_script():
      assert QueryParser._is_pure_english_word_token("ab") is False
      assert QueryParser._is_pure_english_word_token("abc") is True
      assert QueryParser._is_pure_english_word_token("wi-fi") is True
      assert QueryParser._is_pure_english_word_token("连衣裙") is False
      assert QueryParser._is_pure_english_word_token("ab12") is False
  
  
a8261ece   tangwang   检索效果优化
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
  def _build_config() -> SearchConfig:
      return SearchConfig(
          es_index_name="test_products",
          field_boosts={"title.en": 3.0, "title.zh": 3.0},
          indexes=[IndexConfig(name="default", label="default", fields=["title.en", "title.zh"])],
          query_config=QueryConfig(
              enable_text_embedding=False,
              enable_query_rewrite=False,
              supported_languages=["en", "zh"],
              default_language="zh",
          ),
          function_score=FunctionScoreConfig(),
          rerank=RerankConfig(),
          spu_config=SPUConfig(enabled=False),
      )
  
  
  def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(monkeypatch):
      parser = QueryParser(_build_config(), translator=_DummyTranslator())
      monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh")
      monkeypatch.setattr(
          "query.query_parser.get_tenant_config_loader",
          lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["zh", "en"]}),
          raising=False,
      )
  
      result = parser.parse("法式 dress 连衣裙", tenant_id="162", generate_vector=False)
  
      assert result.detected_language == "zh"
6823fe3e   tangwang   feat(search): 混合语...
49
50
      assert result.contains_chinese is True
      assert result.contains_english is True
a8261ece   tangwang   检索效果优化
51
      assert "en" in result.search_langs
1556989b   tangwang   query翻译等待超时逻辑
52
53
      # 翻译在预算内完成时会写入目标语言字段(优于仅用原文做 supplemental 探测)
      assert result.query_text_by_lang["en"] == "法式 dress 连衣裙-en"
a8261ece   tangwang   检索效果优化
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
      assert result.query_text_by_lang["zh"] == "法式 dress 连衣裙"
  
  
  def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch):
      parser = QueryParser(_build_config(), translator=_DummyTranslator())
      monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
      monkeypatch.setattr(
          "query.query_parser.get_tenant_config_loader",
          lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}),
          raising=False,
      )
  
      result = parser.parse("red 连衣裙", tenant_id="0", generate_vector=False)
  
      assert result.detected_language == "en"
6823fe3e   tangwang   feat(search): 混合语...
69
70
      assert result.contains_chinese is True
      assert result.contains_english is True
a8261ece   tangwang   检索效果优化
71
      assert "zh" in result.search_langs
1556989b   tangwang   query翻译等待超时逻辑
72
      assert result.query_text_by_lang["zh"] == "red 连衣裙-zh"
a8261ece   tangwang   检索效果优化
73
      assert result.query_text_by_lang["en"] == "red 连衣裙"
1556989b   tangwang   query翻译等待超时逻辑
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
  
  
  def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch):
      """en 在 index_languages 内时仍应等待并采纳 en->zh 翻译结果(与向量共用预算)。"""
      parser = QueryParser(_build_config(), translator=_DummyTranslator())
      monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
      monkeypatch.setattr(
          "query.query_parser.get_tenant_config_loader",
          lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}),
          raising=False,
      )
  
      result = parser.parse("off shoulder top", tenant_id="0", generate_vector=False)
  
      assert result.detected_language == "en"
6823fe3e   tangwang   feat(search): 混合语...
89
90
      assert result.contains_chinese is False
      assert result.contains_english is True
1556989b   tangwang   query翻译等待超时逻辑
91
92
93
      assert result.translations.get("zh") == "off shoulder top-zh"
      assert result.query_text_by_lang.get("zh") == "off shoulder top-zh"
      assert result.source_in_index_languages is True