Blame view

tests/test_query_parser_mixed_language.py 2.79 KB
a8261ece   tangwang   检索效果优化
1
2
3
4
5
6
7
8
9
  from config import FunctionScoreConfig, IndexConfig, QueryConfig, RerankConfig, SPUConfig, SearchConfig
  from query.query_parser import QueryParser
  
  
  class _DummyTranslator:
      def translate(self, text, target_lang, source_lang, scene, model_name):
          return f"{text}-{target_lang}"
  
  
ef5baa86   tangwang   混杂语言处理
10
11
12
13
  def _tokenizer(text):
      return str(text).split()
  
  
a8261ece   tangwang   检索效果优化
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
  def _build_config() -> SearchConfig:
      return SearchConfig(
          es_index_name="test_products",
          field_boosts={"title.en": 3.0, "title.zh": 3.0},
          indexes=[IndexConfig(name="default", label="default", fields=["title.en", "title.zh"])],
          query_config=QueryConfig(
              enable_text_embedding=False,
              enable_query_rewrite=False,
              supported_languages=["en", "zh"],
              default_language="zh",
          ),
          function_score=FunctionScoreConfig(),
          rerank=RerankConfig(),
          spu_config=SPUConfig(enabled=False),
      )
  
  
35da3813   tangwang   中英混写query的优化逻辑,不适...
31
  def test_parse_mixed_zh_query_translates_to_en(monkeypatch):
ef5baa86   tangwang   混杂语言处理
32
      parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
a8261ece   tangwang   检索效果优化
33
      monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh")
a8261ece   tangwang   检索效果优化
34
  
ef5baa86   tangwang   混杂语言处理
35
36
37
38
39
40
      result = parser.parse(
          "法式 dress 连衣裙",
          tenant_id="162",
          generate_vector=False,
          target_languages=["zh", "en"],
      )
a8261ece   tangwang   检索效果优化
41
42
  
      assert result.detected_language == "zh"
ef5baa86   tangwang   混杂语言处理
43
44
45
46
      assert result.translations == {"en": "法式 dress 连衣裙-en"}
      assert result.query_tokens == ["法式", "dress", "连衣裙"]
      assert not hasattr(result, "query_text_by_lang")
      assert not hasattr(result, "search_langs")
a8261ece   tangwang   检索效果优化
47
48
  
  
35da3813   tangwang   中英混写query的优化逻辑,不适...
49
  def test_parse_mixed_en_query_translates_to_zh(monkeypatch):
ef5baa86   tangwang   混杂语言处理
50
      parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
a8261ece   tangwang   检索效果优化
51
      monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
a8261ece   tangwang   检索效果优化
52
  
ef5baa86   tangwang   混杂语言处理
53
54
55
56
57
58
      result = parser.parse(
          "red 连衣裙",
          tenant_id="0",
          generate_vector=False,
          target_languages=["en", "zh"],
      )
a8261ece   tangwang   检索效果优化
59
60
  
      assert result.detected_language == "en"
ef5baa86   tangwang   混杂语言处理
61
62
      assert result.translations == {"zh": "red 连衣裙-zh"}
      assert result.query_tokens == ["red", "连衣裙"]
1556989b   tangwang   query翻译等待超时逻辑
63
64
65
66
  
  
  def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch):
      """en 在 index_languages 内时仍应等待并采纳 en->zh 翻译结果(与向量共用预算)。"""
ef5baa86   tangwang   混杂语言处理
67
      parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
1556989b   tangwang   query翻译等待超时逻辑
68
      monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
1556989b   tangwang   query翻译等待超时逻辑
69
  
ef5baa86   tangwang   混杂语言处理
70
71
72
73
74
75
      result = parser.parse(
          "off shoulder top",
          tenant_id="0",
          generate_vector=False,
          target_languages=["en", "zh"],
      )
1556989b   tangwang   query翻译等待超时逻辑
76
77
  
      assert result.detected_language == "en"
1556989b   tangwang   query翻译等待超时逻辑
78
      assert result.translations.get("zh") == "off shoulder top-zh"
ef5baa86   tangwang   混杂语言处理
79
      assert not hasattr(result, "source_in_index_languages")