Blame view

tests/test_query_parser_mixed_language.py 5.58 KB
a8261ece   tangwang   检索效果优化
1
2
3
  from config import FunctionScoreConfig, IndexConfig, QueryConfig, RerankConfig, SPUConfig, SearchConfig
  from query.query_parser import QueryParser
  
99b72698   tangwang   测试回归钩子梳理
4
5
6
7
  import pytest
  
  pytestmark = [pytest.mark.query, pytest.mark.regression]
  
a8261ece   tangwang   检索效果优化
8
9
10
11
12
13
  
  class _DummyTranslator:
      def translate(self, text, target_lang, source_lang, scene, model_name):
          return f"{text}-{target_lang}"
  
  
ef5baa86   tangwang   混杂语言处理
14
15
16
17
  def _tokenizer(text):
      return str(text).split()
  
  
a8261ece   tangwang   检索效果优化
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
  def _build_config() -> SearchConfig:
      return SearchConfig(
          es_index_name="test_products",
          field_boosts={"title.en": 3.0, "title.zh": 3.0},
          indexes=[IndexConfig(name="default", label="default", fields=["title.en", "title.zh"])],
          query_config=QueryConfig(
              enable_text_embedding=False,
              enable_query_rewrite=False,
              supported_languages=["en", "zh"],
              default_language="zh",
          ),
          function_score=FunctionScoreConfig(),
          rerank=RerankConfig(),
          spu_config=SPUConfig(enabled=False),
      )
  
  
35da3813   tangwang   中英混写query的优化逻辑,不适...
35
  def test_parse_mixed_zh_query_translates_to_en(monkeypatch):
ef5baa86   tangwang   混杂语言处理
36
      parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
a8261ece   tangwang   检索效果优化
37
      monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh")
a8261ece   tangwang   检索效果优化
38
  
ef5baa86   tangwang   混杂语言处理
39
40
41
42
43
44
      result = parser.parse(
          "法式 dress 连衣裙",
          tenant_id="162",
          generate_vector=False,
          target_languages=["zh", "en"],
      )
a8261ece   tangwang   检索效果优化
45
46
  
      assert result.detected_language == "zh"
ef5baa86   tangwang   混杂语言处理
47
48
49
50
      assert result.translations == {"en": "法式 dress 连衣裙-en"}
      assert result.query_tokens == ["法式", "dress", "连衣裙"]
      assert not hasattr(result, "query_text_by_lang")
      assert not hasattr(result, "search_langs")
a8261ece   tangwang   检索效果优化
51
52
  
  
35da3813   tangwang   中英混写query的优化逻辑,不适...
53
  def test_parse_mixed_en_query_translates_to_zh(monkeypatch):
ef5baa86   tangwang   混杂语言处理
54
      parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
a8261ece   tangwang   检索效果优化
55
      monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
a8261ece   tangwang   检索效果优化
56
  
ef5baa86   tangwang   混杂语言处理
57
58
59
60
61
62
      result = parser.parse(
          "red 连衣裙",
          tenant_id="0",
          generate_vector=False,
          target_languages=["en", "zh"],
      )
a8261ece   tangwang   检索效果优化
63
64
  
      assert result.detected_language == "en"
ef5baa86   tangwang   混杂语言处理
65
66
      assert result.translations == {"zh": "red 连衣裙-zh"}
      assert result.query_tokens == ["red", "连衣裙"]
1556989b   tangwang   query翻译等待超时逻辑
67
68
69
70
  
  
  def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch):
      """en 在 index_languages 内时仍应等待并采纳 en->zh 翻译结果(与向量共用预算)。"""
ef5baa86   tangwang   混杂语言处理
71
      parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
1556989b   tangwang   query翻译等待超时逻辑
72
      monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
1556989b   tangwang   query翻译等待超时逻辑
73
  
ef5baa86   tangwang   混杂语言处理
74
75
76
77
78
79
      result = parser.parse(
          "off shoulder top",
          tenant_id="0",
          generate_vector=False,
          target_languages=["en", "zh"],
      )
1556989b   tangwang   query翻译等待超时逻辑
80
81
  
      assert result.detected_language == "en"
1556989b   tangwang   query翻译等待超时逻辑
82
      assert result.translations.get("zh") == "off shoulder top-zh"
ef5baa86   tangwang   混杂语言处理
83
      assert not hasattr(result, "source_in_index_languages")
45b39796   tangwang   qp性能优化
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
  
  
  def test_parse_reuses_tokenization_across_tail_stages(monkeypatch):
      tokenize_calls = []
  
      def counting_tokenizer(text):
          tokenize_calls.append(str(text))
          return str(text).split()
  
      config = SearchConfig(
          es_index_name="test_products",
          field_boosts={"title.en": 3.0, "title.zh": 3.0},
          indexes=[IndexConfig(name="default", label="default", fields=["title.en", "title.zh"])],
          query_config=QueryConfig(
              enable_text_embedding=False,
              enable_query_rewrite=False,
              supported_languages=["en", "zh"],
              default_language="en",
              style_intent_terms={
                  "color": [
                      {"en_terms": ["black"], "zh_terms": ["黑色"], "attribute_terms": ["black"]}
                  ],
              },
              style_intent_dimension_aliases={"color": ["color", "颜色"]},
              product_title_exclusion_rules=[
                  {
                      "zh_trigger_terms": ["修身"],
                      "en_trigger_terms": ["fitted"],
                      "zh_title_exclusions": ["宽松"],
                      "en_title_exclusions": ["loose"],
                  }
              ],
          ),
          function_score=FunctionScoreConfig(),
          rerank=RerankConfig(),
          spu_config=SPUConfig(enabled=False),
      )
      parser = QueryParser(
          config,
          translator=_DummyTranslator(),
          tokenizer=counting_tokenizer,
      )
      monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
  
      result = parser.parse(
          "black fitted dress",
          tenant_id="0",
          generate_vector=False,
          target_languages=["en", "zh"],
      )
  
      assert result.translations == {"zh": "black fitted dress-zh"}
      assert result.style_intent_profile is not None
      assert result.style_intent_profile.is_active is True
      assert result.product_title_exclusion_profile is not None
      assert result.product_title_exclusion_profile.is_active is True
      assert tokenize_calls == []
  
  
42024409   tangwang   评估框架-批量打标
143
  def test_parse_ascii_latin_query_uses_language_detector(monkeypatch):
45b39796   tangwang   qp性能优化
144
      parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
42024409   tangwang   评估框架-批量打标
145
146
      detector_calls = []
  
45b39796   tangwang   qp性能优化
147
148
149
      monkeypatch.setattr(
          parser.language_detector,
          "detect",
42024409   tangwang   评估框架-批量打标
150
          lambda text: detector_calls.append(text) or "es",
45b39796   tangwang   qp性能优化
151
152
153
      )
  
      result = parser.parse(
42024409   tangwang   评估框架-批量打标
154
          "falda negra oficina",
45b39796   tangwang   qp性能优化
155
156
157
158
159
          tenant_id="0",
          generate_vector=False,
          target_languages=["en", "zh"],
      )
  
42024409   tangwang   评估框架-批量打标
160
161
162
163
      assert detector_calls == ["falda negra oficina"]
      assert result.detected_language == "es"
      assert result.translations == {"en": "falda negra oficina-en", "zh": "falda negra oficina-zh"}
      assert result.query_tokens == ["falda", "negra", "oficina"]