a8261ece
tangwang
检索效果优化
|
1
2
3
4
5
6
7
8
9
|
from config import FunctionScoreConfig, IndexConfig, QueryConfig, RerankConfig, SPUConfig, SearchConfig
from query.query_parser import QueryParser
class _DummyTranslator:
def translate(self, text, target_lang, source_lang, scene, model_name):
return f"{text}-{target_lang}"
|
ef5baa86
tangwang
混杂语言处理
|
10
11
12
13
|
def _tokenizer(text):
return str(text).split()
|
a8261ece
tangwang
检索效果优化
|
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
|
def _build_config() -> SearchConfig:
return SearchConfig(
es_index_name="test_products",
field_boosts={"title.en": 3.0, "title.zh": 3.0},
indexes=[IndexConfig(name="default", label="default", fields=["title.en", "title.zh"])],
query_config=QueryConfig(
enable_text_embedding=False,
enable_query_rewrite=False,
supported_languages=["en", "zh"],
default_language="zh",
),
function_score=FunctionScoreConfig(),
rerank=RerankConfig(),
spu_config=SPUConfig(enabled=False),
)
|
35da3813
tangwang
中英混写query的优化逻辑,不适...
|
31
|
def test_parse_mixed_zh_query_translates_to_en(monkeypatch):
|
ef5baa86
tangwang
混杂语言处理
|
32
|
parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
|
a8261ece
tangwang
检索效果优化
|
33
|
monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh")
|
a8261ece
tangwang
检索效果优化
|
34
|
|
ef5baa86
tangwang
混杂语言处理
|
35
36
37
38
39
40
|
result = parser.parse(
"法式 dress 连衣裙",
tenant_id="162",
generate_vector=False,
target_languages=["zh", "en"],
)
|
a8261ece
tangwang
检索效果优化
|
41
42
|
assert result.detected_language == "zh"
|
ef5baa86
tangwang
混杂语言处理
|
43
44
45
46
|
assert result.translations == {"en": "法式 dress 连衣裙-en"}
assert result.query_tokens == ["法式", "dress", "连衣裙"]
assert not hasattr(result, "query_text_by_lang")
assert not hasattr(result, "search_langs")
|
a8261ece
tangwang
检索效果优化
|
47
48
|
|
35da3813
tangwang
中英混写query的优化逻辑,不适...
|
49
|
def test_parse_mixed_en_query_translates_to_zh(monkeypatch):
|
ef5baa86
tangwang
混杂语言处理
|
50
|
parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
|
a8261ece
tangwang
检索效果优化
|
51
|
monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
|
a8261ece
tangwang
检索效果优化
|
52
|
|
ef5baa86
tangwang
混杂语言处理
|
53
54
55
56
57
58
|
result = parser.parse(
"red 连衣裙",
tenant_id="0",
generate_vector=False,
target_languages=["en", "zh"],
)
|
a8261ece
tangwang
检索效果优化
|
59
60
|
assert result.detected_language == "en"
|
ef5baa86
tangwang
混杂语言处理
|
61
62
|
assert result.translations == {"zh": "red 连衣裙-zh"}
assert result.query_tokens == ["red", "连衣裙"]
|
1556989b
tangwang
query翻译等待超时逻辑
|
63
64
65
66
|
def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch):
"""en 在 index_languages 内时仍应等待并采纳 en->zh 翻译结果(与向量共用预算)。"""
|
ef5baa86
tangwang
混杂语言处理
|
67
|
parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
|
1556989b
tangwang
query翻译等待超时逻辑
|
68
|
monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
|
1556989b
tangwang
query翻译等待超时逻辑
|
69
|
|
ef5baa86
tangwang
混杂语言处理
|
70
71
72
73
74
75
|
result = parser.parse(
"off shoulder top",
tenant_id="0",
generate_vector=False,
target_languages=["en", "zh"],
)
|
1556989b
tangwang
query翻译等待超时逻辑
|
76
77
|
assert result.detected_language == "en"
|
1556989b
tangwang
query翻译等待超时逻辑
|
78
|
assert result.translations.get("zh") == "off shoulder top-zh"
|
ef5baa86
tangwang
混杂语言处理
|
79
|
assert not hasattr(result, "source_in_index_languages")
|
45b39796
tangwang
qp性能优化
|
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
|
def test_parse_reuses_tokenization_across_tail_stages(monkeypatch):
tokenize_calls = []
def counting_tokenizer(text):
tokenize_calls.append(str(text))
return str(text).split()
config = SearchConfig(
es_index_name="test_products",
field_boosts={"title.en": 3.0, "title.zh": 3.0},
indexes=[IndexConfig(name="default", label="default", fields=["title.en", "title.zh"])],
query_config=QueryConfig(
enable_text_embedding=False,
enable_query_rewrite=False,
supported_languages=["en", "zh"],
default_language="en",
style_intent_terms={
"color": [
{"en_terms": ["black"], "zh_terms": ["黑色"], "attribute_terms": ["black"]}
],
},
style_intent_dimension_aliases={"color": ["color", "颜色"]},
product_title_exclusion_rules=[
{
"zh_trigger_terms": ["修身"],
"en_trigger_terms": ["fitted"],
"zh_title_exclusions": ["宽松"],
"en_title_exclusions": ["loose"],
}
],
),
function_score=FunctionScoreConfig(),
rerank=RerankConfig(),
spu_config=SPUConfig(enabled=False),
)
parser = QueryParser(
config,
translator=_DummyTranslator(),
tokenizer=counting_tokenizer,
)
monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
result = parser.parse(
"black fitted dress",
tenant_id="0",
generate_vector=False,
target_languages=["en", "zh"],
)
assert result.translations == {"zh": "black fitted dress-zh"}
assert result.style_intent_profile is not None
assert result.style_intent_profile.is_active is True
assert result.product_title_exclusion_profile is not None
assert result.product_title_exclusion_profile.is_active is True
assert tokenize_calls == []
|
42024409
tangwang
评估框架-批量打标
|
139
|
def test_parse_ascii_latin_query_uses_language_detector(monkeypatch):
|
45b39796
tangwang
qp性能优化
|
140
|
parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
|
42024409
tangwang
评估框架-批量打标
|
141
142
|
detector_calls = []
|
45b39796
tangwang
qp性能优化
|
143
144
145
|
monkeypatch.setattr(
parser.language_detector,
"detect",
|
42024409
tangwang
评估框架-批量打标
|
146
|
lambda text: detector_calls.append(text) or "es",
|
45b39796
tangwang
qp性能优化
|
147
148
149
|
)
result = parser.parse(
|
42024409
tangwang
评估框架-批量打标
|
150
|
"falda negra oficina",
|
45b39796
tangwang
qp性能优化
|
151
152
153
154
155
|
tenant_id="0",
generate_vector=False,
target_languages=["en", "zh"],
)
|
42024409
tangwang
评估框架-批量打标
|
156
157
158
159
|
assert detector_calls == ["falda negra oficina"]
assert result.detected_language == "es"
assert result.translations == {"en": "falda negra oficina-en", "zh": "falda negra oficina-zh"}
assert result.query_tokens == ["falda", "negra", "oficina"]
|