a8261ece
tangwang
检索效果优化
|
1
2
3
4
5
6
7
8
9
|
from config import FunctionScoreConfig, IndexConfig, QueryConfig, RerankConfig, SPUConfig, SearchConfig
from query.query_parser import QueryParser
class _DummyTranslator:
def translate(self, text, target_lang, source_lang, scene, model_name):
return f"{text}-{target_lang}"
|
ef5baa86
tangwang
混杂语言处理
|
10
11
12
13
|
def _tokenizer(text):
return str(text).split()
|
a8261ece
tangwang
检索效果优化
|
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
|
def _build_config() -> SearchConfig:
return SearchConfig(
es_index_name="test_products",
field_boosts={"title.en": 3.0, "title.zh": 3.0},
indexes=[IndexConfig(name="default", label="default", fields=["title.en", "title.zh"])],
query_config=QueryConfig(
enable_text_embedding=False,
enable_query_rewrite=False,
supported_languages=["en", "zh"],
default_language="zh",
),
function_score=FunctionScoreConfig(),
rerank=RerankConfig(),
spu_config=SPUConfig(enabled=False),
)
|
35da3813
tangwang
中英混写query的优化逻辑,不适...
|
31
|
def test_parse_mixed_zh_query_translates_to_en(monkeypatch):
|
ef5baa86
tangwang
混杂语言处理
|
32
|
parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
|
a8261ece
tangwang
检索效果优化
|
33
|
monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh")
|
a8261ece
tangwang
检索效果优化
|
34
|
|
ef5baa86
tangwang
混杂语言处理
|
35
36
37
38
39
40
|
result = parser.parse(
"法式 dress 连衣裙",
tenant_id="162",
generate_vector=False,
target_languages=["zh", "en"],
)
|
a8261ece
tangwang
检索效果优化
|
41
42
|
assert result.detected_language == "zh"
|
ef5baa86
tangwang
混杂语言处理
|
43
44
45
46
|
assert result.translations == {"en": "法式 dress 连衣裙-en"}
assert result.query_tokens == ["法式", "dress", "连衣裙"]
assert not hasattr(result, "query_text_by_lang")
assert not hasattr(result, "search_langs")
|
a8261ece
tangwang
检索效果优化
|
47
48
|
|
35da3813
tangwang
中英混写query的优化逻辑,不适...
|
49
|
def test_parse_mixed_en_query_translates_to_zh(monkeypatch):
|
ef5baa86
tangwang
混杂语言处理
|
50
|
parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
|
a8261ece
tangwang
检索效果优化
|
51
|
monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
|
a8261ece
tangwang
检索效果优化
|
52
|
|
ef5baa86
tangwang
混杂语言处理
|
53
54
55
56
57
58
|
result = parser.parse(
"red 连衣裙",
tenant_id="0",
generate_vector=False,
target_languages=["en", "zh"],
)
|
a8261ece
tangwang
检索效果优化
|
59
60
|
assert result.detected_language == "en"
|
ef5baa86
tangwang
混杂语言处理
|
61
62
|
assert result.translations == {"zh": "red 连衣裙-zh"}
assert result.query_tokens == ["red", "连衣裙"]
|
1556989b
tangwang
query翻译等待超时逻辑
|
63
64
65
66
|
def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch):
"""en 在 index_languages 内时仍应等待并采纳 en->zh 翻译结果(与向量共用预算)。"""
|
ef5baa86
tangwang
混杂语言处理
|
67
|
parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer)
|
1556989b
tangwang
query翻译等待超时逻辑
|
68
|
monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
|
1556989b
tangwang
query翻译等待超时逻辑
|
69
|
|
ef5baa86
tangwang
混杂语言处理
|
70
71
72
73
74
75
|
result = parser.parse(
"off shoulder top",
tenant_id="0",
generate_vector=False,
target_languages=["en", "zh"],
)
|
1556989b
tangwang
query翻译等待超时逻辑
|
76
77
|
assert result.detected_language == "en"
|
1556989b
tangwang
query翻译等待超时逻辑
|
78
|
assert result.translations.get("zh") == "off shoulder top-zh"
|
ef5baa86
tangwang
混杂语言处理
|
79
|
assert not hasattr(result, "source_in_index_languages")
|