from config import FunctionScoreConfig, IndexConfig, QueryConfig, RerankConfig, SPUConfig, SearchConfig from query.query_parser import QueryParser class _DummyTranslator: def translate(self, text, target_lang, source_lang, scene, model_name): return f"{text}-{target_lang}" def _tokenizer(text): return str(text).split() def _build_config() -> SearchConfig: return SearchConfig( es_index_name="test_products", field_boosts={"title.en": 3.0, "title.zh": 3.0}, indexes=[IndexConfig(name="default", label="default", fields=["title.en", "title.zh"])], query_config=QueryConfig( enable_text_embedding=False, enable_query_rewrite=False, supported_languages=["en", "zh"], default_language="zh", ), function_score=FunctionScoreConfig(), rerank=RerankConfig(), spu_config=SPUConfig(enabled=False), ) def test_parse_mixed_zh_query_translates_to_en(monkeypatch): parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh") result = parser.parse( "法式 dress 连衣裙", tenant_id="162", generate_vector=False, target_languages=["zh", "en"], ) assert result.detected_language == "zh" assert result.translations == {"en": "法式 dress 连衣裙-en"} assert result.query_tokens == ["法式", "dress", "连衣裙"] assert not hasattr(result, "query_text_by_lang") assert not hasattr(result, "search_langs") def test_parse_mixed_en_query_translates_to_zh(monkeypatch): parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") result = parser.parse( "red 连衣裙", tenant_id="0", generate_vector=False, target_languages=["en", "zh"], ) assert result.detected_language == "en" assert result.translations == {"zh": "red 连衣裙-zh"} assert result.query_tokens == ["red", "连衣裙"] def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch): """en 在 index_languages 内时仍应等待并采纳 en->zh 翻译结果(与向量共用预算)。""" parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") result = parser.parse( "off shoulder top", tenant_id="0", generate_vector=False, target_languages=["en", "zh"], ) assert result.detected_language == "en" assert result.translations.get("zh") == "off shoulder top-zh" assert not hasattr(result, "source_in_index_languages") def test_parse_reuses_tokenization_across_tail_stages(monkeypatch): tokenize_calls = [] def counting_tokenizer(text): tokenize_calls.append(str(text)) return str(text).split() config = SearchConfig( es_index_name="test_products", field_boosts={"title.en": 3.0, "title.zh": 3.0}, indexes=[IndexConfig(name="default", label="default", fields=["title.en", "title.zh"])], query_config=QueryConfig( enable_text_embedding=False, enable_query_rewrite=False, supported_languages=["en", "zh"], default_language="en", style_intent_terms={ "color": [ {"en_terms": ["black"], "zh_terms": ["黑色"], "attribute_terms": ["black"]} ], }, style_intent_dimension_aliases={"color": ["color", "颜色"]}, product_title_exclusion_rules=[ { "zh_trigger_terms": ["修身"], "en_trigger_terms": ["fitted"], "zh_title_exclusions": ["宽松"], "en_title_exclusions": ["loose"], } ], ), function_score=FunctionScoreConfig(), rerank=RerankConfig(), spu_config=SPUConfig(enabled=False), ) parser = QueryParser( config, translator=_DummyTranslator(), tokenizer=counting_tokenizer, ) monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") result = parser.parse( "black fitted dress", tenant_id="0", generate_vector=False, target_languages=["en", "zh"], ) assert result.translations == {"zh": "black fitted dress-zh"} assert result.style_intent_profile is not None assert result.style_intent_profile.is_active is True assert result.product_title_exclusion_profile is not None assert result.product_title_exclusion_profile.is_active is True assert tokenize_calls == [] def test_parse_ascii_latin_query_uses_language_detector(monkeypatch): parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) detector_calls = [] monkeypatch.setattr( parser.language_detector, "detect", lambda text: detector_calls.append(text) or "es", ) result = parser.parse( "falda negra oficina", tenant_id="0", generate_vector=False, target_languages=["en", "zh"], ) assert detector_calls == ["falda negra oficina"] assert result.detected_language == "es" assert result.translations == {"en": "falda negra oficina-en", "zh": "falda negra oficina-zh"} assert result.query_tokens == ["falda", "negra", "oficina"]