from config import FunctionScoreConfig, IndexConfig, QueryConfig, RerankConfig, SPUConfig, SearchConfig from query.query_parser import QueryParser class _DummyTranslator: def translate(self, text, target_lang, source_lang, scene, model_name): return f"{text}-{target_lang}" def _tokenizer(text): return str(text).split() def test_pure_english_word_token_length_and_script(): assert QueryParser._is_pure_english_word_token("ab") is False assert QueryParser._is_pure_english_word_token("abc") is True assert QueryParser._is_pure_english_word_token("wi-fi") is True assert QueryParser._is_pure_english_word_token("连衣裙") is False assert QueryParser._is_pure_english_word_token("ab12") is False def _build_config() -> SearchConfig: return SearchConfig( es_index_name="test_products", field_boosts={"title.en": 3.0, "title.zh": 3.0}, indexes=[IndexConfig(name="default", label="default", fields=["title.en", "title.zh"])], query_config=QueryConfig( enable_text_embedding=False, enable_query_rewrite=False, supported_languages=["en", "zh"], default_language="zh", ), function_score=FunctionScoreConfig(), rerank=RerankConfig(), spu_config=SPUConfig(enabled=False), ) def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(monkeypatch): parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh") result = parser.parse( "法式 dress 连衣裙", tenant_id="162", generate_vector=False, target_languages=["zh", "en"], ) assert result.detected_language == "zh" assert result.contains_chinese is True assert result.contains_english is True assert result.translations == {"en": "法式 dress 连衣裙-en"} assert result.query_tokens == ["法式", "dress", "连衣裙"] assert not hasattr(result, "query_text_by_lang") assert not hasattr(result, "search_langs") def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch): parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") result = parser.parse( "red 连衣裙", tenant_id="0", generate_vector=False, target_languages=["en", "zh"], ) assert result.detected_language == "en" assert result.contains_chinese is True assert result.contains_english is True assert result.translations == {"zh": "red 连衣裙-zh"} assert result.query_tokens == ["red", "连衣裙"] def test_parse_waits_for_translation_when_source_in_index_languages(monkeypatch): """en 在 index_languages 内时仍应等待并采纳 en->zh 翻译结果(与向量共用预算)。""" parser = QueryParser(_build_config(), translator=_DummyTranslator(), tokenizer=_tokenizer) monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") result = parser.parse( "off shoulder top", tenant_id="0", generate_vector=False, target_languages=["en", "zh"], ) assert result.detected_language == "en" assert result.contains_chinese is False assert result.contains_english is True assert result.translations.get("zh") == "off shoulder top-zh" assert not hasattr(result, "source_in_index_languages")