From a8261ece89a755210af8523091ce7922dd3c83ca Mon Sep 17 00:00:00 2001 From: tangwang Date: Wed, 18 Mar 2026 10:55:57 +0800 Subject: [PATCH] 检索效果优化 --- config/config.yaml | 1 + query/query_parser.py | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- search/es_query_builder.py | 4 ++-- search/searcher.py | 7 ++++++- tests/test_es_query_builder.py | 2 +- tests/test_query_parser_mixed_language.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 133 insertions(+), 7 deletions(-) create mode 100644 tests/test_query_parser_mixed_language.py diff --git a/config/config.yaml b/config/config.yaml index b54acb4..54114d8 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -18,6 +18,7 @@ field_boosts: title: 3.0 brief: 1.5 description: 1.0 + qanchors: 1.5 vendor: 1.5 category_path: 1.5 category_name_text: 1.5 diff --git a/query/query_parser.py b/query/query_parser.py index b3aa3d0..cc73f9f 100644 --- a/query/query_parser.py +++ b/query/query_parser.py @@ -135,8 +135,8 @@ class QueryParser: cfg = get_translation_config() logger.info( "Initializing translator client at QueryParser construction (service_url=%s, default_model=%s)...", - cfg.service_url, - cfg.default_model, + cfg.get("service_url"), + cfg.get("default_model"), ) self._translator = create_translation_client() self._translation_executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="query-translation") @@ -207,6 +207,55 @@ class QueryParser: return [x[0] for x in tok_result] if tok_result else [] return self._simple_tokenize(query) + @staticmethod + def _contains_cjk(text: str) -> bool: + """Whether query contains any CJK ideograph.""" + return bool(re.search(r"[\u4e00-\u9fff]", text or "")) + + @staticmethod + def _extract_latin_tokens(text: str) -> List[str]: + """Extract latin word tokens from query text.""" + return re.findall(r"[A-Za-z]+(?:-[A-Za-z]+)*", text or "") + + def _infer_supplemental_search_langs( + self, + query_text: str, + detected_lang: str, + index_langs: List[str], + ) -> List[str]: + """ + Infer extra languages to search when the query mixes scripts. + + Rules: + - If any Chinese characters appear, include `zh` when available. + - If the query contains meaningful latin tokens, include `en` when available. + "Meaningful" means either: + 1) at least 2 latin tokens with length >= 4, or + 2) at least 1 latin token with length >= 4 and latin chars occupy >= 20% of non-space chars. + """ + supplemental: List[str] = [] + normalized_index_langs = {str(lang or "").strip().lower() for lang in index_langs} + normalized_detected = str(detected_lang or "").strip().lower() + query_text = str(query_text or "") + + if "zh" in normalized_index_langs and self._contains_cjk(query_text) and normalized_detected != "zh": + supplemental.append("zh") + + latin_tokens = self._extract_latin_tokens(query_text) + significant_latin_tokens = [tok for tok in latin_tokens if len(tok) >= 4] + latin_chars = sum(len(tok) for tok in latin_tokens) + non_space_chars = len(re.sub(r"\s+", "", query_text)) + latin_ratio = (latin_chars / non_space_chars) if non_space_chars > 0 else 0.0 + has_meaningful_english = ( + len(significant_latin_tokens) >= 2 or + (len(significant_latin_tokens) >= 1 and latin_ratio >= 0.2) + ) + + if "en" in normalized_index_langs and has_meaningful_english and normalized_detected != "en": + supplemental.append("en") + + return supplemental + def parse( self, query: str, @@ -482,7 +531,17 @@ class QueryParser: for lang, translated_text in (translations or {}).items(): if translated_text and str(translated_text).strip(): query_text_by_lang[str(lang).strip().lower()] = str(translated_text) - + + supplemental_search_langs = self._infer_supplemental_search_langs( + query_text=query_text, + detected_lang=detected_lang, + index_langs=index_langs, + ) + for lang in supplemental_search_langs: + if lang not in query_text_by_lang and query_text: + # Use the original mixed-script query as a robust fallback probe for that language field set. + query_text_by_lang[lang] = query_text + source_in_index_languages = detected_lang in index_langs ordered_search_langs: List[str] = [] seen_order = set() @@ -501,6 +560,7 @@ class QueryParser: if context: context.store_intermediate_result("search_langs", ordered_search_langs) context.store_intermediate_result("query_text_by_lang", query_text_by_lang) + context.store_intermediate_result("supplemental_search_langs", supplemental_search_langs) # Build result result = ParsedQuery( diff --git a/search/es_query_builder.py b/search/es_query_builder.py index 854db86..12393bc 100644 --- a/search/es_query_builder.py +++ b/search/es_query_builder.py @@ -58,7 +58,7 @@ class ESQueryBuilder: self.match_fields = match_fields self.field_boosts = field_boosts or {} self.multilingual_fields = multilingual_fields or [ - "title", "brief", "description", "vendor", "category_path", "category_name_text" + "title", "brief", "description", "qanchors", "vendor", "category_path", "category_name_text" ] self.shared_fields = shared_fields or ["tags", "option1_values", "option2_values", "option3_values"] self.core_multilingual_fields = core_multilingual_fields or ["title", "brief", "vendor", "category_name_text"] @@ -276,7 +276,7 @@ class ESQueryBuilder: "k": knn_k, "num_candidates": knn_num_candidates, "boost": knn_boost, - "name": "knn_query", + "_name": "knn_query", } # Top-level knn does not inherit query.bool.filter automatically. # Apply conjunctive + range filters here so vector recall respects hard filters. diff --git a/search/searcher.py b/search/searcher.py index ddd2910..2b82a4a 100644 --- a/search/searcher.py +++ b/search/searcher.py @@ -670,9 +670,11 @@ class Searcher: if rerank_debug: debug_entry["doc_id"] = rerank_debug.get("doc_id") # 与 rerank_client 中字段保持一致,便于前端直接使用 - debug_entry["es_score_norm"] = rerank_debug.get("es_score_norm") debug_entry["rerank_score"] = rerank_debug.get("rerank_score") + debug_entry["text_score"] = rerank_debug.get("text_score") + debug_entry["knn_score"] = rerank_debug.get("knn_score") debug_entry["fused_score"] = rerank_debug.get("fused_score") + debug_entry["matched_queries"] = rerank_debug.get("matched_queries") per_result_debug.append(debug_entry) @@ -719,6 +721,9 @@ class Searcher: "rewritten_query": context.query_analysis.rewritten_query, "detected_language": context.query_analysis.detected_language, "translations": context.query_analysis.translations, + "query_text_by_lang": context.get_intermediate_result("query_text_by_lang", {}), + "search_langs": context.get_intermediate_result("search_langs", []), + "supplemental_search_langs": context.get_intermediate_result("supplemental_search_langs", []), "has_vector": context.query_analysis.query_vector is not None, "is_simple_query": context.query_analysis.is_simple_query, "domain": context.query_analysis.domain diff --git a/tests/test_es_query_builder.py b/tests/test_es_query_builder.py index 5cac1a6..6fe9072 100644 --- a/tests/test_es_query_builder.py +++ b/tests/test_es_query_builder.py @@ -62,4 +62,4 @@ def test_knn_prefilter_not_added_without_filters(): assert "knn" in q assert "filter" not in q["knn"] - assert q["knn"]["name"] == "knn_query" + assert q["knn"]["_name"] == "knn_query" diff --git a/tests/test_query_parser_mixed_language.py b/tests/test_query_parser_mixed_language.py new file mode 100644 index 0000000..a8b6ab0 --- /dev/null +++ b/tests/test_query_parser_mixed_language.py @@ -0,0 +1,60 @@ +from types import SimpleNamespace + +from config import FunctionScoreConfig, IndexConfig, QueryConfig, RerankConfig, SPUConfig, SearchConfig +from query.query_parser import QueryParser + + +class _DummyTranslator: + def translate(self, text, target_lang, source_lang, scene, model_name): + return f"{text}-{target_lang}" + + +def _build_config() -> SearchConfig: + return SearchConfig( + es_index_name="test_products", + field_boosts={"title.en": 3.0, "title.zh": 3.0}, + indexes=[IndexConfig(name="default", label="default", fields=["title.en", "title.zh"])], + query_config=QueryConfig( + enable_text_embedding=False, + enable_query_rewrite=False, + supported_languages=["en", "zh"], + default_language="zh", + ), + function_score=FunctionScoreConfig(), + rerank=RerankConfig(), + spu_config=SPUConfig(enabled=False), + ) + + +def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(monkeypatch): + parser = QueryParser(_build_config(), translator=_DummyTranslator()) + monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh") + monkeypatch.setattr( + "query.query_parser.get_tenant_config_loader", + lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["zh", "en"]}), + raising=False, + ) + + result = parser.parse("法式 dress 连衣裙", tenant_id="162", generate_vector=False) + + assert result.detected_language == "zh" + assert "en" in result.search_langs + assert result.query_text_by_lang["en"] == "法式 dress 连衣裙" + assert result.query_text_by_lang["zh"] == "法式 dress 连衣裙" + + +def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch): + parser = QueryParser(_build_config(), translator=_DummyTranslator()) + monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") + monkeypatch.setattr( + "query.query_parser.get_tenant_config_loader", + lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}), + raising=False, + ) + + result = parser.parse("red 连衣裙", tenant_id="0", generate_vector=False) + + assert result.detected_language == "en" + assert "zh" in result.search_langs + assert result.query_text_by_lang["zh"] == "red 连衣裙" + assert result.query_text_by_lang["en"] == "red 连衣裙" -- libgit2 0.21.2