Commit a8261ece89a755210af8523091ce7922dd3c83ca

Authored by tangwang
1 parent a47416ec

检索效果优化

config/config.yaml
@@ -18,6 +18,7 @@ field_boosts: @@ -18,6 +18,7 @@ field_boosts:
18 title: 3.0 18 title: 3.0
19 brief: 1.5 19 brief: 1.5
20 description: 1.0 20 description: 1.0
  21 + qanchors: 1.5
21 vendor: 1.5 22 vendor: 1.5
22 category_path: 1.5 23 category_path: 1.5
23 category_name_text: 1.5 24 category_name_text: 1.5
query/query_parser.py
@@ -135,8 +135,8 @@ class QueryParser: @@ -135,8 +135,8 @@ class QueryParser:
135 cfg = get_translation_config() 135 cfg = get_translation_config()
136 logger.info( 136 logger.info(
137 "Initializing translator client at QueryParser construction (service_url=%s, default_model=%s)...", 137 "Initializing translator client at QueryParser construction (service_url=%s, default_model=%s)...",
138 - cfg.service_url,  
139 - cfg.default_model, 138 + cfg.get("service_url"),
  139 + cfg.get("default_model"),
140 ) 140 )
141 self._translator = create_translation_client() 141 self._translator = create_translation_client()
142 self._translation_executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="query-translation") 142 self._translation_executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="query-translation")
@@ -207,6 +207,55 @@ class QueryParser: @@ -207,6 +207,55 @@ class QueryParser:
207 return [x[0] for x in tok_result] if tok_result else [] 207 return [x[0] for x in tok_result] if tok_result else []
208 return self._simple_tokenize(query) 208 return self._simple_tokenize(query)
209 209
  210 + @staticmethod
  211 + def _contains_cjk(text: str) -> bool:
  212 + """Whether query contains any CJK ideograph."""
  213 + return bool(re.search(r"[\u4e00-\u9fff]", text or ""))
  214 +
  215 + @staticmethod
  216 + def _extract_latin_tokens(text: str) -> List[str]:
  217 + """Extract latin word tokens from query text."""
  218 + return re.findall(r"[A-Za-z]+(?:-[A-Za-z]+)*", text or "")
  219 +
  220 + def _infer_supplemental_search_langs(
  221 + self,
  222 + query_text: str,
  223 + detected_lang: str,
  224 + index_langs: List[str],
  225 + ) -> List[str]:
  226 + """
  227 + Infer extra languages to search when the query mixes scripts.
  228 +
  229 + Rules:
  230 + - If any Chinese characters appear, include `zh` when available.
  231 + - If the query contains meaningful latin tokens, include `en` when available.
  232 + "Meaningful" means either:
  233 + 1) at least 2 latin tokens with length >= 4, or
  234 + 2) at least 1 latin token with length >= 4 and latin chars occupy >= 20% of non-space chars.
  235 + """
  236 + supplemental: List[str] = []
  237 + normalized_index_langs = {str(lang or "").strip().lower() for lang in index_langs}
  238 + normalized_detected = str(detected_lang or "").strip().lower()
  239 + query_text = str(query_text or "")
  240 +
  241 + if "zh" in normalized_index_langs and self._contains_cjk(query_text) and normalized_detected != "zh":
  242 + supplemental.append("zh")
  243 +
  244 + latin_tokens = self._extract_latin_tokens(query_text)
  245 + significant_latin_tokens = [tok for tok in latin_tokens if len(tok) >= 4]
  246 + latin_chars = sum(len(tok) for tok in latin_tokens)
  247 + non_space_chars = len(re.sub(r"\s+", "", query_text))
  248 + latin_ratio = (latin_chars / non_space_chars) if non_space_chars > 0 else 0.0
  249 + has_meaningful_english = (
  250 + len(significant_latin_tokens) >= 2 or
  251 + (len(significant_latin_tokens) >= 1 and latin_ratio >= 0.2)
  252 + )
  253 +
  254 + if "en" in normalized_index_langs and has_meaningful_english and normalized_detected != "en":
  255 + supplemental.append("en")
  256 +
  257 + return supplemental
  258 +
210 def parse( 259 def parse(
211 self, 260 self,
212 query: str, 261 query: str,
@@ -482,7 +531,17 @@ class QueryParser: @@ -482,7 +531,17 @@ class QueryParser:
482 for lang, translated_text in (translations or {}).items(): 531 for lang, translated_text in (translations or {}).items():
483 if translated_text and str(translated_text).strip(): 532 if translated_text and str(translated_text).strip():
484 query_text_by_lang[str(lang).strip().lower()] = str(translated_text) 533 query_text_by_lang[str(lang).strip().lower()] = str(translated_text)
485 - 534 +
  535 + supplemental_search_langs = self._infer_supplemental_search_langs(
  536 + query_text=query_text,
  537 + detected_lang=detected_lang,
  538 + index_langs=index_langs,
  539 + )
  540 + for lang in supplemental_search_langs:
  541 + if lang not in query_text_by_lang and query_text:
  542 + # Use the original mixed-script query as a robust fallback probe for that language field set.
  543 + query_text_by_lang[lang] = query_text
  544 +
486 source_in_index_languages = detected_lang in index_langs 545 source_in_index_languages = detected_lang in index_langs
487 ordered_search_langs: List[str] = [] 546 ordered_search_langs: List[str] = []
488 seen_order = set() 547 seen_order = set()
@@ -501,6 +560,7 @@ class QueryParser: @@ -501,6 +560,7 @@ class QueryParser:
501 if context: 560 if context:
502 context.store_intermediate_result("search_langs", ordered_search_langs) 561 context.store_intermediate_result("search_langs", ordered_search_langs)
503 context.store_intermediate_result("query_text_by_lang", query_text_by_lang) 562 context.store_intermediate_result("query_text_by_lang", query_text_by_lang)
  563 + context.store_intermediate_result("supplemental_search_langs", supplemental_search_langs)
504 564
505 # Build result 565 # Build result
506 result = ParsedQuery( 566 result = ParsedQuery(
search/es_query_builder.py
@@ -58,7 +58,7 @@ class ESQueryBuilder: @@ -58,7 +58,7 @@ class ESQueryBuilder:
58 self.match_fields = match_fields 58 self.match_fields = match_fields
59 self.field_boosts = field_boosts or {} 59 self.field_boosts = field_boosts or {}
60 self.multilingual_fields = multilingual_fields or [ 60 self.multilingual_fields = multilingual_fields or [
61 - "title", "brief", "description", "vendor", "category_path", "category_name_text" 61 + "title", "brief", "description", "qanchors", "vendor", "category_path", "category_name_text"
62 ] 62 ]
63 self.shared_fields = shared_fields or ["tags", "option1_values", "option2_values", "option3_values"] 63 self.shared_fields = shared_fields or ["tags", "option1_values", "option2_values", "option3_values"]
64 self.core_multilingual_fields = core_multilingual_fields or ["title", "brief", "vendor", "category_name_text"] 64 self.core_multilingual_fields = core_multilingual_fields or ["title", "brief", "vendor", "category_name_text"]
@@ -276,7 +276,7 @@ class ESQueryBuilder: @@ -276,7 +276,7 @@ class ESQueryBuilder:
276 "k": knn_k, 276 "k": knn_k,
277 "num_candidates": knn_num_candidates, 277 "num_candidates": knn_num_candidates,
278 "boost": knn_boost, 278 "boost": knn_boost,
279 - "name": "knn_query", 279 + "_name": "knn_query",
280 } 280 }
281 # Top-level knn does not inherit query.bool.filter automatically. 281 # Top-level knn does not inherit query.bool.filter automatically.
282 # Apply conjunctive + range filters here so vector recall respects hard filters. 282 # Apply conjunctive + range filters here so vector recall respects hard filters.
search/searcher.py
@@ -670,9 +670,11 @@ class Searcher: @@ -670,9 +670,11 @@ class Searcher:
670 if rerank_debug: 670 if rerank_debug:
671 debug_entry["doc_id"] = rerank_debug.get("doc_id") 671 debug_entry["doc_id"] = rerank_debug.get("doc_id")
672 # 与 rerank_client 中字段保持一致,便于前端直接使用 672 # 与 rerank_client 中字段保持一致,便于前端直接使用
673 - debug_entry["es_score_norm"] = rerank_debug.get("es_score_norm")  
674 debug_entry["rerank_score"] = rerank_debug.get("rerank_score") 673 debug_entry["rerank_score"] = rerank_debug.get("rerank_score")
  674 + debug_entry["text_score"] = rerank_debug.get("text_score")
  675 + debug_entry["knn_score"] = rerank_debug.get("knn_score")
675 debug_entry["fused_score"] = rerank_debug.get("fused_score") 676 debug_entry["fused_score"] = rerank_debug.get("fused_score")
  677 + debug_entry["matched_queries"] = rerank_debug.get("matched_queries")
676 678
677 per_result_debug.append(debug_entry) 679 per_result_debug.append(debug_entry)
678 680
@@ -719,6 +721,9 @@ class Searcher: @@ -719,6 +721,9 @@ class Searcher:
719 "rewritten_query": context.query_analysis.rewritten_query, 721 "rewritten_query": context.query_analysis.rewritten_query,
720 "detected_language": context.query_analysis.detected_language, 722 "detected_language": context.query_analysis.detected_language,
721 "translations": context.query_analysis.translations, 723 "translations": context.query_analysis.translations,
  724 + "query_text_by_lang": context.get_intermediate_result("query_text_by_lang", {}),
  725 + "search_langs": context.get_intermediate_result("search_langs", []),
  726 + "supplemental_search_langs": context.get_intermediate_result("supplemental_search_langs", []),
722 "has_vector": context.query_analysis.query_vector is not None, 727 "has_vector": context.query_analysis.query_vector is not None,
723 "is_simple_query": context.query_analysis.is_simple_query, 728 "is_simple_query": context.query_analysis.is_simple_query,
724 "domain": context.query_analysis.domain 729 "domain": context.query_analysis.domain
tests/test_es_query_builder.py
@@ -62,4 +62,4 @@ def test_knn_prefilter_not_added_without_filters(): @@ -62,4 +62,4 @@ def test_knn_prefilter_not_added_without_filters():
62 62
63 assert "knn" in q 63 assert "knn" in q
64 assert "filter" not in q["knn"] 64 assert "filter" not in q["knn"]
65 - assert q["knn"]["name"] == "knn_query" 65 + assert q["knn"]["_name"] == "knn_query"
tests/test_query_parser_mixed_language.py 0 → 100644
@@ -0,0 +1,60 @@ @@ -0,0 +1,60 @@
  1 +from types import SimpleNamespace
  2 +
  3 +from config import FunctionScoreConfig, IndexConfig, QueryConfig, RerankConfig, SPUConfig, SearchConfig
  4 +from query.query_parser import QueryParser
  5 +
  6 +
  7 +class _DummyTranslator:
  8 + def translate(self, text, target_lang, source_lang, scene, model_name):
  9 + return f"{text}-{target_lang}"
  10 +
  11 +
  12 +def _build_config() -> SearchConfig:
  13 + return SearchConfig(
  14 + es_index_name="test_products",
  15 + field_boosts={"title.en": 3.0, "title.zh": 3.0},
  16 + indexes=[IndexConfig(name="default", label="default", fields=["title.en", "title.zh"])],
  17 + query_config=QueryConfig(
  18 + enable_text_embedding=False,
  19 + enable_query_rewrite=False,
  20 + supported_languages=["en", "zh"],
  21 + default_language="zh",
  22 + ),
  23 + function_score=FunctionScoreConfig(),
  24 + rerank=RerankConfig(),
  25 + spu_config=SPUConfig(enabled=False),
  26 + )
  27 +
  28 +
  29 +def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(monkeypatch):
  30 + parser = QueryParser(_build_config(), translator=_DummyTranslator())
  31 + monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh")
  32 + monkeypatch.setattr(
  33 + "query.query_parser.get_tenant_config_loader",
  34 + lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["zh", "en"]}),
  35 + raising=False,
  36 + )
  37 +
  38 + result = parser.parse("法式 dress 连衣裙", tenant_id="162", generate_vector=False)
  39 +
  40 + assert result.detected_language == "zh"
  41 + assert "en" in result.search_langs
  42 + assert result.query_text_by_lang["en"] == "法式 dress 连衣裙"
  43 + assert result.query_text_by_lang["zh"] == "法式 dress 连衣裙"
  44 +
  45 +
  46 +def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch):
  47 + parser = QueryParser(_build_config(), translator=_DummyTranslator())
  48 + monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
  49 + monkeypatch.setattr(
  50 + "query.query_parser.get_tenant_config_loader",
  51 + lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}),
  52 + raising=False,
  53 + )
  54 +
  55 + result = parser.parse("red 连衣裙", tenant_id="0", generate_vector=False)
  56 +
  57 + assert result.detected_language == "en"
  58 + assert "zh" in result.search_langs
  59 + assert result.query_text_by_lang["zh"] == "red 连衣裙"
  60 + assert result.query_text_by_lang["en"] == "red 连衣裙"