Commit a8261ece89a755210af8523091ce7922dd3c83ca

Authored by tangwang
1 parent a47416ec

检索效果优化

config/config.yaml
... ... @@ -18,6 +18,7 @@ field_boosts:
18 18 title: 3.0
19 19 brief: 1.5
20 20 description: 1.0
  21 + qanchors: 1.5
21 22 vendor: 1.5
22 23 category_path: 1.5
23 24 category_name_text: 1.5
... ...
query/query_parser.py
... ... @@ -135,8 +135,8 @@ class QueryParser:
135 135 cfg = get_translation_config()
136 136 logger.info(
137 137 "Initializing translator client at QueryParser construction (service_url=%s, default_model=%s)...",
138   - cfg.service_url,
139   - cfg.default_model,
  138 + cfg.get("service_url"),
  139 + cfg.get("default_model"),
140 140 )
141 141 self._translator = create_translation_client()
142 142 self._translation_executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="query-translation")
... ... @@ -207,6 +207,55 @@ class QueryParser:
207 207 return [x[0] for x in tok_result] if tok_result else []
208 208 return self._simple_tokenize(query)
209 209  
  210 + @staticmethod
  211 + def _contains_cjk(text: str) -> bool:
  212 + """Whether query contains any CJK ideograph."""
  213 + return bool(re.search(r"[\u4e00-\u9fff]", text or ""))
  214 +
  215 + @staticmethod
  216 + def _extract_latin_tokens(text: str) -> List[str]:
  217 + """Extract latin word tokens from query text."""
  218 + return re.findall(r"[A-Za-z]+(?:-[A-Za-z]+)*", text or "")
  219 +
  220 + def _infer_supplemental_search_langs(
  221 + self,
  222 + query_text: str,
  223 + detected_lang: str,
  224 + index_langs: List[str],
  225 + ) -> List[str]:
  226 + """
  227 + Infer extra languages to search when the query mixes scripts.
  228 +
  229 + Rules:
  230 + - If any Chinese characters appear, include `zh` when available.
  231 + - If the query contains meaningful latin tokens, include `en` when available.
  232 + "Meaningful" means either:
  233 + 1) at least 2 latin tokens with length >= 4, or
  234 + 2) at least 1 latin token with length >= 4 and latin chars occupy >= 20% of non-space chars.
  235 + """
  236 + supplemental: List[str] = []
  237 + normalized_index_langs = {str(lang or "").strip().lower() for lang in index_langs}
  238 + normalized_detected = str(detected_lang or "").strip().lower()
  239 + query_text = str(query_text or "")
  240 +
  241 + if "zh" in normalized_index_langs and self._contains_cjk(query_text) and normalized_detected != "zh":
  242 + supplemental.append("zh")
  243 +
  244 + latin_tokens = self._extract_latin_tokens(query_text)
  245 + significant_latin_tokens = [tok for tok in latin_tokens if len(tok) >= 4]
  246 + latin_chars = sum(len(tok) for tok in latin_tokens)
  247 + non_space_chars = len(re.sub(r"\s+", "", query_text))
  248 + latin_ratio = (latin_chars / non_space_chars) if non_space_chars > 0 else 0.0
  249 + has_meaningful_english = (
  250 + len(significant_latin_tokens) >= 2 or
  251 + (len(significant_latin_tokens) >= 1 and latin_ratio >= 0.2)
  252 + )
  253 +
  254 + if "en" in normalized_index_langs and has_meaningful_english and normalized_detected != "en":
  255 + supplemental.append("en")
  256 +
  257 + return supplemental
  258 +
210 259 def parse(
211 260 self,
212 261 query: str,
... ... @@ -482,7 +531,17 @@ class QueryParser:
482 531 for lang, translated_text in (translations or {}).items():
483 532 if translated_text and str(translated_text).strip():
484 533 query_text_by_lang[str(lang).strip().lower()] = str(translated_text)
485   -
  534 +
  535 + supplemental_search_langs = self._infer_supplemental_search_langs(
  536 + query_text=query_text,
  537 + detected_lang=detected_lang,
  538 + index_langs=index_langs,
  539 + )
  540 + for lang in supplemental_search_langs:
  541 + if lang not in query_text_by_lang and query_text:
  542 + # Use the original mixed-script query as a robust fallback probe for that language field set.
  543 + query_text_by_lang[lang] = query_text
  544 +
486 545 source_in_index_languages = detected_lang in index_langs
487 546 ordered_search_langs: List[str] = []
488 547 seen_order = set()
... ... @@ -501,6 +560,7 @@ class QueryParser:
501 560 if context:
502 561 context.store_intermediate_result("search_langs", ordered_search_langs)
503 562 context.store_intermediate_result("query_text_by_lang", query_text_by_lang)
  563 + context.store_intermediate_result("supplemental_search_langs", supplemental_search_langs)
504 564  
505 565 # Build result
506 566 result = ParsedQuery(
... ...
search/es_query_builder.py
... ... @@ -58,7 +58,7 @@ class ESQueryBuilder:
58 58 self.match_fields = match_fields
59 59 self.field_boosts = field_boosts or {}
60 60 self.multilingual_fields = multilingual_fields or [
61   - "title", "brief", "description", "vendor", "category_path", "category_name_text"
  61 + "title", "brief", "description", "qanchors", "vendor", "category_path", "category_name_text"
62 62 ]
63 63 self.shared_fields = shared_fields or ["tags", "option1_values", "option2_values", "option3_values"]
64 64 self.core_multilingual_fields = core_multilingual_fields or ["title", "brief", "vendor", "category_name_text"]
... ... @@ -276,7 +276,7 @@ class ESQueryBuilder:
276 276 "k": knn_k,
277 277 "num_candidates": knn_num_candidates,
278 278 "boost": knn_boost,
279   - "name": "knn_query",
  279 + "_name": "knn_query",
280 280 }
281 281 # Top-level knn does not inherit query.bool.filter automatically.
282 282 # Apply conjunctive + range filters here so vector recall respects hard filters.
... ...
search/searcher.py
... ... @@ -670,9 +670,11 @@ class Searcher:
670 670 if rerank_debug:
671 671 debug_entry["doc_id"] = rerank_debug.get("doc_id")
672 672 # 与 rerank_client 中字段保持一致,便于前端直接使用
673   - debug_entry["es_score_norm"] = rerank_debug.get("es_score_norm")
674 673 debug_entry["rerank_score"] = rerank_debug.get("rerank_score")
  674 + debug_entry["text_score"] = rerank_debug.get("text_score")
  675 + debug_entry["knn_score"] = rerank_debug.get("knn_score")
675 676 debug_entry["fused_score"] = rerank_debug.get("fused_score")
  677 + debug_entry["matched_queries"] = rerank_debug.get("matched_queries")
676 678  
677 679 per_result_debug.append(debug_entry)
678 680  
... ... @@ -719,6 +721,9 @@ class Searcher:
719 721 "rewritten_query": context.query_analysis.rewritten_query,
720 722 "detected_language": context.query_analysis.detected_language,
721 723 "translations": context.query_analysis.translations,
  724 + "query_text_by_lang": context.get_intermediate_result("query_text_by_lang", {}),
  725 + "search_langs": context.get_intermediate_result("search_langs", []),
  726 + "supplemental_search_langs": context.get_intermediate_result("supplemental_search_langs", []),
722 727 "has_vector": context.query_analysis.query_vector is not None,
723 728 "is_simple_query": context.query_analysis.is_simple_query,
724 729 "domain": context.query_analysis.domain
... ...
tests/test_es_query_builder.py
... ... @@ -62,4 +62,4 @@ def test_knn_prefilter_not_added_without_filters():
62 62  
63 63 assert "knn" in q
64 64 assert "filter" not in q["knn"]
65   - assert q["knn"]["name"] == "knn_query"
  65 + assert q["knn"]["_name"] == "knn_query"
... ...
tests/test_query_parser_mixed_language.py 0 → 100644
... ... @@ -0,0 +1,60 @@
  1 +from types import SimpleNamespace
  2 +
  3 +from config import FunctionScoreConfig, IndexConfig, QueryConfig, RerankConfig, SPUConfig, SearchConfig
  4 +from query.query_parser import QueryParser
  5 +
  6 +
  7 +class _DummyTranslator:
  8 + def translate(self, text, target_lang, source_lang, scene, model_name):
  9 + return f"{text}-{target_lang}"
  10 +
  11 +
  12 +def _build_config() -> SearchConfig:
  13 + return SearchConfig(
  14 + es_index_name="test_products",
  15 + field_boosts={"title.en": 3.0, "title.zh": 3.0},
  16 + indexes=[IndexConfig(name="default", label="default", fields=["title.en", "title.zh"])],
  17 + query_config=QueryConfig(
  18 + enable_text_embedding=False,
  19 + enable_query_rewrite=False,
  20 + supported_languages=["en", "zh"],
  21 + default_language="zh",
  22 + ),
  23 + function_score=FunctionScoreConfig(),
  24 + rerank=RerankConfig(),
  25 + spu_config=SPUConfig(enabled=False),
  26 + )
  27 +
  28 +
  29 +def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(monkeypatch):
  30 + parser = QueryParser(_build_config(), translator=_DummyTranslator())
  31 + monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh")
  32 + monkeypatch.setattr(
  33 + "query.query_parser.get_tenant_config_loader",
  34 + lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["zh", "en"]}),
  35 + raising=False,
  36 + )
  37 +
  38 + result = parser.parse("法式 dress 连衣裙", tenant_id="162", generate_vector=False)
  39 +
  40 + assert result.detected_language == "zh"
  41 + assert "en" in result.search_langs
  42 + assert result.query_text_by_lang["en"] == "法式 dress 连衣裙"
  43 + assert result.query_text_by_lang["zh"] == "法式 dress 连衣裙"
  44 +
  45 +
  46 +def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch):
  47 + parser = QueryParser(_build_config(), translator=_DummyTranslator())
  48 + monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en")
  49 + monkeypatch.setattr(
  50 + "query.query_parser.get_tenant_config_loader",
  51 + lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}),
  52 + raising=False,
  53 + )
  54 +
  55 + result = parser.parse("red 连衣裙", tenant_id="0", generate_vector=False)
  56 +
  57 + assert result.detected_language == "en"
  58 + assert "zh" in result.search_langs
  59 + assert result.query_text_by_lang["zh"] == "red 连衣裙"
  60 + assert result.query_text_by_lang["en"] == "red 连衣裙"
... ...