Commit a8261ece89a755210af8523091ce7922dd3c83ca
1 parent
a47416ec
检索效果优化
Showing
6 changed files
with
133 additions
and
7 deletions
Show diff stats
config/config.yaml
query/query_parser.py
| ... | ... | @@ -135,8 +135,8 @@ class QueryParser: |
| 135 | 135 | cfg = get_translation_config() |
| 136 | 136 | logger.info( |
| 137 | 137 | "Initializing translator client at QueryParser construction (service_url=%s, default_model=%s)...", |
| 138 | - cfg.service_url, | |
| 139 | - cfg.default_model, | |
| 138 | + cfg.get("service_url"), | |
| 139 | + cfg.get("default_model"), | |
| 140 | 140 | ) |
| 141 | 141 | self._translator = create_translation_client() |
| 142 | 142 | self._translation_executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="query-translation") |
| ... | ... | @@ -207,6 +207,55 @@ class QueryParser: |
| 207 | 207 | return [x[0] for x in tok_result] if tok_result else [] |
| 208 | 208 | return self._simple_tokenize(query) |
| 209 | 209 | |
| 210 | + @staticmethod | |
| 211 | + def _contains_cjk(text: str) -> bool: | |
| 212 | + """Whether query contains any CJK ideograph.""" | |
| 213 | + return bool(re.search(r"[\u4e00-\u9fff]", text or "")) | |
| 214 | + | |
| 215 | + @staticmethod | |
| 216 | + def _extract_latin_tokens(text: str) -> List[str]: | |
| 217 | + """Extract latin word tokens from query text.""" | |
| 218 | + return re.findall(r"[A-Za-z]+(?:-[A-Za-z]+)*", text or "") | |
| 219 | + | |
| 220 | + def _infer_supplemental_search_langs( | |
| 221 | + self, | |
| 222 | + query_text: str, | |
| 223 | + detected_lang: str, | |
| 224 | + index_langs: List[str], | |
| 225 | + ) -> List[str]: | |
| 226 | + """ | |
| 227 | + Infer extra languages to search when the query mixes scripts. | |
| 228 | + | |
| 229 | + Rules: | |
| 230 | + - If any Chinese characters appear, include `zh` when available. | |
| 231 | + - If the query contains meaningful latin tokens, include `en` when available. | |
| 232 | + "Meaningful" means either: | |
| 233 | + 1) at least 2 latin tokens with length >= 4, or | |
| 234 | + 2) at least 1 latin token with length >= 4 and latin chars occupy >= 20% of non-space chars. | |
| 235 | + """ | |
| 236 | + supplemental: List[str] = [] | |
| 237 | + normalized_index_langs = {str(lang or "").strip().lower() for lang in index_langs} | |
| 238 | + normalized_detected = str(detected_lang or "").strip().lower() | |
| 239 | + query_text = str(query_text or "") | |
| 240 | + | |
| 241 | + if "zh" in normalized_index_langs and self._contains_cjk(query_text) and normalized_detected != "zh": | |
| 242 | + supplemental.append("zh") | |
| 243 | + | |
| 244 | + latin_tokens = self._extract_latin_tokens(query_text) | |
| 245 | + significant_latin_tokens = [tok for tok in latin_tokens if len(tok) >= 4] | |
| 246 | + latin_chars = sum(len(tok) for tok in latin_tokens) | |
| 247 | + non_space_chars = len(re.sub(r"\s+", "", query_text)) | |
| 248 | + latin_ratio = (latin_chars / non_space_chars) if non_space_chars > 0 else 0.0 | |
| 249 | + has_meaningful_english = ( | |
| 250 | + len(significant_latin_tokens) >= 2 or | |
| 251 | + (len(significant_latin_tokens) >= 1 and latin_ratio >= 0.2) | |
| 252 | + ) | |
| 253 | + | |
| 254 | + if "en" in normalized_index_langs and has_meaningful_english and normalized_detected != "en": | |
| 255 | + supplemental.append("en") | |
| 256 | + | |
| 257 | + return supplemental | |
| 258 | + | |
| 210 | 259 | def parse( |
| 211 | 260 | self, |
| 212 | 261 | query: str, |
| ... | ... | @@ -482,7 +531,17 @@ class QueryParser: |
| 482 | 531 | for lang, translated_text in (translations or {}).items(): |
| 483 | 532 | if translated_text and str(translated_text).strip(): |
| 484 | 533 | query_text_by_lang[str(lang).strip().lower()] = str(translated_text) |
| 485 | - | |
| 534 | + | |
| 535 | + supplemental_search_langs = self._infer_supplemental_search_langs( | |
| 536 | + query_text=query_text, | |
| 537 | + detected_lang=detected_lang, | |
| 538 | + index_langs=index_langs, | |
| 539 | + ) | |
| 540 | + for lang in supplemental_search_langs: | |
| 541 | + if lang not in query_text_by_lang and query_text: | |
| 542 | + # Use the original mixed-script query as a robust fallback probe for that language field set. | |
| 543 | + query_text_by_lang[lang] = query_text | |
| 544 | + | |
| 486 | 545 | source_in_index_languages = detected_lang in index_langs |
| 487 | 546 | ordered_search_langs: List[str] = [] |
| 488 | 547 | seen_order = set() |
| ... | ... | @@ -501,6 +560,7 @@ class QueryParser: |
| 501 | 560 | if context: |
| 502 | 561 | context.store_intermediate_result("search_langs", ordered_search_langs) |
| 503 | 562 | context.store_intermediate_result("query_text_by_lang", query_text_by_lang) |
| 563 | + context.store_intermediate_result("supplemental_search_langs", supplemental_search_langs) | |
| 504 | 564 | |
| 505 | 565 | # Build result |
| 506 | 566 | result = ParsedQuery( | ... | ... |
search/es_query_builder.py
| ... | ... | @@ -58,7 +58,7 @@ class ESQueryBuilder: |
| 58 | 58 | self.match_fields = match_fields |
| 59 | 59 | self.field_boosts = field_boosts or {} |
| 60 | 60 | self.multilingual_fields = multilingual_fields or [ |
| 61 | - "title", "brief", "description", "vendor", "category_path", "category_name_text" | |
| 61 | + "title", "brief", "description", "qanchors", "vendor", "category_path", "category_name_text" | |
| 62 | 62 | ] |
| 63 | 63 | self.shared_fields = shared_fields or ["tags", "option1_values", "option2_values", "option3_values"] |
| 64 | 64 | self.core_multilingual_fields = core_multilingual_fields or ["title", "brief", "vendor", "category_name_text"] |
| ... | ... | @@ -276,7 +276,7 @@ class ESQueryBuilder: |
| 276 | 276 | "k": knn_k, |
| 277 | 277 | "num_candidates": knn_num_candidates, |
| 278 | 278 | "boost": knn_boost, |
| 279 | - "name": "knn_query", | |
| 279 | + "_name": "knn_query", | |
| 280 | 280 | } |
| 281 | 281 | # Top-level knn does not inherit query.bool.filter automatically. |
| 282 | 282 | # Apply conjunctive + range filters here so vector recall respects hard filters. | ... | ... |
search/searcher.py
| ... | ... | @@ -670,9 +670,11 @@ class Searcher: |
| 670 | 670 | if rerank_debug: |
| 671 | 671 | debug_entry["doc_id"] = rerank_debug.get("doc_id") |
| 672 | 672 | # 与 rerank_client 中字段保持一致,便于前端直接使用 |
| 673 | - debug_entry["es_score_norm"] = rerank_debug.get("es_score_norm") | |
| 674 | 673 | debug_entry["rerank_score"] = rerank_debug.get("rerank_score") |
| 674 | + debug_entry["text_score"] = rerank_debug.get("text_score") | |
| 675 | + debug_entry["knn_score"] = rerank_debug.get("knn_score") | |
| 675 | 676 | debug_entry["fused_score"] = rerank_debug.get("fused_score") |
| 677 | + debug_entry["matched_queries"] = rerank_debug.get("matched_queries") | |
| 676 | 678 | |
| 677 | 679 | per_result_debug.append(debug_entry) |
| 678 | 680 | |
| ... | ... | @@ -719,6 +721,9 @@ class Searcher: |
| 719 | 721 | "rewritten_query": context.query_analysis.rewritten_query, |
| 720 | 722 | "detected_language": context.query_analysis.detected_language, |
| 721 | 723 | "translations": context.query_analysis.translations, |
| 724 | + "query_text_by_lang": context.get_intermediate_result("query_text_by_lang", {}), | |
| 725 | + "search_langs": context.get_intermediate_result("search_langs", []), | |
| 726 | + "supplemental_search_langs": context.get_intermediate_result("supplemental_search_langs", []), | |
| 722 | 727 | "has_vector": context.query_analysis.query_vector is not None, |
| 723 | 728 | "is_simple_query": context.query_analysis.is_simple_query, |
| 724 | 729 | "domain": context.query_analysis.domain | ... | ... |
tests/test_es_query_builder.py
| ... | ... | @@ -0,0 +1,60 @@ |
| 1 | +from types import SimpleNamespace | |
| 2 | + | |
| 3 | +from config import FunctionScoreConfig, IndexConfig, QueryConfig, RerankConfig, SPUConfig, SearchConfig | |
| 4 | +from query.query_parser import QueryParser | |
| 5 | + | |
| 6 | + | |
| 7 | +class _DummyTranslator: | |
| 8 | + def translate(self, text, target_lang, source_lang, scene, model_name): | |
| 9 | + return f"{text}-{target_lang}" | |
| 10 | + | |
| 11 | + | |
| 12 | +def _build_config() -> SearchConfig: | |
| 13 | + return SearchConfig( | |
| 14 | + es_index_name="test_products", | |
| 15 | + field_boosts={"title.en": 3.0, "title.zh": 3.0}, | |
| 16 | + indexes=[IndexConfig(name="default", label="default", fields=["title.en", "title.zh"])], | |
| 17 | + query_config=QueryConfig( | |
| 18 | + enable_text_embedding=False, | |
| 19 | + enable_query_rewrite=False, | |
| 20 | + supported_languages=["en", "zh"], | |
| 21 | + default_language="zh", | |
| 22 | + ), | |
| 23 | + function_score=FunctionScoreConfig(), | |
| 24 | + rerank=RerankConfig(), | |
| 25 | + spu_config=SPUConfig(enabled=False), | |
| 26 | + ) | |
| 27 | + | |
| 28 | + | |
| 29 | +def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(monkeypatch): | |
| 30 | + parser = QueryParser(_build_config(), translator=_DummyTranslator()) | |
| 31 | + monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh") | |
| 32 | + monkeypatch.setattr( | |
| 33 | + "query.query_parser.get_tenant_config_loader", | |
| 34 | + lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["zh", "en"]}), | |
| 35 | + raising=False, | |
| 36 | + ) | |
| 37 | + | |
| 38 | + result = parser.parse("法式 dress 连衣裙", tenant_id="162", generate_vector=False) | |
| 39 | + | |
| 40 | + assert result.detected_language == "zh" | |
| 41 | + assert "en" in result.search_langs | |
| 42 | + assert result.query_text_by_lang["en"] == "法式 dress 连衣裙" | |
| 43 | + assert result.query_text_by_lang["zh"] == "法式 dress 连衣裙" | |
| 44 | + | |
| 45 | + | |
| 46 | +def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch): | |
| 47 | + parser = QueryParser(_build_config(), translator=_DummyTranslator()) | |
| 48 | + monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") | |
| 49 | + monkeypatch.setattr( | |
| 50 | + "query.query_parser.get_tenant_config_loader", | |
| 51 | + lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}), | |
| 52 | + raising=False, | |
| 53 | + ) | |
| 54 | + | |
| 55 | + result = parser.parse("red 连衣裙", tenant_id="0", generate_vector=False) | |
| 56 | + | |
| 57 | + assert result.detected_language == "en" | |
| 58 | + assert "zh" in result.search_langs | |
| 59 | + assert result.query_text_by_lang["zh"] == "red 连衣裙" | |
| 60 | + assert result.query_text_by_lang["en"] == "red 连衣裙" | ... | ... |