Commit a8261ece89a755210af8523091ce7922dd3c83ca
1 parent
a47416ec
检索效果优化
Showing
6 changed files
with
133 additions
and
7 deletions
Show diff stats
config/config.yaml
| @@ -18,6 +18,7 @@ field_boosts: | @@ -18,6 +18,7 @@ field_boosts: | ||
| 18 | title: 3.0 | 18 | title: 3.0 |
| 19 | brief: 1.5 | 19 | brief: 1.5 |
| 20 | description: 1.0 | 20 | description: 1.0 |
| 21 | + qanchors: 1.5 | ||
| 21 | vendor: 1.5 | 22 | vendor: 1.5 |
| 22 | category_path: 1.5 | 23 | category_path: 1.5 |
| 23 | category_name_text: 1.5 | 24 | category_name_text: 1.5 |
query/query_parser.py
| @@ -135,8 +135,8 @@ class QueryParser: | @@ -135,8 +135,8 @@ class QueryParser: | ||
| 135 | cfg = get_translation_config() | 135 | cfg = get_translation_config() |
| 136 | logger.info( | 136 | logger.info( |
| 137 | "Initializing translator client at QueryParser construction (service_url=%s, default_model=%s)...", | 137 | "Initializing translator client at QueryParser construction (service_url=%s, default_model=%s)...", |
| 138 | - cfg.service_url, | ||
| 139 | - cfg.default_model, | 138 | + cfg.get("service_url"), |
| 139 | + cfg.get("default_model"), | ||
| 140 | ) | 140 | ) |
| 141 | self._translator = create_translation_client() | 141 | self._translator = create_translation_client() |
| 142 | self._translation_executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="query-translation") | 142 | self._translation_executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="query-translation") |
| @@ -207,6 +207,55 @@ class QueryParser: | @@ -207,6 +207,55 @@ class QueryParser: | ||
| 207 | return [x[0] for x in tok_result] if tok_result else [] | 207 | return [x[0] for x in tok_result] if tok_result else [] |
| 208 | return self._simple_tokenize(query) | 208 | return self._simple_tokenize(query) |
| 209 | 209 | ||
| 210 | + @staticmethod | ||
| 211 | + def _contains_cjk(text: str) -> bool: | ||
| 212 | + """Whether query contains any CJK ideograph.""" | ||
| 213 | + return bool(re.search(r"[\u4e00-\u9fff]", text or "")) | ||
| 214 | + | ||
| 215 | + @staticmethod | ||
| 216 | + def _extract_latin_tokens(text: str) -> List[str]: | ||
| 217 | + """Extract latin word tokens from query text.""" | ||
| 218 | + return re.findall(r"[A-Za-z]+(?:-[A-Za-z]+)*", text or "") | ||
| 219 | + | ||
| 220 | + def _infer_supplemental_search_langs( | ||
| 221 | + self, | ||
| 222 | + query_text: str, | ||
| 223 | + detected_lang: str, | ||
| 224 | + index_langs: List[str], | ||
| 225 | + ) -> List[str]: | ||
| 226 | + """ | ||
| 227 | + Infer extra languages to search when the query mixes scripts. | ||
| 228 | + | ||
| 229 | + Rules: | ||
| 230 | + - If any Chinese characters appear, include `zh` when available. | ||
| 231 | + - If the query contains meaningful latin tokens, include `en` when available. | ||
| 232 | + "Meaningful" means either: | ||
| 233 | + 1) at least 2 latin tokens with length >= 4, or | ||
| 234 | + 2) at least 1 latin token with length >= 4 and latin chars occupy >= 20% of non-space chars. | ||
| 235 | + """ | ||
| 236 | + supplemental: List[str] = [] | ||
| 237 | + normalized_index_langs = {str(lang or "").strip().lower() for lang in index_langs} | ||
| 238 | + normalized_detected = str(detected_lang or "").strip().lower() | ||
| 239 | + query_text = str(query_text or "") | ||
| 240 | + | ||
| 241 | + if "zh" in normalized_index_langs and self._contains_cjk(query_text) and normalized_detected != "zh": | ||
| 242 | + supplemental.append("zh") | ||
| 243 | + | ||
| 244 | + latin_tokens = self._extract_latin_tokens(query_text) | ||
| 245 | + significant_latin_tokens = [tok for tok in latin_tokens if len(tok) >= 4] | ||
| 246 | + latin_chars = sum(len(tok) for tok in latin_tokens) | ||
| 247 | + non_space_chars = len(re.sub(r"\s+", "", query_text)) | ||
| 248 | + latin_ratio = (latin_chars / non_space_chars) if non_space_chars > 0 else 0.0 | ||
| 249 | + has_meaningful_english = ( | ||
| 250 | + len(significant_latin_tokens) >= 2 or | ||
| 251 | + (len(significant_latin_tokens) >= 1 and latin_ratio >= 0.2) | ||
| 252 | + ) | ||
| 253 | + | ||
| 254 | + if "en" in normalized_index_langs and has_meaningful_english and normalized_detected != "en": | ||
| 255 | + supplemental.append("en") | ||
| 256 | + | ||
| 257 | + return supplemental | ||
| 258 | + | ||
| 210 | def parse( | 259 | def parse( |
| 211 | self, | 260 | self, |
| 212 | query: str, | 261 | query: str, |
| @@ -482,7 +531,17 @@ class QueryParser: | @@ -482,7 +531,17 @@ class QueryParser: | ||
| 482 | for lang, translated_text in (translations or {}).items(): | 531 | for lang, translated_text in (translations or {}).items(): |
| 483 | if translated_text and str(translated_text).strip(): | 532 | if translated_text and str(translated_text).strip(): |
| 484 | query_text_by_lang[str(lang).strip().lower()] = str(translated_text) | 533 | query_text_by_lang[str(lang).strip().lower()] = str(translated_text) |
| 485 | - | 534 | + |
| 535 | + supplemental_search_langs = self._infer_supplemental_search_langs( | ||
| 536 | + query_text=query_text, | ||
| 537 | + detected_lang=detected_lang, | ||
| 538 | + index_langs=index_langs, | ||
| 539 | + ) | ||
| 540 | + for lang in supplemental_search_langs: | ||
| 541 | + if lang not in query_text_by_lang and query_text: | ||
| 542 | + # Use the original mixed-script query as a robust fallback probe for that language field set. | ||
| 543 | + query_text_by_lang[lang] = query_text | ||
| 544 | + | ||
| 486 | source_in_index_languages = detected_lang in index_langs | 545 | source_in_index_languages = detected_lang in index_langs |
| 487 | ordered_search_langs: List[str] = [] | 546 | ordered_search_langs: List[str] = [] |
| 488 | seen_order = set() | 547 | seen_order = set() |
| @@ -501,6 +560,7 @@ class QueryParser: | @@ -501,6 +560,7 @@ class QueryParser: | ||
| 501 | if context: | 560 | if context: |
| 502 | context.store_intermediate_result("search_langs", ordered_search_langs) | 561 | context.store_intermediate_result("search_langs", ordered_search_langs) |
| 503 | context.store_intermediate_result("query_text_by_lang", query_text_by_lang) | 562 | context.store_intermediate_result("query_text_by_lang", query_text_by_lang) |
| 563 | + context.store_intermediate_result("supplemental_search_langs", supplemental_search_langs) | ||
| 504 | 564 | ||
| 505 | # Build result | 565 | # Build result |
| 506 | result = ParsedQuery( | 566 | result = ParsedQuery( |
search/es_query_builder.py
| @@ -58,7 +58,7 @@ class ESQueryBuilder: | @@ -58,7 +58,7 @@ class ESQueryBuilder: | ||
| 58 | self.match_fields = match_fields | 58 | self.match_fields = match_fields |
| 59 | self.field_boosts = field_boosts or {} | 59 | self.field_boosts = field_boosts or {} |
| 60 | self.multilingual_fields = multilingual_fields or [ | 60 | self.multilingual_fields = multilingual_fields or [ |
| 61 | - "title", "brief", "description", "vendor", "category_path", "category_name_text" | 61 | + "title", "brief", "description", "qanchors", "vendor", "category_path", "category_name_text" |
| 62 | ] | 62 | ] |
| 63 | self.shared_fields = shared_fields or ["tags", "option1_values", "option2_values", "option3_values"] | 63 | self.shared_fields = shared_fields or ["tags", "option1_values", "option2_values", "option3_values"] |
| 64 | self.core_multilingual_fields = core_multilingual_fields or ["title", "brief", "vendor", "category_name_text"] | 64 | self.core_multilingual_fields = core_multilingual_fields or ["title", "brief", "vendor", "category_name_text"] |
| @@ -276,7 +276,7 @@ class ESQueryBuilder: | @@ -276,7 +276,7 @@ class ESQueryBuilder: | ||
| 276 | "k": knn_k, | 276 | "k": knn_k, |
| 277 | "num_candidates": knn_num_candidates, | 277 | "num_candidates": knn_num_candidates, |
| 278 | "boost": knn_boost, | 278 | "boost": knn_boost, |
| 279 | - "name": "knn_query", | 279 | + "_name": "knn_query", |
| 280 | } | 280 | } |
| 281 | # Top-level knn does not inherit query.bool.filter automatically. | 281 | # Top-level knn does not inherit query.bool.filter automatically. |
| 282 | # Apply conjunctive + range filters here so vector recall respects hard filters. | 282 | # Apply conjunctive + range filters here so vector recall respects hard filters. |
search/searcher.py
| @@ -670,9 +670,11 @@ class Searcher: | @@ -670,9 +670,11 @@ class Searcher: | ||
| 670 | if rerank_debug: | 670 | if rerank_debug: |
| 671 | debug_entry["doc_id"] = rerank_debug.get("doc_id") | 671 | debug_entry["doc_id"] = rerank_debug.get("doc_id") |
| 672 | # 与 rerank_client 中字段保持一致,便于前端直接使用 | 672 | # 与 rerank_client 中字段保持一致,便于前端直接使用 |
| 673 | - debug_entry["es_score_norm"] = rerank_debug.get("es_score_norm") | ||
| 674 | debug_entry["rerank_score"] = rerank_debug.get("rerank_score") | 673 | debug_entry["rerank_score"] = rerank_debug.get("rerank_score") |
| 674 | + debug_entry["text_score"] = rerank_debug.get("text_score") | ||
| 675 | + debug_entry["knn_score"] = rerank_debug.get("knn_score") | ||
| 675 | debug_entry["fused_score"] = rerank_debug.get("fused_score") | 676 | debug_entry["fused_score"] = rerank_debug.get("fused_score") |
| 677 | + debug_entry["matched_queries"] = rerank_debug.get("matched_queries") | ||
| 676 | 678 | ||
| 677 | per_result_debug.append(debug_entry) | 679 | per_result_debug.append(debug_entry) |
| 678 | 680 | ||
| @@ -719,6 +721,9 @@ class Searcher: | @@ -719,6 +721,9 @@ class Searcher: | ||
| 719 | "rewritten_query": context.query_analysis.rewritten_query, | 721 | "rewritten_query": context.query_analysis.rewritten_query, |
| 720 | "detected_language": context.query_analysis.detected_language, | 722 | "detected_language": context.query_analysis.detected_language, |
| 721 | "translations": context.query_analysis.translations, | 723 | "translations": context.query_analysis.translations, |
| 724 | + "query_text_by_lang": context.get_intermediate_result("query_text_by_lang", {}), | ||
| 725 | + "search_langs": context.get_intermediate_result("search_langs", []), | ||
| 726 | + "supplemental_search_langs": context.get_intermediate_result("supplemental_search_langs", []), | ||
| 722 | "has_vector": context.query_analysis.query_vector is not None, | 727 | "has_vector": context.query_analysis.query_vector is not None, |
| 723 | "is_simple_query": context.query_analysis.is_simple_query, | 728 | "is_simple_query": context.query_analysis.is_simple_query, |
| 724 | "domain": context.query_analysis.domain | 729 | "domain": context.query_analysis.domain |
tests/test_es_query_builder.py
| @@ -62,4 +62,4 @@ def test_knn_prefilter_not_added_without_filters(): | @@ -62,4 +62,4 @@ def test_knn_prefilter_not_added_without_filters(): | ||
| 62 | 62 | ||
| 63 | assert "knn" in q | 63 | assert "knn" in q |
| 64 | assert "filter" not in q["knn"] | 64 | assert "filter" not in q["knn"] |
| 65 | - assert q["knn"]["name"] == "knn_query" | 65 | + assert q["knn"]["_name"] == "knn_query" |
| @@ -0,0 +1,60 @@ | @@ -0,0 +1,60 @@ | ||
| 1 | +from types import SimpleNamespace | ||
| 2 | + | ||
| 3 | +from config import FunctionScoreConfig, IndexConfig, QueryConfig, RerankConfig, SPUConfig, SearchConfig | ||
| 4 | +from query.query_parser import QueryParser | ||
| 5 | + | ||
| 6 | + | ||
| 7 | +class _DummyTranslator: | ||
| 8 | + def translate(self, text, target_lang, source_lang, scene, model_name): | ||
| 9 | + return f"{text}-{target_lang}" | ||
| 10 | + | ||
| 11 | + | ||
| 12 | +def _build_config() -> SearchConfig: | ||
| 13 | + return SearchConfig( | ||
| 14 | + es_index_name="test_products", | ||
| 15 | + field_boosts={"title.en": 3.0, "title.zh": 3.0}, | ||
| 16 | + indexes=[IndexConfig(name="default", label="default", fields=["title.en", "title.zh"])], | ||
| 17 | + query_config=QueryConfig( | ||
| 18 | + enable_text_embedding=False, | ||
| 19 | + enable_query_rewrite=False, | ||
| 20 | + supported_languages=["en", "zh"], | ||
| 21 | + default_language="zh", | ||
| 22 | + ), | ||
| 23 | + function_score=FunctionScoreConfig(), | ||
| 24 | + rerank=RerankConfig(), | ||
| 25 | + spu_config=SPUConfig(enabled=False), | ||
| 26 | + ) | ||
| 27 | + | ||
| 28 | + | ||
| 29 | +def test_parse_adds_en_fields_for_mixed_chinese_query_with_meaningful_english(monkeypatch): | ||
| 30 | + parser = QueryParser(_build_config(), translator=_DummyTranslator()) | ||
| 31 | + monkeypatch.setattr(parser.language_detector, "detect", lambda text: "zh") | ||
| 32 | + monkeypatch.setattr( | ||
| 33 | + "query.query_parser.get_tenant_config_loader", | ||
| 34 | + lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["zh", "en"]}), | ||
| 35 | + raising=False, | ||
| 36 | + ) | ||
| 37 | + | ||
| 38 | + result = parser.parse("法式 dress 连衣裙", tenant_id="162", generate_vector=False) | ||
| 39 | + | ||
| 40 | + assert result.detected_language == "zh" | ||
| 41 | + assert "en" in result.search_langs | ||
| 42 | + assert result.query_text_by_lang["en"] == "法式 dress 连衣裙" | ||
| 43 | + assert result.query_text_by_lang["zh"] == "法式 dress 连衣裙" | ||
| 44 | + | ||
| 45 | + | ||
| 46 | +def test_parse_adds_zh_fields_for_english_query_when_cjk_present(monkeypatch): | ||
| 47 | + parser = QueryParser(_build_config(), translator=_DummyTranslator()) | ||
| 48 | + monkeypatch.setattr(parser.language_detector, "detect", lambda text: "en") | ||
| 49 | + monkeypatch.setattr( | ||
| 50 | + "query.query_parser.get_tenant_config_loader", | ||
| 51 | + lambda: SimpleNamespace(get_tenant_config=lambda tenant_id: {"index_languages": ["en", "zh"]}), | ||
| 52 | + raising=False, | ||
| 53 | + ) | ||
| 54 | + | ||
| 55 | + result = parser.parse("red 连衣裙", tenant_id="0", generate_vector=False) | ||
| 56 | + | ||
| 57 | + assert result.detected_language == "en" | ||
| 58 | + assert "zh" in result.search_langs | ||
| 59 | + assert result.query_text_by_lang["zh"] == "red 连衣裙" | ||
| 60 | + assert result.query_text_by_lang["en"] == "red 连衣裙" |