Commit a5a6bab82022a86b85ccb14b09355c18ee170412
1 parent
11237cf2
多语言查询优化
Showing
6 changed files
with
126 additions
and
29 deletions
Show diff stats
frontend/index.html
| @@ -142,6 +142,6 @@ | @@ -142,6 +142,6 @@ | ||
| 142 | <p>SearchEngine © 2025 | API: <span id="apiUrl">Loading...</span></p> | 142 | <p>SearchEngine © 2025 | API: <span id="apiUrl">Loading...</span></p> |
| 143 | </footer> | 143 | </footer> |
| 144 | 144 | ||
| 145 | - <script src="/static/js/app.js?v=3.7"></script> | 145 | + <script src="/static/js/app.js?v=3.1"></script> |
| 146 | </body> | 146 | </body> |
| 147 | </html> | 147 | </html> |
frontend/static/js/app.js
| @@ -554,7 +554,7 @@ function displayDebugInfo(data) { | @@ -554,7 +554,7 @@ function displayDebugInfo(data) { | ||
| 554 | if (data.query_info) { | 554 | if (data.query_info) { |
| 555 | let html = '<div style="padding: 10px;">'; | 555 | let html = '<div style="padding: 10px;">'; |
| 556 | html += `<div><strong>original_query:</strong> ${escapeHtml(data.query_info.original_query || 'N/A')}</div>`; | 556 | html += `<div><strong>original_query:</strong> ${escapeHtml(data.query_info.original_query || 'N/A')}</div>`; |
| 557 | - html += `<div><strong>detected_language:</strong> ${getLanguageName(data.query_info.detected_language)}</div>`; | 557 | + html += `<div><strong>detected_language:</strong> ${data.query_info.detected_languag}</div>`; |
| 558 | html += '</div>'; | 558 | html += '</div>'; |
| 559 | debugInfoDiv.innerHTML = html; | 559 | debugInfoDiv.innerHTML = html; |
| 560 | } else { | 560 | } else { |
| @@ -573,7 +573,7 @@ function displayDebugInfo(data) { | @@ -573,7 +573,7 @@ function displayDebugInfo(data) { | ||
| 573 | html += `<div>original_query: ${escapeHtml(debugInfo.query_analysis.original_query || 'N/A')}</div>`; | 573 | html += `<div>original_query: ${escapeHtml(debugInfo.query_analysis.original_query || 'N/A')}</div>`; |
| 574 | html += `<div>normalized_query: ${escapeHtml(debugInfo.query_analysis.normalized_query || 'N/A')}</div>`; | 574 | html += `<div>normalized_query: ${escapeHtml(debugInfo.query_analysis.normalized_query || 'N/A')}</div>`; |
| 575 | html += `<div>rewritten_query: ${escapeHtml(debugInfo.query_analysis.rewritten_query || 'N/A')}</div>`; | 575 | html += `<div>rewritten_query: ${escapeHtml(debugInfo.query_analysis.rewritten_query || 'N/A')}</div>`; |
| 576 | - html += `<div>detected_language: ${getLanguageName(debugInfo.query_analysis.detected_language)}</div>`; | 576 | + html += `<div>detected_language: ${debugInfo.query_analysis.detected_language}</div>`; |
| 577 | html += `<div>domain: ${escapeHtml(debugInfo.query_analysis.domain || 'default')}</div>`; | 577 | html += `<div>domain: ${escapeHtml(debugInfo.query_analysis.domain || 'default')}</div>`; |
| 578 | html += `<div>is_simple_query: ${debugInfo.query_analysis.is_simple_query ? 'yes' : 'no'}</div>`; | 578 | html += `<div>is_simple_query: ${debugInfo.query_analysis.is_simple_query ? 'yes' : 'no'}</div>`; |
| 579 | 579 | ||
| @@ -581,7 +581,7 @@ function displayDebugInfo(data) { | @@ -581,7 +581,7 @@ function displayDebugInfo(data) { | ||
| 581 | html += '<div>translations: '; | 581 | html += '<div>translations: '; |
| 582 | for (const [lang, translation] of Object.entries(debugInfo.query_analysis.translations)) { | 582 | for (const [lang, translation] of Object.entries(debugInfo.query_analysis.translations)) { |
| 583 | if (translation) { | 583 | if (translation) { |
| 584 | - html += `${getLanguageName(lang)}: ${escapeHtml(translation)}; `; | 584 | + html += `${lang}: ${escapeHtml(translation)}; `; |
| 585 | } | 585 | } |
| 586 | } | 586 | } |
| 587 | html += '</div>'; | 587 | html += '</div>'; |
| @@ -669,14 +669,3 @@ function formatDate(dateStr) { | @@ -669,14 +669,3 @@ function formatDate(dateStr) { | ||
| 669 | } | 669 | } |
| 670 | } | 670 | } |
| 671 | 671 | ||
| 672 | -function getLanguageName(code) { | ||
| 673 | - const names = { | ||
| 674 | - 'zh': '中文', | ||
| 675 | - 'en': 'English', | ||
| 676 | - 'ru': 'Русский', | ||
| 677 | - 'ar': 'العربية', | ||
| 678 | - 'ja': '日本語', | ||
| 679 | - 'unknown': 'Unknown' | ||
| 680 | - }; | ||
| 681 | - return names[code] || code; | ||
| 682 | -} |
query/query_parser.py
| @@ -27,7 +27,7 @@ class ParsedQuery: | @@ -27,7 +27,7 @@ class ParsedQuery: | ||
| 27 | original_query: str, | 27 | original_query: str, |
| 28 | normalized_query: str, | 28 | normalized_query: str, |
| 29 | rewritten_query: Optional[str] = None, | 29 | rewritten_query: Optional[str] = None, |
| 30 | - detected_language: str = "unknown", | 30 | + detected_language: Optional[str] = None, |
| 31 | translations: Dict[str, str] = None, | 31 | translations: Dict[str, str] = None, |
| 32 | query_vector: Optional[np.ndarray] = None, | 32 | query_vector: Optional[np.ndarray] = None, |
| 33 | domain: str = "default", | 33 | domain: str = "default", |
| @@ -210,6 +210,9 @@ class QueryParser: | @@ -210,6 +210,9 @@ class QueryParser: | ||
| 210 | 210 | ||
| 211 | # Stage 3: Language detection | 211 | # Stage 3: Language detection |
| 212 | detected_lang = self.language_detector.detect(query_text) | 212 | detected_lang = self.language_detector.detect(query_text) |
| 213 | + # Use default language if detection failed (None or "unknown") | ||
| 214 | + if not detected_lang or detected_lang == "unknown": | ||
| 215 | + detected_lang = self.config.query_config.default_language | ||
| 213 | log_info(f"语言检测 | 检测到语言: {detected_lang}") | 216 | log_info(f"语言检测 | 检测到语言: {detected_lang}") |
| 214 | if context: | 217 | if context: |
| 215 | context.store_intermediate_result('detected_language', detected_lang) | 218 | context.store_intermediate_result('detected_language', detected_lang) |
query/translator.py
| @@ -12,6 +12,7 @@ https://developers.deepl.com/api-reference/translate/request-translation | @@ -12,6 +12,7 @@ https://developers.deepl.com/api-reference/translate/request-translation | ||
| 12 | """ | 12 | """ |
| 13 | 13 | ||
| 14 | import requests | 14 | import requests |
| 15 | +import re | ||
| 15 | from concurrent.futures import ThreadPoolExecutor | 16 | from concurrent.futures import ThreadPoolExecutor |
| 16 | from typing import Dict, List, Optional | 17 | from typing import Dict, List, Optional |
| 17 | from utils.cache import DictCache | 18 | from utils.cache import DictCache |
| @@ -110,6 +111,15 @@ class Translator: | @@ -110,6 +111,15 @@ class Translator: | ||
| 110 | if source_lang: | 111 | if source_lang: |
| 111 | source_lang = source_lang.lower() | 112 | source_lang = source_lang.lower() |
| 112 | 113 | ||
| 114 | + # Optimization: Skip translation if not needed | ||
| 115 | + if target_lang == 'en' and self._is_english_text(text): | ||
| 116 | + logger.debug(f"[Translator] Text is already English, skipping translation: '{text[:50]}...'") | ||
| 117 | + return text | ||
| 118 | + | ||
| 119 | + if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): | ||
| 120 | + logger.debug(f"[Translator] Text contains Chinese or is pure number, skipping translation: '{text[:50]}...'") | ||
| 121 | + return text | ||
| 122 | + | ||
| 113 | # Use provided context or default context | 123 | # Use provided context or default context |
| 114 | translation_context = context or self.translation_context | 124 | translation_context = context or self.translation_context |
| 115 | 125 | ||
| @@ -312,8 +322,10 @@ class Translator: | @@ -312,8 +322,10 @@ class Translator: | ||
| 312 | 322 | ||
| 313 | In async_mode=True (default): | 323 | In async_mode=True (default): |
| 314 | - Returns cached translations immediately if available | 324 | - Returns cached translations immediately if available |
| 315 | - - Launches async tasks for missing translations (non-blocking) | ||
| 316 | - - Returns None for missing translations (will be available in cache next time) | 325 | + - For translations that can be optimized (e.g., pure numbers, already in target language), |
| 326 | + returns result immediately via synchronous call | ||
| 327 | + - Launches async tasks for other missing translations (non-blocking) | ||
| 328 | + - Returns None for missing translations that require async processing | ||
| 317 | 329 | ||
| 318 | In async_mode=False: | 330 | In async_mode=False: |
| 319 | - Waits for all translations to complete (blocking) | 331 | - Waits for all translations to complete (blocking) |
| @@ -331,6 +343,7 @@ class Translator: | @@ -331,6 +343,7 @@ class Translator: | ||
| 331 | """ | 343 | """ |
| 332 | results = {} | 344 | results = {} |
| 333 | missing_langs = [] | 345 | missing_langs = [] |
| 346 | + async_langs = [] | ||
| 334 | 347 | ||
| 335 | # First, get cached translations | 348 | # First, get cached translations |
| 336 | for lang in target_langs: | 349 | for lang in target_langs: |
| @@ -340,13 +353,32 @@ class Translator: | @@ -340,13 +353,32 @@ class Translator: | ||
| 340 | else: | 353 | else: |
| 341 | missing_langs.append(lang) | 354 | missing_langs.append(lang) |
| 342 | 355 | ||
| 343 | - # If async mode and there are missing translations, launch async tasks | 356 | + # If async mode and there are missing translations |
| 344 | if async_mode and missing_langs: | 357 | if async_mode and missing_langs: |
| 358 | + # Check if translation can be optimized (immediate return) | ||
| 345 | for lang in missing_langs: | 359 | for lang in missing_langs: |
| 346 | - self._translate_async(text, lang, source_lang, context, prompt) | ||
| 347 | - # Return None for missing translations | ||
| 348 | - for lang in missing_langs: | ||
| 349 | - results[lang] = None | 360 | + target_lang = lang.lower() |
| 361 | + # Check optimization conditions (same as in translate method) | ||
| 362 | + can_optimize = False | ||
| 363 | + if target_lang == 'en' and self._is_english_text(text): | ||
| 364 | + can_optimize = True | ||
| 365 | + elif target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): | ||
| 366 | + can_optimize = True | ||
| 367 | + | ||
| 368 | + if can_optimize: | ||
| 369 | + # Can be optimized, call translate synchronously for immediate result | ||
| 370 | + results[lang] = self.translate(text, lang, source_lang, context, prompt) | ||
| 371 | + else: | ||
| 372 | + # Requires actual translation, add to async list | ||
| 373 | + async_langs.append(lang) | ||
| 374 | + | ||
| 375 | + # Launch async tasks for translations that require actual API calls | ||
| 376 | + if async_langs: | ||
| 377 | + for lang in async_langs: | ||
| 378 | + self._translate_async(text, lang, source_lang, context, prompt) | ||
| 379 | + # Return None for async translations | ||
| 380 | + for lang in async_langs: | ||
| 381 | + results[lang] = None | ||
| 350 | else: | 382 | else: |
| 351 | # Synchronous mode: wait for all translations | 383 | # Synchronous mode: wait for all translations |
| 352 | for lang in missing_langs: | 384 | for lang in missing_langs: |
| @@ -496,3 +528,67 @@ class Translator: | @@ -496,3 +528,67 @@ class Translator: | ||
| 496 | 528 | ||
| 497 | # Otherwise, translate to all supported languages | 529 | # Otherwise, translate to all supported languages |
| 498 | return supported_langs | 530 | return supported_langs |
| 531 | + | ||
| 532 | + def _is_english_text(self, text: str) -> bool: | ||
| 533 | + """ | ||
| 534 | + Check if text is primarily English (ASCII letters, numbers, common punctuation). | ||
| 535 | + | ||
| 536 | + Args: | ||
| 537 | + text: Text to check | ||
| 538 | + | ||
| 539 | + Returns: | ||
| 540 | + True if text appears to be English | ||
| 541 | + """ | ||
| 542 | + if not text or not text.strip(): | ||
| 543 | + return True | ||
| 544 | + | ||
| 545 | + # Remove whitespace and common punctuation | ||
| 546 | + text_clean = re.sub(r'[\s\.,!?;:\-\'\"\(\)\[\]{}]', '', text) | ||
| 547 | + if not text_clean: | ||
| 548 | + return True | ||
| 549 | + | ||
| 550 | + # Check if all remaining characters are ASCII (letters, numbers) | ||
| 551 | + # This is a simple heuristic: if most characters are ASCII, it's likely English | ||
| 552 | + ascii_count = sum(1 for c in text_clean if ord(c) < 128) | ||
| 553 | + ratio = ascii_count / len(text_clean) if text_clean else 0 | ||
| 554 | + | ||
| 555 | + # If more than 80% are ASCII characters, consider it English | ||
| 556 | + return ratio > 0.8 | ||
| 557 | + | ||
| 558 | + def _contains_chinese(self, text: str) -> bool: | ||
| 559 | + """ | ||
| 560 | + Check if text contains Chinese characters (Han characters). | ||
| 561 | + | ||
| 562 | + Args: | ||
| 563 | + text: Text to check | ||
| 564 | + | ||
| 565 | + Returns: | ||
| 566 | + True if text contains Chinese characters | ||
| 567 | + """ | ||
| 568 | + if not text: | ||
| 569 | + return False | ||
| 570 | + | ||
| 571 | + # Check for Chinese characters (Unicode range: \u4e00-\u9fff) | ||
| 572 | + chinese_pattern = re.compile(r'[\u4e00-\u9fff]') | ||
| 573 | + return bool(chinese_pattern.search(text)) | ||
| 574 | + | ||
| 575 | + def _is_pure_number(self, text: str) -> bool: | ||
| 576 | + """ | ||
| 577 | + Check if text is purely numeric (digits, possibly with spaces, dots, commas). | ||
| 578 | + | ||
| 579 | + Args: | ||
| 580 | + text: Text to check | ||
| 581 | + | ||
| 582 | + Returns: | ||
| 583 | + True if text is purely numeric | ||
| 584 | + """ | ||
| 585 | + if not text or not text.strip(): | ||
| 586 | + return False | ||
| 587 | + | ||
| 588 | + # Remove whitespace, dots, commas (common number separators) | ||
| 589 | + text_clean = re.sub(r'[\s\.,]', '', text.strip()) | ||
| 590 | + if not text_clean: | ||
| 591 | + return False | ||
| 592 | + | ||
| 593 | + # Check if all remaining characters are digits | ||
| 594 | + return text_clean.isdigit() |
search/es_query_builder.py
| @@ -25,7 +25,8 @@ class ESQueryBuilder: | @@ -25,7 +25,8 @@ class ESQueryBuilder: | ||
| 25 | image_embedding_field: Optional[str] = None, | 25 | image_embedding_field: Optional[str] = None, |
| 26 | source_fields: Optional[List[str]] = None, | 26 | source_fields: Optional[List[str]] = None, |
| 27 | function_score_config: Optional[FunctionScoreConfig] = None, | 27 | function_score_config: Optional[FunctionScoreConfig] = None, |
| 28 | - enable_multilang_search: bool = True | 28 | + enable_multilang_search: bool = True, |
| 29 | + default_language: str = "zh" | ||
| 29 | ): | 30 | ): |
| 30 | """ | 31 | """ |
| 31 | Initialize query builder. | 32 | Initialize query builder. |
| @@ -38,6 +39,7 @@ class ESQueryBuilder: | @@ -38,6 +39,7 @@ class ESQueryBuilder: | ||
| 38 | source_fields: Fields to return in search results (_source includes) | 39 | source_fields: Fields to return in search results (_source includes) |
| 39 | function_score_config: Function score configuration | 40 | function_score_config: Function score configuration |
| 40 | enable_multilang_search: Enable multi-language search using translations | 41 | enable_multilang_search: Enable multi-language search using translations |
| 42 | + default_language: Default language to use when detection fails or returns "unknown" | ||
| 41 | """ | 43 | """ |
| 42 | self.index_name = index_name | 44 | self.index_name = index_name |
| 43 | self.match_fields = match_fields | 45 | self.match_fields = match_fields |
| @@ -46,6 +48,7 @@ class ESQueryBuilder: | @@ -46,6 +48,7 @@ class ESQueryBuilder: | ||
| 46 | self.source_fields = source_fields | 48 | self.source_fields = source_fields |
| 47 | self.function_score_config = function_score_config | 49 | self.function_score_config = function_score_config |
| 48 | self.enable_multilang_search = enable_multilang_search | 50 | self.enable_multilang_search = enable_multilang_search |
| 51 | + self.default_language = default_language | ||
| 49 | 52 | ||
| 50 | def _split_filters_for_faceting( | 53 | def _split_filters_for_faceting( |
| 51 | self, | 54 | self, |
| @@ -422,7 +425,7 @@ class ESQueryBuilder: | @@ -422,7 +425,7 @@ class ESQueryBuilder: | ||
| 422 | 425 | ||
| 423 | # Get query analysis from parsed_query | 426 | # Get query analysis from parsed_query |
| 424 | translations = {} | 427 | translations = {} |
| 425 | - language = 'zh' | 428 | + language = self.default_language |
| 426 | keywords = "" | 429 | keywords = "" |
| 427 | token_count = 0 | 430 | token_count = 0 |
| 428 | is_short_query = False | 431 | is_short_query = False |
| @@ -430,7 +433,12 @@ class ESQueryBuilder: | @@ -430,7 +433,12 @@ class ESQueryBuilder: | ||
| 430 | 433 | ||
| 431 | if parsed_query: | 434 | if parsed_query: |
| 432 | translations = parsed_query.translations or {} | 435 | translations = parsed_query.translations or {} |
| 433 | - language = parsed_query.detected_language or 'zh' | 436 | + # Use default language if detected_language is None or "unknown" |
| 437 | + detected_lang = parsed_query.detected_language | ||
| 438 | + if not detected_lang or detected_lang == "unknown": | ||
| 439 | + language = self.default_language | ||
| 440 | + else: | ||
| 441 | + language = detected_lang | ||
| 434 | keywords = getattr(parsed_query, 'keywords', '') or "" | 442 | keywords = getattr(parsed_query, 'keywords', '') or "" |
| 435 | token_count = getattr(parsed_query, 'token_count', 0) or 0 | 443 | token_count = getattr(parsed_query, 'token_count', 0) or 0 |
| 436 | is_short_query = getattr(parsed_query, 'is_short_query', False) | 444 | is_short_query = getattr(parsed_query, 'is_short_query', False) |
| @@ -458,7 +466,7 @@ class ESQueryBuilder: | @@ -458,7 +466,7 @@ class ESQueryBuilder: | ||
| 458 | 466 | ||
| 459 | # 2. Translation queries - lower boost (0.4) for other languages | 467 | # 2. Translation queries - lower boost (0.4) for other languages |
| 460 | if self.enable_multilang_search: | 468 | if self.enable_multilang_search: |
| 461 | - if language != 'zh' and translations.get('zh') and translations['zh'] != query_text: | 469 | + if language != 'zh' and translations.get('zh'): |
| 462 | zh_fields, _ = self._get_match_fields('zh') | 470 | zh_fields, _ = self._get_match_fields('zh') |
| 463 | should_clauses.append({ | 471 | should_clauses.append({ |
| 464 | "multi_match": { | 472 | "multi_match": { |
| @@ -472,7 +480,7 @@ class ESQueryBuilder: | @@ -472,7 +480,7 @@ class ESQueryBuilder: | ||
| 472 | } | 480 | } |
| 473 | }) | 481 | }) |
| 474 | 482 | ||
| 475 | - if language != 'en' and translations.get('en') and translations['en'] != query_text: | 483 | + if language != 'en' and translations.get('en'): |
| 476 | en_fields, _ = self._get_match_fields('en') | 484 | en_fields, _ = self._get_match_fields('en') |
| 477 | should_clauses.append({ | 485 | should_clauses.append({ |
| 478 | "multi_match": { | 486 | "multi_match": { |
search/searcher.py
| @@ -113,7 +113,8 @@ class Searcher: | @@ -113,7 +113,8 @@ class Searcher: | ||
| 113 | image_embedding_field=self.image_embedding_field, | 113 | image_embedding_field=self.image_embedding_field, |
| 114 | source_fields=self.source_fields, | 114 | source_fields=self.source_fields, |
| 115 | function_score_config=self.config.function_score, | 115 | function_score_config=self.config.function_score, |
| 116 | - enable_multilang_search=self.config.query_config.enable_multilang_search | 116 | + enable_multilang_search=self.config.query_config.enable_multilang_search, |
| 117 | + default_language=self.config.query_config.default_language | ||
| 117 | ) | 118 | ) |
| 118 | 119 | ||
| 119 | def search( | 120 | def search( |