Commit a5a6bab82022a86b85ccb14b09355c18ee170412

Authored by tangwang
1 parent 11237cf2

多语言查询优化

frontend/index.html
@@ -142,6 +142,6 @@ @@ -142,6 +142,6 @@
142 <p>SearchEngine © 2025 | API: <span id="apiUrl">Loading...</span></p> 142 <p>SearchEngine © 2025 | API: <span id="apiUrl">Loading...</span></p>
143 </footer> 143 </footer>
144 144
145 - <script src="/static/js/app.js?v=3.7"></script> 145 + <script src="/static/js/app.js?v=3.1"></script>
146 </body> 146 </body>
147 </html> 147 </html>
frontend/static/js/app.js
@@ -554,7 +554,7 @@ function displayDebugInfo(data) { @@ -554,7 +554,7 @@ function displayDebugInfo(data) {
554 if (data.query_info) { 554 if (data.query_info) {
555 let html = '<div style="padding: 10px;">'; 555 let html = '<div style="padding: 10px;">';
556 html += `<div><strong>original_query:</strong> ${escapeHtml(data.query_info.original_query || 'N/A')}</div>`; 556 html += `<div><strong>original_query:</strong> ${escapeHtml(data.query_info.original_query || 'N/A')}</div>`;
557 - html += `<div><strong>detected_language:</strong> ${getLanguageName(data.query_info.detected_language)}</div>`; 557 + html += `<div><strong>detected_language:</strong> ${data.query_info.detected_languag}</div>`;
558 html += '</div>'; 558 html += '</div>';
559 debugInfoDiv.innerHTML = html; 559 debugInfoDiv.innerHTML = html;
560 } else { 560 } else {
@@ -573,7 +573,7 @@ function displayDebugInfo(data) { @@ -573,7 +573,7 @@ function displayDebugInfo(data) {
573 html += `<div>original_query: ${escapeHtml(debugInfo.query_analysis.original_query || 'N/A')}</div>`; 573 html += `<div>original_query: ${escapeHtml(debugInfo.query_analysis.original_query || 'N/A')}</div>`;
574 html += `<div>normalized_query: ${escapeHtml(debugInfo.query_analysis.normalized_query || 'N/A')}</div>`; 574 html += `<div>normalized_query: ${escapeHtml(debugInfo.query_analysis.normalized_query || 'N/A')}</div>`;
575 html += `<div>rewritten_query: ${escapeHtml(debugInfo.query_analysis.rewritten_query || 'N/A')}</div>`; 575 html += `<div>rewritten_query: ${escapeHtml(debugInfo.query_analysis.rewritten_query || 'N/A')}</div>`;
576 - html += `<div>detected_language: ${getLanguageName(debugInfo.query_analysis.detected_language)}</div>`; 576 + html += `<div>detected_language: ${debugInfo.query_analysis.detected_language}</div>`;
577 html += `<div>domain: ${escapeHtml(debugInfo.query_analysis.domain || 'default')}</div>`; 577 html += `<div>domain: ${escapeHtml(debugInfo.query_analysis.domain || 'default')}</div>`;
578 html += `<div>is_simple_query: ${debugInfo.query_analysis.is_simple_query ? 'yes' : 'no'}</div>`; 578 html += `<div>is_simple_query: ${debugInfo.query_analysis.is_simple_query ? 'yes' : 'no'}</div>`;
579 579
@@ -581,7 +581,7 @@ function displayDebugInfo(data) { @@ -581,7 +581,7 @@ function displayDebugInfo(data) {
581 html += '<div>translations: '; 581 html += '<div>translations: ';
582 for (const [lang, translation] of Object.entries(debugInfo.query_analysis.translations)) { 582 for (const [lang, translation] of Object.entries(debugInfo.query_analysis.translations)) {
583 if (translation) { 583 if (translation) {
584 - html += `${getLanguageName(lang)}: ${escapeHtml(translation)}; `; 584 + html += `${lang}: ${escapeHtml(translation)}; `;
585 } 585 }
586 } 586 }
587 html += '</div>'; 587 html += '</div>';
@@ -669,14 +669,3 @@ function formatDate(dateStr) { @@ -669,14 +669,3 @@ function formatDate(dateStr) {
669 } 669 }
670 } 670 }
671 671
672 -function getLanguageName(code) {  
673 - const names = {  
674 - 'zh': '中文',  
675 - 'en': 'English',  
676 - 'ru': 'Русский',  
677 - 'ar': 'العربية',  
678 - 'ja': '日本語',  
679 - 'unknown': 'Unknown'  
680 - };  
681 - return names[code] || code;  
682 -}  
query/query_parser.py
@@ -27,7 +27,7 @@ class ParsedQuery: @@ -27,7 +27,7 @@ class ParsedQuery:
27 original_query: str, 27 original_query: str,
28 normalized_query: str, 28 normalized_query: str,
29 rewritten_query: Optional[str] = None, 29 rewritten_query: Optional[str] = None,
30 - detected_language: str = "unknown", 30 + detected_language: Optional[str] = None,
31 translations: Dict[str, str] = None, 31 translations: Dict[str, str] = None,
32 query_vector: Optional[np.ndarray] = None, 32 query_vector: Optional[np.ndarray] = None,
33 domain: str = "default", 33 domain: str = "default",
@@ -210,6 +210,9 @@ class QueryParser: @@ -210,6 +210,9 @@ class QueryParser:
210 210
211 # Stage 3: Language detection 211 # Stage 3: Language detection
212 detected_lang = self.language_detector.detect(query_text) 212 detected_lang = self.language_detector.detect(query_text)
  213 + # Use default language if detection failed (None or "unknown")
  214 + if not detected_lang or detected_lang == "unknown":
  215 + detected_lang = self.config.query_config.default_language
213 log_info(f"语言检测 | 检测到语言: {detected_lang}") 216 log_info(f"语言检测 | 检测到语言: {detected_lang}")
214 if context: 217 if context:
215 context.store_intermediate_result('detected_language', detected_lang) 218 context.store_intermediate_result('detected_language', detected_lang)
query/translator.py
@@ -12,6 +12,7 @@ https://developers.deepl.com/api-reference/translate/request-translation @@ -12,6 +12,7 @@ https://developers.deepl.com/api-reference/translate/request-translation
12 """ 12 """
13 13
14 import requests 14 import requests
  15 +import re
15 from concurrent.futures import ThreadPoolExecutor 16 from concurrent.futures import ThreadPoolExecutor
16 from typing import Dict, List, Optional 17 from typing import Dict, List, Optional
17 from utils.cache import DictCache 18 from utils.cache import DictCache
@@ -110,6 +111,15 @@ class Translator: @@ -110,6 +111,15 @@ class Translator:
110 if source_lang: 111 if source_lang:
111 source_lang = source_lang.lower() 112 source_lang = source_lang.lower()
112 113
  114 + # Optimization: Skip translation if not needed
  115 + if target_lang == 'en' and self._is_english_text(text):
  116 + logger.debug(f"[Translator] Text is already English, skipping translation: '{text[:50]}...'")
  117 + return text
  118 +
  119 + if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)):
  120 + logger.debug(f"[Translator] Text contains Chinese or is pure number, skipping translation: '{text[:50]}...'")
  121 + return text
  122 +
113 # Use provided context or default context 123 # Use provided context or default context
114 translation_context = context or self.translation_context 124 translation_context = context or self.translation_context
115 125
@@ -312,8 +322,10 @@ class Translator: @@ -312,8 +322,10 @@ class Translator:
312 322
313 In async_mode=True (default): 323 In async_mode=True (default):
314 - Returns cached translations immediately if available 324 - Returns cached translations immediately if available
315 - - Launches async tasks for missing translations (non-blocking)  
316 - - Returns None for missing translations (will be available in cache next time) 325 + - For translations that can be optimized (e.g., pure numbers, already in target language),
  326 + returns result immediately via synchronous call
  327 + - Launches async tasks for other missing translations (non-blocking)
  328 + - Returns None for missing translations that require async processing
317 329
318 In async_mode=False: 330 In async_mode=False:
319 - Waits for all translations to complete (blocking) 331 - Waits for all translations to complete (blocking)
@@ -331,6 +343,7 @@ class Translator: @@ -331,6 +343,7 @@ class Translator:
331 """ 343 """
332 results = {} 344 results = {}
333 missing_langs = [] 345 missing_langs = []
  346 + async_langs = []
334 347
335 # First, get cached translations 348 # First, get cached translations
336 for lang in target_langs: 349 for lang in target_langs:
@@ -340,13 +353,32 @@ class Translator: @@ -340,13 +353,32 @@ class Translator:
340 else: 353 else:
341 missing_langs.append(lang) 354 missing_langs.append(lang)
342 355
343 - # If async mode and there are missing translations, launch async tasks 356 + # If async mode and there are missing translations
344 if async_mode and missing_langs: 357 if async_mode and missing_langs:
  358 + # Check if translation can be optimized (immediate return)
345 for lang in missing_langs: 359 for lang in missing_langs:
346 - self._translate_async(text, lang, source_lang, context, prompt)  
347 - # Return None for missing translations  
348 - for lang in missing_langs:  
349 - results[lang] = None 360 + target_lang = lang.lower()
  361 + # Check optimization conditions (same as in translate method)
  362 + can_optimize = False
  363 + if target_lang == 'en' and self._is_english_text(text):
  364 + can_optimize = True
  365 + elif target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)):
  366 + can_optimize = True
  367 +
  368 + if can_optimize:
  369 + # Can be optimized, call translate synchronously for immediate result
  370 + results[lang] = self.translate(text, lang, source_lang, context, prompt)
  371 + else:
  372 + # Requires actual translation, add to async list
  373 + async_langs.append(lang)
  374 +
  375 + # Launch async tasks for translations that require actual API calls
  376 + if async_langs:
  377 + for lang in async_langs:
  378 + self._translate_async(text, lang, source_lang, context, prompt)
  379 + # Return None for async translations
  380 + for lang in async_langs:
  381 + results[lang] = None
350 else: 382 else:
351 # Synchronous mode: wait for all translations 383 # Synchronous mode: wait for all translations
352 for lang in missing_langs: 384 for lang in missing_langs:
@@ -496,3 +528,67 @@ class Translator: @@ -496,3 +528,67 @@ class Translator:
496 528
497 # Otherwise, translate to all supported languages 529 # Otherwise, translate to all supported languages
498 return supported_langs 530 return supported_langs
  531 +
  532 + def _is_english_text(self, text: str) -> bool:
  533 + """
  534 + Check if text is primarily English (ASCII letters, numbers, common punctuation).
  535 +
  536 + Args:
  537 + text: Text to check
  538 +
  539 + Returns:
  540 + True if text appears to be English
  541 + """
  542 + if not text or not text.strip():
  543 + return True
  544 +
  545 + # Remove whitespace and common punctuation
  546 + text_clean = re.sub(r'[\s\.,!?;:\-\'\"\(\)\[\]{}]', '', text)
  547 + if not text_clean:
  548 + return True
  549 +
  550 + # Check if all remaining characters are ASCII (letters, numbers)
  551 + # This is a simple heuristic: if most characters are ASCII, it's likely English
  552 + ascii_count = sum(1 for c in text_clean if ord(c) < 128)
  553 + ratio = ascii_count / len(text_clean) if text_clean else 0
  554 +
  555 + # If more than 80% are ASCII characters, consider it English
  556 + return ratio > 0.8
  557 +
  558 + def _contains_chinese(self, text: str) -> bool:
  559 + """
  560 + Check if text contains Chinese characters (Han characters).
  561 +
  562 + Args:
  563 + text: Text to check
  564 +
  565 + Returns:
  566 + True if text contains Chinese characters
  567 + """
  568 + if not text:
  569 + return False
  570 +
  571 + # Check for Chinese characters (Unicode range: \u4e00-\u9fff)
  572 + chinese_pattern = re.compile(r'[\u4e00-\u9fff]')
  573 + return bool(chinese_pattern.search(text))
  574 +
  575 + def _is_pure_number(self, text: str) -> bool:
  576 + """
  577 + Check if text is purely numeric (digits, possibly with spaces, dots, commas).
  578 +
  579 + Args:
  580 + text: Text to check
  581 +
  582 + Returns:
  583 + True if text is purely numeric
  584 + """
  585 + if not text or not text.strip():
  586 + return False
  587 +
  588 + # Remove whitespace, dots, commas (common number separators)
  589 + text_clean = re.sub(r'[\s\.,]', '', text.strip())
  590 + if not text_clean:
  591 + return False
  592 +
  593 + # Check if all remaining characters are digits
  594 + return text_clean.isdigit()
search/es_query_builder.py
@@ -25,7 +25,8 @@ class ESQueryBuilder: @@ -25,7 +25,8 @@ class ESQueryBuilder:
25 image_embedding_field: Optional[str] = None, 25 image_embedding_field: Optional[str] = None,
26 source_fields: Optional[List[str]] = None, 26 source_fields: Optional[List[str]] = None,
27 function_score_config: Optional[FunctionScoreConfig] = None, 27 function_score_config: Optional[FunctionScoreConfig] = None,
28 - enable_multilang_search: bool = True 28 + enable_multilang_search: bool = True,
  29 + default_language: str = "zh"
29 ): 30 ):
30 """ 31 """
31 Initialize query builder. 32 Initialize query builder.
@@ -38,6 +39,7 @@ class ESQueryBuilder: @@ -38,6 +39,7 @@ class ESQueryBuilder:
38 source_fields: Fields to return in search results (_source includes) 39 source_fields: Fields to return in search results (_source includes)
39 function_score_config: Function score configuration 40 function_score_config: Function score configuration
40 enable_multilang_search: Enable multi-language search using translations 41 enable_multilang_search: Enable multi-language search using translations
  42 + default_language: Default language to use when detection fails or returns "unknown"
41 """ 43 """
42 self.index_name = index_name 44 self.index_name = index_name
43 self.match_fields = match_fields 45 self.match_fields = match_fields
@@ -46,6 +48,7 @@ class ESQueryBuilder: @@ -46,6 +48,7 @@ class ESQueryBuilder:
46 self.source_fields = source_fields 48 self.source_fields = source_fields
47 self.function_score_config = function_score_config 49 self.function_score_config = function_score_config
48 self.enable_multilang_search = enable_multilang_search 50 self.enable_multilang_search = enable_multilang_search
  51 + self.default_language = default_language
49 52
50 def _split_filters_for_faceting( 53 def _split_filters_for_faceting(
51 self, 54 self,
@@ -422,7 +425,7 @@ class ESQueryBuilder: @@ -422,7 +425,7 @@ class ESQueryBuilder:
422 425
423 # Get query analysis from parsed_query 426 # Get query analysis from parsed_query
424 translations = {} 427 translations = {}
425 - language = 'zh' 428 + language = self.default_language
426 keywords = "" 429 keywords = ""
427 token_count = 0 430 token_count = 0
428 is_short_query = False 431 is_short_query = False
@@ -430,7 +433,12 @@ class ESQueryBuilder: @@ -430,7 +433,12 @@ class ESQueryBuilder:
430 433
431 if parsed_query: 434 if parsed_query:
432 translations = parsed_query.translations or {} 435 translations = parsed_query.translations or {}
433 - language = parsed_query.detected_language or 'zh' 436 + # Use default language if detected_language is None or "unknown"
  437 + detected_lang = parsed_query.detected_language
  438 + if not detected_lang or detected_lang == "unknown":
  439 + language = self.default_language
  440 + else:
  441 + language = detected_lang
434 keywords = getattr(parsed_query, 'keywords', '') or "" 442 keywords = getattr(parsed_query, 'keywords', '') or ""
435 token_count = getattr(parsed_query, 'token_count', 0) or 0 443 token_count = getattr(parsed_query, 'token_count', 0) or 0
436 is_short_query = getattr(parsed_query, 'is_short_query', False) 444 is_short_query = getattr(parsed_query, 'is_short_query', False)
@@ -458,7 +466,7 @@ class ESQueryBuilder: @@ -458,7 +466,7 @@ class ESQueryBuilder:
458 466
459 # 2. Translation queries - lower boost (0.4) for other languages 467 # 2. Translation queries - lower boost (0.4) for other languages
460 if self.enable_multilang_search: 468 if self.enable_multilang_search:
461 - if language != 'zh' and translations.get('zh') and translations['zh'] != query_text: 469 + if language != 'zh' and translations.get('zh'):
462 zh_fields, _ = self._get_match_fields('zh') 470 zh_fields, _ = self._get_match_fields('zh')
463 should_clauses.append({ 471 should_clauses.append({
464 "multi_match": { 472 "multi_match": {
@@ -472,7 +480,7 @@ class ESQueryBuilder: @@ -472,7 +480,7 @@ class ESQueryBuilder:
472 } 480 }
473 }) 481 })
474 482
475 - if language != 'en' and translations.get('en') and translations['en'] != query_text: 483 + if language != 'en' and translations.get('en'):
476 en_fields, _ = self._get_match_fields('en') 484 en_fields, _ = self._get_match_fields('en')
477 should_clauses.append({ 485 should_clauses.append({
478 "multi_match": { 486 "multi_match": {
search/searcher.py
@@ -113,7 +113,8 @@ class Searcher: @@ -113,7 +113,8 @@ class Searcher:
113 image_embedding_field=self.image_embedding_field, 113 image_embedding_field=self.image_embedding_field,
114 source_fields=self.source_fields, 114 source_fields=self.source_fields,
115 function_score_config=self.config.function_score, 115 function_score_config=self.config.function_score,
116 - enable_multilang_search=self.config.query_config.enable_multilang_search 116 + enable_multilang_search=self.config.query_config.enable_multilang_search,
  117 + default_language=self.config.query_config.default_language
117 ) 118 )
118 119
119 def search( 120 def search(