Commit a5a6bab82022a86b85ccb14b09355c18ee170412

Authored by tangwang
1 parent 11237cf2

多语言查询优化

frontend/index.html
... ... @@ -142,6 +142,6 @@
142 142 <p>SearchEngine © 2025 | API: <span id="apiUrl">Loading...</span></p>
143 143 </footer>
144 144  
145   - <script src="/static/js/app.js?v=3.7"></script>
  145 + <script src="/static/js/app.js?v=3.1"></script>
146 146 </body>
147 147 </html>
... ...
frontend/static/js/app.js
... ... @@ -554,7 +554,7 @@ function displayDebugInfo(data) {
554 554 if (data.query_info) {
555 555 let html = '<div style="padding: 10px;">';
556 556 html += `<div><strong>original_query:</strong> ${escapeHtml(data.query_info.original_query || 'N/A')}</div>`;
557   - html += `<div><strong>detected_language:</strong> ${getLanguageName(data.query_info.detected_language)}</div>`;
  557 + html += `<div><strong>detected_language:</strong> ${data.query_info.detected_languag}</div>`;
558 558 html += '</div>';
559 559 debugInfoDiv.innerHTML = html;
560 560 } else {
... ... @@ -573,7 +573,7 @@ function displayDebugInfo(data) {
573 573 html += `<div>original_query: ${escapeHtml(debugInfo.query_analysis.original_query || 'N/A')}</div>`;
574 574 html += `<div>normalized_query: ${escapeHtml(debugInfo.query_analysis.normalized_query || 'N/A')}</div>`;
575 575 html += `<div>rewritten_query: ${escapeHtml(debugInfo.query_analysis.rewritten_query || 'N/A')}</div>`;
576   - html += `<div>detected_language: ${getLanguageName(debugInfo.query_analysis.detected_language)}</div>`;
  576 + html += `<div>detected_language: ${debugInfo.query_analysis.detected_language}</div>`;
577 577 html += `<div>domain: ${escapeHtml(debugInfo.query_analysis.domain || 'default')}</div>`;
578 578 html += `<div>is_simple_query: ${debugInfo.query_analysis.is_simple_query ? 'yes' : 'no'}</div>`;
579 579  
... ... @@ -581,7 +581,7 @@ function displayDebugInfo(data) {
581 581 html += '<div>translations: ';
582 582 for (const [lang, translation] of Object.entries(debugInfo.query_analysis.translations)) {
583 583 if (translation) {
584   - html += `${getLanguageName(lang)}: ${escapeHtml(translation)}; `;
  584 + html += `${lang}: ${escapeHtml(translation)}; `;
585 585 }
586 586 }
587 587 html += '</div>';
... ... @@ -669,14 +669,3 @@ function formatDate(dateStr) {
669 669 }
670 670 }
671 671  
672   -function getLanguageName(code) {
673   - const names = {
674   - 'zh': '中文',
675   - 'en': 'English',
676   - 'ru': 'Русский',
677   - 'ar': 'العربية',
678   - 'ja': '日本語',
679   - 'unknown': 'Unknown'
680   - };
681   - return names[code] || code;
682   -}
... ...
query/query_parser.py
... ... @@ -27,7 +27,7 @@ class ParsedQuery:
27 27 original_query: str,
28 28 normalized_query: str,
29 29 rewritten_query: Optional[str] = None,
30   - detected_language: str = "unknown",
  30 + detected_language: Optional[str] = None,
31 31 translations: Dict[str, str] = None,
32 32 query_vector: Optional[np.ndarray] = None,
33 33 domain: str = "default",
... ... @@ -210,6 +210,9 @@ class QueryParser:
210 210  
211 211 # Stage 3: Language detection
212 212 detected_lang = self.language_detector.detect(query_text)
  213 + # Use default language if detection failed (None or "unknown")
  214 + if not detected_lang or detected_lang == "unknown":
  215 + detected_lang = self.config.query_config.default_language
213 216 log_info(f"语言检测 | 检测到语言: {detected_lang}")
214 217 if context:
215 218 context.store_intermediate_result('detected_language', detected_lang)
... ...
query/translator.py
... ... @@ -12,6 +12,7 @@ https://developers.deepl.com/api-reference/translate/request-translation
12 12 """
13 13  
14 14 import requests
  15 +import re
15 16 from concurrent.futures import ThreadPoolExecutor
16 17 from typing import Dict, List, Optional
17 18 from utils.cache import DictCache
... ... @@ -110,6 +111,15 @@ class Translator:
110 111 if source_lang:
111 112 source_lang = source_lang.lower()
112 113  
  114 + # Optimization: Skip translation if not needed
  115 + if target_lang == 'en' and self._is_english_text(text):
  116 + logger.debug(f"[Translator] Text is already English, skipping translation: '{text[:50]}...'")
  117 + return text
  118 +
  119 + if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)):
  120 + logger.debug(f"[Translator] Text contains Chinese or is pure number, skipping translation: '{text[:50]}...'")
  121 + return text
  122 +
113 123 # Use provided context or default context
114 124 translation_context = context or self.translation_context
115 125  
... ... @@ -312,8 +322,10 @@ class Translator:
312 322  
313 323 In async_mode=True (default):
314 324 - Returns cached translations immediately if available
315   - - Launches async tasks for missing translations (non-blocking)
316   - - Returns None for missing translations (will be available in cache next time)
  325 + - For translations that can be optimized (e.g., pure numbers, already in target language),
  326 + returns result immediately via synchronous call
  327 + - Launches async tasks for other missing translations (non-blocking)
  328 + - Returns None for missing translations that require async processing
317 329  
318 330 In async_mode=False:
319 331 - Waits for all translations to complete (blocking)
... ... @@ -331,6 +343,7 @@ class Translator:
331 343 """
332 344 results = {}
333 345 missing_langs = []
  346 + async_langs = []
334 347  
335 348 # First, get cached translations
336 349 for lang in target_langs:
... ... @@ -340,13 +353,32 @@ class Translator:
340 353 else:
341 354 missing_langs.append(lang)
342 355  
343   - # If async mode and there are missing translations, launch async tasks
  356 + # If async mode and there are missing translations
344 357 if async_mode and missing_langs:
  358 + # Check if translation can be optimized (immediate return)
345 359 for lang in missing_langs:
346   - self._translate_async(text, lang, source_lang, context, prompt)
347   - # Return None for missing translations
348   - for lang in missing_langs:
349   - results[lang] = None
  360 + target_lang = lang.lower()
  361 + # Check optimization conditions (same as in translate method)
  362 + can_optimize = False
  363 + if target_lang == 'en' and self._is_english_text(text):
  364 + can_optimize = True
  365 + elif target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)):
  366 + can_optimize = True
  367 +
  368 + if can_optimize:
  369 + # Can be optimized, call translate synchronously for immediate result
  370 + results[lang] = self.translate(text, lang, source_lang, context, prompt)
  371 + else:
  372 + # Requires actual translation, add to async list
  373 + async_langs.append(lang)
  374 +
  375 + # Launch async tasks for translations that require actual API calls
  376 + if async_langs:
  377 + for lang in async_langs:
  378 + self._translate_async(text, lang, source_lang, context, prompt)
  379 + # Return None for async translations
  380 + for lang in async_langs:
  381 + results[lang] = None
350 382 else:
351 383 # Synchronous mode: wait for all translations
352 384 for lang in missing_langs:
... ... @@ -496,3 +528,67 @@ class Translator:
496 528  
497 529 # Otherwise, translate to all supported languages
498 530 return supported_langs
  531 +
  532 + def _is_english_text(self, text: str) -> bool:
  533 + """
  534 + Check if text is primarily English (ASCII letters, numbers, common punctuation).
  535 +
  536 + Args:
  537 + text: Text to check
  538 +
  539 + Returns:
  540 + True if text appears to be English
  541 + """
  542 + if not text or not text.strip():
  543 + return True
  544 +
  545 + # Remove whitespace and common punctuation
  546 + text_clean = re.sub(r'[\s\.,!?;:\-\'\"\(\)\[\]{}]', '', text)
  547 + if not text_clean:
  548 + return True
  549 +
  550 + # Check if all remaining characters are ASCII (letters, numbers)
  551 + # This is a simple heuristic: if most characters are ASCII, it's likely English
  552 + ascii_count = sum(1 for c in text_clean if ord(c) < 128)
  553 + ratio = ascii_count / len(text_clean) if text_clean else 0
  554 +
  555 + # If more than 80% are ASCII characters, consider it English
  556 + return ratio > 0.8
  557 +
  558 + def _contains_chinese(self, text: str) -> bool:
  559 + """
  560 + Check if text contains Chinese characters (Han characters).
  561 +
  562 + Args:
  563 + text: Text to check
  564 +
  565 + Returns:
  566 + True if text contains Chinese characters
  567 + """
  568 + if not text:
  569 + return False
  570 +
  571 + # Check for Chinese characters (Unicode range: \u4e00-\u9fff)
  572 + chinese_pattern = re.compile(r'[\u4e00-\u9fff]')
  573 + return bool(chinese_pattern.search(text))
  574 +
  575 + def _is_pure_number(self, text: str) -> bool:
  576 + """
  577 + Check if text is purely numeric (digits, possibly with spaces, dots, commas).
  578 +
  579 + Args:
  580 + text: Text to check
  581 +
  582 + Returns:
  583 + True if text is purely numeric
  584 + """
  585 + if not text or not text.strip():
  586 + return False
  587 +
  588 + # Remove whitespace, dots, commas (common number separators)
  589 + text_clean = re.sub(r'[\s\.,]', '', text.strip())
  590 + if not text_clean:
  591 + return False
  592 +
  593 + # Check if all remaining characters are digits
  594 + return text_clean.isdigit()
... ...
search/es_query_builder.py
... ... @@ -25,7 +25,8 @@ class ESQueryBuilder:
25 25 image_embedding_field: Optional[str] = None,
26 26 source_fields: Optional[List[str]] = None,
27 27 function_score_config: Optional[FunctionScoreConfig] = None,
28   - enable_multilang_search: bool = True
  28 + enable_multilang_search: bool = True,
  29 + default_language: str = "zh"
29 30 ):
30 31 """
31 32 Initialize query builder.
... ... @@ -38,6 +39,7 @@ class ESQueryBuilder:
38 39 source_fields: Fields to return in search results (_source includes)
39 40 function_score_config: Function score configuration
40 41 enable_multilang_search: Enable multi-language search using translations
  42 + default_language: Default language to use when detection fails or returns "unknown"
41 43 """
42 44 self.index_name = index_name
43 45 self.match_fields = match_fields
... ... @@ -46,6 +48,7 @@ class ESQueryBuilder:
46 48 self.source_fields = source_fields
47 49 self.function_score_config = function_score_config
48 50 self.enable_multilang_search = enable_multilang_search
  51 + self.default_language = default_language
49 52  
50 53 def _split_filters_for_faceting(
51 54 self,
... ... @@ -422,7 +425,7 @@ class ESQueryBuilder:
422 425  
423 426 # Get query analysis from parsed_query
424 427 translations = {}
425   - language = 'zh'
  428 + language = self.default_language
426 429 keywords = ""
427 430 token_count = 0
428 431 is_short_query = False
... ... @@ -430,7 +433,12 @@ class ESQueryBuilder:
430 433  
431 434 if parsed_query:
432 435 translations = parsed_query.translations or {}
433   - language = parsed_query.detected_language or 'zh'
  436 + # Use default language if detected_language is None or "unknown"
  437 + detected_lang = parsed_query.detected_language
  438 + if not detected_lang or detected_lang == "unknown":
  439 + language = self.default_language
  440 + else:
  441 + language = detected_lang
434 442 keywords = getattr(parsed_query, 'keywords', '') or ""
435 443 token_count = getattr(parsed_query, 'token_count', 0) or 0
436 444 is_short_query = getattr(parsed_query, 'is_short_query', False)
... ... @@ -458,7 +466,7 @@ class ESQueryBuilder:
458 466  
459 467 # 2. Translation queries - lower boost (0.4) for other languages
460 468 if self.enable_multilang_search:
461   - if language != 'zh' and translations.get('zh') and translations['zh'] != query_text:
  469 + if language != 'zh' and translations.get('zh'):
462 470 zh_fields, _ = self._get_match_fields('zh')
463 471 should_clauses.append({
464 472 "multi_match": {
... ... @@ -472,7 +480,7 @@ class ESQueryBuilder:
472 480 }
473 481 })
474 482  
475   - if language != 'en' and translations.get('en') and translations['en'] != query_text:
  483 + if language != 'en' and translations.get('en'):
476 484 en_fields, _ = self._get_match_fields('en')
477 485 should_clauses.append({
478 486 "multi_match": {
... ...
search/searcher.py
... ... @@ -113,7 +113,8 @@ class Searcher:
113 113 image_embedding_field=self.image_embedding_field,
114 114 source_fields=self.source_fields,
115 115 function_score_config=self.config.function_score,
116   - enable_multilang_search=self.config.query_config.enable_multilang_search
  116 + enable_multilang_search=self.config.query_config.enable_multilang_search,
  117 + default_language=self.config.query_config.default_language
117 118 )
118 119  
119 120 def search(
... ...