Commit a5a6bab82022a86b85ccb14b09355c18ee170412
1 parent
11237cf2
多语言查询优化
Showing
6 changed files
with
126 additions
and
29 deletions
Show diff stats
frontend/index.html
frontend/static/js/app.js
| ... | ... | @@ -554,7 +554,7 @@ function displayDebugInfo(data) { |
| 554 | 554 | if (data.query_info) { |
| 555 | 555 | let html = '<div style="padding: 10px;">'; |
| 556 | 556 | html += `<div><strong>original_query:</strong> ${escapeHtml(data.query_info.original_query || 'N/A')}</div>`; |
| 557 | - html += `<div><strong>detected_language:</strong> ${getLanguageName(data.query_info.detected_language)}</div>`; | |
| 557 | + html += `<div><strong>detected_language:</strong> ${data.query_info.detected_languag}</div>`; | |
| 558 | 558 | html += '</div>'; |
| 559 | 559 | debugInfoDiv.innerHTML = html; |
| 560 | 560 | } else { |
| ... | ... | @@ -573,7 +573,7 @@ function displayDebugInfo(data) { |
| 573 | 573 | html += `<div>original_query: ${escapeHtml(debugInfo.query_analysis.original_query || 'N/A')}</div>`; |
| 574 | 574 | html += `<div>normalized_query: ${escapeHtml(debugInfo.query_analysis.normalized_query || 'N/A')}</div>`; |
| 575 | 575 | html += `<div>rewritten_query: ${escapeHtml(debugInfo.query_analysis.rewritten_query || 'N/A')}</div>`; |
| 576 | - html += `<div>detected_language: ${getLanguageName(debugInfo.query_analysis.detected_language)}</div>`; | |
| 576 | + html += `<div>detected_language: ${debugInfo.query_analysis.detected_language}</div>`; | |
| 577 | 577 | html += `<div>domain: ${escapeHtml(debugInfo.query_analysis.domain || 'default')}</div>`; |
| 578 | 578 | html += `<div>is_simple_query: ${debugInfo.query_analysis.is_simple_query ? 'yes' : 'no'}</div>`; |
| 579 | 579 | |
| ... | ... | @@ -581,7 +581,7 @@ function displayDebugInfo(data) { |
| 581 | 581 | html += '<div>translations: '; |
| 582 | 582 | for (const [lang, translation] of Object.entries(debugInfo.query_analysis.translations)) { |
| 583 | 583 | if (translation) { |
| 584 | - html += `${getLanguageName(lang)}: ${escapeHtml(translation)}; `; | |
| 584 | + html += `${lang}: ${escapeHtml(translation)}; `; | |
| 585 | 585 | } |
| 586 | 586 | } |
| 587 | 587 | html += '</div>'; |
| ... | ... | @@ -669,14 +669,3 @@ function formatDate(dateStr) { |
| 669 | 669 | } |
| 670 | 670 | } |
| 671 | 671 | |
| 672 | -function getLanguageName(code) { | |
| 673 | - const names = { | |
| 674 | - 'zh': '中文', | |
| 675 | - 'en': 'English', | |
| 676 | - 'ru': 'Русский', | |
| 677 | - 'ar': 'العربية', | |
| 678 | - 'ja': '日本語', | |
| 679 | - 'unknown': 'Unknown' | |
| 680 | - }; | |
| 681 | - return names[code] || code; | |
| 682 | -} | ... | ... |
query/query_parser.py
| ... | ... | @@ -27,7 +27,7 @@ class ParsedQuery: |
| 27 | 27 | original_query: str, |
| 28 | 28 | normalized_query: str, |
| 29 | 29 | rewritten_query: Optional[str] = None, |
| 30 | - detected_language: str = "unknown", | |
| 30 | + detected_language: Optional[str] = None, | |
| 31 | 31 | translations: Dict[str, str] = None, |
| 32 | 32 | query_vector: Optional[np.ndarray] = None, |
| 33 | 33 | domain: str = "default", |
| ... | ... | @@ -210,6 +210,9 @@ class QueryParser: |
| 210 | 210 | |
| 211 | 211 | # Stage 3: Language detection |
| 212 | 212 | detected_lang = self.language_detector.detect(query_text) |
| 213 | + # Use default language if detection failed (None or "unknown") | |
| 214 | + if not detected_lang or detected_lang == "unknown": | |
| 215 | + detected_lang = self.config.query_config.default_language | |
| 213 | 216 | log_info(f"语言检测 | 检测到语言: {detected_lang}") |
| 214 | 217 | if context: |
| 215 | 218 | context.store_intermediate_result('detected_language', detected_lang) | ... | ... |
query/translator.py
| ... | ... | @@ -12,6 +12,7 @@ https://developers.deepl.com/api-reference/translate/request-translation |
| 12 | 12 | """ |
| 13 | 13 | |
| 14 | 14 | import requests |
| 15 | +import re | |
| 15 | 16 | from concurrent.futures import ThreadPoolExecutor |
| 16 | 17 | from typing import Dict, List, Optional |
| 17 | 18 | from utils.cache import DictCache |
| ... | ... | @@ -110,6 +111,15 @@ class Translator: |
| 110 | 111 | if source_lang: |
| 111 | 112 | source_lang = source_lang.lower() |
| 112 | 113 | |
| 114 | + # Optimization: Skip translation if not needed | |
| 115 | + if target_lang == 'en' and self._is_english_text(text): | |
| 116 | + logger.debug(f"[Translator] Text is already English, skipping translation: '{text[:50]}...'") | |
| 117 | + return text | |
| 118 | + | |
| 119 | + if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): | |
| 120 | + logger.debug(f"[Translator] Text contains Chinese or is pure number, skipping translation: '{text[:50]}...'") | |
| 121 | + return text | |
| 122 | + | |
| 113 | 123 | # Use provided context or default context |
| 114 | 124 | translation_context = context or self.translation_context |
| 115 | 125 | |
| ... | ... | @@ -312,8 +322,10 @@ class Translator: |
| 312 | 322 | |
| 313 | 323 | In async_mode=True (default): |
| 314 | 324 | - Returns cached translations immediately if available |
| 315 | - - Launches async tasks for missing translations (non-blocking) | |
| 316 | - - Returns None for missing translations (will be available in cache next time) | |
| 325 | + - For translations that can be optimized (e.g., pure numbers, already in target language), | |
| 326 | + returns result immediately via synchronous call | |
| 327 | + - Launches async tasks for other missing translations (non-blocking) | |
| 328 | + - Returns None for missing translations that require async processing | |
| 317 | 329 | |
| 318 | 330 | In async_mode=False: |
| 319 | 331 | - Waits for all translations to complete (blocking) |
| ... | ... | @@ -331,6 +343,7 @@ class Translator: |
| 331 | 343 | """ |
| 332 | 344 | results = {} |
| 333 | 345 | missing_langs = [] |
| 346 | + async_langs = [] | |
| 334 | 347 | |
| 335 | 348 | # First, get cached translations |
| 336 | 349 | for lang in target_langs: |
| ... | ... | @@ -340,13 +353,32 @@ class Translator: |
| 340 | 353 | else: |
| 341 | 354 | missing_langs.append(lang) |
| 342 | 355 | |
| 343 | - # If async mode and there are missing translations, launch async tasks | |
| 356 | + # If async mode and there are missing translations | |
| 344 | 357 | if async_mode and missing_langs: |
| 358 | + # Check if translation can be optimized (immediate return) | |
| 345 | 359 | for lang in missing_langs: |
| 346 | - self._translate_async(text, lang, source_lang, context, prompt) | |
| 347 | - # Return None for missing translations | |
| 348 | - for lang in missing_langs: | |
| 349 | - results[lang] = None | |
| 360 | + target_lang = lang.lower() | |
| 361 | + # Check optimization conditions (same as in translate method) | |
| 362 | + can_optimize = False | |
| 363 | + if target_lang == 'en' and self._is_english_text(text): | |
| 364 | + can_optimize = True | |
| 365 | + elif target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): | |
| 366 | + can_optimize = True | |
| 367 | + | |
| 368 | + if can_optimize: | |
| 369 | + # Can be optimized, call translate synchronously for immediate result | |
| 370 | + results[lang] = self.translate(text, lang, source_lang, context, prompt) | |
| 371 | + else: | |
| 372 | + # Requires actual translation, add to async list | |
| 373 | + async_langs.append(lang) | |
| 374 | + | |
| 375 | + # Launch async tasks for translations that require actual API calls | |
| 376 | + if async_langs: | |
| 377 | + for lang in async_langs: | |
| 378 | + self._translate_async(text, lang, source_lang, context, prompt) | |
| 379 | + # Return None for async translations | |
| 380 | + for lang in async_langs: | |
| 381 | + results[lang] = None | |
| 350 | 382 | else: |
| 351 | 383 | # Synchronous mode: wait for all translations |
| 352 | 384 | for lang in missing_langs: |
| ... | ... | @@ -496,3 +528,67 @@ class Translator: |
| 496 | 528 | |
| 497 | 529 | # Otherwise, translate to all supported languages |
| 498 | 530 | return supported_langs |
| 531 | + | |
| 532 | + def _is_english_text(self, text: str) -> bool: | |
| 533 | + """ | |
| 534 | + Check if text is primarily English (ASCII letters, numbers, common punctuation). | |
| 535 | + | |
| 536 | + Args: | |
| 537 | + text: Text to check | |
| 538 | + | |
| 539 | + Returns: | |
| 540 | + True if text appears to be English | |
| 541 | + """ | |
| 542 | + if not text or not text.strip(): | |
| 543 | + return True | |
| 544 | + | |
| 545 | + # Remove whitespace and common punctuation | |
| 546 | + text_clean = re.sub(r'[\s\.,!?;:\-\'\"\(\)\[\]{}]', '', text) | |
| 547 | + if not text_clean: | |
| 548 | + return True | |
| 549 | + | |
| 550 | + # Check if all remaining characters are ASCII (letters, numbers) | |
| 551 | + # This is a simple heuristic: if most characters are ASCII, it's likely English | |
| 552 | + ascii_count = sum(1 for c in text_clean if ord(c) < 128) | |
| 553 | + ratio = ascii_count / len(text_clean) if text_clean else 0 | |
| 554 | + | |
| 555 | + # If more than 80% are ASCII characters, consider it English | |
| 556 | + return ratio > 0.8 | |
| 557 | + | |
| 558 | + def _contains_chinese(self, text: str) -> bool: | |
| 559 | + """ | |
| 560 | + Check if text contains Chinese characters (Han characters). | |
| 561 | + | |
| 562 | + Args: | |
| 563 | + text: Text to check | |
| 564 | + | |
| 565 | + Returns: | |
| 566 | + True if text contains Chinese characters | |
| 567 | + """ | |
| 568 | + if not text: | |
| 569 | + return False | |
| 570 | + | |
| 571 | + # Check for Chinese characters (Unicode range: \u4e00-\u9fff) | |
| 572 | + chinese_pattern = re.compile(r'[\u4e00-\u9fff]') | |
| 573 | + return bool(chinese_pattern.search(text)) | |
| 574 | + | |
| 575 | + def _is_pure_number(self, text: str) -> bool: | |
| 576 | + """ | |
| 577 | + Check if text is purely numeric (digits, possibly with spaces, dots, commas). | |
| 578 | + | |
| 579 | + Args: | |
| 580 | + text: Text to check | |
| 581 | + | |
| 582 | + Returns: | |
| 583 | + True if text is purely numeric | |
| 584 | + """ | |
| 585 | + if not text or not text.strip(): | |
| 586 | + return False | |
| 587 | + | |
| 588 | + # Remove whitespace, dots, commas (common number separators) | |
| 589 | + text_clean = re.sub(r'[\s\.,]', '', text.strip()) | |
| 590 | + if not text_clean: | |
| 591 | + return False | |
| 592 | + | |
| 593 | + # Check if all remaining characters are digits | |
| 594 | + return text_clean.isdigit() | ... | ... |
search/es_query_builder.py
| ... | ... | @@ -25,7 +25,8 @@ class ESQueryBuilder: |
| 25 | 25 | image_embedding_field: Optional[str] = None, |
| 26 | 26 | source_fields: Optional[List[str]] = None, |
| 27 | 27 | function_score_config: Optional[FunctionScoreConfig] = None, |
| 28 | - enable_multilang_search: bool = True | |
| 28 | + enable_multilang_search: bool = True, | |
| 29 | + default_language: str = "zh" | |
| 29 | 30 | ): |
| 30 | 31 | """ |
| 31 | 32 | Initialize query builder. |
| ... | ... | @@ -38,6 +39,7 @@ class ESQueryBuilder: |
| 38 | 39 | source_fields: Fields to return in search results (_source includes) |
| 39 | 40 | function_score_config: Function score configuration |
| 40 | 41 | enable_multilang_search: Enable multi-language search using translations |
| 42 | + default_language: Default language to use when detection fails or returns "unknown" | |
| 41 | 43 | """ |
| 42 | 44 | self.index_name = index_name |
| 43 | 45 | self.match_fields = match_fields |
| ... | ... | @@ -46,6 +48,7 @@ class ESQueryBuilder: |
| 46 | 48 | self.source_fields = source_fields |
| 47 | 49 | self.function_score_config = function_score_config |
| 48 | 50 | self.enable_multilang_search = enable_multilang_search |
| 51 | + self.default_language = default_language | |
| 49 | 52 | |
| 50 | 53 | def _split_filters_for_faceting( |
| 51 | 54 | self, |
| ... | ... | @@ -422,7 +425,7 @@ class ESQueryBuilder: |
| 422 | 425 | |
| 423 | 426 | # Get query analysis from parsed_query |
| 424 | 427 | translations = {} |
| 425 | - language = 'zh' | |
| 428 | + language = self.default_language | |
| 426 | 429 | keywords = "" |
| 427 | 430 | token_count = 0 |
| 428 | 431 | is_short_query = False |
| ... | ... | @@ -430,7 +433,12 @@ class ESQueryBuilder: |
| 430 | 433 | |
| 431 | 434 | if parsed_query: |
| 432 | 435 | translations = parsed_query.translations or {} |
| 433 | - language = parsed_query.detected_language or 'zh' | |
| 436 | + # Use default language if detected_language is None or "unknown" | |
| 437 | + detected_lang = parsed_query.detected_language | |
| 438 | + if not detected_lang or detected_lang == "unknown": | |
| 439 | + language = self.default_language | |
| 440 | + else: | |
| 441 | + language = detected_lang | |
| 434 | 442 | keywords = getattr(parsed_query, 'keywords', '') or "" |
| 435 | 443 | token_count = getattr(parsed_query, 'token_count', 0) or 0 |
| 436 | 444 | is_short_query = getattr(parsed_query, 'is_short_query', False) |
| ... | ... | @@ -458,7 +466,7 @@ class ESQueryBuilder: |
| 458 | 466 | |
| 459 | 467 | # 2. Translation queries - lower boost (0.4) for other languages |
| 460 | 468 | if self.enable_multilang_search: |
| 461 | - if language != 'zh' and translations.get('zh') and translations['zh'] != query_text: | |
| 469 | + if language != 'zh' and translations.get('zh'): | |
| 462 | 470 | zh_fields, _ = self._get_match_fields('zh') |
| 463 | 471 | should_clauses.append({ |
| 464 | 472 | "multi_match": { |
| ... | ... | @@ -472,7 +480,7 @@ class ESQueryBuilder: |
| 472 | 480 | } |
| 473 | 481 | }) |
| 474 | 482 | |
| 475 | - if language != 'en' and translations.get('en') and translations['en'] != query_text: | |
| 483 | + if language != 'en' and translations.get('en'): | |
| 476 | 484 | en_fields, _ = self._get_match_fields('en') |
| 477 | 485 | should_clauses.append({ |
| 478 | 486 | "multi_match": { | ... | ... |
search/searcher.py
| ... | ... | @@ -113,7 +113,8 @@ class Searcher: |
| 113 | 113 | image_embedding_field=self.image_embedding_field, |
| 114 | 114 | source_fields=self.source_fields, |
| 115 | 115 | function_score_config=self.config.function_score, |
| 116 | - enable_multilang_search=self.config.query_config.enable_multilang_search | |
| 116 | + enable_multilang_search=self.config.query_config.enable_multilang_search, | |
| 117 | + default_language=self.config.query_config.default_language | |
| 117 | 118 | ) |
| 118 | 119 | |
| 119 | 120 | def search( | ... | ... |