diff --git a/frontend/index.html b/frontend/index.html index 81583f1..58e9b27 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -142,6 +142,6 @@

SearchEngine © 2025 | API: Loading...

- + diff --git a/frontend/static/js/app.js b/frontend/static/js/app.js index 1f287b8..2cc50da 100644 --- a/frontend/static/js/app.js +++ b/frontend/static/js/app.js @@ -554,7 +554,7 @@ function displayDebugInfo(data) { if (data.query_info) { let html = '
'; html += `
original_query: ${escapeHtml(data.query_info.original_query || 'N/A')}
`; - html += `
detected_language: ${getLanguageName(data.query_info.detected_language)}
`; + html += `
detected_language: ${data.query_info.detected_languag}
`; html += '
'; debugInfoDiv.innerHTML = html; } else { @@ -573,7 +573,7 @@ function displayDebugInfo(data) { html += `
original_query: ${escapeHtml(debugInfo.query_analysis.original_query || 'N/A')}
`; html += `
normalized_query: ${escapeHtml(debugInfo.query_analysis.normalized_query || 'N/A')}
`; html += `
rewritten_query: ${escapeHtml(debugInfo.query_analysis.rewritten_query || 'N/A')}
`; - html += `
detected_language: ${getLanguageName(debugInfo.query_analysis.detected_language)}
`; + html += `
detected_language: ${debugInfo.query_analysis.detected_language}
`; html += `
domain: ${escapeHtml(debugInfo.query_analysis.domain || 'default')}
`; html += `
is_simple_query: ${debugInfo.query_analysis.is_simple_query ? 'yes' : 'no'}
`; @@ -581,7 +581,7 @@ function displayDebugInfo(data) { html += '
translations: '; for (const [lang, translation] of Object.entries(debugInfo.query_analysis.translations)) { if (translation) { - html += `${getLanguageName(lang)}: ${escapeHtml(translation)}; `; + html += `${lang}: ${escapeHtml(translation)}; `; } } html += '
'; @@ -669,14 +669,3 @@ function formatDate(dateStr) { } } -function getLanguageName(code) { - const names = { - 'zh': '中文', - 'en': 'English', - 'ru': 'Русский', - 'ar': 'العربية', - 'ja': '日本語', - 'unknown': 'Unknown' - }; - return names[code] || code; -} diff --git a/query/query_parser.py b/query/query_parser.py index 6e176fc..898a6fe 100644 --- a/query/query_parser.py +++ b/query/query_parser.py @@ -27,7 +27,7 @@ class ParsedQuery: original_query: str, normalized_query: str, rewritten_query: Optional[str] = None, - detected_language: str = "unknown", + detected_language: Optional[str] = None, translations: Dict[str, str] = None, query_vector: Optional[np.ndarray] = None, domain: str = "default", @@ -210,6 +210,9 @@ class QueryParser: # Stage 3: Language detection detected_lang = self.language_detector.detect(query_text) + # Use default language if detection failed (None or "unknown") + if not detected_lang or detected_lang == "unknown": + detected_lang = self.config.query_config.default_language log_info(f"语言检测 | 检测到语言: {detected_lang}") if context: context.store_intermediate_result('detected_language', detected_lang) diff --git a/query/translator.py b/query/translator.py index 040f892..f323c53 100644 --- a/query/translator.py +++ b/query/translator.py @@ -12,6 +12,7 @@ https://developers.deepl.com/api-reference/translate/request-translation """ import requests +import re from concurrent.futures import ThreadPoolExecutor from typing import Dict, List, Optional from utils.cache import DictCache @@ -110,6 +111,15 @@ class Translator: if source_lang: source_lang = source_lang.lower() + # Optimization: Skip translation if not needed + if target_lang == 'en' and self._is_english_text(text): + logger.debug(f"[Translator] Text is already English, skipping translation: '{text[:50]}...'") + return text + + if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): + logger.debug(f"[Translator] Text contains Chinese or is pure number, skipping translation: '{text[:50]}...'") + return text + # Use provided context or default context translation_context = context or self.translation_context @@ -312,8 +322,10 @@ class Translator: In async_mode=True (default): - Returns cached translations immediately if available - - Launches async tasks for missing translations (non-blocking) - - Returns None for missing translations (will be available in cache next time) + - For translations that can be optimized (e.g., pure numbers, already in target language), + returns result immediately via synchronous call + - Launches async tasks for other missing translations (non-blocking) + - Returns None for missing translations that require async processing In async_mode=False: - Waits for all translations to complete (blocking) @@ -331,6 +343,7 @@ class Translator: """ results = {} missing_langs = [] + async_langs = [] # First, get cached translations for lang in target_langs: @@ -340,13 +353,32 @@ class Translator: else: missing_langs.append(lang) - # If async mode and there are missing translations, launch async tasks + # If async mode and there are missing translations if async_mode and missing_langs: + # Check if translation can be optimized (immediate return) for lang in missing_langs: - self._translate_async(text, lang, source_lang, context, prompt) - # Return None for missing translations - for lang in missing_langs: - results[lang] = None + target_lang = lang.lower() + # Check optimization conditions (same as in translate method) + can_optimize = False + if target_lang == 'en' and self._is_english_text(text): + can_optimize = True + elif target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): + can_optimize = True + + if can_optimize: + # Can be optimized, call translate synchronously for immediate result + results[lang] = self.translate(text, lang, source_lang, context, prompt) + else: + # Requires actual translation, add to async list + async_langs.append(lang) + + # Launch async tasks for translations that require actual API calls + if async_langs: + for lang in async_langs: + self._translate_async(text, lang, source_lang, context, prompt) + # Return None for async translations + for lang in async_langs: + results[lang] = None else: # Synchronous mode: wait for all translations for lang in missing_langs: @@ -496,3 +528,67 @@ class Translator: # Otherwise, translate to all supported languages return supported_langs + + def _is_english_text(self, text: str) -> bool: + """ + Check if text is primarily English (ASCII letters, numbers, common punctuation). + + Args: + text: Text to check + + Returns: + True if text appears to be English + """ + if not text or not text.strip(): + return True + + # Remove whitespace and common punctuation + text_clean = re.sub(r'[\s\.,!?;:\-\'\"\(\)\[\]{}]', '', text) + if not text_clean: + return True + + # Check if all remaining characters are ASCII (letters, numbers) + # This is a simple heuristic: if most characters are ASCII, it's likely English + ascii_count = sum(1 for c in text_clean if ord(c) < 128) + ratio = ascii_count / len(text_clean) if text_clean else 0 + + # If more than 80% are ASCII characters, consider it English + return ratio > 0.8 + + def _contains_chinese(self, text: str) -> bool: + """ + Check if text contains Chinese characters (Han characters). + + Args: + text: Text to check + + Returns: + True if text contains Chinese characters + """ + if not text: + return False + + # Check for Chinese characters (Unicode range: \u4e00-\u9fff) + chinese_pattern = re.compile(r'[\u4e00-\u9fff]') + return bool(chinese_pattern.search(text)) + + def _is_pure_number(self, text: str) -> bool: + """ + Check if text is purely numeric (digits, possibly with spaces, dots, commas). + + Args: + text: Text to check + + Returns: + True if text is purely numeric + """ + if not text or not text.strip(): + return False + + # Remove whitespace, dots, commas (common number separators) + text_clean = re.sub(r'[\s\.,]', '', text.strip()) + if not text_clean: + return False + + # Check if all remaining characters are digits + return text_clean.isdigit() diff --git a/search/es_query_builder.py b/search/es_query_builder.py index 3338585..1405416 100644 --- a/search/es_query_builder.py +++ b/search/es_query_builder.py @@ -25,7 +25,8 @@ class ESQueryBuilder: image_embedding_field: Optional[str] = None, source_fields: Optional[List[str]] = None, function_score_config: Optional[FunctionScoreConfig] = None, - enable_multilang_search: bool = True + enable_multilang_search: bool = True, + default_language: str = "zh" ): """ Initialize query builder. @@ -38,6 +39,7 @@ class ESQueryBuilder: source_fields: Fields to return in search results (_source includes) function_score_config: Function score configuration enable_multilang_search: Enable multi-language search using translations + default_language: Default language to use when detection fails or returns "unknown" """ self.index_name = index_name self.match_fields = match_fields @@ -46,6 +48,7 @@ class ESQueryBuilder: self.source_fields = source_fields self.function_score_config = function_score_config self.enable_multilang_search = enable_multilang_search + self.default_language = default_language def _split_filters_for_faceting( self, @@ -422,7 +425,7 @@ class ESQueryBuilder: # Get query analysis from parsed_query translations = {} - language = 'zh' + language = self.default_language keywords = "" token_count = 0 is_short_query = False @@ -430,7 +433,12 @@ class ESQueryBuilder: if parsed_query: translations = parsed_query.translations or {} - language = parsed_query.detected_language or 'zh' + # Use default language if detected_language is None or "unknown" + detected_lang = parsed_query.detected_language + if not detected_lang or detected_lang == "unknown": + language = self.default_language + else: + language = detected_lang keywords = getattr(parsed_query, 'keywords', '') or "" token_count = getattr(parsed_query, 'token_count', 0) or 0 is_short_query = getattr(parsed_query, 'is_short_query', False) @@ -458,7 +466,7 @@ class ESQueryBuilder: # 2. Translation queries - lower boost (0.4) for other languages if self.enable_multilang_search: - if language != 'zh' and translations.get('zh') and translations['zh'] != query_text: + if language != 'zh' and translations.get('zh'): zh_fields, _ = self._get_match_fields('zh') should_clauses.append({ "multi_match": { @@ -472,7 +480,7 @@ class ESQueryBuilder: } }) - if language != 'en' and translations.get('en') and translations['en'] != query_text: + if language != 'en' and translations.get('en'): en_fields, _ = self._get_match_fields('en') should_clauses.append({ "multi_match": { diff --git a/search/searcher.py b/search/searcher.py index 9c3b217..cd262f8 100644 --- a/search/searcher.py +++ b/search/searcher.py @@ -113,7 +113,8 @@ class Searcher: image_embedding_field=self.image_embedding_field, source_fields=self.source_fields, function_score_config=self.config.function_score, - enable_multilang_search=self.config.query_config.enable_multilang_search + enable_multilang_search=self.config.query_config.enable_multilang_search, + default_language=self.config.query_config.default_language ) def search( -- libgit2 0.21.2