""" Translation service for multi-language query support. Supports multiple translation models: - Qwen (default): Alibaba Cloud DashScope API using qwen-mt-flash model - DeepL: DeepL API for high-quality translations 使用方法 (Usage): ```python from query.translator import Translator # 使用默认的 qwen 模型(推荐) translator = Translator() # 默认使用 qwen 模型 # 或显式指定模型 translator = Translator(model='qwen') # 使用 qwen 模型 translator = Translator(model='deepl') # 使用 DeepL 模型 # 翻译文本 result = translator.translate( text="我看到这个视频后没有笑", target_lang="en", source_lang="auto" # 自动检测源语言 ) ``` 配置说明 (Configuration): - Qwen 模型需要设置 DASHSCOPE_API_KEY 环境变量(在 .env 文件中) - DeepL 模型需要设置 DEEPL_AUTH_KEY 环境变量(在 .env 文件中) Qwen 模型参考文档: - 官方文档:https://help.aliyun.com/zh/model-studio/get-api-key - 模型:qwen-mt-flash(快速翻译模型) DeepL 官方文档: https://developers.deepl.com/api-reference/translate/request-translation """ import os import requests import re import redis from concurrent.futures import ThreadPoolExecutor, Future from datetime import timedelta from typing import Dict, List, Optional, Union import logging import time logger = logging.getLogger(__name__) from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG from openai import OpenAI class Translator: """ Multi-language translator supporting Qwen and DeepL APIs. Default model is 'qwen' which uses Alibaba Cloud DashScope API. """ DEEPL_API_URL = "https://api.deepl.com/v2/translate" # Pro tier QWEN_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1" # 北京地域 # 如果使用新加坡地域的模型,需要将base_url替换为:https://dashscope-intl.aliyuncs.com/compatible-mode/v1 QWEN_MODEL = "qwen-mt-flash" # 快速翻译模型 # Language code mapping LANG_CODE_MAP = { 'zh': 'ZH', 'en': 'EN', 'ru': 'RU', 'ar': 'AR', 'ja': 'JA', 'es': 'ES', 'de': 'DE', 'fr': 'FR', 'it': 'IT', 'pt': 'PT', } def __init__( self, model: str = "qwen", api_key: Optional[str] = None, use_cache: bool = True, timeout: int = 10, glossary_id: Optional[str] = None, translation_context: Optional[str] = None ): """ Initialize translator. Args: model: Translation model to use. Options: 'qwen' (default) or 'deepl' api_key: API key for the selected model (or None to use from config/env) use_cache: Whether to cache translations timeout: Request timeout in seconds glossary_id: DeepL glossary ID for custom terminology (optional, only for DeepL) translation_context: Context hint for translation (e.g., "e-commerce", "product search") """ self.model = model.lower() if self.model not in ['qwen', 'deepl']: raise ValueError(f"Unsupported model: {model}. Supported models: 'qwen', 'deepl'") # Get API key from config if not provided if api_key is None: if self.model == 'qwen': api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY") else: # deepl api_key = DEEPL_AUTH_KEY or os.getenv("DEEPL_AUTH_KEY") self.api_key = api_key self.timeout = timeout self.use_cache = use_cache self.glossary_id = glossary_id self.translation_context = translation_context or "e-commerce product search" # Initialize OpenAI client for Qwen if needed self.qwen_client = None if self.model == 'qwen': if not self.api_key: logger.warning("DASHSCOPE_API_KEY not set. Qwen translation will not work.") else: self.qwen_client = OpenAI( api_key=self.api_key, base_url=self.QWEN_BASE_URL, ) # Initialize Redis cache if enabled if use_cache: try: self.redis_client = redis.Redis( host=REDIS_CONFIG.get('host', 'localhost'), port=REDIS_CONFIG.get('port', 6479), password=REDIS_CONFIG.get('password'), decode_responses=True, # Return str instead of bytes socket_timeout=REDIS_CONFIG.get('socket_timeout', 1), socket_connect_timeout=REDIS_CONFIG.get('socket_connect_timeout', 1), retry_on_timeout=REDIS_CONFIG.get('retry_on_timeout', False), health_check_interval=10, # 避免复用坏连接 ) # Test connection self.redis_client.ping() expire_days = REDIS_CONFIG.get('translation_cache_expire_days', 360) self.expire_time = timedelta(days=expire_days) self.expire_seconds = int(self.expire_time.total_seconds()) # Redis 需要秒数 self.cache_prefix = REDIS_CONFIG.get('translation_cache_prefix', 'trans') logger.info("Redis cache initialized for translations") except Exception as e: logger.warning(f"Failed to initialize Redis cache: {e}, falling back to no cache") self.redis_client = None self.cache = None else: self.redis_client = None self.cache = None # Thread pool for async translation self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="translator") def translate( self, text: str, target_lang: str, source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None ) -> Optional[str]: """ Translate text to target language (synchronous mode). Args: text: Text to translate target_lang: Target language code ('zh', 'en', 'ru', etc.) source_lang: Source language code (option al, auto-detect if None) context: Additional context for translation (overrides default context) prompt: Translation prompt/instruction (optional, for better translation quality) Returns: Translated text or None if translation fails """ if not text or not text.strip(): return text # Normalize language codes target_lang = target_lang.lower() if source_lang: source_lang = source_lang.lower() # Optimization: Skip translation if not needed if target_lang == 'en' and self._is_english_text(text): logger.info(f"[Translator] Text is already English, skipping translation: '{text[:50]}...'") return text if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): logger.info( f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " f"Source language: {source_lang or 'auto'} | Result: Skip translation (contains Chinese or pure number)" ) return text # Use provided context or default context translation_context = context or self.translation_context # Build cache key (include prompt in cache key if provided) cache_key_parts = [source_lang or 'auto', target_lang, translation_context] if prompt: cache_key_parts.append(prompt) cache_key_parts.append(text) cache_key = ':'.join(cache_key_parts) # Check cache (include context and prompt in cache key for accuracy) if self.use_cache and self.redis_client: cached = self._get_cached_translation_redis(text, target_lang, source_lang, translation_context, prompt) if cached: logger.info( f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " f"Source language: {source_lang or 'auto'} | Result: '{cached}' | Source: Cache hit" ) return cached # If no API key, return mock translation (for testing) if not self.api_key: logger.info( f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " f"Source language: {source_lang or 'auto'} | Result: '{text}' | Source: Mock mode (no API key)" ) return text # Translate using selected model logger.info( f"[Translator] Translation request | Model: {self.model} | Original text: '{text}' | Target language: {target_lang} | " f"Source language: {source_lang or 'auto'} | Context: {translation_context} | " f"Prompt: {'yes' if prompt else 'no'} | Status: Starting translation" ) if self.model == 'qwen': result = self._translate_qwen(text, target_lang, source_lang, translation_context, prompt) else: # deepl result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt) # If still failed, return original text with warning if result is None: logger.warning( f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " f"Source language: {source_lang or 'auto'} | Result: '{text}' | Status: Translation failed, returning original" ) result = text else: logger.info( f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " f"Source language: {source_lang or 'auto'} | Result: '{result}' | Status: Translation successful" ) # Cache result if result and self.use_cache and self.redis_client: self._set_cached_translation_redis(text, target_lang, result, source_lang, translation_context, prompt) return result def _translate_qwen( self, text: str, target_lang: str, source_lang: Optional[str], context: Optional[str] = None, prompt: Optional[str] = None ) -> Optional[str]: """ Translate using Qwen MT Flash model via Alibaba Cloud DashScope API. Args: text: Text to translate target_lang: Target language code ('zh', 'en', 'ru', etc.) source_lang: Source language code (optional, 'auto' if None) context: Context hint for translation (optional) prompt: Translation prompt/instruction (optional) Returns: Translated text or None if translation fails """ if not self.qwen_client: logger.error("[Translator] Qwen client not initialized. Check DASHSCOPE_API_KEY.") return None # Qwen (qwen-mt-plus/flash/turbo) supported languages mapping # 标准来自:你提供的“语言 / 英文名 / 代码”表 qwen_lang_map = { "en": "English", "zh": "Chinese", "zh_tw": "Traditional Chinese", "ru": "Russian", "ja": "Japanese", "ko": "Korean", "es": "Spanish", "fr": "French", "pt": "Portuguese", "de": "German", "it": "Italian", "th": "Thai", "vi": "Vietnamese", "id": "Indonesian", "ms": "Malay", "ar": "Arabic", "hi": "Hindi", "he": "Hebrew", "my": "Burmese", "ta": "Tamil", "ur": "Urdu", "bn": "Bengali", "pl": "Polish", "nl": "Dutch", "ro": "Romanian", "tr": "Turkish", "km": "Khmer", "lo": "Lao", "yue": "Cantonese", "cs": "Czech", "el": "Greek", "sv": "Swedish", "hu": "Hungarian", "da": "Danish", "fi": "Finnish", "uk": "Ukrainian", "bg": "Bulgarian", } # Convert target language target_lang_normalized = target_lang.lower() target_lang_qwen = qwen_lang_map.get(target_lang_normalized, target_lang.capitalize()) # Convert source language source_lang_normalized = (source_lang or "").strip().lower() if not source_lang_normalized or source_lang_normalized == "auto": source_lang_qwen = "auto" else: source_lang_qwen = qwen_lang_map.get(source_lang_normalized, source_lang.capitalize()) # Prepare translation options translation_options = { "source_lang": source_lang_qwen, "target_lang": target_lang_qwen, } # Prepare messages messages = [ { "role": "user", "content": text } ] start_time = time.time() try: completion = self.qwen_client.chat.completions.create( model=self.QWEN_MODEL, messages=messages, extra_body={ "translation_options": translation_options } ) translated_text = completion.choices[0].message.content.strip() duration_ms = (time.time() - start_time) * 1000 logger.info( f"[Translator] Qwen API response success | Original text: '{text}' | Target language: {target_lang_qwen} | " f"Translation result: '{translated_text}' | Duration: {duration_ms:.2f} ms" ) return translated_text except Exception as e: duration_ms = (time.time() - start_time) * 1000 logger.error( f"[Translator] Qwen API request exception | Original text: '{text}' | Target language: {target_lang_qwen} | " f"Duration: {duration_ms:.2f} ms | Error: {e}", exc_info=True ) return None def _translate_deepl( self, text: str, target_lang: str, source_lang: Optional[str], context: Optional[str] = None, prompt: Optional[str] = None ) -> Optional[str]: """ Translate using DeepL API with context and glossary support. Args: text: Text to translate target_lang: Target language code source_lang: Source language code (optional) context: Context hint for translation (e.g., "e-commerce product search") """ # Map to DeepL language codes target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper()) headers = { "Authorization": f"DeepL-Auth-Key {self.api_key}", "Content-Type": "application/json", } # Use prompt as context parameter for DeepL API (not as text prefix) # According to DeepL API: context is "Additional context that can influence a translation but is not translated itself" # If prompt is provided, use it as context; otherwise use the default context api_context = prompt if prompt else context # For e-commerce, add context words to help DeepL understand the domain # This is especially important for single-word ambiguous terms like "车" (car vs rook) text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context) payload = { "text": [text_to_translate], "target_lang": target_code, } if source_lang: source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper()) payload["source_lang"] = source_code # Add context parameter (prompt or default context) # Context influences translation but is not translated itself if api_context: payload["context"] = api_context # Add glossary if configured if self.glossary_id: payload["glossary_id"] = self.glossary_id # Note: DeepL API v2 supports "context" parameter for additional context # that influences translation but is not translated itself. # We use prompt as context parameter when provided. try: response = requests.post( self.DEEPL_API_URL, headers=headers, json=payload, timeout=self.timeout ) if response.status_code == 200: data = response.json() if "translations" in data and len(data["translations"]) > 0: translated_text = data["translations"][0]["text"] # If we added context, extract just the term from the result if needs_extraction: translated_text = self._extract_term_from_translation( translated_text, text, target_code ) logger.debug( f"[Translator] DeepL API response success | Original text: '{text}' | Target language: {target_code} | " f"Translation result: '{translated_text}'" ) return translated_text else: logger.error( f"[Translator] DeepL API error | Original text: '{text}' | Target language: {target_code} | " f"Status code: {response.status_code} | Error message: {response.text}" ) return None except requests.Timeout: logger.warning( f"[Translator] DeepL API request timeout | Original text: '{text}' | Target language: {target_code} | " f"Timeout: {self.timeout}s" ) return None except Exception as e: logger.error( f"[Translator] DeepL API request exception | Original text: '{text}' | Target language: {target_code} | " f"Error: {e}", exc_info=True ) return None # NOTE: _translate_deepl_free is intentionally not implemented. # We do not support automatic fallback to the free endpoint, to avoid # mixing Pro keys with https://api-free.deepl.com and related 403 errors. def translate_multi( self, text: str, target_langs: List[str], source_lang: Optional[str] = None, context: Optional[str] = None, async_mode: bool = True, prompt: Optional[str] = None ) -> Dict[str, Optional[str]]: """ Translate text to multiple target languages. In async_mode=True (default): - Returns cached translations immediately if available - For translations that can be optimized (e.g., pure numbers, already in target language), returns result immediately via synchronous call - Launches async tasks for other missing translations (non-blocking) - Returns None for missing translations that require async processing In async_mode=False: - Waits for all translations to complete (blocking) Args: text: Text to translate target_langs: List of target language codes source_lang: Source language code (optional) context: Context hint for translation (optional) async_mode: If True, return cached results immediately and translate missing ones async prompt: Translation prompt/instruction (optional) Returns: Dictionary mapping language code to translated text (only cached results in async mode) """ results = {} missing_langs = [] async_langs = [] # First, get cached translations for lang in target_langs: cached = self._get_cached_translation(text, lang, source_lang, context, prompt) if cached is not None: results[lang] = cached else: missing_langs.append(lang) # If async mode and there are missing translations if async_mode and missing_langs: # Check if translation can be optimized (immediate return) for lang in missing_langs: target_lang = lang.lower() # Check optimization conditions (same as in translate method) can_optimize = False if target_lang == 'en' and self._is_english_text(text): can_optimize = True elif target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): can_optimize = True if can_optimize: # Can be optimized, call translate synchronously for immediate result results[lang] = self.translate(text, lang, source_lang, context, prompt) else: # Requires actual translation, add to async list async_langs.append(lang) # Launch async tasks for translations that require actual API calls if async_langs: for lang in async_langs: self._translate_async(text, lang, source_lang, context, prompt) # Return None for async translations for lang in async_langs: results[lang] = None else: # Synchronous mode: wait for all translations for lang in missing_langs: results[lang] = self.translate(text, lang, source_lang, context, prompt) return results def translate_multi_async( self, text: str, target_langs: List[str], source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None ) -> Dict[str, Union[str, Future]]: """ Translate text to multiple target languages asynchronously, returning Futures that can be awaited. This method returns a dictionary where: - If translation is cached, the value is the translation string (immediate) - If translation needs to be done, the value is a Future object that can be awaited Args: text: Text to translate target_langs: List of target language codes source_lang: Source language code (optional) context: Context hint for translation (optional) prompt: Translation prompt/instruction (optional) Returns: Dictionary mapping language code to either translation string (cached) or Future object """ results = {} missing_langs = [] # First, get cached translations for lang in target_langs: cached = self._get_cached_translation(text, lang, source_lang, context, prompt) if cached is not None: results[lang] = cached else: missing_langs.append(lang) # For missing translations, submit async tasks and return Futures for lang in missing_langs: future = self.executor.submit( self.translate, text, lang, source_lang, context, prompt ) results[lang] = future return results def _get_cached_translation( self, text: str, target_lang: str, source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None ) -> Optional[str]: """Get translation from cache if available.""" if not self.redis_client: return None return self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt) def _get_cached_translation_redis( self, text: str, target_lang: str, source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None ) -> Optional[str]: """ Get translation from Redis cache with sliding expiration. 滑动过期机制:每次访问缓存时,重置过期时间为配置的过期时间(默认720天)。 这样缓存会在最后一次访问后的720天才过期,而不是写入后的720天。 这确保了常用的翻译缓存不会被过早删除。 """ if not self.redis_client: return None try: # Build cache key: prefix:target_lang:text # For simplicity, we use target_lang and text as key # Context and prompt are not included in key to maximize cache hits cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}" value = self.redis_client.get(cache_key) if value: # Sliding expiration: reset expiration time on access # 每次读取缓存时,重置过期时间为配置的过期时间(最后一次访问后的N天才过期) try: self.redis_client.expire(cache_key, self.expire_seconds) except Exception as expire_error: # 即使 expire 失败,也返回缓存值(不影响功能) logger.warning( f"[Translator] Failed to update cache expiration for key {cache_key}: {expire_error}" ) logger.debug( f"[Translator] Redis cache hit | Original text: '{text}' | Target language: {target_lang} | " f"Cache key: {cache_key} | Translation result: '{value}' | TTL reset to {self.expire_seconds}s" ) return value logger.debug( f"[Translator] Redis cache miss | Original text: '{text}' | Target language: {target_lang} | " f"Cache key: {cache_key}" ) return None except Exception as e: logger.error(f"[Translator] Redis error during get translation cache | Original text: '{text}' | Target language: {target_lang} | Error: {e}") return None def _set_cached_translation_redis( self, text: str, target_lang: str, translation: str, source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None ) -> None: """Store translation in Redis cache.""" if not self.redis_client: return try: cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}" self.redis_client.setex(cache_key, self.expire_seconds, translation) logger.info( f"[Translator] Redis cache write | Original text: '{text}' | Target language: {target_lang} | " f"Cache key: {cache_key} | Translation result: '{translation}'" ) except Exception as e: logger.error( f"[Translator] Redis cache write failed | Original text: '{text}' | Target language: {target_lang} | " f"Error: {e}" ) def _translate_async( self, text: str, target_lang: str, source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None ): """Launch async translation task.""" def _do_translate(): try: result = self.translate(text, target_lang, source_lang, context, prompt) if result: logger.debug(f"Async translation completed: {text} -> {target_lang}: {result}") except Exception as e: logger.warning(f"Async translation failed: {text} -> {target_lang}: {e}") self.executor.submit(_do_translate) def _add_ecommerce_context( self, text: str, source_lang: Optional[str], context: Optional[str] ) -> tuple: """ Add e-commerce context to text for better disambiguation. For single-word ambiguous Chinese terms, we add context words that help DeepL understand this is an e-commerce/product search context. Args: text: Original text to translate source_lang: Source language code context: Context hint Returns: Tuple of (text_with_context, needs_extraction) - text_with_context: Text to send to DeepL - needs_extraction: Whether we need to extract the term from the result """ # Only apply for e-commerce context and Chinese source if not context or "e-commerce" not in context.lower(): return text, False if not source_lang or source_lang.lower() != 'zh': return text, False # For single-word queries, add context to help disambiguation text_stripped = text.strip() if len(text_stripped.split()) == 1 and len(text_stripped) <= 2: # Common ambiguous Chinese e-commerce terms like "车" (car vs rook) # We add a context phrase: "购买 [term]" (buy [term]) or "商品 [term]" (product [term]) # This helps DeepL understand the e-commerce context # We'll need to extract just the term from the translation result context_phrase = f"购买 {text_stripped}" return context_phrase, True # For multi-word queries, DeepL usually has enough context return text, False def _extract_term_from_translation( self, translated_text: str, original_text: str, target_lang_code: str ) -> str: """ Extract the actual term from a translation that included context. For example, if we translated "购买 车" (buy car) and got "buy car", we want to extract just "car". Args: translated_text: Full translation result original_text: Original single-word query target_lang_code: Target language code (EN, ZH, etc.) Returns: Extracted term or original translation if extraction fails """ # For English target, try to extract the last word (the actual term) if target_lang_code == "EN": words = translated_text.strip().split() if len(words) > 1: # Usually the last word is the term we want # But we need to be smart - if it's "buy car", we want "car" # Common context words to skip: buy, purchase, product, item, etc. context_words = {"buy", "purchase", "product", "item", "commodity", "goods"} # Try to find the term (not a context word) for word in reversed(words): word_lower = word.lower().rstrip('.,!?;:') if word_lower not in context_words: return word_lower # If all words are context words, return the last one return words[-1].lower().rstrip('.,!?;:') # For other languages or if extraction fails, return as-is # The user can configure a glossary for better results return translated_text def _shop_lang_matches(self, shop_lang_lower: str, lang_code: str) -> bool: """True if shop language matches index language (use source, no translate).""" if not shop_lang_lower or not lang_code: return False if shop_lang_lower == lang_code: return True if lang_code == "zh" and "zh" in shop_lang_lower: return True if lang_code == "en" and "en" in shop_lang_lower: return True return False def translate_for_indexing( self, text: str, shop_language: str, source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None, index_languages: Optional[List[str]] = None, ) -> Dict[str, Optional[str]]: """ Translate text for indexing based on shop language and tenant index_languages. For each language in index_languages: use source text if shop language matches, otherwise translate to that language. Args: text: Text to translate shop_language: Shop primary language (e.g. 'zh', 'en', 'ru') source_lang: Source language code (optional) context: Additional context for translation (optional) prompt: Translation prompt (optional) index_languages: Languages to index (from tenant_config). Default ["en", "zh"]. Returns: Dict keyed by each index_language with translated or source text (or None). """ langs = index_languages if index_languages else ["en", "zh"] results = {lang: None for lang in langs} if not text or not text.strip(): return results if re.match(r'^[\d\s_-]+$', text): logger.info(f"[Translator] Skip translation for symbol-only query: '{text}'") return results shop_lang_lower = (shop_language or "").strip().lower() targets = [] for lang in langs: if self._shop_lang_matches(shop_lang_lower, lang): results[lang] = text else: targets.append(lang) for target_lang in targets: cached = self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt) if cached: results[target_lang] = cached logger.debug(f"[Translator] Cache hit for indexing: '{text}' -> {target_lang}: {cached}") continue translated = self.translate( text, target_lang=target_lang, source_lang=source_lang or shop_language, context=context, prompt=prompt, ) results[target_lang] = translated return results def get_translation_needs( self, detected_lang: str, supported_langs: List[str] ) -> List[str]: """ Determine which languages need translation. Args: detected_lang: Detected query language supported_langs: List of supported languages Returns: List of language codes to translate to """ # If detected language is in supported list, translate to others if detected_lang in supported_langs: return [lang for lang in supported_langs if detected_lang != lang] # Otherwise, translate to all supported languages return supported_langs def _is_english_text(self, text: str) -> bool: """ Check if text is primarily English (ASCII letters, numbers, common punctuation). Args: text: Text to check Returns: True if text appears to be English """ if not text or not text.strip(): return True # Remove whitespace and common punctuation text_clean = re.sub(r'[\s\.,!?;:\-\'\"\(\)\[\]{}]', '', text) if not text_clean: return True # Check if all remaining characters are ASCII (letters, numbers) # This is a simple heuristic: if most characters are ASCII, it's likely English ascii_count = sum(1 for c in text_clean if ord(c) < 128) ratio = ascii_count / len(text_clean) if text_clean else 0 # If more than 80% are ASCII characters, consider it English return ratio > 0.8 def _contains_chinese(self, text: str) -> bool: """ Check if text contains Chinese characters (Han characters). Args: text: Text to check Returns: True if text contains Chinese characters """ if not text: return False # Check for Chinese characters (Unicode range: \u4e00-\u9fff) chinese_pattern = re.compile(r'[\u4e00-\u9fff]') return bool(chinese_pattern.search(text)) def _is_pure_number(self, text: str) -> bool: """ Check if text is purely numeric (digits, possibly with spaces, dots, commas). Args: text: Text to check Returns: True if text is purely numeric """ if not text or not text.strip(): return False # Remove whitespace, dots, commas (common number separators) text_clean = re.sub(r'[\s\.,]', '', text.strip()) if not text_clean: return False # Check if all remaining characters are digits return text_clean.isdigit()