""" Translation service for multi-language query support. Supports multiple translation models: - Qwen (default): Alibaba Cloud DashScope API using qwen-mt-flash model - DeepL: DeepL API for high-quality translations 使用方法 (Usage): ```python from query.translator import Translator # 使用默认的 qwen 模型(推荐) translator = Translator() # 默认使用 qwen 模型 # 或显式指定模型 translator = Translator(model='qwen') # 使用 qwen 模型 translator = Translator(model='deepl') # 使用 DeepL 模型 # 翻译文本 result = translator.translate( text="我看到这个视频后没有笑", target_lang="en", source_lang="auto" # 自动检测源语言 ) ``` 配置说明 (Configuration): - Qwen 模型需要设置 DASHSCOPE_API_KEY 环境变量(在 .env 文件中) - DeepL 模型需要设置 DEEPL_AUTH_KEY 环境变量(在 .env 文件中) Qwen 模型参考文档: - 官方文档:https://help.aliyun.com/zh/model-studio/get-api-key - 模型:qwen-mt-flash(快速翻译模型) DeepL 官方文档: https://developers.deepl.com/api-reference/translate/request-translation """ import os import requests import re import redis from concurrent.futures import ThreadPoolExecutor, Future from datetime import timedelta from typing import Dict, List, Optional, Union import logging logger = logging.getLogger(__name__) from config.env_config import DEEPL_AUTH_KEY, DASHSCOPE_API_KEY, REDIS_CONFIG from openai import OpenAI class Translator: """ Multi-language translator supporting Qwen and DeepL APIs. Default model is 'qwen' which uses Alibaba Cloud DashScope API. """ DEEPL_API_URL = "https://api.deepl.com/v2/translate" # Pro tier QWEN_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1" # 北京地域 # 如果使用新加坡地域的模型,需要将base_url替换为:https://dashscope-intl.aliyuncs.com/compatible-mode/v1 QWEN_MODEL = "qwen-mt-flash" # 快速翻译模型 # Language code mapping LANG_CODE_MAP = { 'zh': 'ZH', 'en': 'EN', 'ru': 'RU', 'ar': 'AR', 'ja': 'JA', 'es': 'ES', 'de': 'DE', 'fr': 'FR', 'it': 'IT', 'pt': 'PT', } def __init__( self, model: str = "qwen", api_key: Optional[str] = None, use_cache: bool = True, timeout: int = 10, glossary_id: Optional[str] = None, translation_context: Optional[str] = None ): """ Initialize translator. Args: model: Translation model to use. Options: 'qwen' (default) or 'deepl' api_key: API key for the selected model (or None to use from config/env) use_cache: Whether to cache translations timeout: Request timeout in seconds glossary_id: DeepL glossary ID for custom terminology (optional, only for DeepL) translation_context: Context hint for translation (e.g., "e-commerce", "product search") """ self.model = model.lower() if self.model not in ['qwen', 'deepl']: raise ValueError(f"Unsupported model: {model}. Supported models: 'qwen', 'deepl'") # Get API key from config if not provided if api_key is None: if self.model == 'qwen': api_key = DASHSCOPE_API_KEY or os.getenv("DASHSCOPE_API_KEY") else: # deepl api_key = DEEPL_AUTH_KEY or os.getenv("DEEPL_AUTH_KEY") self.api_key = api_key self.timeout = timeout self.use_cache = use_cache self.glossary_id = glossary_id self.translation_context = translation_context or "e-commerce product search" # Initialize OpenAI client for Qwen if needed self.qwen_client = None if self.model == 'qwen': if not self.api_key: logger.warning("DASHSCOPE_API_KEY not set. Qwen translation will not work.") else: self.qwen_client = OpenAI( api_key=self.api_key, base_url=self.QWEN_BASE_URL, ) # Initialize Redis cache if enabled if use_cache: try: self.redis_client = redis.Redis( host=REDIS_CONFIG.get('host', 'localhost'), port=REDIS_CONFIG.get('port', 6479), password=REDIS_CONFIG.get('password'), decode_responses=True, # Return str instead of bytes socket_timeout=REDIS_CONFIG.get('socket_timeout', 1), socket_connect_timeout=REDIS_CONFIG.get('socket_connect_timeout', 1), retry_on_timeout=REDIS_CONFIG.get('retry_on_timeout', False), health_check_interval=10, # 避免复用坏连接 ) # Test connection self.redis_client.ping() expire_days = REDIS_CONFIG.get('translation_cache_expire_days', 360) self.expire_time = timedelta(days=expire_days) self.expire_seconds = int(self.expire_time.total_seconds()) # Redis 需要秒数 self.cache_prefix = REDIS_CONFIG.get('translation_cache_prefix', 'trans') logger.info("Redis cache initialized for translations") except Exception as e: logger.warning(f"Failed to initialize Redis cache: {e}, falling back to no cache") self.redis_client = None self.cache = None else: self.redis_client = None self.cache = None # Thread pool for async translation self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="translator") def translate( self, text: str, target_lang: str, source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None ) -> Optional[str]: """ Translate text to target language (synchronous mode). Args: text: Text to translate target_lang: Target language code ('zh', 'en', 'ru', etc.) source_lang: Source language code (option al, auto-detect if None) context: Additional context for translation (overrides default context) prompt: Translation prompt/instruction (optional, for better translation quality) Returns: Translated text or None if translation fails """ if not text or not text.strip(): return text # Normalize language codes target_lang = target_lang.lower() if source_lang: source_lang = source_lang.lower() # Optimization: Skip translation if not needed if target_lang == 'en' and self._is_english_text(text): logger.info(f"[Translator] Text is already English, skipping translation: '{text[:50]}...'") return text if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): logger.info( f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " f"Source language: {source_lang or 'auto'} | Result: Skip translation (contains Chinese or pure number)" ) return text # Use provided context or default context translation_context = context or self.translation_context # Build cache key (include prompt in cache key if provided) cache_key_parts = [source_lang or 'auto', target_lang, translation_context] if prompt: cache_key_parts.append(prompt) cache_key_parts.append(text) cache_key = ':'.join(cache_key_parts) # Check cache (include context and prompt in cache key for accuracy) if self.use_cache and self.redis_client: cached = self._get_cached_translation_redis(text, target_lang, source_lang, translation_context, prompt) if cached: logger.info( f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " f"Source language: {source_lang or 'auto'} | Result: '{cached}' | Source: Cache hit" ) return cached # If no API key, return mock translation (for testing) if not self.api_key: logger.info( f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " f"Source language: {source_lang or 'auto'} | Result: '{text}' | Source: Mock mode (no API key)" ) return text # Translate using selected model logger.info( f"[Translator] Translation request | Model: {self.model} | Original text: '{text}' | Target language: {target_lang} | " f"Source language: {source_lang or 'auto'} | Context: {translation_context} | " f"Prompt: {'yes' if prompt else 'no'} | Status: Starting translation" ) if self.model == 'qwen': result = self._translate_qwen(text, target_lang, source_lang, translation_context, prompt) else: # deepl result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt) # If still failed, return original text with warning if result is None: logger.warning( f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " f"Source language: {source_lang or 'auto'} | Result: '{text}' | Status: Translation failed, returning original" ) result = text else: logger.info( f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " f"Source language: {source_lang or 'auto'} | Result: '{result}' | Status: Translation successful" ) # Cache result if result and self.use_cache and self.redis_client: self._set_cached_translation_redis(text, target_lang, result, source_lang, translation_context, prompt) return result def _translate_qwen( self, text: str, target_lang: str, source_lang: Optional[str], context: Optional[str] = None, prompt: Optional[str] = None ) -> Optional[str]: """ Translate using Qwen MT Flash model via Alibaba Cloud DashScope API. Args: text: Text to translate target_lang: Target language code ('zh', 'en', 'ru', etc.) source_lang: Source language code (optional, 'auto' if None) context: Context hint for translation (optional) prompt: Translation prompt/instruction (optional) Returns: Translated text or None if translation fails """ if not self.qwen_client: logger.error("[Translator] Qwen client not initialized. Check DASHSCOPE_API_KEY.") return None # Qwen (qwen-mt-plus/flash/turbo) supported languages mapping # 标准来自:你提供的“语言 / 英文名 / 代码”表 qwen_lang_map = { "en": "English", "zh": "Chinese", "zh_tw": "Traditional Chinese", "ru": "Russian", "ja": "Japanese", "ko": "Korean", "es": "Spanish", "fr": "French", "pt": "Portuguese", "de": "German", "it": "Italian", "th": "Thai", "vi": "Vietnamese", "id": "Indonesian", "ms": "Malay", "ar": "Arabic", "hi": "Hindi", "he": "Hebrew", "my": "Burmese", "ta": "Tamil", "ur": "Urdu", "bn": "Bengali", "pl": "Polish", "nl": "Dutch", "ro": "Romanian", "tr": "Turkish", "km": "Khmer", "lo": "Lao", "yue": "Cantonese", "cs": "Czech", "el": "Greek", "sv": "Swedish", "hu": "Hungarian", "da": "Danish", "fi": "Finnish", "uk": "Ukrainian", "bg": "Bulgarian", } # Convert target language target_lang_normalized = target_lang.lower() target_lang_qwen = qwen_lang_map.get(target_lang_normalized, target_lang.capitalize()) # Convert source language source_lang_normalized = (source_lang or "").strip().lower() if not source_lang_normalized or source_lang_normalized == "auto": source_lang_qwen = "auto" else: source_lang_qwen = qwen_lang_map.get(source_lang_normalized, source_lang.capitalize()) # Prepare translation options translation_options = { "source_lang": source_lang_qwen, "target_lang": target_lang_qwen, } # Prepare messages messages = [ { "role": "user", "content": text } ] try: completion = self.qwen_client.chat.completions.create( model=self.QWEN_MODEL, messages=messages, extra_body={ "translation_options": translation_options } ) translated_text = completion.choices[0].message.content.strip() logger.debug( f"[Translator] Qwen API response success | Original text: '{text}' | Target language: {target_lang_qwen} | " f"Translation result: '{translated_text}'" ) return translated_text except Exception as e: logger.error( f"[Translator] Qwen API request exception | Original text: '{text}' | Target language: {target_lang_qwen} | " f"Error: {e}", exc_info=True ) return None def _translate_deepl( self, text: str, target_lang: str, source_lang: Optional[str], context: Optional[str] = None, prompt: Optional[str] = None ) -> Optional[str]: """ Translate using DeepL API with context and glossary support. Args: text: Text to translate target_lang: Target language code source_lang: Source language code (optional) context: Context hint for translation (e.g., "e-commerce product search") """ # Map to DeepL language codes target_code = self.LANG_CODE_MAP.get(target_lang, target_lang.upper()) headers = { "Authorization": f"DeepL-Auth-Key {self.api_key}", "Content-Type": "application/json", } # Use prompt as context parameter for DeepL API (not as text prefix) # According to DeepL API: context is "Additional context that can influence a translation but is not translated itself" # If prompt is provided, use it as context; otherwise use the default context api_context = prompt if prompt else context # For e-commerce, add context words to help DeepL understand the domain # This is especially important for single-word ambiguous terms like "车" (car vs rook) text_to_translate, needs_extraction = self._add_ecommerce_context(text, source_lang, api_context) payload = { "text": [text_to_translate], "target_lang": target_code, } if source_lang: source_code = self.LANG_CODE_MAP.get(source_lang, source_lang.upper()) payload["source_lang"] = source_code # Add context parameter (prompt or default context) # Context influences translation but is not translated itself if api_context: payload["context"] = api_context # Add glossary if configured if self.glossary_id: payload["glossary_id"] = self.glossary_id # Note: DeepL API v2 supports "context" parameter for additional context # that influences translation but is not translated itself. # We use prompt as context parameter when provided. try: response = requests.post( self.DEEPL_API_URL, headers=headers, json=payload, timeout=self.timeout ) if response.status_code == 200: data = response.json() if "translations" in data and len(data["translations"]) > 0: translated_text = data["translations"][0]["text"] # If we added context, extract just the term from the result if needs_extraction: translated_text = self._extract_term_from_translation( translated_text, text, target_code ) logger.debug( f"[Translator] DeepL API response success | Original text: '{text}' | Target language: {target_code} | " f"Translation result: '{translated_text}'" ) return translated_text else: logger.error( f"[Translator] DeepL API error | Original text: '{text}' | Target language: {target_code} | " f"Status code: {response.status_code} | Error message: {response.text}" ) return None except requests.Timeout: logger.warning( f"[Translator] DeepL API request timeout | Original text: '{text}' | Target language: {target_code} | " f"Timeout: {self.timeout}s" ) return None except Exception as e: logger.error( f"[Translator] DeepL API request exception | Original text: '{text}' | Target language: {target_code} | " f"Error: {e}", exc_info=True ) return None # NOTE: _translate_deepl_free is intentionally not implemented. # We do not support automatic fallback to the free endpoint, to avoid # mixing Pro keys with https://api-free.deepl.com and related 403 errors. def translate_multi( self, text: str, target_langs: List[str], source_lang: Optional[str] = None, context: Optional[str] = None, async_mode: bool = True, prompt: Optional[str] = None ) -> Dict[str, Optional[str]]: """ Translate text to multiple target languages. In async_mode=True (default): - Returns cached translations immediately if available - For translations that can be optimized (e.g., pure numbers, already in target language), returns result immediately via synchronous call - Launches async tasks for other missing translations (non-blocking) - Returns None for missing translations that require async processing In async_mode=False: - Waits for all translations to complete (blocking) Args: text: Text to translate target_langs: List of target language codes source_lang: Source language code (optional) context: Context hint for translation (optional) async_mode: If True, return cached results immediately and translate missing ones async prompt: Translation prompt/instruction (optional) Returns: Dictionary mapping language code to translated text (only cached results in async mode) """ results = {} missing_langs = [] async_langs = [] # First, get cached translations for lang in target_langs: cached = self._get_cached_translation(text, lang, source_lang, context, prompt) if cached is not None: results[lang] = cached else: missing_langs.append(lang) # If async mode and there are missing translations if async_mode and missing_langs: # Check if translation can be optimized (immediate return) for lang in missing_langs: target_lang = lang.lower() # Check optimization conditions (same as in translate method) can_optimize = False if target_lang == 'en' and self._is_english_text(text): can_optimize = True elif target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): can_optimize = True if can_optimize: # Can be optimized, call translate synchronously for immediate result results[lang] = self.translate(text, lang, source_lang, context, prompt) else: # Requires actual translation, add to async list async_langs.append(lang) # Launch async tasks for translations that require actual API calls if async_langs: for lang in async_langs: self._translate_async(text, lang, source_lang, context, prompt) # Return None for async translations for lang in async_langs: results[lang] = None else: # Synchronous mode: wait for all translations for lang in missing_langs: results[lang] = self.translate(text, lang, source_lang, context, prompt) return results def translate_multi_async( self, text: str, target_langs: List[str], source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None ) -> Dict[str, Union[str, Future]]: """ Translate text to multiple target languages asynchronously, returning Futures that can be awaited. This method returns a dictionary where: - If translation is cached, the value is the translation string (immediate) - If translation needs to be done, the value is a Future object that can be awaited Args: text: Text to translate target_langs: List of target language codes source_lang: Source language code (optional) context: Context hint for translation (optional) prompt: Translation prompt/instruction (optional) Returns: Dictionary mapping language code to either translation string (cached) or Future object """ results = {} missing_langs = [] # First, get cached translations for lang in target_langs: cached = self._get_cached_translation(text, lang, source_lang, context, prompt) if cached is not None: results[lang] = cached else: missing_langs.append(lang) # For missing translations, submit async tasks and return Futures for lang in missing_langs: future = self.executor.submit( self.translate, text, lang, source_lang, context, prompt ) results[lang] = future return results def _get_cached_translation( self, text: str, target_lang: str, source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None ) -> Optional[str]: """Get translation from cache if available.""" if not self.redis_client: return None return self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt) def _get_cached_translation_redis( self, text: str, target_lang: str, source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None ) -> Optional[str]: """ Get translation from Redis cache with sliding expiration. 滑动过期机制:每次访问缓存时,重置过期时间为配置的过期时间(默认720天)。 这样缓存会在最后一次访问后的720天才过期,而不是写入后的720天。 这确保了常用的翻译缓存不会被过早删除。 """ if not self.redis_client: return None try: # Build cache key: prefix:target_lang:text # For simplicity, we use target_lang and text as key # Context and prompt are not included in key to maximize cache hits cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}" value = self.redis_client.get(cache_key) if value: # Sliding expiration: reset expiration time on access # 每次读取缓存时,重置过期时间为配置的过期时间(最后一次访问后的N天才过期) try: self.redis_client.expire(cache_key, self.expire_seconds) except Exception as expire_error: # 即使 expire 失败,也返回缓存值(不影响功能) logger.warning( f"[Translator] Failed to update cache expiration for key {cache_key}: {expire_error}" ) logger.debug( f"[Translator] Redis cache hit | Original text: '{text}' | Target language: {target_lang} | " f"Cache key: {cache_key} | Translation result: '{value}' | TTL reset to {self.expire_seconds}s" ) return value logger.debug( f"[Translator] Redis cache miss | Original text: '{text}' | Target language: {target_lang} | " f"Cache key: {cache_key}" ) return None except Exception as e: logger.error(f"[Translator] Redis error during get translation cache | Original text: '{text}' | Target language: {target_lang} | Error: {e}") return None def _set_cached_translation_redis( self, text: str, target_lang: str, translation: str, source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None ) -> None: """Store translation in Redis cache.""" if not self.redis_client: return try: cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}" self.redis_client.setex(cache_key, self.expire_seconds, translation) logger.info( f"[Translator] Redis cache write | Original text: '{text}' | Target language: {target_lang} | " f"Cache key: {cache_key} | Translation result: '{translation}'" ) except Exception as e: logger.error( f"[Translator] Redis cache write failed | Original text: '{text}' | Target language: {target_lang} | " f"Error: {e}" ) def _translate_async( self, text: str, target_lang: str, source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None ): """Launch async translation task.""" def _do_translate(): try: result = self.translate(text, target_lang, source_lang, context, prompt) if result: logger.debug(f"Async translation completed: {text} -> {target_lang}: {result}") except Exception as e: logger.warning(f"Async translation failed: {text} -> {target_lang}: {e}") self.executor.submit(_do_translate) def _add_ecommerce_context( self, text: str, source_lang: Optional[str], context: Optional[str] ) -> tuple: """ Add e-commerce context to text for better disambiguation. For single-word ambiguous Chinese terms, we add context words that help DeepL understand this is an e-commerce/product search context. Args: text: Original text to translate source_lang: Source language code context: Context hint Returns: Tuple of (text_with_context, needs_extraction) - text_with_context: Text to send to DeepL - needs_extraction: Whether we need to extract the term from the result """ # Only apply for e-commerce context and Chinese source if not context or "e-commerce" not in context.lower(): return text, False if not source_lang or source_lang.lower() != 'zh': return text, False # For single-word queries, add context to help disambiguation text_stripped = text.strip() if len(text_stripped.split()) == 1 and len(text_stripped) <= 2: # Common ambiguous Chinese e-commerce terms like "车" (car vs rook) # We add a context phrase: "购买 [term]" (buy [term]) or "商品 [term]" (product [term]) # This helps DeepL understand the e-commerce context # We'll need to extract just the term from the translation result context_phrase = f"购买 {text_stripped}" return context_phrase, True # For multi-word queries, DeepL usually has enough context return text, False def _extract_term_from_translation( self, translated_text: str, original_text: str, target_lang_code: str ) -> str: """ Extract the actual term from a translation that included context. For example, if we translated "购买 车" (buy car) and got "buy car", we want to extract just "car". Args: translated_text: Full translation result original_text: Original single-word query target_lang_code: Target language code (EN, ZH, etc.) Returns: Extracted term or original translation if extraction fails """ # For English target, try to extract the last word (the actual term) if target_lang_code == "EN": words = translated_text.strip().split() if len(words) > 1: # Usually the last word is the term we want # But we need to be smart - if it's "buy car", we want "car" # Common context words to skip: buy, purchase, product, item, etc. context_words = {"buy", "purchase", "product", "item", "commodity", "goods"} # Try to find the term (not a context word) for word in reversed(words): word_lower = word.lower().rstrip('.,!?;:') if word_lower not in context_words: return word_lower # If all words are context words, return the last one return words[-1].lower().rstrip('.,!?;:') # For other languages or if extraction fails, return as-is # The user can configure a glossary for better results return translated_text def _shop_lang_matches(self, shop_lang_lower: str, lang_code: str) -> bool: """True if shop language matches index language (use source, no translate).""" if not shop_lang_lower or not lang_code: return False if shop_lang_lower == lang_code: return True if lang_code == "zh" and "zh" in shop_lang_lower: return True if lang_code == "en" and "en" in shop_lang_lower: return True return False def translate_for_indexing( self, text: str, shop_language: str, source_lang: Optional[str] = None, context: Optional[str] = None, prompt: Optional[str] = None, index_languages: Optional[List[str]] = None, ) -> Dict[str, Optional[str]]: """ Translate text for indexing based on shop language and tenant index_languages. For each language in index_languages: use source text if shop language matches, otherwise translate to that language. Args: text: Text to translate shop_language: Shop primary language (e.g. 'zh', 'en', 'ru') source_lang: Source language code (optional) context: Additional context for translation (optional) prompt: Translation prompt (optional) index_languages: Languages to index (from tenant_config). Default ["en", "zh"]. Returns: Dict keyed by each index_language with translated or source text (or None). """ langs = index_languages if index_languages else ["en", "zh"] results = {lang: None for lang in langs} if not text or not text.strip(): return results if re.match(r'^[\d\s_-]+$', text): logger.info(f"[Translator] Skip translation for symbol-only query: '{text}'") return results shop_lang_lower = (shop_language or "").strip().lower() targets = [] for lang in langs: if self._shop_lang_matches(shop_lang_lower, lang): results[lang] = text else: targets.append(lang) for target_lang in targets: cached = self._get_cached_translation_redis(text, target_lang, source_lang, context, prompt) if cached: results[target_lang] = cached logger.debug(f"[Translator] Cache hit for indexing: '{text}' -> {target_lang}: {cached}") continue translated = self.translate( text, target_lang=target_lang, source_lang=source_lang or shop_language, context=context, prompt=prompt, ) results[target_lang] = translated return results def get_translation_needs( self, detected_lang: str, supported_langs: List[str] ) -> List[str]: """ Determine which languages need translation. Args: detected_lang: Detected query language supported_langs: List of supported languages Returns: List of language codes to translate to """ # If detected language is in supported list, translate to others if detected_lang in supported_langs: return [lang for lang in supported_langs if detected_lang != lang] # Otherwise, translate to all supported languages return supported_langs def _is_english_text(self, text: str) -> bool: """ Check if text is primarily English (ASCII letters, numbers, common punctuation). Args: text: Text to check Returns: True if text appears to be English """ if not text or not text.strip(): return True # Remove whitespace and common punctuation text_clean = re.sub(r'[\s\.,!?;:\-\'\"\(\)\[\]{}]', '', text) if not text_clean: return True # Check if all remaining characters are ASCII (letters, numbers) # This is a simple heuristic: if most characters are ASCII, it's likely English ascii_count = sum(1 for c in text_clean if ord(c) < 128) ratio = ascii_count / len(text_clean) if text_clean else 0 # If more than 80% are ASCII characters, consider it English return ratio > 0.8 def _contains_chinese(self, text: str) -> bool: """ Check if text contains Chinese characters (Han characters). Args: text: Text to check Returns: True if text contains Chinese characters """ if not text: return False # Check for Chinese characters (Unicode range: \u4e00-\u9fff) chinese_pattern = re.compile(r'[\u4e00-\u9fff]') return bool(chinese_pattern.search(text)) def _is_pure_number(self, text: str) -> bool: """ Check if text is purely numeric (digits, possibly with spaces, dots, commas). Args: text: Text to check Returns: True if text is purely numeric """ if not text or not text.strip(): return False # Remove whitespace, dots, commas (common number separators) text_clean = re.sub(r'[\s\.,]', '', text.strip()) if not text_clean: return False # Check if all remaining characters are digits return text_clean.isdigit()