""" Language detection utility. Detects the language of a query string. """ from typing import Optional import re class LanguageDetector: """Simple rule-based language detector for common e-commerce languages.""" # Unicode ranges for different scripts CJK_RANGES = [ (0x4E00, 0x9FFF), # CJK Unified Ideographs (0x3400, 0x4DBF), # CJK Extension A (0x20000, 0x2A6DF), # CJK Extension B (0x3040, 0x309F), # Hiragana (0x30A0, 0x30FF), # Katakana ] CYRILLIC_RANGE = (0x0400, 0x04FF) ARABIC_RANGE = (0x0600, 0x06FF) LATIN_RANGE = (0x0041, 0x007A) def __init__(self): """Initialize language detector.""" self.chinese_pattern = re.compile(r'[\u4e00-\u9fff]+') self.russian_pattern = re.compile(r'[\u0400-\u04ff]+') self.arabic_pattern = re.compile(r'[\u0600-\u06ff]+') self.japanese_pattern = re.compile(r'[\u3040-\u309f\u30a0-\u30ff]+') def detect(self, text: str) -> str: """ Detect language of text. Args: text: Input text Returns: Language code: 'zh', 'en', 'ru', 'ar', 'ja', or 'unknown' """ if not text or not text.strip(): return 'unknown' text = text.strip() # Count characters in each script char_counts = { 'chinese': 0, 'russian': 0, 'arabic': 0, 'japanese': 0, 'latin': 0 } for char in text: code_point = ord(char) # Check CJK (Chinese/Japanese) is_cjk = any(start <= code_point <= end for start, end in self.CJK_RANGES) if is_cjk: char_counts['chinese'] += 1 # Check Hiragana/Katakana (Japanese) if 0x3040 <= code_point <= 0x30FF: char_counts['japanese'] += 1 # Check Cyrillic (Russian) if self.CYRILLIC_RANGE[0] <= code_point <= self.CYRILLIC_RANGE[1]: char_counts['russian'] += 1 # Check Arabic if self.ARABIC_RANGE[0] <= code_point <= self.ARABIC_RANGE[1]: char_counts['arabic'] += 1 # Check Latin if (0x0041 <= code_point <= 0x005A) or (0x0061 <= code_point <= 0x007A): char_counts['latin'] += 1 # Determine dominant script total_chars = sum(char_counts.values()) if total_chars == 0: return 'unknown' # Calculate percentages percentages = { script: count / total_chars for script, count in char_counts.items() } # Japanese has both Hiragana/Katakana and CJK if percentages['japanese'] > 0.1: return 'ja' # Russian (Cyrillic) if percentages['russian'] > 0.5: return 'ru' # Arabic if percentages['arabic'] > 0.5: return 'ar' # Chinese (CJK without Japanese kana) if percentages['chinese'] > 0.3: return 'zh' # English/Latin if percentages['latin'] > 0.5: return 'en' return 'unknown' def is_chinese(self, text: str) -> bool: """Check if text is primarily Chinese.""" return self.detect(text) == 'zh' def is_english(self, text: str) -> bool: """Check if text is primarily English.""" return self.detect(text) == 'en' def is_russian(self, text: str) -> bool: """Check if text is primarily Russian.""" return self.detect(text) == 'ru' def is_arabic(self, text: str) -> bool: """Check if text is primarily Arabic.""" return self.detect(text) == 'ar' def is_japanese(self, text: str) -> bool: """Check if text is primarily Japanese.""" return self.detect(text) == 'ja'