""" Language detection utility. Script-first rules for CJK and other non-Latin scripts, then Lingua (lingua-language-detector) for Latin text and Romance/Germanic disambiguation. """ from __future__ import annotations from typing import Dict, Optional import re from lingua import Language, LanguageDetectorBuilder _LINGUA_TO_CODE: Dict[Language, str] = { Language.CHINESE: "zh", Language.ENGLISH: "en", Language.JAPANESE: "ja", Language.KOREAN: "ko", Language.GERMAN: "de", Language.FRENCH: "fr", Language.SPANISH: "es", Language.ITALIAN: "it", Language.PORTUGUESE: "pt", Language.DUTCH: "nl", Language.RUSSIAN: "ru", Language.ARABIC: "ar", Language.HINDI: "hi", Language.HEBREW: "he", Language.THAI: "th", } _LINGUA_LANGUAGES = tuple(_LINGUA_TO_CODE.keys()) _lingua_detector: Optional[object] = None def _get_lingua_detector(): global _lingua_detector if _lingua_detector is None: _lingua_detector = LanguageDetectorBuilder.from_languages( *_LINGUA_LANGUAGES ).build() return _lingua_detector class LanguageDetector: """Language detector: script hints + Lingua for Latin-family queries.""" def __init__(self): self._re_zh = re.compile(r"[\u4e00-\u9fff]") self._re_ja_kana = re.compile(r"[\u3040-\u30ff]") self._re_ko = re.compile(r"[\uac00-\ud7af]") self._re_ru = re.compile(r"[\u0400-\u04ff]") self._re_ar = re.compile(r"[\u0600-\u06ff]") self._re_hi = re.compile(r"[\u0900-\u097f]") self._re_he = re.compile(r"[\u0590-\u05ff]") self._re_th = re.compile(r"[\u0e00-\u0e7f]") def detect(self, text: str) -> str: """ Detect language code for text. Returns one of: zh/en/ru/ar/ja/ko/de/fr/es/it/pt/nl/hi/he/th/unknown """ if not text or not text.strip(): return "unknown" q = text.strip() # Script-first: unambiguous blocks before Latin/Romance Lingua pass. if self._re_ja_kana.search(q): return "ja" if self._re_ko.search(q): return "ko" if self._re_zh.search(q): return "zh" if self._re_ru.search(q): return "ru" if self._re_ar.search(q): return "ar" if self._re_hi.search(q): return "hi" if self._re_he.search(q): return "he" if self._re_th.search(q): return "th" detected = _get_lingua_detector().detect_language_of(q) if detected is None: return "unknown" code = _LINGUA_TO_CODE.get(detected) return code if code is not None else "unknown" def is_chinese(self, text: str) -> bool: return self.detect(text) == "zh" def is_english(self, text: str) -> bool: return self.detect(text) == "en" def is_russian(self, text: str) -> bool: return self.detect(text) == "ru" def is_arabic(self, text: str) -> bool: return self.detect(text) == "ar" def is_japanese(self, text: str) -> bool: return self.detect(text) == "ja"