language_detector.py 3.07 KB
"""
Language detection utility.

Script-first rules for CJK and other non-Latin scripts, then Lingua
(lingua-language-detector) for Latin text and Romance/Germanic disambiguation.
"""

from __future__ import annotations

from typing import Dict, Optional
import re

from lingua import Language, LanguageDetectorBuilder

_LINGUA_TO_CODE: Dict[Language, str] = {
    Language.CHINESE: "zh",
    Language.ENGLISH: "en",
    Language.JAPANESE: "ja",
    Language.KOREAN: "ko",
    Language.GERMAN: "de",
    Language.FRENCH: "fr",
    Language.SPANISH: "es",
    Language.ITALIAN: "it",
    Language.PORTUGUESE: "pt",
    Language.DUTCH: "nl",
    Language.RUSSIAN: "ru",
    Language.ARABIC: "ar",
    Language.HINDI: "hi",
    Language.HEBREW: "he",
    Language.THAI: "th",
}

_LINGUA_LANGUAGES = tuple(_LINGUA_TO_CODE.keys())

_lingua_detector: Optional[object] = None


def _get_lingua_detector():
    global _lingua_detector
    if _lingua_detector is None:
        _lingua_detector = LanguageDetectorBuilder.from_languages(
            *_LINGUA_LANGUAGES
        ).build()
    return _lingua_detector


class LanguageDetector:
    """Language detector: script hints + Lingua for Latin-family queries."""

    def __init__(self):
        self._re_zh = re.compile(r"[\u4e00-\u9fff]")
        self._re_ja_kana = re.compile(r"[\u3040-\u30ff]")
        self._re_ko = re.compile(r"[\uac00-\ud7af]")
        self._re_ru = re.compile(r"[\u0400-\u04ff]")
        self._re_ar = re.compile(r"[\u0600-\u06ff]")
        self._re_hi = re.compile(r"[\u0900-\u097f]")
        self._re_he = re.compile(r"[\u0590-\u05ff]")
        self._re_th = re.compile(r"[\u0e00-\u0e7f]")

    def detect(self, text: str) -> str:
        """
        Detect language code for text.

        Returns one of: zh/en/ru/ar/ja/ko/de/fr/es/it/pt/nl/hi/he/th/unknown
        """
        if not text or not text.strip():
            return "unknown"
        q = text.strip()

        # Script-first: unambiguous blocks before Latin/Romance Lingua pass.
        if self._re_ja_kana.search(q):
            return "ja"
        if self._re_ko.search(q):
            return "ko"
        if self._re_zh.search(q):
            return "zh"
        if self._re_ru.search(q):
            return "ru"
        if self._re_ar.search(q):
            return "ar"
        if self._re_hi.search(q):
            return "hi"
        if self._re_he.search(q):
            return "he"
        if self._re_th.search(q):
            return "th"

        detected = _get_lingua_detector().detect_language_of(q)
        if detected is None:
            return "unknown"
        code = _LINGUA_TO_CODE.get(detected)
        return code if code is not None else "unknown"

    def is_chinese(self, text: str) -> bool:
        return self.detect(text) == "zh"

    def is_english(self, text: str) -> bool:
        return self.detect(text) == "en"

    def is_russian(self, text: str) -> bool:
        return self.detect(text) == "ru"

    def is_arabic(self, text: str) -> bool:
        return self.detect(text) == "ar"

    def is_japanese(self, text: str) -> bool:
        return self.detect(text) == "ja"