language_detector.py 5.46 KB
"""
Language detection utility.

Detects language of short e-commerce queries with script checks + lightweight
Latin-language scoring (de/fr/es/it/pt/nl/en).
"""

from typing import Dict, List
import re


class LanguageDetector:
    """Rule-based language detector for common e-commerce query languages."""

    def __init__(self):
        self._re_zh = re.compile(r"[\u4e00-\u9fff]")
        self._re_ja_kana = re.compile(r"[\u3040-\u30ff]")
        self._re_ko = re.compile(r"[\uac00-\ud7af]")
        self._re_ru = re.compile(r"[\u0400-\u04ff]")
        self._re_ar = re.compile(r"[\u0600-\u06ff]")
        self._re_hi = re.compile(r"[\u0900-\u097f]")
        self._re_he = re.compile(r"[\u0590-\u05ff]")
        self._re_th = re.compile(r"[\u0e00-\u0e7f]")
        self._re_latin_word = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+")

        # Stopwords + e-commerce terms for Latin-family disambiguation.
        self._latin_lexicons: Dict[str, set] = {
            "en": {
                "the", "and", "for", "with", "new", "women", "men", "kids",
                "shoe", "shoes", "dress", "shirt", "jacket", "bag", "wireless",
            },
            "de": {
                "der", "die", "das", "und", "mit", "für", "damen", "herren",
                "kinder", "schuhe", "kleid", "hemd", "jacke", "tasche",
            },
            "fr": {
                "le", "la", "les", "et", "avec", "pour", "femme", "homme",
                "enfant", "chaussures", "robe", "chemise", "veste", "sac",
            },
            "es": {
                "el", "la", "los", "las", "y", "con", "para", "mujer", "hombre",
                "niño", "niña", "zapatos", "vestido", "camisa", "chaqueta", "bolso",
            },
            "it": {
                "il", "lo", "la", "gli", "le", "e", "con", "per", "donna", "uomo",
                "bambino", "scarpe", "abito", "camicia", "giacca", "borsa",
            },
            "pt": {
                "o", "a", "os", "as", "e", "com", "para", "mulher", "homem",
                "criança", "sapatos", "vestido", "camisa", "jaqueta", "bolsa",
            },
            "nl": {
                "de", "het", "en", "met", "voor", "dames", "heren", "kinderen",
                "schoenen", "jurk", "overhemd", "jas", "tas",
            },
        }
        self._diacritic_weights: Dict[str, Dict[str, int]] = {
            "de": {"ä": 3, "ö": 3, "ü": 3, "ß": 4},
            "fr": {"é": 2, "è": 2, "ê": 2, "à": 2, "ç": 2, "ù": 2, "ô": 2},
            "es": {"ñ": 3, "á": 2, "é": 2, "í": 2, "ó": 2, "ú": 2},
            "it": {"à": 2, "è": 2, "é": 2, "ì": 2, "ò": 2, "ù": 2},
            "pt": {"ã": 3, "õ": 3, "ç": 2, "á": 2, "â": 2, "ê": 2, "ô": 2},
            "nl": {"ij": 2},
        }

    def detect(self, text: str) -> str:
        """
        Detect language code for text.

        Returns one of: zh/en/ru/ar/ja/ko/de/fr/es/it/pt/nl/hi/he/th/unknown
        """
        if not text or not text.strip():
            return "unknown"
        q = text.strip().lower()

        # Script-first detection for non-Latin languages.
        if self._re_ja_kana.search(q):
            return "ja"
        if self._re_ko.search(q):
            return "ko"
        if self._re_zh.search(q):
            return "zh"
        if self._re_ru.search(q):
            return "ru"
        if self._re_ar.search(q):
            return "ar"
        if self._re_hi.search(q):
            return "hi"
        if self._re_he.search(q):
            return "he"
        if self._re_th.search(q):
            return "th"

        # Latin-family scoring.
        tokens = self._re_latin_word.findall(q)
        if not tokens:
            return "unknown"

        scores: Dict[str, float] = {k: 0.0 for k in self._latin_lexicons.keys()}
        scores["en"] = scores.get("en", 0.0)
        token_set = set(tokens)

        # Lexicon matches
        for lang, lex in self._latin_lexicons.items():
            overlap = len(token_set & lex)
            if overlap:
                scores[lang] += overlap * 2.0

        # Diacritics / orthographic hints
        for lang, hints in self._diacritic_weights.items():
            for marker, weight in hints.items():
                if marker in q:
                    scores[lang] += weight

        # Light suffix hints for common product words
        for t in tokens:
            if t.endswith("ung") or t.endswith("chen"):
                scores["de"] += 0.6
            if t.endswith("ción") or t.endswith("ado") or t.endswith("ada"):
                scores["es"] += 0.6
            if t.endswith("zione") or t.endswith("etto") or t.endswith("ella"):
                scores["it"] += 0.6
            if t.endswith("ção") or t.endswith("mente"):
                scores["pt"] += 0.6
            if t.endswith("ment") or t.endswith("eau"):
                scores["fr"] += 0.5

        # Fallback preference: English for pure Latin short tokens.
        scores["en"] += 0.2

        best_lang = max(scores.items(), key=lambda x: x[1])[0]
        best_score = scores[best_lang]
        if best_score <= 0:
            return "en"
        return best_lang

    def is_chinese(self, text: str) -> bool:
        return self.detect(text) == "zh"

    def is_english(self, text: str) -> bool:
        return self.detect(text) == "en"

    def is_russian(self, text: str) -> bool:
        return self.detect(text) == "ru"

    def is_arabic(self, text: str) -> bool:
        return self.detect(text) == "ar"

    def is_japanese(self, text: str) -> bool:
        return self.detect(text) == "ja"