""" Language detection utility. Detects language of short e-commerce queries with script checks + lightweight Latin-language scoring (de/fr/es/it/pt/nl/en). """ from typing import Dict, List import re class LanguageDetector: """Rule-based language detector for common e-commerce query languages.""" def __init__(self): self._re_zh = re.compile(r"[\u4e00-\u9fff]") self._re_ja_kana = re.compile(r"[\u3040-\u30ff]") self._re_ko = re.compile(r"[\uac00-\ud7af]") self._re_ru = re.compile(r"[\u0400-\u04ff]") self._re_ar = re.compile(r"[\u0600-\u06ff]") self._re_hi = re.compile(r"[\u0900-\u097f]") self._re_he = re.compile(r"[\u0590-\u05ff]") self._re_th = re.compile(r"[\u0e00-\u0e7f]") self._re_latin_word = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+") # Stopwords + e-commerce terms for Latin-family disambiguation. self._latin_lexicons: Dict[str, set] = { "en": { "the", "and", "for", "with", "new", "women", "men", "kids", "shoe", "shoes", "dress", "shirt", "jacket", "bag", "wireless", }, "de": { "der", "die", "das", "und", "mit", "für", "damen", "herren", "kinder", "schuhe", "kleid", "hemd", "jacke", "tasche", }, "fr": { "le", "la", "les", "et", "avec", "pour", "femme", "homme", "enfant", "chaussures", "robe", "chemise", "veste", "sac", }, "es": { "el", "la", "los", "las", "y", "con", "para", "mujer", "hombre", "niño", "niña", "zapatos", "vestido", "camisa", "chaqueta", "bolso", }, "it": { "il", "lo", "la", "gli", "le", "e", "con", "per", "donna", "uomo", "bambino", "scarpe", "abito", "camicia", "giacca", "borsa", }, "pt": { "o", "a", "os", "as", "e", "com", "para", "mulher", "homem", "criança", "sapatos", "vestido", "camisa", "jaqueta", "bolsa", }, "nl": { "de", "het", "en", "met", "voor", "dames", "heren", "kinderen", "schoenen", "jurk", "overhemd", "jas", "tas", }, } self._diacritic_weights: Dict[str, Dict[str, int]] = { "de": {"ä": 3, "ö": 3, "ü": 3, "ß": 4}, "fr": {"é": 2, "è": 2, "ê": 2, "à": 2, "ç": 2, "ù": 2, "ô": 2}, "es": {"ñ": 3, "á": 2, "é": 2, "í": 2, "ó": 2, "ú": 2}, "it": {"à": 2, "è": 2, "é": 2, "ì": 2, "ò": 2, "ù": 2}, "pt": {"ã": 3, "õ": 3, "ç": 2, "á": 2, "â": 2, "ê": 2, "ô": 2}, "nl": {"ij": 2}, } def detect(self, text: str) -> str: """ Detect language code for text. Returns one of: zh/en/ru/ar/ja/ko/de/fr/es/it/pt/nl/hi/he/th/unknown """ if not text or not text.strip(): return "unknown" q = text.strip().lower() # Script-first detection for non-Latin languages. if self._re_ja_kana.search(q): return "ja" if self._re_ko.search(q): return "ko" if self._re_zh.search(q): return "zh" if self._re_ru.search(q): return "ru" if self._re_ar.search(q): return "ar" if self._re_hi.search(q): return "hi" if self._re_he.search(q): return "he" if self._re_th.search(q): return "th" # Latin-family scoring. tokens = self._re_latin_word.findall(q) if not tokens: return "unknown" scores: Dict[str, float] = {k: 0.0 for k in self._latin_lexicons.keys()} scores["en"] = scores.get("en", 0.0) token_set = set(tokens) # Lexicon matches for lang, lex in self._latin_lexicons.items(): overlap = len(token_set & lex) if overlap: scores[lang] += overlap * 2.0 # Diacritics / orthographic hints for lang, hints in self._diacritic_weights.items(): for marker, weight in hints.items(): if marker in q: scores[lang] += weight # Light suffix hints for common product words for t in tokens: if t.endswith("ung") or t.endswith("chen"): scores["de"] += 0.6 if t.endswith("ción") or t.endswith("ado") or t.endswith("ada"): scores["es"] += 0.6 if t.endswith("zione") or t.endswith("etto") or t.endswith("ella"): scores["it"] += 0.6 if t.endswith("ção") or t.endswith("mente"): scores["pt"] += 0.6 if t.endswith("ment") or t.endswith("eau"): scores["fr"] += 0.5 # Fallback preference: English for pure Latin short tokens. scores["en"] += 0.2 best_lang = max(scores.items(), key=lambda x: x[1])[0] best_score = scores[best_lang] if best_score <= 0: return "en" return best_lang def is_chinese(self, text: str) -> bool: return self.detect(text) == "zh" def is_english(self, text: str) -> bool: return self.detect(text) == "en" def is_russian(self, text: str) -> bool: return self.detect(text) == "ru" def is_arabic(self, text: str) -> bool: return self.detect(text) == "ar" def is_japanese(self, text: str) -> bool: return self.detect(text) == "ja"