Commit 0ea456b2f14531db776ab4ad3d3aec2423cb11e0
1 parent
272aeabe
+lingua-language-detector
Showing
3 changed files
with
50 additions
and
88 deletions
Show diff stats
docs/TODO.txt
query/language_detector.py
| 1 | 1 | """ |
| 2 | 2 | Language detection utility. |
| 3 | 3 | |
| 4 | -Detects language of short e-commerce queries with script checks + lightweight | |
| 5 | -Latin-language scoring (de/fr/es/it/pt/nl/en). | |
| 4 | +Script-first rules for CJK and other non-Latin scripts, then Lingua | |
| 5 | +(lingua-language-detector) for Latin text and Romance/Germanic disambiguation. | |
| 6 | 6 | """ |
| 7 | 7 | |
| 8 | -from typing import Dict, List | |
| 8 | +from __future__ import annotations | |
| 9 | + | |
| 10 | +from typing import Dict, Optional | |
| 9 | 11 | import re |
| 10 | 12 | |
| 13 | +from lingua import Language, LanguageDetectorBuilder | |
| 14 | + | |
| 15 | +_LINGUA_TO_CODE: Dict[Language, str] = { | |
| 16 | + Language.CHINESE: "zh", | |
| 17 | + Language.ENGLISH: "en", | |
| 18 | + Language.JAPANESE: "ja", | |
| 19 | + Language.KOREAN: "ko", | |
| 20 | + Language.GERMAN: "de", | |
| 21 | + Language.FRENCH: "fr", | |
| 22 | + Language.SPANISH: "es", | |
| 23 | + Language.ITALIAN: "it", | |
| 24 | + Language.PORTUGUESE: "pt", | |
| 25 | + Language.DUTCH: "nl", | |
| 26 | + Language.RUSSIAN: "ru", | |
| 27 | + Language.ARABIC: "ar", | |
| 28 | + Language.HINDI: "hi", | |
| 29 | + Language.HEBREW: "he", | |
| 30 | + Language.THAI: "th", | |
| 31 | +} | |
| 32 | + | |
| 33 | +_LINGUA_LANGUAGES = tuple(_LINGUA_TO_CODE.keys()) | |
| 34 | + | |
| 35 | +_lingua_detector: Optional[object] = None | |
| 36 | + | |
| 37 | + | |
| 38 | +def _get_lingua_detector(): | |
| 39 | + global _lingua_detector | |
| 40 | + if _lingua_detector is None: | |
| 41 | + _lingua_detector = LanguageDetectorBuilder.from_languages( | |
| 42 | + *_LINGUA_LANGUAGES | |
| 43 | + ).build() | |
| 44 | + return _lingua_detector | |
| 45 | + | |
| 11 | 46 | |
| 12 | 47 | class LanguageDetector: |
| 13 | - """Rule-based language detector for common e-commerce query languages.""" | |
| 48 | + """Language detector: script hints + Lingua for Latin-family queries.""" | |
| 14 | 49 | |
| 15 | 50 | def __init__(self): |
| 16 | 51 | self._re_zh = re.compile(r"[\u4e00-\u9fff]") |
| ... | ... | @@ -21,47 +56,6 @@ class LanguageDetector: |
| 21 | 56 | self._re_hi = re.compile(r"[\u0900-\u097f]") |
| 22 | 57 | self._re_he = re.compile(r"[\u0590-\u05ff]") |
| 23 | 58 | self._re_th = re.compile(r"[\u0e00-\u0e7f]") |
| 24 | - self._re_latin_word = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+") | |
| 25 | - | |
| 26 | - # Stopwords + e-commerce terms for Latin-family disambiguation. | |
| 27 | - self._latin_lexicons: Dict[str, set] = { | |
| 28 | - "en": { | |
| 29 | - "the", "and", "for", "with", "new", "women", "men", "kids", | |
| 30 | - "shoe", "shoes", "dress", "shirt", "jacket", "bag", "wireless", | |
| 31 | - }, | |
| 32 | - "de": { | |
| 33 | - "der", "die", "das", "und", "mit", "für", "damen", "herren", | |
| 34 | - "kinder", "schuhe", "kleid", "hemd", "jacke", "tasche", | |
| 35 | - }, | |
| 36 | - "fr": { | |
| 37 | - "le", "la", "les", "et", "avec", "pour", "femme", "homme", | |
| 38 | - "enfant", "chaussures", "robe", "chemise", "veste", "sac", | |
| 39 | - }, | |
| 40 | - "es": { | |
| 41 | - "el", "la", "los", "las", "y", "con", "para", "mujer", "hombre", | |
| 42 | - "niño", "niña", "zapatos", "vestido", "camisa", "chaqueta", "bolso", | |
| 43 | - }, | |
| 44 | - "it": { | |
| 45 | - "il", "lo", "la", "gli", "le", "e", "con", "per", "donna", "uomo", | |
| 46 | - "bambino", "scarpe", "abito", "camicia", "giacca", "borsa", | |
| 47 | - }, | |
| 48 | - "pt": { | |
| 49 | - "o", "a", "os", "as", "e", "com", "para", "mulher", "homem", | |
| 50 | - "criança", "sapatos", "vestido", "camisa", "jaqueta", "bolsa", | |
| 51 | - }, | |
| 52 | - "nl": { | |
| 53 | - "de", "het", "en", "met", "voor", "dames", "heren", "kinderen", | |
| 54 | - "schoenen", "jurk", "overhemd", "jas", "tas", | |
| 55 | - }, | |
| 56 | - } | |
| 57 | - self._diacritic_weights: Dict[str, Dict[str, int]] = { | |
| 58 | - "de": {"ä": 3, "ö": 3, "ü": 3, "ß": 4}, | |
| 59 | - "fr": {"é": 2, "è": 2, "ê": 2, "à": 2, "ç": 2, "ù": 2, "ô": 2}, | |
| 60 | - "es": {"ñ": 3, "á": 2, "é": 2, "í": 2, "ó": 2, "ú": 2}, | |
| 61 | - "it": {"à": 2, "è": 2, "é": 2, "ì": 2, "ò": 2, "ù": 2}, | |
| 62 | - "pt": {"ã": 3, "õ": 3, "ç": 2, "á": 2, "â": 2, "ê": 2, "ô": 2}, | |
| 63 | - "nl": {"ij": 2}, | |
| 64 | - } | |
| 65 | 59 | |
| 66 | 60 | def detect(self, text: str) -> str: |
| 67 | 61 | """ |
| ... | ... | @@ -71,9 +65,9 @@ class LanguageDetector: |
| 71 | 65 | """ |
| 72 | 66 | if not text or not text.strip(): |
| 73 | 67 | return "unknown" |
| 74 | - q = text.strip().lower() | |
| 68 | + q = text.strip() | |
| 75 | 69 | |
| 76 | - # Script-first detection for non-Latin languages. | |
| 70 | + # Script-first: unambiguous blocks before Latin/Romance Lingua pass. | |
| 77 | 71 | if self._re_ja_kana.search(q): |
| 78 | 72 | return "ja" |
| 79 | 73 | if self._re_ko.search(q): |
| ... | ... | @@ -91,48 +85,11 @@ class LanguageDetector: |
| 91 | 85 | if self._re_th.search(q): |
| 92 | 86 | return "th" |
| 93 | 87 | |
| 94 | - # Latin-family scoring. | |
| 95 | - tokens = self._re_latin_word.findall(q) | |
| 96 | - if not tokens: | |
| 88 | + detected = _get_lingua_detector().detect_language_of(q) | |
| 89 | + if detected is None: | |
| 97 | 90 | return "unknown" |
| 98 | - | |
| 99 | - scores: Dict[str, float] = {k: 0.0 for k in self._latin_lexicons.keys()} | |
| 100 | - scores["en"] = scores.get("en", 0.0) | |
| 101 | - token_set = set(tokens) | |
| 102 | - | |
| 103 | - # Lexicon matches | |
| 104 | - for lang, lex in self._latin_lexicons.items(): | |
| 105 | - overlap = len(token_set & lex) | |
| 106 | - if overlap: | |
| 107 | - scores[lang] += overlap * 2.0 | |
| 108 | - | |
| 109 | - # Diacritics / orthographic hints | |
| 110 | - for lang, hints in self._diacritic_weights.items(): | |
| 111 | - for marker, weight in hints.items(): | |
| 112 | - if marker in q: | |
| 113 | - scores[lang] += weight | |
| 114 | - | |
| 115 | - # Light suffix hints for common product words | |
| 116 | - for t in tokens: | |
| 117 | - if t.endswith("ung") or t.endswith("chen"): | |
| 118 | - scores["de"] += 0.6 | |
| 119 | - if t.endswith("ción") or t.endswith("ado") or t.endswith("ada"): | |
| 120 | - scores["es"] += 0.6 | |
| 121 | - if t.endswith("zione") or t.endswith("etto") or t.endswith("ella"): | |
| 122 | - scores["it"] += 0.6 | |
| 123 | - if t.endswith("ção") or t.endswith("mente"): | |
| 124 | - scores["pt"] += 0.6 | |
| 125 | - if t.endswith("ment") or t.endswith("eau"): | |
| 126 | - scores["fr"] += 0.5 | |
| 127 | - | |
| 128 | - # Fallback preference: English for pure Latin short tokens. | |
| 129 | - scores["en"] += 0.2 | |
| 130 | - | |
| 131 | - best_lang = max(scores.items(), key=lambda x: x[1])[0] | |
| 132 | - best_score = scores[best_lang] | |
| 133 | - if best_score <= 0: | |
| 134 | - return "en" | |
| 135 | - return best_lang | |
| 91 | + code = _LINGUA_TO_CODE.get(detected) | |
| 92 | + return code if code is not None else "unknown" | |
| 136 | 93 | |
| 137 | 94 | def is_chinese(self, text: str) -> bool: |
| 138 | 95 | return self.detect(text) == "zh" | ... | ... |