Commit 0ea456b2f14531db776ab4ad3d3aec2423cb11e0
1 parent
272aeabe
+lingua-language-detector
Showing
3 changed files
with
50 additions
and
88 deletions
Show diff stats
docs/TODO.txt
query/language_detector.py
| 1 | """ | 1 | """ |
| 2 | Language detection utility. | 2 | Language detection utility. |
| 3 | 3 | ||
| 4 | -Detects language of short e-commerce queries with script checks + lightweight | ||
| 5 | -Latin-language scoring (de/fr/es/it/pt/nl/en). | 4 | +Script-first rules for CJK and other non-Latin scripts, then Lingua |
| 5 | +(lingua-language-detector) for Latin text and Romance/Germanic disambiguation. | ||
| 6 | """ | 6 | """ |
| 7 | 7 | ||
| 8 | -from typing import Dict, List | 8 | +from __future__ import annotations |
| 9 | + | ||
| 10 | +from typing import Dict, Optional | ||
| 9 | import re | 11 | import re |
| 10 | 12 | ||
| 13 | +from lingua import Language, LanguageDetectorBuilder | ||
| 14 | + | ||
| 15 | +_LINGUA_TO_CODE: Dict[Language, str] = { | ||
| 16 | + Language.CHINESE: "zh", | ||
| 17 | + Language.ENGLISH: "en", | ||
| 18 | + Language.JAPANESE: "ja", | ||
| 19 | + Language.KOREAN: "ko", | ||
| 20 | + Language.GERMAN: "de", | ||
| 21 | + Language.FRENCH: "fr", | ||
| 22 | + Language.SPANISH: "es", | ||
| 23 | + Language.ITALIAN: "it", | ||
| 24 | + Language.PORTUGUESE: "pt", | ||
| 25 | + Language.DUTCH: "nl", | ||
| 26 | + Language.RUSSIAN: "ru", | ||
| 27 | + Language.ARABIC: "ar", | ||
| 28 | + Language.HINDI: "hi", | ||
| 29 | + Language.HEBREW: "he", | ||
| 30 | + Language.THAI: "th", | ||
| 31 | +} | ||
| 32 | + | ||
| 33 | +_LINGUA_LANGUAGES = tuple(_LINGUA_TO_CODE.keys()) | ||
| 34 | + | ||
| 35 | +_lingua_detector: Optional[object] = None | ||
| 36 | + | ||
| 37 | + | ||
| 38 | +def _get_lingua_detector(): | ||
| 39 | + global _lingua_detector | ||
| 40 | + if _lingua_detector is None: | ||
| 41 | + _lingua_detector = LanguageDetectorBuilder.from_languages( | ||
| 42 | + *_LINGUA_LANGUAGES | ||
| 43 | + ).build() | ||
| 44 | + return _lingua_detector | ||
| 45 | + | ||
| 11 | 46 | ||
| 12 | class LanguageDetector: | 47 | class LanguageDetector: |
| 13 | - """Rule-based language detector for common e-commerce query languages.""" | 48 | + """Language detector: script hints + Lingua for Latin-family queries.""" |
| 14 | 49 | ||
| 15 | def __init__(self): | 50 | def __init__(self): |
| 16 | self._re_zh = re.compile(r"[\u4e00-\u9fff]") | 51 | self._re_zh = re.compile(r"[\u4e00-\u9fff]") |
| @@ -21,47 +56,6 @@ class LanguageDetector: | @@ -21,47 +56,6 @@ class LanguageDetector: | ||
| 21 | self._re_hi = re.compile(r"[\u0900-\u097f]") | 56 | self._re_hi = re.compile(r"[\u0900-\u097f]") |
| 22 | self._re_he = re.compile(r"[\u0590-\u05ff]") | 57 | self._re_he = re.compile(r"[\u0590-\u05ff]") |
| 23 | self._re_th = re.compile(r"[\u0e00-\u0e7f]") | 58 | self._re_th = re.compile(r"[\u0e00-\u0e7f]") |
| 24 | - self._re_latin_word = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+") | ||
| 25 | - | ||
| 26 | - # Stopwords + e-commerce terms for Latin-family disambiguation. | ||
| 27 | - self._latin_lexicons: Dict[str, set] = { | ||
| 28 | - "en": { | ||
| 29 | - "the", "and", "for", "with", "new", "women", "men", "kids", | ||
| 30 | - "shoe", "shoes", "dress", "shirt", "jacket", "bag", "wireless", | ||
| 31 | - }, | ||
| 32 | - "de": { | ||
| 33 | - "der", "die", "das", "und", "mit", "für", "damen", "herren", | ||
| 34 | - "kinder", "schuhe", "kleid", "hemd", "jacke", "tasche", | ||
| 35 | - }, | ||
| 36 | - "fr": { | ||
| 37 | - "le", "la", "les", "et", "avec", "pour", "femme", "homme", | ||
| 38 | - "enfant", "chaussures", "robe", "chemise", "veste", "sac", | ||
| 39 | - }, | ||
| 40 | - "es": { | ||
| 41 | - "el", "la", "los", "las", "y", "con", "para", "mujer", "hombre", | ||
| 42 | - "niño", "niña", "zapatos", "vestido", "camisa", "chaqueta", "bolso", | ||
| 43 | - }, | ||
| 44 | - "it": { | ||
| 45 | - "il", "lo", "la", "gli", "le", "e", "con", "per", "donna", "uomo", | ||
| 46 | - "bambino", "scarpe", "abito", "camicia", "giacca", "borsa", | ||
| 47 | - }, | ||
| 48 | - "pt": { | ||
| 49 | - "o", "a", "os", "as", "e", "com", "para", "mulher", "homem", | ||
| 50 | - "criança", "sapatos", "vestido", "camisa", "jaqueta", "bolsa", | ||
| 51 | - }, | ||
| 52 | - "nl": { | ||
| 53 | - "de", "het", "en", "met", "voor", "dames", "heren", "kinderen", | ||
| 54 | - "schoenen", "jurk", "overhemd", "jas", "tas", | ||
| 55 | - }, | ||
| 56 | - } | ||
| 57 | - self._diacritic_weights: Dict[str, Dict[str, int]] = { | ||
| 58 | - "de": {"ä": 3, "ö": 3, "ü": 3, "ß": 4}, | ||
| 59 | - "fr": {"é": 2, "è": 2, "ê": 2, "à": 2, "ç": 2, "ù": 2, "ô": 2}, | ||
| 60 | - "es": {"ñ": 3, "á": 2, "é": 2, "í": 2, "ó": 2, "ú": 2}, | ||
| 61 | - "it": {"à": 2, "è": 2, "é": 2, "ì": 2, "ò": 2, "ù": 2}, | ||
| 62 | - "pt": {"ã": 3, "õ": 3, "ç": 2, "á": 2, "â": 2, "ê": 2, "ô": 2}, | ||
| 63 | - "nl": {"ij": 2}, | ||
| 64 | - } | ||
| 65 | 59 | ||
| 66 | def detect(self, text: str) -> str: | 60 | def detect(self, text: str) -> str: |
| 67 | """ | 61 | """ |
| @@ -71,9 +65,9 @@ class LanguageDetector: | @@ -71,9 +65,9 @@ class LanguageDetector: | ||
| 71 | """ | 65 | """ |
| 72 | if not text or not text.strip(): | 66 | if not text or not text.strip(): |
| 73 | return "unknown" | 67 | return "unknown" |
| 74 | - q = text.strip().lower() | 68 | + q = text.strip() |
| 75 | 69 | ||
| 76 | - # Script-first detection for non-Latin languages. | 70 | + # Script-first: unambiguous blocks before Latin/Romance Lingua pass. |
| 77 | if self._re_ja_kana.search(q): | 71 | if self._re_ja_kana.search(q): |
| 78 | return "ja" | 72 | return "ja" |
| 79 | if self._re_ko.search(q): | 73 | if self._re_ko.search(q): |
| @@ -91,48 +85,11 @@ class LanguageDetector: | @@ -91,48 +85,11 @@ class LanguageDetector: | ||
| 91 | if self._re_th.search(q): | 85 | if self._re_th.search(q): |
| 92 | return "th" | 86 | return "th" |
| 93 | 87 | ||
| 94 | - # Latin-family scoring. | ||
| 95 | - tokens = self._re_latin_word.findall(q) | ||
| 96 | - if not tokens: | 88 | + detected = _get_lingua_detector().detect_language_of(q) |
| 89 | + if detected is None: | ||
| 97 | return "unknown" | 90 | return "unknown" |
| 98 | - | ||
| 99 | - scores: Dict[str, float] = {k: 0.0 for k in self._latin_lexicons.keys()} | ||
| 100 | - scores["en"] = scores.get("en", 0.0) | ||
| 101 | - token_set = set(tokens) | ||
| 102 | - | ||
| 103 | - # Lexicon matches | ||
| 104 | - for lang, lex in self._latin_lexicons.items(): | ||
| 105 | - overlap = len(token_set & lex) | ||
| 106 | - if overlap: | ||
| 107 | - scores[lang] += overlap * 2.0 | ||
| 108 | - | ||
| 109 | - # Diacritics / orthographic hints | ||
| 110 | - for lang, hints in self._diacritic_weights.items(): | ||
| 111 | - for marker, weight in hints.items(): | ||
| 112 | - if marker in q: | ||
| 113 | - scores[lang] += weight | ||
| 114 | - | ||
| 115 | - # Light suffix hints for common product words | ||
| 116 | - for t in tokens: | ||
| 117 | - if t.endswith("ung") or t.endswith("chen"): | ||
| 118 | - scores["de"] += 0.6 | ||
| 119 | - if t.endswith("ción") or t.endswith("ado") or t.endswith("ada"): | ||
| 120 | - scores["es"] += 0.6 | ||
| 121 | - if t.endswith("zione") or t.endswith("etto") or t.endswith("ella"): | ||
| 122 | - scores["it"] += 0.6 | ||
| 123 | - if t.endswith("ção") or t.endswith("mente"): | ||
| 124 | - scores["pt"] += 0.6 | ||
| 125 | - if t.endswith("ment") or t.endswith("eau"): | ||
| 126 | - scores["fr"] += 0.5 | ||
| 127 | - | ||
| 128 | - # Fallback preference: English for pure Latin short tokens. | ||
| 129 | - scores["en"] += 0.2 | ||
| 130 | - | ||
| 131 | - best_lang = max(scores.items(), key=lambda x: x[1])[0] | ||
| 132 | - best_score = scores[best_lang] | ||
| 133 | - if best_score <= 0: | ||
| 134 | - return "en" | ||
| 135 | - return best_lang | 91 | + code = _LINGUA_TO_CODE.get(detected) |
| 92 | + return code if code is not None else "unknown" | ||
| 136 | 93 | ||
| 137 | def is_chinese(self, text: str) -> bool: | 94 | def is_chinese(self, text: str) -> bool: |
| 138 | return self.detect(text) == "zh" | 95 | return self.detect(text) == "zh" |
requirements.txt
| @@ -42,3 +42,6 @@ click>=8.1.0 | @@ -42,3 +42,6 @@ click>=8.1.0 | ||
| 42 | pytest>=7.4.0 | 42 | pytest>=7.4.0 |
| 43 | pytest-asyncio>=0.21.0 | 43 | pytest-asyncio>=0.21.0 |
| 44 | httpx>=0.24.0 | 44 | httpx>=0.24.0 |
| 45 | + | ||
| 46 | +# language detector | ||
| 47 | +lingua-language-detector | ||
| 45 | \ No newline at end of file | 48 | \ No newline at end of file |