language_detector.py
5.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
"""
Language detection utility.
Detects language of short e-commerce queries with script checks + lightweight
Latin-language scoring (de/fr/es/it/pt/nl/en).
"""
from typing import Dict, List
import re
class LanguageDetector:
"""Rule-based language detector for common e-commerce query languages."""
def __init__(self):
self._re_zh = re.compile(r"[\u4e00-\u9fff]")
self._re_ja_kana = re.compile(r"[\u3040-\u30ff]")
self._re_ko = re.compile(r"[\uac00-\ud7af]")
self._re_ru = re.compile(r"[\u0400-\u04ff]")
self._re_ar = re.compile(r"[\u0600-\u06ff]")
self._re_hi = re.compile(r"[\u0900-\u097f]")
self._re_he = re.compile(r"[\u0590-\u05ff]")
self._re_th = re.compile(r"[\u0e00-\u0e7f]")
self._re_latin_word = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+")
# Stopwords + e-commerce terms for Latin-family disambiguation.
self._latin_lexicons: Dict[str, set] = {
"en": {
"the", "and", "for", "with", "new", "women", "men", "kids",
"shoe", "shoes", "dress", "shirt", "jacket", "bag", "wireless",
},
"de": {
"der", "die", "das", "und", "mit", "für", "damen", "herren",
"kinder", "schuhe", "kleid", "hemd", "jacke", "tasche",
},
"fr": {
"le", "la", "les", "et", "avec", "pour", "femme", "homme",
"enfant", "chaussures", "robe", "chemise", "veste", "sac",
},
"es": {
"el", "la", "los", "las", "y", "con", "para", "mujer", "hombre",
"niño", "niña", "zapatos", "vestido", "camisa", "chaqueta", "bolso",
},
"it": {
"il", "lo", "la", "gli", "le", "e", "con", "per", "donna", "uomo",
"bambino", "scarpe", "abito", "camicia", "giacca", "borsa",
},
"pt": {
"o", "a", "os", "as", "e", "com", "para", "mulher", "homem",
"criança", "sapatos", "vestido", "camisa", "jaqueta", "bolsa",
},
"nl": {
"de", "het", "en", "met", "voor", "dames", "heren", "kinderen",
"schoenen", "jurk", "overhemd", "jas", "tas",
},
}
self._diacritic_weights: Dict[str, Dict[str, int]] = {
"de": {"ä": 3, "ö": 3, "ü": 3, "ß": 4},
"fr": {"é": 2, "è": 2, "ê": 2, "à": 2, "ç": 2, "ù": 2, "ô": 2},
"es": {"ñ": 3, "á": 2, "é": 2, "í": 2, "ó": 2, "ú": 2},
"it": {"à": 2, "è": 2, "é": 2, "ì": 2, "ò": 2, "ù": 2},
"pt": {"ã": 3, "õ": 3, "ç": 2, "á": 2, "â": 2, "ê": 2, "ô": 2},
"nl": {"ij": 2},
}
def detect(self, text: str) -> str:
"""
Detect language code for text.
Returns one of: zh/en/ru/ar/ja/ko/de/fr/es/it/pt/nl/hi/he/th/unknown
"""
if not text or not text.strip():
return "unknown"
q = text.strip().lower()
# Script-first detection for non-Latin languages.
if self._re_ja_kana.search(q):
return "ja"
if self._re_ko.search(q):
return "ko"
if self._re_zh.search(q):
return "zh"
if self._re_ru.search(q):
return "ru"
if self._re_ar.search(q):
return "ar"
if self._re_hi.search(q):
return "hi"
if self._re_he.search(q):
return "he"
if self._re_th.search(q):
return "th"
# Latin-family scoring.
tokens = self._re_latin_word.findall(q)
if not tokens:
return "unknown"
scores: Dict[str, float] = {k: 0.0 for k in self._latin_lexicons.keys()}
scores["en"] = scores.get("en", 0.0)
token_set = set(tokens)
# Lexicon matches
for lang, lex in self._latin_lexicons.items():
overlap = len(token_set & lex)
if overlap:
scores[lang] += overlap * 2.0
# Diacritics / orthographic hints
for lang, hints in self._diacritic_weights.items():
for marker, weight in hints.items():
if marker in q:
scores[lang] += weight
# Light suffix hints for common product words
for t in tokens:
if t.endswith("ung") or t.endswith("chen"):
scores["de"] += 0.6
if t.endswith("ción") or t.endswith("ado") or t.endswith("ada"):
scores["es"] += 0.6
if t.endswith("zione") or t.endswith("etto") or t.endswith("ella"):
scores["it"] += 0.6
if t.endswith("ção") or t.endswith("mente"):
scores["pt"] += 0.6
if t.endswith("ment") or t.endswith("eau"):
scores["fr"] += 0.5
# Fallback preference: English for pure Latin short tokens.
scores["en"] += 0.2
best_lang = max(scores.items(), key=lambda x: x[1])[0]
best_score = scores[best_lang]
if best_score <= 0:
return "en"
return best_lang
def is_chinese(self, text: str) -> bool:
return self.detect(text) == "zh"
def is_english(self, text: str) -> bool:
return self.detect(text) == "en"
def is_russian(self, text: str) -> bool:
return self.detect(text) == "ru"
def is_arabic(self, text: str) -> bool:
return self.detect(text) == "ar"
def is_japanese(self, text: str) -> bool:
return self.detect(text) == "ja"