Commit 0ea456b2f14531db776ab4ad3d3aec2423cb11e0

Authored by tangwang
1 parent 272aeabe

+lingua-language-detector

@@ -247,6 +247,8 @@ config/environments/<env>.yaml @@ -247,6 +247,8 @@ config/environments/<env>.yaml
247 247
248 248
249 249
  250 +无结果重查
  251 +稀有语言,翻译可能超时(因为zh-en互译之外的翻译耗时更长)
250 252
251 253
252 254
query/language_detector.py
1 """ 1 """
2 Language detection utility. 2 Language detection utility.
3 3
4 -Detects language of short e-commerce queries with script checks + lightweight  
5 -Latin-language scoring (de/fr/es/it/pt/nl/en). 4 +Script-first rules for CJK and other non-Latin scripts, then Lingua
  5 +(lingua-language-detector) for Latin text and Romance/Germanic disambiguation.
6 """ 6 """
7 7
8 -from typing import Dict, List 8 +from __future__ import annotations
  9 +
  10 +from typing import Dict, Optional
9 import re 11 import re
10 12
  13 +from lingua import Language, LanguageDetectorBuilder
  14 +
  15 +_LINGUA_TO_CODE: Dict[Language, str] = {
  16 + Language.CHINESE: "zh",
  17 + Language.ENGLISH: "en",
  18 + Language.JAPANESE: "ja",
  19 + Language.KOREAN: "ko",
  20 + Language.GERMAN: "de",
  21 + Language.FRENCH: "fr",
  22 + Language.SPANISH: "es",
  23 + Language.ITALIAN: "it",
  24 + Language.PORTUGUESE: "pt",
  25 + Language.DUTCH: "nl",
  26 + Language.RUSSIAN: "ru",
  27 + Language.ARABIC: "ar",
  28 + Language.HINDI: "hi",
  29 + Language.HEBREW: "he",
  30 + Language.THAI: "th",
  31 +}
  32 +
  33 +_LINGUA_LANGUAGES = tuple(_LINGUA_TO_CODE.keys())
  34 +
  35 +_lingua_detector: Optional[object] = None
  36 +
  37 +
  38 +def _get_lingua_detector():
  39 + global _lingua_detector
  40 + if _lingua_detector is None:
  41 + _lingua_detector = LanguageDetectorBuilder.from_languages(
  42 + *_LINGUA_LANGUAGES
  43 + ).build()
  44 + return _lingua_detector
  45 +
11 46
12 class LanguageDetector: 47 class LanguageDetector:
13 - """Rule-based language detector for common e-commerce query languages.""" 48 + """Language detector: script hints + Lingua for Latin-family queries."""
14 49
15 def __init__(self): 50 def __init__(self):
16 self._re_zh = re.compile(r"[\u4e00-\u9fff]") 51 self._re_zh = re.compile(r"[\u4e00-\u9fff]")
@@ -21,47 +56,6 @@ class LanguageDetector: @@ -21,47 +56,6 @@ class LanguageDetector:
21 self._re_hi = re.compile(r"[\u0900-\u097f]") 56 self._re_hi = re.compile(r"[\u0900-\u097f]")
22 self._re_he = re.compile(r"[\u0590-\u05ff]") 57 self._re_he = re.compile(r"[\u0590-\u05ff]")
23 self._re_th = re.compile(r"[\u0e00-\u0e7f]") 58 self._re_th = re.compile(r"[\u0e00-\u0e7f]")
24 - self._re_latin_word = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+")  
25 -  
26 - # Stopwords + e-commerce terms for Latin-family disambiguation.  
27 - self._latin_lexicons: Dict[str, set] = {  
28 - "en": {  
29 - "the", "and", "for", "with", "new", "women", "men", "kids",  
30 - "shoe", "shoes", "dress", "shirt", "jacket", "bag", "wireless",  
31 - },  
32 - "de": {  
33 - "der", "die", "das", "und", "mit", "für", "damen", "herren",  
34 - "kinder", "schuhe", "kleid", "hemd", "jacke", "tasche",  
35 - },  
36 - "fr": {  
37 - "le", "la", "les", "et", "avec", "pour", "femme", "homme",  
38 - "enfant", "chaussures", "robe", "chemise", "veste", "sac",  
39 - },  
40 - "es": {  
41 - "el", "la", "los", "las", "y", "con", "para", "mujer", "hombre",  
42 - "niño", "niña", "zapatos", "vestido", "camisa", "chaqueta", "bolso",  
43 - },  
44 - "it": {  
45 - "il", "lo", "la", "gli", "le", "e", "con", "per", "donna", "uomo",  
46 - "bambino", "scarpe", "abito", "camicia", "giacca", "borsa",  
47 - },  
48 - "pt": {  
49 - "o", "a", "os", "as", "e", "com", "para", "mulher", "homem",  
50 - "criança", "sapatos", "vestido", "camisa", "jaqueta", "bolsa",  
51 - },  
52 - "nl": {  
53 - "de", "het", "en", "met", "voor", "dames", "heren", "kinderen",  
54 - "schoenen", "jurk", "overhemd", "jas", "tas",  
55 - },  
56 - }  
57 - self._diacritic_weights: Dict[str, Dict[str, int]] = {  
58 - "de": {"ä": 3, "ö": 3, "ü": 3, "ß": 4},  
59 - "fr": {"é": 2, "è": 2, "ê": 2, "à": 2, "ç": 2, "ù": 2, "ô": 2},  
60 - "es": {"ñ": 3, "á": 2, "é": 2, "í": 2, "ó": 2, "ú": 2},  
61 - "it": {"à": 2, "è": 2, "é": 2, "ì": 2, "ò": 2, "ù": 2},  
62 - "pt": {"ã": 3, "õ": 3, "ç": 2, "á": 2, "â": 2, "ê": 2, "ô": 2},  
63 - "nl": {"ij": 2},  
64 - }  
65 59
66 def detect(self, text: str) -> str: 60 def detect(self, text: str) -> str:
67 """ 61 """
@@ -71,9 +65,9 @@ class LanguageDetector: @@ -71,9 +65,9 @@ class LanguageDetector:
71 """ 65 """
72 if not text or not text.strip(): 66 if not text or not text.strip():
73 return "unknown" 67 return "unknown"
74 - q = text.strip().lower() 68 + q = text.strip()
75 69
76 - # Script-first detection for non-Latin languages. 70 + # Script-first: unambiguous blocks before Latin/Romance Lingua pass.
77 if self._re_ja_kana.search(q): 71 if self._re_ja_kana.search(q):
78 return "ja" 72 return "ja"
79 if self._re_ko.search(q): 73 if self._re_ko.search(q):
@@ -91,48 +85,11 @@ class LanguageDetector: @@ -91,48 +85,11 @@ class LanguageDetector:
91 if self._re_th.search(q): 85 if self._re_th.search(q):
92 return "th" 86 return "th"
93 87
94 - # Latin-family scoring.  
95 - tokens = self._re_latin_word.findall(q)  
96 - if not tokens: 88 + detected = _get_lingua_detector().detect_language_of(q)
  89 + if detected is None:
97 return "unknown" 90 return "unknown"
98 -  
99 - scores: Dict[str, float] = {k: 0.0 for k in self._latin_lexicons.keys()}  
100 - scores["en"] = scores.get("en", 0.0)  
101 - token_set = set(tokens)  
102 -  
103 - # Lexicon matches  
104 - for lang, lex in self._latin_lexicons.items():  
105 - overlap = len(token_set & lex)  
106 - if overlap:  
107 - scores[lang] += overlap * 2.0  
108 -  
109 - # Diacritics / orthographic hints  
110 - for lang, hints in self._diacritic_weights.items():  
111 - for marker, weight in hints.items():  
112 - if marker in q:  
113 - scores[lang] += weight  
114 -  
115 - # Light suffix hints for common product words  
116 - for t in tokens:  
117 - if t.endswith("ung") or t.endswith("chen"):  
118 - scores["de"] += 0.6  
119 - if t.endswith("ción") or t.endswith("ado") or t.endswith("ada"):  
120 - scores["es"] += 0.6  
121 - if t.endswith("zione") or t.endswith("etto") or t.endswith("ella"):  
122 - scores["it"] += 0.6  
123 - if t.endswith("ção") or t.endswith("mente"):  
124 - scores["pt"] += 0.6  
125 - if t.endswith("ment") or t.endswith("eau"):  
126 - scores["fr"] += 0.5  
127 -  
128 - # Fallback preference: English for pure Latin short tokens.  
129 - scores["en"] += 0.2  
130 -  
131 - best_lang = max(scores.items(), key=lambda x: x[1])[0]  
132 - best_score = scores[best_lang]  
133 - if best_score <= 0:  
134 - return "en"  
135 - return best_lang 91 + code = _LINGUA_TO_CODE.get(detected)
  92 + return code if code is not None else "unknown"
136 93
137 def is_chinese(self, text: str) -> bool: 94 def is_chinese(self, text: str) -> bool:
138 return self.detect(text) == "zh" 95 return self.detect(text) == "zh"
@@ -42,3 +42,6 @@ click&gt;=8.1.0 @@ -42,3 +42,6 @@ click&gt;=8.1.0
42 pytest>=7.4.0 42 pytest>=7.4.0
43 pytest-asyncio>=0.21.0 43 pytest-asyncio>=0.21.0
44 httpx>=0.24.0 44 httpx>=0.24.0
  45 +
  46 +# language detector
  47 +lingua-language-detector
45 \ No newline at end of file 48 \ No newline at end of file