Commit 0ea456b2f14531db776ab4ad3d3aec2423cb11e0

Authored by tangwang
1 parent 272aeabe

+lingua-language-detector

docs/TODO.txt
... ... @@ -247,6 +247,8 @@ config/environments/<env>.yaml
247 247  
248 248  
249 249  
  250 +无结果重查
  251 +稀有语言,翻译可能超时(因为zh-en互译之外的翻译耗时更长)
250 252  
251 253  
252 254  
... ...
query/language_detector.py
1 1 """
2 2 Language detection utility.
3 3  
4   -Detects language of short e-commerce queries with script checks + lightweight
5   -Latin-language scoring (de/fr/es/it/pt/nl/en).
  4 +Script-first rules for CJK and other non-Latin scripts, then Lingua
  5 +(lingua-language-detector) for Latin text and Romance/Germanic disambiguation.
6 6 """
7 7  
8   -from typing import Dict, List
  8 +from __future__ import annotations
  9 +
  10 +from typing import Dict, Optional
9 11 import re
10 12  
  13 +from lingua import Language, LanguageDetectorBuilder
  14 +
  15 +_LINGUA_TO_CODE: Dict[Language, str] = {
  16 + Language.CHINESE: "zh",
  17 + Language.ENGLISH: "en",
  18 + Language.JAPANESE: "ja",
  19 + Language.KOREAN: "ko",
  20 + Language.GERMAN: "de",
  21 + Language.FRENCH: "fr",
  22 + Language.SPANISH: "es",
  23 + Language.ITALIAN: "it",
  24 + Language.PORTUGUESE: "pt",
  25 + Language.DUTCH: "nl",
  26 + Language.RUSSIAN: "ru",
  27 + Language.ARABIC: "ar",
  28 + Language.HINDI: "hi",
  29 + Language.HEBREW: "he",
  30 + Language.THAI: "th",
  31 +}
  32 +
  33 +_LINGUA_LANGUAGES = tuple(_LINGUA_TO_CODE.keys())
  34 +
  35 +_lingua_detector: Optional[object] = None
  36 +
  37 +
  38 +def _get_lingua_detector():
  39 + global _lingua_detector
  40 + if _lingua_detector is None:
  41 + _lingua_detector = LanguageDetectorBuilder.from_languages(
  42 + *_LINGUA_LANGUAGES
  43 + ).build()
  44 + return _lingua_detector
  45 +
11 46  
12 47 class LanguageDetector:
13   - """Rule-based language detector for common e-commerce query languages."""
  48 + """Language detector: script hints + Lingua for Latin-family queries."""
14 49  
15 50 def __init__(self):
16 51 self._re_zh = re.compile(r"[\u4e00-\u9fff]")
... ... @@ -21,47 +56,6 @@ class LanguageDetector:
21 56 self._re_hi = re.compile(r"[\u0900-\u097f]")
22 57 self._re_he = re.compile(r"[\u0590-\u05ff]")
23 58 self._re_th = re.compile(r"[\u0e00-\u0e7f]")
24   - self._re_latin_word = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+")
25   -
26   - # Stopwords + e-commerce terms for Latin-family disambiguation.
27   - self._latin_lexicons: Dict[str, set] = {
28   - "en": {
29   - "the", "and", "for", "with", "new", "women", "men", "kids",
30   - "shoe", "shoes", "dress", "shirt", "jacket", "bag", "wireless",
31   - },
32   - "de": {
33   - "der", "die", "das", "und", "mit", "für", "damen", "herren",
34   - "kinder", "schuhe", "kleid", "hemd", "jacke", "tasche",
35   - },
36   - "fr": {
37   - "le", "la", "les", "et", "avec", "pour", "femme", "homme",
38   - "enfant", "chaussures", "robe", "chemise", "veste", "sac",
39   - },
40   - "es": {
41   - "el", "la", "los", "las", "y", "con", "para", "mujer", "hombre",
42   - "niño", "niña", "zapatos", "vestido", "camisa", "chaqueta", "bolso",
43   - },
44   - "it": {
45   - "il", "lo", "la", "gli", "le", "e", "con", "per", "donna", "uomo",
46   - "bambino", "scarpe", "abito", "camicia", "giacca", "borsa",
47   - },
48   - "pt": {
49   - "o", "a", "os", "as", "e", "com", "para", "mulher", "homem",
50   - "criança", "sapatos", "vestido", "camisa", "jaqueta", "bolsa",
51   - },
52   - "nl": {
53   - "de", "het", "en", "met", "voor", "dames", "heren", "kinderen",
54   - "schoenen", "jurk", "overhemd", "jas", "tas",
55   - },
56   - }
57   - self._diacritic_weights: Dict[str, Dict[str, int]] = {
58   - "de": {"ä": 3, "ö": 3, "ü": 3, "ß": 4},
59   - "fr": {"é": 2, "è": 2, "ê": 2, "à": 2, "ç": 2, "ù": 2, "ô": 2},
60   - "es": {"ñ": 3, "á": 2, "é": 2, "í": 2, "ó": 2, "ú": 2},
61   - "it": {"à": 2, "è": 2, "é": 2, "ì": 2, "ò": 2, "ù": 2},
62   - "pt": {"ã": 3, "õ": 3, "ç": 2, "á": 2, "â": 2, "ê": 2, "ô": 2},
63   - "nl": {"ij": 2},
64   - }
65 59  
66 60 def detect(self, text: str) -> str:
67 61 """
... ... @@ -71,9 +65,9 @@ class LanguageDetector:
71 65 """
72 66 if not text or not text.strip():
73 67 return "unknown"
74   - q = text.strip().lower()
  68 + q = text.strip()
75 69  
76   - # Script-first detection for non-Latin languages.
  70 + # Script-first: unambiguous blocks before Latin/Romance Lingua pass.
77 71 if self._re_ja_kana.search(q):
78 72 return "ja"
79 73 if self._re_ko.search(q):
... ... @@ -91,48 +85,11 @@ class LanguageDetector:
91 85 if self._re_th.search(q):
92 86 return "th"
93 87  
94   - # Latin-family scoring.
95   - tokens = self._re_latin_word.findall(q)
96   - if not tokens:
  88 + detected = _get_lingua_detector().detect_language_of(q)
  89 + if detected is None:
97 90 return "unknown"
98   -
99   - scores: Dict[str, float] = {k: 0.0 for k in self._latin_lexicons.keys()}
100   - scores["en"] = scores.get("en", 0.0)
101   - token_set = set(tokens)
102   -
103   - # Lexicon matches
104   - for lang, lex in self._latin_lexicons.items():
105   - overlap = len(token_set & lex)
106   - if overlap:
107   - scores[lang] += overlap * 2.0
108   -
109   - # Diacritics / orthographic hints
110   - for lang, hints in self._diacritic_weights.items():
111   - for marker, weight in hints.items():
112   - if marker in q:
113   - scores[lang] += weight
114   -
115   - # Light suffix hints for common product words
116   - for t in tokens:
117   - if t.endswith("ung") or t.endswith("chen"):
118   - scores["de"] += 0.6
119   - if t.endswith("ción") or t.endswith("ado") or t.endswith("ada"):
120   - scores["es"] += 0.6
121   - if t.endswith("zione") or t.endswith("etto") or t.endswith("ella"):
122   - scores["it"] += 0.6
123   - if t.endswith("ção") or t.endswith("mente"):
124   - scores["pt"] += 0.6
125   - if t.endswith("ment") or t.endswith("eau"):
126   - scores["fr"] += 0.5
127   -
128   - # Fallback preference: English for pure Latin short tokens.
129   - scores["en"] += 0.2
130   -
131   - best_lang = max(scores.items(), key=lambda x: x[1])[0]
132   - best_score = scores[best_lang]
133   - if best_score <= 0:
134   - return "en"
135   - return best_lang
  91 + code = _LINGUA_TO_CODE.get(detected)
  92 + return code if code is not None else "unknown"
136 93  
137 94 def is_chinese(self, text: str) -> bool:
138 95 return self.detect(text) == "zh"
... ...
requirements.txt
... ... @@ -42,3 +42,6 @@ click&gt;=8.1.0
42 42 pytest>=7.4.0
43 43 pytest-asyncio>=0.21.0
44 44 httpx>=0.24.0
  45 +
  46 +# language detector
  47 +lingua-language-detector
45 48 \ No newline at end of file
... ...