+lingua-language-detector

tangwang
1 parent 272aeabe
Showing 3 changed files with 50 additions and 88 deletions Show diff stats
docs/TODO.txt
query/language_detector.py
requirements.txt
@@ -247,6 +247,8 @@ config/environments/&lt;env&gt;.yaml
  
  
  
+无结果重查
+稀有语言，翻译可能超时（因为zh-en互译之外的翻译耗时更长）
  
  
  
 """
 Language detection utility.
  
-Detects language of short e-commerce queries with script checks + lightweight
-Latin-language scoring (de/fr/es/it/pt/nl/en).
+Script-first rules for CJK and other non-Latin scripts, then Lingua
+(lingua-language-detector) for Latin text and Romance/Germanic disambiguation.
 """
  
-from typing import Dict, List
+from __future__ import annotations
+
+from typing import Dict, Optional
 import re
  
+from lingua import Language, LanguageDetectorBuilder
+
+_LINGUA_TO_CODE: Dict[Language, str] = {
+    Language.CHINESE: "zh",
+    Language.ENGLISH: "en",
+    Language.JAPANESE: "ja",
+    Language.KOREAN: "ko",
+    Language.GERMAN: "de",
+    Language.FRENCH: "fr",
+    Language.SPANISH: "es",
+    Language.ITALIAN: "it",
+    Language.PORTUGUESE: "pt",
+    Language.DUTCH: "nl",
+    Language.RUSSIAN: "ru",
+    Language.ARABIC: "ar",
+    Language.HINDI: "hi",
+    Language.HEBREW: "he",
+    Language.THAI: "th",
+}
+
+_LINGUA_LANGUAGES = tuple(_LINGUA_TO_CODE.keys())
+
+_lingua_detector: Optional[object] = None
+
+
+def _get_lingua_detector():
+    global _lingua_detector
+    if _lingua_detector is None:
+        _lingua_detector = LanguageDetectorBuilder.from_languages(
+            *_LINGUA_LANGUAGES
+        ).build()
+    return _lingua_detector
+
  
 class LanguageDetector:
-    """Rule-based language detector for common e-commerce query languages."""
+    """Language detector: script hints + Lingua for Latin-family queries."""
  
     def __init__(self):
         self._re_zh = re.compile(r"[\u4e00-\u9fff]")
@@ -21,47 +56,6 @@ class LanguageDetector:
         self._re_hi = re.compile(r"[\u0900-\u097f]")
         self._re_he = re.compile(r"[\u0590-\u05ff]")
         self._re_th = re.compile(r"[\u0e00-\u0e7f]")
-        self._re_latin_word = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+")
-
-        # Stopwords + e-commerce terms for Latin-family disambiguation.
-        self._latin_lexicons: Dict[str, set] = {
-            "en": {
-                "the", "and", "for", "with", "new", "women", "men", "kids",
-                "shoe", "shoes", "dress", "shirt", "jacket", "bag", "wireless",
-            },
-            "de": {
-                "der", "die", "das", "und", "mit", "für", "damen", "herren",
-                "kinder", "schuhe", "kleid", "hemd", "jacke", "tasche",
-            },
-            "fr": {
-                "le", "la", "les", "et", "avec", "pour", "femme", "homme",
-                "enfant", "chaussures", "robe", "chemise", "veste", "sac",
-            },
-            "es": {
-                "el", "la", "los", "las", "y", "con", "para", "mujer", "hombre",
-                "niño", "niña", "zapatos", "vestido", "camisa", "chaqueta", "bolso",
-            },
-            "it": {
-                "il", "lo", "la", "gli", "le", "e", "con", "per", "donna", "uomo",
-                "bambino", "scarpe", "abito", "camicia", "giacca", "borsa",
-            },
-            "pt": {
-                "o", "a", "os", "as", "e", "com", "para", "mulher", "homem",
-                "criança", "sapatos", "vestido", "camisa", "jaqueta", "bolsa",
-            },
-            "nl": {
-                "de", "het", "en", "met", "voor", "dames", "heren", "kinderen",
-                "schoenen", "jurk", "overhemd", "jas", "tas",
-            },
-        }
-        self._diacritic_weights: Dict[str, Dict[str, int]] = {
-            "de": {"ä": 3, "ö": 3, "ü": 3, "ß": 4},
-            "fr": {"é": 2, "è": 2, "ê": 2, "à": 2, "ç": 2, "ù": 2, "ô": 2},
-            "es": {"ñ": 3, "á": 2, "é": 2, "í": 2, "ó": 2, "ú": 2},
-            "it": {"à": 2, "è": 2, "é": 2, "ì": 2, "ò": 2, "ù": 2},
-            "pt": {"ã": 3, "õ": 3, "ç": 2, "á": 2, "â": 2, "ê": 2, "ô": 2},
-            "nl": {"ij": 2},
-        }
  
     def detect(self, text: str) -> str:
         """
@@ -71,9 +65,9 @@ class LanguageDetector:
         """
         if not text or not text.strip():
             return "unknown"
-        q = text.strip().lower()
+        q = text.strip()
  
-        # Script-first detection for non-Latin languages.
+        # Script-first: unambiguous blocks before Latin/Romance Lingua pass.
         if self._re_ja_kana.search(q):
             return "ja"
         if self._re_ko.search(q):
@@ -91,48 +85,11 @@ class LanguageDetector:
         if self._re_th.search(q):
             return "th"
  
-        # Latin-family scoring.
-        tokens = self._re_latin_word.findall(q)
-        if not tokens:
+        detected = _get_lingua_detector().detect_language_of(q)
+        if detected is None:
             return "unknown"
-
-        scores: Dict[str, float] = {k: 0.0 for k in self._latin_lexicons.keys()}
-        scores["en"] = scores.get("en", 0.0)
-        token_set = set(tokens)
-
-        # Lexicon matches
-        for lang, lex in self._latin_lexicons.items():
-            overlap = len(token_set & lex)
-            if overlap:
-                scores[lang] += overlap * 2.0
-
-        # Diacritics / orthographic hints
-        for lang, hints in self._diacritic_weights.items():
-            for marker, weight in hints.items():
-                if marker in q:
-                    scores[lang] += weight
-
-        # Light suffix hints for common product words
-        for t in tokens:
-            if t.endswith("ung") or t.endswith("chen"):
-                scores["de"] += 0.6
-            if t.endswith("ción") or t.endswith("ado") or t.endswith("ada"):
-                scores["es"] += 0.6
-            if t.endswith("zione") or t.endswith("etto") or t.endswith("ella"):
-                scores["it"] += 0.6
-            if t.endswith("ção") or t.endswith("mente"):
-                scores["pt"] += 0.6
-            if t.endswith("ment") or t.endswith("eau"):
-                scores["fr"] += 0.5
-
-        # Fallback preference: English for pure Latin short tokens.
-        scores["en"] += 0.2
-
-        best_lang = max(scores.items(), key=lambda x: x[1])[0]
-        best_score = scores[best_lang]
-        if best_score <= 0:
-            return "en"
-        return best_lang
+        code = _LINGUA_TO_CODE.get(detected)
+        return code if code is not None else "unknown"
  
     def is_chinese(self, text: str) -> bool:
         return self.detect(text) == "zh"
@@ -42,3 +42,6 @@ click&gt;=8.1.0
 pytest>=7.4.0
 pytest-asyncio>=0.21.0
 httpx>=0.24.0
+
+# language detector
+lingua-language-detector
 \ No newline at end of file