From 0ea456b2f14531db776ab4ad3d3aec2423cb11e0 Mon Sep 17 00:00:00 2001 From: tangwang Date: Fri, 20 Mar 2026 20:54:17 +0800 Subject: [PATCH] +lingua-language-detector --- docs/TODO.txt | 2 ++ query/language_detector.py | 133 +++++++++++++++++++++++++++++++++++++++++++++---------------------------------------------------------------------------------------- requirements.txt | 3 +++ 3 files changed, 50 insertions(+), 88 deletions(-) diff --git a/docs/TODO.txt b/docs/TODO.txt index f628ff9..a66f3a5 100644 --- a/docs/TODO.txt +++ b/docs/TODO.txt @@ -247,6 +247,8 @@ config/environments/.yaml +无结果重查 +稀有语言,翻译可能超时(因为zh-en互译之外的翻译耗时更长) diff --git a/query/language_detector.py b/query/language_detector.py index 44fc041..493618a 100644 --- a/query/language_detector.py +++ b/query/language_detector.py @@ -1,16 +1,51 @@ """ Language detection utility. -Detects language of short e-commerce queries with script checks + lightweight -Latin-language scoring (de/fr/es/it/pt/nl/en). +Script-first rules for CJK and other non-Latin scripts, then Lingua +(lingua-language-detector) for Latin text and Romance/Germanic disambiguation. """ -from typing import Dict, List +from __future__ import annotations + +from typing import Dict, Optional import re +from lingua import Language, LanguageDetectorBuilder + +_LINGUA_TO_CODE: Dict[Language, str] = { + Language.CHINESE: "zh", + Language.ENGLISH: "en", + Language.JAPANESE: "ja", + Language.KOREAN: "ko", + Language.GERMAN: "de", + Language.FRENCH: "fr", + Language.SPANISH: "es", + Language.ITALIAN: "it", + Language.PORTUGUESE: "pt", + Language.DUTCH: "nl", + Language.RUSSIAN: "ru", + Language.ARABIC: "ar", + Language.HINDI: "hi", + Language.HEBREW: "he", + Language.THAI: "th", +} + +_LINGUA_LANGUAGES = tuple(_LINGUA_TO_CODE.keys()) + +_lingua_detector: Optional[object] = None + + +def _get_lingua_detector(): + global _lingua_detector + if _lingua_detector is None: + _lingua_detector = LanguageDetectorBuilder.from_languages( + *_LINGUA_LANGUAGES + ).build() + return _lingua_detector + class LanguageDetector: - """Rule-based language detector for common e-commerce query languages.""" + """Language detector: script hints + Lingua for Latin-family queries.""" def __init__(self): self._re_zh = re.compile(r"[\u4e00-\u9fff]") @@ -21,47 +56,6 @@ class LanguageDetector: self._re_hi = re.compile(r"[\u0900-\u097f]") self._re_he = re.compile(r"[\u0590-\u05ff]") self._re_th = re.compile(r"[\u0e00-\u0e7f]") - self._re_latin_word = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+") - - # Stopwords + e-commerce terms for Latin-family disambiguation. - self._latin_lexicons: Dict[str, set] = { - "en": { - "the", "and", "for", "with", "new", "women", "men", "kids", - "shoe", "shoes", "dress", "shirt", "jacket", "bag", "wireless", - }, - "de": { - "der", "die", "das", "und", "mit", "für", "damen", "herren", - "kinder", "schuhe", "kleid", "hemd", "jacke", "tasche", - }, - "fr": { - "le", "la", "les", "et", "avec", "pour", "femme", "homme", - "enfant", "chaussures", "robe", "chemise", "veste", "sac", - }, - "es": { - "el", "la", "los", "las", "y", "con", "para", "mujer", "hombre", - "niño", "niña", "zapatos", "vestido", "camisa", "chaqueta", "bolso", - }, - "it": { - "il", "lo", "la", "gli", "le", "e", "con", "per", "donna", "uomo", - "bambino", "scarpe", "abito", "camicia", "giacca", "borsa", - }, - "pt": { - "o", "a", "os", "as", "e", "com", "para", "mulher", "homem", - "criança", "sapatos", "vestido", "camisa", "jaqueta", "bolsa", - }, - "nl": { - "de", "het", "en", "met", "voor", "dames", "heren", "kinderen", - "schoenen", "jurk", "overhemd", "jas", "tas", - }, - } - self._diacritic_weights: Dict[str, Dict[str, int]] = { - "de": {"ä": 3, "ö": 3, "ü": 3, "ß": 4}, - "fr": {"é": 2, "è": 2, "ê": 2, "à": 2, "ç": 2, "ù": 2, "ô": 2}, - "es": {"ñ": 3, "á": 2, "é": 2, "í": 2, "ó": 2, "ú": 2}, - "it": {"à": 2, "è": 2, "é": 2, "ì": 2, "ò": 2, "ù": 2}, - "pt": {"ã": 3, "õ": 3, "ç": 2, "á": 2, "â": 2, "ê": 2, "ô": 2}, - "nl": {"ij": 2}, - } def detect(self, text: str) -> str: """ @@ -71,9 +65,9 @@ class LanguageDetector: """ if not text or not text.strip(): return "unknown" - q = text.strip().lower() + q = text.strip() - # Script-first detection for non-Latin languages. + # Script-first: unambiguous blocks before Latin/Romance Lingua pass. if self._re_ja_kana.search(q): return "ja" if self._re_ko.search(q): @@ -91,48 +85,11 @@ class LanguageDetector: if self._re_th.search(q): return "th" - # Latin-family scoring. - tokens = self._re_latin_word.findall(q) - if not tokens: + detected = _get_lingua_detector().detect_language_of(q) + if detected is None: return "unknown" - - scores: Dict[str, float] = {k: 0.0 for k in self._latin_lexicons.keys()} - scores["en"] = scores.get("en", 0.0) - token_set = set(tokens) - - # Lexicon matches - for lang, lex in self._latin_lexicons.items(): - overlap = len(token_set & lex) - if overlap: - scores[lang] += overlap * 2.0 - - # Diacritics / orthographic hints - for lang, hints in self._diacritic_weights.items(): - for marker, weight in hints.items(): - if marker in q: - scores[lang] += weight - - # Light suffix hints for common product words - for t in tokens: - if t.endswith("ung") or t.endswith("chen"): - scores["de"] += 0.6 - if t.endswith("ción") or t.endswith("ado") or t.endswith("ada"): - scores["es"] += 0.6 - if t.endswith("zione") or t.endswith("etto") or t.endswith("ella"): - scores["it"] += 0.6 - if t.endswith("ção") or t.endswith("mente"): - scores["pt"] += 0.6 - if t.endswith("ment") or t.endswith("eau"): - scores["fr"] += 0.5 - - # Fallback preference: English for pure Latin short tokens. - scores["en"] += 0.2 - - best_lang = max(scores.items(), key=lambda x: x[1])[0] - best_score = scores[best_lang] - if best_score <= 0: - return "en" - return best_lang + code = _LINGUA_TO_CODE.get(detected) + return code if code is not None else "unknown" def is_chinese(self, text: str) -> bool: return self.detect(text) == "zh" diff --git a/requirements.txt b/requirements.txt index 449fd35..8599b37 100644 --- a/requirements.txt +++ b/requirements.txt @@ -42,3 +42,6 @@ click>=8.1.0 pytest>=7.4.0 pytest-asyncio>=0.21.0 httpx>=0.24.0 + +# language detector +lingua-language-detector \ No newline at end of file -- libgit2 0.21.2