language_detector.py
3.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""
Language detection utility.
Script-first rules for CJK and other non-Latin scripts, then Lingua
(lingua-language-detector) for Latin text and Romance/Germanic disambiguation.
"""
from __future__ import annotations
from typing import Dict, Optional
import re
from lingua import Language, LanguageDetectorBuilder
_LINGUA_TO_CODE: Dict[Language, str] = {
Language.CHINESE: "zh",
Language.ENGLISH: "en",
Language.JAPANESE: "ja",
Language.KOREAN: "ko",
Language.GERMAN: "de",
Language.FRENCH: "fr",
Language.SPANISH: "es",
Language.ITALIAN: "it",
Language.PORTUGUESE: "pt",
Language.DUTCH: "nl",
Language.RUSSIAN: "ru",
Language.ARABIC: "ar",
Language.HINDI: "hi",
Language.HEBREW: "he",
Language.THAI: "th",
}
_LINGUA_LANGUAGES = tuple(_LINGUA_TO_CODE.keys())
_lingua_detector: Optional[object] = None
def _get_lingua_detector():
global _lingua_detector
if _lingua_detector is None:
_lingua_detector = LanguageDetectorBuilder.from_languages(
*_LINGUA_LANGUAGES
).build()
return _lingua_detector
class LanguageDetector:
"""Language detector: script hints + Lingua for Latin-family queries."""
def __init__(self):
self._re_zh = re.compile(r"[\u4e00-\u9fff]")
self._re_ja_kana = re.compile(r"[\u3040-\u30ff]")
self._re_ko = re.compile(r"[\uac00-\ud7af]")
self._re_ru = re.compile(r"[\u0400-\u04ff]")
self._re_ar = re.compile(r"[\u0600-\u06ff]")
self._re_hi = re.compile(r"[\u0900-\u097f]")
self._re_he = re.compile(r"[\u0590-\u05ff]")
self._re_th = re.compile(r"[\u0e00-\u0e7f]")
def detect(self, text: str) -> str:
"""
Detect language code for text.
Returns one of: zh/en/ru/ar/ja/ko/de/fr/es/it/pt/nl/hi/he/th/unknown
"""
if not text or not text.strip():
return "unknown"
q = text.strip()
# Script-first: unambiguous blocks before Latin/Romance Lingua pass.
if self._re_ja_kana.search(q):
return "ja"
if self._re_ko.search(q):
return "ko"
if self._re_zh.search(q):
return "zh"
if self._re_ru.search(q):
return "ru"
if self._re_ar.search(q):
return "ar"
if self._re_hi.search(q):
return "hi"
if self._re_he.search(q):
return "he"
if self._re_th.search(q):
return "th"
detected = _get_lingua_detector().detect_language_of(q)
if detected is None:
return "unknown"
code = _LINGUA_TO_CODE.get(detected)
return code if code is not None else "unknown"
def is_chinese(self, text: str) -> bool:
return self.detect(text) == "zh"
def is_english(self, text: str) -> bool:
return self.detect(text) == "en"
def is_russian(self, text: str) -> bool:
return self.detect(text) == "ru"
def is_arabic(self, text: str) -> bool:
return self.detect(text) == "ar"
def is_japanese(self, text: str) -> bool:
return self.detect(text) == "ja"