be52af70
tangwang
first commit
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
|
"""
Language detection utility.
Detects the language of a query string.
"""
from typing import Optional
import re
class LanguageDetector:
"""Simple rule-based language detector for common e-commerce languages."""
# Unicode ranges for different scripts
CJK_RANGES = [
(0x4E00, 0x9FFF), # CJK Unified Ideographs
(0x3400, 0x4DBF), # CJK Extension A
(0x20000, 0x2A6DF), # CJK Extension B
(0x3040, 0x309F), # Hiragana
(0x30A0, 0x30FF), # Katakana
]
CYRILLIC_RANGE = (0x0400, 0x04FF)
ARABIC_RANGE = (0x0600, 0x06FF)
LATIN_RANGE = (0x0041, 0x007A)
def __init__(self):
"""Initialize language detector."""
self.chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
self.russian_pattern = re.compile(r'[\u0400-\u04ff]+')
self.arabic_pattern = re.compile(r'[\u0600-\u06ff]+')
self.japanese_pattern = re.compile(r'[\u3040-\u309f\u30a0-\u30ff]+')
def detect(self, text: str) -> str:
"""
Detect language of text.
Args:
text: Input text
Returns:
Language code: 'zh', 'en', 'ru', 'ar', 'ja', or 'unknown'
"""
if not text or not text.strip():
return 'unknown'
text = text.strip()
# Count characters in each script
char_counts = {
'chinese': 0,
'russian': 0,
'arabic': 0,
'japanese': 0,
'latin': 0
}
for char in text:
code_point = ord(char)
# Check CJK (Chinese/Japanese)
is_cjk = any(start <= code_point <= end for start, end in self.CJK_RANGES)
if is_cjk:
char_counts['chinese'] += 1
# Check Hiragana/Katakana (Japanese)
if 0x3040 <= code_point <= 0x30FF:
char_counts['japanese'] += 1
# Check Cyrillic (Russian)
if self.CYRILLIC_RANGE[0] <= code_point <= self.CYRILLIC_RANGE[1]:
char_counts['russian'] += 1
# Check Arabic
if self.ARABIC_RANGE[0] <= code_point <= self.ARABIC_RANGE[1]:
char_counts['arabic'] += 1
# Check Latin
if (0x0041 <= code_point <= 0x005A) or (0x0061 <= code_point <= 0x007A):
char_counts['latin'] += 1
# Determine dominant script
total_chars = sum(char_counts.values())
if total_chars == 0:
return 'unknown'
# Calculate percentages
percentages = {
script: count / total_chars
for script, count in char_counts.items()
}
# Japanese has both Hiragana/Katakana and CJK
if percentages['japanese'] > 0.1:
return 'ja'
# Russian (Cyrillic)
if percentages['russian'] > 0.5:
return 'ru'
# Arabic
if percentages['arabic'] > 0.5:
return 'ar'
# Chinese (CJK without Japanese kana)
if percentages['chinese'] > 0.3:
return 'zh'
# English/Latin
if percentages['latin'] > 0.5:
return 'en'
return 'unknown'
def is_chinese(self, text: str) -> bool:
"""Check if text is primarily Chinese."""
return self.detect(text) == 'zh'
def is_english(self, text: str) -> bool:
"""Check if text is primarily English."""
return self.detect(text) == 'en'
def is_russian(self, text: str) -> bool:
"""Check if text is primarily Russian."""
return self.detect(text) == 'ru'
def is_arabic(self, text: str) -> bool:
"""Check if text is primarily Arabic."""
return self.detect(text) == 'ar'
def is_japanese(self, text: str) -> bool:
"""Check if text is primarily Japanese."""
return self.detect(text) == 'ja'
|