Blame view

query/language_detector.py 3.78 KB
be52af70   tangwang   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
  """
  Language detection utility.
  
  Detects the language of a query string.
  """
  
  from typing import Optional
  import re
  
  
  class LanguageDetector:
      """Simple rule-based language detector for common e-commerce languages."""
  
      # Unicode ranges for different scripts
      CJK_RANGES = [
          (0x4E00, 0x9FFF),   # CJK Unified Ideographs
          (0x3400, 0x4DBF),   # CJK Extension A
          (0x20000, 0x2A6DF), # CJK Extension B
          (0x3040, 0x309F),   # Hiragana
          (0x30A0, 0x30FF),   # Katakana
      ]
  
      CYRILLIC_RANGE = (0x0400, 0x04FF)
      ARABIC_RANGE = (0x0600, 0x06FF)
      LATIN_RANGE = (0x0041, 0x007A)
  
      def __init__(self):
          """Initialize language detector."""
          self.chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
          self.russian_pattern = re.compile(r'[\u0400-\u04ff]+')
          self.arabic_pattern = re.compile(r'[\u0600-\u06ff]+')
          self.japanese_pattern = re.compile(r'[\u3040-\u309f\u30a0-\u30ff]+')
  
      def detect(self, text: str) -> str:
          """
          Detect language of text.
  
          Args:
              text: Input text
  
          Returns:
              Language code: 'zh', 'en', 'ru', 'ar', 'ja', or 'unknown'
          """
          if not text or not text.strip():
              return 'unknown'
  
          text = text.strip()
  
          # Count characters in each script
          char_counts = {
              'chinese': 0,
              'russian': 0,
              'arabic': 0,
              'japanese': 0,
              'latin': 0
          }
  
          for char in text:
              code_point = ord(char)
  
              # Check CJK (Chinese/Japanese)
              is_cjk = any(start <= code_point <= end for start, end in self.CJK_RANGES)
              if is_cjk:
                  char_counts['chinese'] += 1
  
              # Check Hiragana/Katakana (Japanese)
              if 0x3040 <= code_point <= 0x30FF:
                  char_counts['japanese'] += 1
  
              # Check Cyrillic (Russian)
              if self.CYRILLIC_RANGE[0] <= code_point <= self.CYRILLIC_RANGE[1]:
                  char_counts['russian'] += 1
  
              # Check Arabic
              if self.ARABIC_RANGE[0] <= code_point <= self.ARABIC_RANGE[1]:
                  char_counts['arabic'] += 1
  
              # Check Latin
              if (0x0041 <= code_point <= 0x005A) or (0x0061 <= code_point <= 0x007A):
                  char_counts['latin'] += 1
  
          # Determine dominant script
          total_chars = sum(char_counts.values())
          if total_chars == 0:
              return 'unknown'
  
          # Calculate percentages
          percentages = {
              script: count / total_chars
              for script, count in char_counts.items()
          }
  
          # Japanese has both Hiragana/Katakana and CJK
          if percentages['japanese'] > 0.1:
              return 'ja'
  
          # Russian (Cyrillic)
          if percentages['russian'] > 0.5:
              return 'ru'
  
          # Arabic
          if percentages['arabic'] > 0.5:
              return 'ar'
  
          # Chinese (CJK without Japanese kana)
          if percentages['chinese'] > 0.3:
              return 'zh'
  
          # English/Latin
          if percentages['latin'] > 0.5:
              return 'en'
  
          return 'unknown'
  
      def is_chinese(self, text: str) -> bool:
          """Check if text is primarily Chinese."""
          return self.detect(text) == 'zh'
  
      def is_english(self, text: str) -> bool:
          """Check if text is primarily English."""
          return self.detect(text) == 'en'
  
      def is_russian(self, text: str) -> bool:
          """Check if text is primarily Russian."""
          return self.detect(text) == 'ru'
  
      def is_arabic(self, text: str) -> bool:
          """Check if text is primarily Arabic."""
          return self.detect(text) == 'ar'
  
      def is_japanese(self, text: str) -> bool:
          """Check if text is primarily Japanese."""
          return self.detect(text) == 'ja'