Blame view

query/language_detector.py 3.07 KB
be52af70   tangwang   first commit
1
2
3
  """
  Language detection utility.
  
0ea456b2   tangwang   +lingua-language-...
4
5
  Script-first rules for CJK and other non-Latin scripts, then Lingua
  (lingua-language-detector) for Latin text and Romance/Germanic disambiguation.
be52af70   tangwang   first commit
6
7
  """
  
0ea456b2   tangwang   +lingua-language-...
8
9
10
  from __future__ import annotations
  
  from typing import Dict, Optional
be52af70   tangwang   first commit
11
12
  import re
  
0ea456b2   tangwang   +lingua-language-...
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
  from lingua import Language, LanguageDetectorBuilder
  
  _LINGUA_TO_CODE: Dict[Language, str] = {
      Language.CHINESE: "zh",
      Language.ENGLISH: "en",
      Language.JAPANESE: "ja",
      Language.KOREAN: "ko",
      Language.GERMAN: "de",
      Language.FRENCH: "fr",
      Language.SPANISH: "es",
      Language.ITALIAN: "it",
      Language.PORTUGUESE: "pt",
      Language.DUTCH: "nl",
      Language.RUSSIAN: "ru",
      Language.ARABIC: "ar",
      Language.HINDI: "hi",
      Language.HEBREW: "he",
      Language.THAI: "th",
  }
  
  _LINGUA_LANGUAGES = tuple(_LINGUA_TO_CODE.keys())
  
  _lingua_detector: Optional[object] = None
  
  
  def _get_lingua_detector():
      global _lingua_detector
      if _lingua_detector is None:
          _lingua_detector = LanguageDetectorBuilder.from_languages(
              *_LINGUA_LANGUAGES
          ).build()
      return _lingua_detector
  
be52af70   tangwang   first commit
46
47
  
  class LanguageDetector:
0ea456b2   tangwang   +lingua-language-...
48
      """Language detector: script hints + Lingua for Latin-family queries."""
be52af70   tangwang   first commit
49
50
  
      def __init__(self):
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
51
52
53
54
55
56
57
58
          self._re_zh = re.compile(r"[\u4e00-\u9fff]")
          self._re_ja_kana = re.compile(r"[\u3040-\u30ff]")
          self._re_ko = re.compile(r"[\uac00-\ud7af]")
          self._re_ru = re.compile(r"[\u0400-\u04ff]")
          self._re_ar = re.compile(r"[\u0600-\u06ff]")
          self._re_hi = re.compile(r"[\u0900-\u097f]")
          self._re_he = re.compile(r"[\u0590-\u05ff]")
          self._re_th = re.compile(r"[\u0e00-\u0e7f]")
be52af70   tangwang   first commit
59
60
61
  
      def detect(self, text: str) -> str:
          """
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
62
          Detect language code for text.
be52af70   tangwang   first commit
63
  
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
64
          Returns one of: zh/en/ru/ar/ja/ko/de/fr/es/it/pt/nl/hi/he/th/unknown
be52af70   tangwang   first commit
65
66
          """
          if not text or not text.strip():
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
67
              return "unknown"
0ea456b2   tangwang   +lingua-language-...
68
          q = text.strip()
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
69
  
0ea456b2   tangwang   +lingua-language-...
70
          # Script-first: unambiguous blocks before Latin/Romance Lingua pass.
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
          if self._re_ja_kana.search(q):
              return "ja"
          if self._re_ko.search(q):
              return "ko"
          if self._re_zh.search(q):
              return "zh"
          if self._re_ru.search(q):
              return "ru"
          if self._re_ar.search(q):
              return "ar"
          if self._re_hi.search(q):
              return "hi"
          if self._re_he.search(q):
              return "he"
          if self._re_th.search(q):
              return "th"
  
0ea456b2   tangwang   +lingua-language-...
88
89
          detected = _get_lingua_detector().detect_language_of(q)
          if detected is None:
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
90
              return "unknown"
0ea456b2   tangwang   +lingua-language-...
91
92
          code = _LINGUA_TO_CODE.get(detected)
          return code if code is not None else "unknown"
be52af70   tangwang   first commit
93
94
  
      def is_chinese(self, text: str) -> bool:
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
95
          return self.detect(text) == "zh"
be52af70   tangwang   first commit
96
97
  
      def is_english(self, text: str) -> bool:
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
98
          return self.detect(text) == "en"
be52af70   tangwang   first commit
99
100
  
      def is_russian(self, text: str) -> bool:
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
101
          return self.detect(text) == "ru"
be52af70   tangwang   first commit
102
103
  
      def is_arabic(self, text: str) -> bool:
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
104
          return self.detect(text) == "ar"
be52af70   tangwang   first commit
105
106
  
      def is_japanese(self, text: str) -> bool:
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
107
          return self.detect(text) == "ja"