Blame view

query/language_detector.py 5.46 KB
be52af70   tangwang   first commit
1
2
3
  """
  Language detection utility.
  
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
4
5
  Detects language of short e-commerce queries with script checks + lightweight
  Latin-language scoring (de/fr/es/it/pt/nl/en).
be52af70   tangwang   first commit
6
7
  """
  
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
8
  from typing import Dict, List
be52af70   tangwang   first commit
9
10
11
12
  import re
  
  
  class LanguageDetector:
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
13
      """Rule-based language detector for common e-commerce query languages."""
be52af70   tangwang   first commit
14
15
  
      def __init__(self):
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
          self._re_zh = re.compile(r"[\u4e00-\u9fff]")
          self._re_ja_kana = re.compile(r"[\u3040-\u30ff]")
          self._re_ko = re.compile(r"[\uac00-\ud7af]")
          self._re_ru = re.compile(r"[\u0400-\u04ff]")
          self._re_ar = re.compile(r"[\u0600-\u06ff]")
          self._re_hi = re.compile(r"[\u0900-\u097f]")
          self._re_he = re.compile(r"[\u0590-\u05ff]")
          self._re_th = re.compile(r"[\u0e00-\u0e7f]")
          self._re_latin_word = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+")
  
          # Stopwords + e-commerce terms for Latin-family disambiguation.
          self._latin_lexicons: Dict[str, set] = {
              "en": {
                  "the", "and", "for", "with", "new", "women", "men", "kids",
                  "shoe", "shoes", "dress", "shirt", "jacket", "bag", "wireless",
              },
              "de": {
                  "der", "die", "das", "und", "mit", "für", "damen", "herren",
                  "kinder", "schuhe", "kleid", "hemd", "jacke", "tasche",
              },
              "fr": {
                  "le", "la", "les", "et", "avec", "pour", "femme", "homme",
                  "enfant", "chaussures", "robe", "chemise", "veste", "sac",
              },
              "es": {
                  "el", "la", "los", "las", "y", "con", "para", "mujer", "hombre",
                  "niño", "niña", "zapatos", "vestido", "camisa", "chaqueta", "bolso",
              },
              "it": {
                  "il", "lo", "la", "gli", "le", "e", "con", "per", "donna", "uomo",
                  "bambino", "scarpe", "abito", "camicia", "giacca", "borsa",
              },
              "pt": {
                  "o", "a", "os", "as", "e", "com", "para", "mulher", "homem",
                  "criança", "sapatos", "vestido", "camisa", "jaqueta", "bolsa",
              },
              "nl": {
                  "de", "het", "en", "met", "voor", "dames", "heren", "kinderen",
                  "schoenen", "jurk", "overhemd", "jas", "tas",
              },
          }
          self._diacritic_weights: Dict[str, Dict[str, int]] = {
              "de": {"ä": 3, "ö": 3, "ü": 3, "ß": 4},
              "fr": {"é": 2, "è": 2, "ê": 2, "à": 2, "ç": 2, "ù": 2, "ô": 2},
              "es": {"ñ": 3, "á": 2, "é": 2, "í": 2, "ó": 2, "ú": 2},
              "it": {"à": 2, "è": 2, "é": 2, "ì": 2, "ò": 2, "ù": 2},
              "pt": {"ã": 3, "õ": 3, "ç": 2, "á": 2, "â": 2, "ê": 2, "ô": 2},
              "nl": {"ij": 2},
          }
be52af70   tangwang   first commit
65
66
67
  
      def detect(self, text: str) -> str:
          """
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
68
          Detect language code for text.
be52af70   tangwang   first commit
69
  
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
70
          Returns one of: zh/en/ru/ar/ja/ko/de/fr/es/it/pt/nl/hi/he/th/unknown
be52af70   tangwang   first commit
71
72
          """
          if not text or not text.strip():
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
              return "unknown"
          q = text.strip().lower()
  
          # Script-first detection for non-Latin languages.
          if self._re_ja_kana.search(q):
              return "ja"
          if self._re_ko.search(q):
              return "ko"
          if self._re_zh.search(q):
              return "zh"
          if self._re_ru.search(q):
              return "ru"
          if self._re_ar.search(q):
              return "ar"
          if self._re_hi.search(q):
              return "hi"
          if self._re_he.search(q):
              return "he"
          if self._re_th.search(q):
              return "th"
  
          # Latin-family scoring.
          tokens = self._re_latin_word.findall(q)
          if not tokens:
              return "unknown"
  
          scores: Dict[str, float] = {k: 0.0 for k in self._latin_lexicons.keys()}
          scores["en"] = scores.get("en", 0.0)
          token_set = set(tokens)
  
          # Lexicon matches
          for lang, lex in self._latin_lexicons.items():
              overlap = len(token_set & lex)
              if overlap:
                  scores[lang] += overlap * 2.0
  
          # Diacritics / orthographic hints
          for lang, hints in self._diacritic_weights.items():
              for marker, weight in hints.items():
                  if marker in q:
                      scores[lang] += weight
  
          # Light suffix hints for common product words
          for t in tokens:
              if t.endswith("ung") or t.endswith("chen"):
                  scores["de"] += 0.6
              if t.endswith("ción") or t.endswith("ado") or t.endswith("ada"):
                  scores["es"] += 0.6
              if t.endswith("zione") or t.endswith("etto") or t.endswith("ella"):
                  scores["it"] += 0.6
              if t.endswith("ção") or t.endswith("mente"):
                  scores["pt"] += 0.6
              if t.endswith("ment") or t.endswith("eau"):
                  scores["fr"] += 0.5
  
          # Fallback preference: English for pure Latin short tokens.
          scores["en"] += 0.2
  
          best_lang = max(scores.items(), key=lambda x: x[1])[0]
          best_score = scores[best_lang]
          if best_score <= 0:
              return "en"
          return best_lang
be52af70   tangwang   first commit
136
137
  
      def is_chinese(self, text: str) -> bool:
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
138
          return self.detect(text) == "zh"
be52af70   tangwang   first commit
139
140
  
      def is_english(self, text: str) -> bool:
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
141
          return self.detect(text) == "en"
be52af70   tangwang   first commit
142
143
  
      def is_russian(self, text: str) -> bool:
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
144
          return self.detect(text) == "ru"
be52af70   tangwang   first commit
145
146
  
      def is_arabic(self, text: str) -> bool:
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
147
          return self.detect(text) == "ar"
be52af70   tangwang   first commit
148
149
  
      def is_japanese(self, text: str) -> bool:
bd96cead   tangwang   1. 动态多语言字段与统一策略配置
150
          return self.detect(text) == "ja"