Blame view

translation/languages.py 2.67 KB
0fd2f875   tangwang   translate
1
2
3
4
  """Translation-internal language metadata."""
  
  from __future__ import annotations
  
14e67b71   tangwang   分句后的 batching 现在是...
5
  from typing import Dict, Mapping, Optional, Tuple
0fd2f875   tangwang   translate
6
7
8
9
10
  
  
  LANGUAGE_LABELS: Dict[str, str] = {
      "zh": "Chinese",
      "en": "English",
14e67b71   tangwang   分句后的 batching 现在是...
11
      "fi": "Finnish",
0fd2f875   tangwang   translate
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
      "ru": "Russian",
      "ar": "Arabic",
      "ja": "Japanese",
      "es": "Spanish",
      "de": "German",
      "fr": "French",
      "it": "Italian",
      "pt": "Portuguese",
  }
  
  
  QWEN_LANGUAGE_CODES: Dict[str, str] = {
      "zh": "Chinese",
      "en": "English",
      "ru": "Russian",
      "ar": "Arabic",
      "ja": "Japanese",
      "es": "Spanish",
      "de": "German",
      "fr": "French",
      "it": "Italian",
      "pt": "Portuguese",
  }
  
  
  DEEPL_LANGUAGE_CODES: Dict[str, str] = {
      "zh": "ZH",
      "en": "EN",
      "ru": "RU",
      "ar": "AR",
      "ja": "JA",
      "es": "ES",
      "de": "DE",
      "fr": "FR",
      "it": "IT",
      "pt": "PT",
  }
  
  
  NLLB_LANGUAGE_CODES: Dict[str, str] = {
      "en": "eng_Latn",
14e67b71   tangwang   分句后的 batching 现在是...
53
      "fi": "fin_Latn",
0fd2f875   tangwang   translate
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
      "zh": "zho_Hans",
      "ru": "rus_Cyrl",
      "ar": "arb_Arab",
      "ja": "jpn_Jpan",
      "es": "spa_Latn",
      "de": "deu_Latn",
      "fr": "fra_Latn",
      "it": "ita_Latn",
      "pt": "por_Latn",
  }
  
  
  MARIAN_LANGUAGE_DIRECTIONS: Dict[str, Tuple[str, str]] = {
      "opus-mt-zh-en": ("zh", "en"),
      "opus-mt-en-zh": ("en", "zh"),
  }
14e67b71   tangwang   分句后的 batching 现在是...
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
  
  
  NLLB_LANGUAGE_ALIASES: Dict[str, str] = {
      "fi_fi": "fi",
      "fin": "fi",
      "fin_fin": "fi",
      "zh_cn": "zh",
      "zh_hans": "zh",
  }
  
  
  def normalize_language_key(language: Optional[str]) -> str:
      return str(language or "").strip().lower().replace("-", "_")
  
  
  def build_nllb_language_catalog(
      overrides: Optional[Mapping[str, str]] = None,
  ) -> Dict[str, str]:
      catalog = {
          normalize_language_key(key): str(value).strip()
          for key, value in NLLB_LANGUAGE_CODES.items()
          if str(key).strip()
      }
      for key, value in (overrides or {}).items():
          normalized_key = normalize_language_key(key)
          if normalized_key:
              catalog[normalized_key] = str(value).strip()
      return catalog
  
  
  def resolve_nllb_language_code(
      language: Optional[str],
      language_codes: Optional[Mapping[str, str]] = None,
  ) -> Optional[str]:
      normalized = normalize_language_key(language)
      if not normalized:
          return None
  
      catalog = build_nllb_language_catalog(language_codes)
      direct = catalog.get(normalized)
      if direct is not None:
          return direct
  
      alias = NLLB_LANGUAGE_ALIASES.get(normalized)
      if alias is not None:
          aliased = catalog.get(normalize_language_key(alias))
          if aliased is not None:
              return aliased
  
      for code in catalog.values():
          if normalize_language_key(code) == normalized:
              return code
      return None