Blame view

translation/languages.py 3.64 KB
0fd2f875   tangwang   translate
1
2
3
4
  """Translation-internal language metadata."""
  
  from __future__ import annotations
  
39306492   tangwang   fix(translation):...
5
  from functools import lru_cache
14e67b71   tangwang   分句后的 batching 现在是...
6
  from typing import Dict, Mapping, Optional, Tuple
0fd2f875   tangwang   translate
7
  
39306492   tangwang   fix(translation):...
8
9
10
11
12
  from translation.nllb_flores_short_map import (
      NLLB_FLORES_SHORT_TO_CODE,
      NLLB_TOKENIZER_LANGUAGE_CODES,
  )
  
0fd2f875   tangwang   translate
13
14
15
16
  
  LANGUAGE_LABELS: Dict[str, str] = {
      "zh": "Chinese",
      "en": "English",
14e67b71   tangwang   分句后的 batching 现在是...
17
      "fi": "Finnish",
0fd2f875   tangwang   translate
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
      "ru": "Russian",
      "ar": "Arabic",
      "ja": "Japanese",
      "es": "Spanish",
      "de": "German",
      "fr": "French",
      "it": "Italian",
      "pt": "Portuguese",
  }
  
  
  QWEN_LANGUAGE_CODES: Dict[str, str] = {
      "zh": "Chinese",
      "en": "English",
      "ru": "Russian",
      "ar": "Arabic",
      "ja": "Japanese",
      "es": "Spanish",
      "de": "German",
      "fr": "French",
      "it": "Italian",
      "pt": "Portuguese",
  }
  
  
  DEEPL_LANGUAGE_CODES: Dict[str, str] = {
      "zh": "ZH",
      "en": "EN",
      "ru": "RU",
      "ar": "AR",
      "ja": "JA",
      "es": "ES",
      "de": "DE",
      "fr": "FR",
      "it": "IT",
      "pt": "PT",
  }
  
  
39306492   tangwang   fix(translation):...
57
58
  # Sparse overrides on top of ``NLLB_FLORES_SHORT_TO_CODE`` (same keys win later in
  # ``build_nllb_language_catalog``). Kept for backward compatibility and explicit defaults.
0fd2f875   tangwang   translate
59
60
  NLLB_LANGUAGE_CODES: Dict[str, str] = {
      "en": "eng_Latn",
14e67b71   tangwang   分句后的 batching 现在是...
61
      "fi": "fin_Latn",
0fd2f875   tangwang   translate
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
      "zh": "zho_Hans",
      "ru": "rus_Cyrl",
      "ar": "arb_Arab",
      "ja": "jpn_Jpan",
      "es": "spa_Latn",
      "de": "deu_Latn",
      "fr": "fra_Latn",
      "it": "ita_Latn",
      "pt": "por_Latn",
  }
  
  
  MARIAN_LANGUAGE_DIRECTIONS: Dict[str, Tuple[str, str]] = {
      "opus-mt-zh-en": ("zh", "en"),
      "opus-mt-en-zh": ("en", "zh"),
  }
14e67b71   tangwang   分句后的 batching 现在是...
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
  
  
  NLLB_LANGUAGE_ALIASES: Dict[str, str] = {
      "fi_fi": "fi",
      "fin": "fi",
      "fin_fin": "fi",
      "zh_cn": "zh",
      "zh_hans": "zh",
  }
  
  
  def normalize_language_key(language: Optional[str]) -> str:
      return str(language or "").strip().lower().replace("-", "_")
  
  
39306492   tangwang   fix(translation):...
93
94
95
96
97
98
  @lru_cache(maxsize=1)
  def _nllb_tokenizer_code_by_normalized_key() -> Dict[str, str]:
      """Map lowercased ``deu_latn``-style keys to canonical tokenizer strings (e.g. ``deu_Latn``)."""
      return {normalize_language_key(code): code for code in NLLB_TOKENIZER_LANGUAGE_CODES}
  
  
14e67b71   tangwang   分句后的 batching 现在是...
99
100
101
  def build_nllb_language_catalog(
      overrides: Optional[Mapping[str, str]] = None,
  ) -> Dict[str, str]:
39306492   tangwang   fix(translation):...
102
103
104
105
106
107
108
109
110
      catalog: Dict[str, str] = {}
      for key, value in NLLB_FLORES_SHORT_TO_CODE.items():
          normalized_key = normalize_language_key(key)
          if normalized_key:
              catalog[normalized_key] = str(value).strip()
      for key, value in NLLB_LANGUAGE_CODES.items():
          normalized_key = normalize_language_key(key)
          if normalized_key:
              catalog[normalized_key] = str(value).strip()
14e67b71   tangwang   分句后的 batching 现在是...
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
      for key, value in (overrides or {}).items():
          normalized_key = normalize_language_key(key)
          if normalized_key:
              catalog[normalized_key] = str(value).strip()
      return catalog
  
  
  def resolve_nllb_language_code(
      language: Optional[str],
      language_codes: Optional[Mapping[str, str]] = None,
  ) -> Optional[str]:
      normalized = normalize_language_key(language)
      if not normalized:
          return None
  
      catalog = build_nllb_language_catalog(language_codes)
      direct = catalog.get(normalized)
      if direct is not None:
          return direct
  
      alias = NLLB_LANGUAGE_ALIASES.get(normalized)
      if alias is not None:
          aliased = catalog.get(normalize_language_key(alias))
          if aliased is not None:
              return aliased
  
39306492   tangwang   fix(translation):...
137
138
139
140
      tokenizer_hit = _nllb_tokenizer_code_by_normalized_key().get(normalized)
      if tokenizer_hit is not None:
          return tokenizer_hit
  
14e67b71   tangwang   分句后的 batching 现在是...
141
142
143
144
      for code in catalog.values():
          if normalize_language_key(code) == normalized:
              return code
      return None