0fd2f875
tangwang
translate
|
1
2
3
4
|
"""Translation-internal language metadata."""
from __future__ import annotations
|
39306492
tangwang
fix(translation):...
|
5
|
from functools import lru_cache
|
14e67b71
tangwang
分句后的 batching 现在是...
|
6
|
from typing import Dict, Mapping, Optional, Tuple
|
0fd2f875
tangwang
translate
|
7
|
|
39306492
tangwang
fix(translation):...
|
8
9
10
11
12
|
from translation.nllb_flores_short_map import (
NLLB_FLORES_SHORT_TO_CODE,
NLLB_TOKENIZER_LANGUAGE_CODES,
)
|
0fd2f875
tangwang
translate
|
13
14
15
16
|
LANGUAGE_LABELS: Dict[str, str] = {
"zh": "Chinese",
"en": "English",
|
14e67b71
tangwang
分句后的 batching 现在是...
|
17
|
"fi": "Finnish",
|
0fd2f875
tangwang
translate
|
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
|
"ru": "Russian",
"ar": "Arabic",
"ja": "Japanese",
"es": "Spanish",
"de": "German",
"fr": "French",
"it": "Italian",
"pt": "Portuguese",
}
QWEN_LANGUAGE_CODES: Dict[str, str] = {
"zh": "Chinese",
"en": "English",
"ru": "Russian",
"ar": "Arabic",
"ja": "Japanese",
"es": "Spanish",
"de": "German",
"fr": "French",
"it": "Italian",
"pt": "Portuguese",
}
DEEPL_LANGUAGE_CODES: Dict[str, str] = {
"zh": "ZH",
"en": "EN",
"ru": "RU",
"ar": "AR",
"ja": "JA",
"es": "ES",
"de": "DE",
"fr": "FR",
"it": "IT",
"pt": "PT",
}
|
39306492
tangwang
fix(translation):...
|
57
58
|
# Sparse overrides on top of ``NLLB_FLORES_SHORT_TO_CODE`` (same keys win later in
# ``build_nllb_language_catalog``). Kept for backward compatibility and explicit defaults.
|
0fd2f875
tangwang
translate
|
59
60
|
NLLB_LANGUAGE_CODES: Dict[str, str] = {
"en": "eng_Latn",
|
14e67b71
tangwang
分句后的 batching 现在是...
|
61
|
"fi": "fin_Latn",
|
0fd2f875
tangwang
translate
|
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
|
"zh": "zho_Hans",
"ru": "rus_Cyrl",
"ar": "arb_Arab",
"ja": "jpn_Jpan",
"es": "spa_Latn",
"de": "deu_Latn",
"fr": "fra_Latn",
"it": "ita_Latn",
"pt": "por_Latn",
}
MARIAN_LANGUAGE_DIRECTIONS: Dict[str, Tuple[str, str]] = {
"opus-mt-zh-en": ("zh", "en"),
"opus-mt-en-zh": ("en", "zh"),
}
|
14e67b71
tangwang
分句后的 batching 现在是...
|
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
|
NLLB_LANGUAGE_ALIASES: Dict[str, str] = {
"fi_fi": "fi",
"fin": "fi",
"fin_fin": "fi",
"zh_cn": "zh",
"zh_hans": "zh",
}
def normalize_language_key(language: Optional[str]) -> str:
return str(language or "").strip().lower().replace("-", "_")
|
39306492
tangwang
fix(translation):...
|
93
94
95
96
97
98
|
@lru_cache(maxsize=1)
def _nllb_tokenizer_code_by_normalized_key() -> Dict[str, str]:
"""Map lowercased ``deu_latn``-style keys to canonical tokenizer strings (e.g. ``deu_Latn``)."""
return {normalize_language_key(code): code for code in NLLB_TOKENIZER_LANGUAGE_CODES}
|
14e67b71
tangwang
分句后的 batching 现在是...
|
99
100
101
|
def build_nllb_language_catalog(
overrides: Optional[Mapping[str, str]] = None,
) -> Dict[str, str]:
|
39306492
tangwang
fix(translation):...
|
102
103
104
105
106
107
108
109
110
|
catalog: Dict[str, str] = {}
for key, value in NLLB_FLORES_SHORT_TO_CODE.items():
normalized_key = normalize_language_key(key)
if normalized_key:
catalog[normalized_key] = str(value).strip()
for key, value in NLLB_LANGUAGE_CODES.items():
normalized_key = normalize_language_key(key)
if normalized_key:
catalog[normalized_key] = str(value).strip()
|
14e67b71
tangwang
分句后的 batching 现在是...
|
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
|
for key, value in (overrides or {}).items():
normalized_key = normalize_language_key(key)
if normalized_key:
catalog[normalized_key] = str(value).strip()
return catalog
def resolve_nllb_language_code(
language: Optional[str],
language_codes: Optional[Mapping[str, str]] = None,
) -> Optional[str]:
normalized = normalize_language_key(language)
if not normalized:
return None
catalog = build_nllb_language_catalog(language_codes)
direct = catalog.get(normalized)
if direct is not None:
return direct
alias = NLLB_LANGUAGE_ALIASES.get(normalized)
if alias is not None:
aliased = catalog.get(normalize_language_key(alias))
if aliased is not None:
return aliased
|
39306492
tangwang
fix(translation):...
|
137
138
139
140
|
tokenizer_hit = _nllb_tokenizer_code_by_normalized_key().get(normalized)
if tokenizer_hit is not None:
return tokenizer_hit
|
14e67b71
tangwang
分句后的 batching 现在是...
|
141
142
143
144
|
for code in catalog.values():
if normalize_language_key(code) == normalized:
return code
return None
|