Commit 39306492208b850d441ed8a6eaaae3898f8990a5

Authored by tangwang
1 parent 41856690

fix(translation): 补全 NLLB 本地翻译的语言码解析(FLORES 短码 + 完整 tokenizer 码)

问题描述
----------
使用 facebook/nllb-200-distilled-600M(CTranslate2 后端)时,若 API 传入 ISO 639-1
或 FLORES 短标签(如 ca、da、nl、sv、no、tr 等),会触发
「Unsupported NLLB source/target language」。模型与 tokenizer 实际支持这些语言;
根因是 resolve_nllb_language_code 仅依赖 translation/languages.py 里十余条
NLLB_LANGUAGE_CODES 映射,大量合法短码未注册,校验误报为不支持。

修改内容
----------
1. 新增 translation/nllb_flores_short_map.py
   - NLLB_FLORES_SHORT_TO_CODE:与 HF 模型卡 language 列表对齐的短标签 ->
     NLLB 强制 BOS/src_lang 形式(<ISO639-3>_<ISO15924>,如 cat_Latn)。
   - NLLB_TOKENIZER_LANGUAGE_CODES:从 tokenizer.json 提取的 202 个语言 token
     全集,供直接传入 deu_Latn 等形式时做规范化解析。
   - 额外约定:ISO 639-1「no」映射 nob_Latn(书面挪威语 Bokmål);nb/nn 分别
     对应 nob_Latn / nno_Latn;「ar」显式指向 arb_Arab(与 NLLB 一致)。

2. 调整 translation/languages.py
   - build_nllb_language_catalog:合并顺序为 FLORES 全表 -> NLLB_LANGUAGE_CODES
    (保留少量显式覆盖,如 zh->zho_Hans)-> 调用方 overrides。
   - resolve_nllb_language_code:在目录与别名之后,增加基于
     NLLB_TOKENIZER_LANGUAGE_CODES 的大小写不敏感匹配(如 eng_latn -> eng_Latn),
     覆盖「已传完整 NLLB 码」的场景。

3. tests/test_translation_local_backends.py
   - 新增 test_nllb_resolves_flores_short_tags_and_iso_no,覆盖用户关心的短码及
     deu_Latn 直通解析。

方案说明
----------
NLLB 接口语义以 Hugging Face NllbTokenizer 为准:语言标识为 FLORES-200 风格
三字母语种码 + 下划线 + 四字母脚本子标签(ISO 15924)。业务侧常用 ISO 639-1
(de、sv)或模型卡短列表(ca、nl),需在服务内统一映射到 tokenizer 特殊 token。
本实现以模型卡 language 字段 + tokenizer 词表为单一事实来源生成静态表,
避免运行时依赖额外库;同时保留原有 NLLB_LANGUAGE_CODES 作为薄覆盖层以兼容
既有配置与测试。

Refs: https://huggingface.co/facebook/nllb-200-distilled-600M
Made-with: Cursor
tests/test_translation_local_backends.py
... ... @@ -5,6 +5,7 @@ import torch
5 5  
6 6 from translation.backends.local_seq2seq import MarianMTTranslationBackend, NLLBTranslationBackend
7 7 from translation.backends.local_ctranslate2 import NLLBCTranslate2TranslationBackend
  8 +from translation.languages import build_nllb_language_catalog, resolve_nllb_language_code
8 9 from translation.service import TranslationService
9 10 from translation.text_splitter import compute_safe_input_token_limit, split_text_for_translation
10 11  
... ... @@ -200,6 +201,22 @@ def test_nllb_ctranslate2_accepts_finnish_short_code(monkeypatch):
200 201 assert backend.translator.last_translate_batch_kwargs["target_prefix"] == [["zho_Hans"]]
201 202  
202 203  
  204 +def test_nllb_resolves_flores_short_tags_and_iso_no():
  205 + cat = build_nllb_language_catalog(None)
  206 + assert resolve_nllb_language_code("ca", cat) == "cat_Latn"
  207 + assert resolve_nllb_language_code("da", cat) == "dan_Latn"
  208 + assert resolve_nllb_language_code("eu", cat) == "eus_Latn"
  209 + assert resolve_nllb_language_code("gl", cat) == "glg_Latn"
  210 + assert resolve_nllb_language_code("hu", cat) == "hun_Latn"
  211 + assert resolve_nllb_language_code("id", cat) == "ind_Latn"
  212 + assert resolve_nllb_language_code("nl", cat) == "nld_Latn"
  213 + assert resolve_nllb_language_code("no", cat) == "nob_Latn"
  214 + assert resolve_nllb_language_code("ro", cat) == "ron_Latn"
  215 + assert resolve_nllb_language_code("SV", cat) == "swe_Latn"
  216 + assert resolve_nllb_language_code("tr", cat) == "tur_Latn"
  217 + assert resolve_nllb_language_code("deu_Latn", cat) == "deu_Latn"
  218 +
  219 +
203 220 def test_translation_service_preloads_enabled_backends(monkeypatch):
204 221 created = []
205 222  
... ...
translation/languages.py
... ... @@ -2,8 +2,14 @@
2 2  
3 3 from __future__ import annotations
4 4  
  5 +from functools import lru_cache
5 6 from typing import Dict, Mapping, Optional, Tuple
6 7  
  8 +from translation.nllb_flores_short_map import (
  9 + NLLB_FLORES_SHORT_TO_CODE,
  10 + NLLB_TOKENIZER_LANGUAGE_CODES,
  11 +)
  12 +
7 13  
8 14 LANGUAGE_LABELS: Dict[str, str] = {
9 15 "zh": "Chinese",
... ... @@ -48,6 +54,8 @@ DEEPL_LANGUAGE_CODES: Dict[str, str] = {
48 54 }
49 55  
50 56  
  57 +# Sparse overrides on top of ``NLLB_FLORES_SHORT_TO_CODE`` (same keys win later in
  58 +# ``build_nllb_language_catalog``). Kept for backward compatibility and explicit defaults.
51 59 NLLB_LANGUAGE_CODES: Dict[str, str] = {
52 60 "en": "eng_Latn",
53 61 "fi": "fin_Latn",
... ... @@ -82,14 +90,24 @@ def normalize_language_key(language: Optional[str]) -&gt; str:
82 90 return str(language or "").strip().lower().replace("-", "_")
83 91  
84 92  
  93 +@lru_cache(maxsize=1)
  94 +def _nllb_tokenizer_code_by_normalized_key() -> Dict[str, str]:
  95 + """Map lowercased ``deu_latn``-style keys to canonical tokenizer strings (e.g. ``deu_Latn``)."""
  96 + return {normalize_language_key(code): code for code in NLLB_TOKENIZER_LANGUAGE_CODES}
  97 +
  98 +
85 99 def build_nllb_language_catalog(
86 100 overrides: Optional[Mapping[str, str]] = None,
87 101 ) -> Dict[str, str]:
88   - catalog = {
89   - normalize_language_key(key): str(value).strip()
90   - for key, value in NLLB_LANGUAGE_CODES.items()
91   - if str(key).strip()
92   - }
  102 + catalog: Dict[str, str] = {}
  103 + for key, value in NLLB_FLORES_SHORT_TO_CODE.items():
  104 + normalized_key = normalize_language_key(key)
  105 + if normalized_key:
  106 + catalog[normalized_key] = str(value).strip()
  107 + for key, value in NLLB_LANGUAGE_CODES.items():
  108 + normalized_key = normalize_language_key(key)
  109 + if normalized_key:
  110 + catalog[normalized_key] = str(value).strip()
93 111 for key, value in (overrides or {}).items():
94 112 normalized_key = normalize_language_key(key)
95 113 if normalized_key:
... ... @@ -116,6 +134,10 @@ def resolve_nllb_language_code(
116 134 if aliased is not None:
117 135 return aliased
118 136  
  137 + tokenizer_hit = _nllb_tokenizer_code_by_normalized_key().get(normalized)
  138 + if tokenizer_hit is not None:
  139 + return tokenizer_hit
  140 +
119 141 for code in catalog.values():
120 142 if normalize_language_key(code) == normalized:
121 143 return code
... ...
translation/nllb_flores_short_map.py 0 → 100644
... ... @@ -0,0 +1,416 @@
  1 +"""FLORES short language tags and canonical NLLB tokenizer codes.
  2 +
  3 +``NLLB_FLORES_SHORT_TO_CODE`` maps model-card short tags (ISO 639-1 / FLORES ids)
  4 +to NLLB ``src_lang`` tokens: ``<iso639-3>_<Script>`` (ISO 15924 script).
  5 +
  6 +``NLLB_TOKENIZER_LANGUAGE_CODES`` lists every language token in the tokenizer.
  7 +"""
  8 +from __future__ import annotations
  9 +
  10 +from typing import Dict, FrozenSet
  11 +
  12 +NLLB_TOKENIZER_LANGUAGE_CODES: FrozenSet[str] = frozenset({
  13 + "ace_Arab",
  14 + "ace_Latn",
  15 + "acm_Arab",
  16 + "acq_Arab",
  17 + "aeb_Arab",
  18 + "afr_Latn",
  19 + "ajp_Arab",
  20 + "aka_Latn",
  21 + "als_Latn",
  22 + "amh_Ethi",
  23 + "apc_Arab",
  24 + "arb_Arab",
  25 + "ars_Arab",
  26 + "ary_Arab",
  27 + "arz_Arab",
  28 + "asm_Beng",
  29 + "ast_Latn",
  30 + "awa_Deva",
  31 + "ayr_Latn",
  32 + "azb_Arab",
  33 + "azj_Latn",
  34 + "bak_Cyrl",
  35 + "bam_Latn",
  36 + "ban_Latn",
  37 + "bel_Cyrl",
  38 + "bem_Latn",
  39 + "ben_Beng",
  40 + "bho_Deva",
  41 + "bjn_Arab",
  42 + "bjn_Latn",
  43 + "bod_Tibt",
  44 + "bos_Latn",
  45 + "bug_Latn",
  46 + "bul_Cyrl",
  47 + "cat_Latn",
  48 + "ceb_Latn",
  49 + "ces_Latn",
  50 + "cjk_Latn",
  51 + "ckb_Arab",
  52 + "crh_Latn",
  53 + "cym_Latn",
  54 + "dan_Latn",
  55 + "deu_Latn",
  56 + "dik_Latn",
  57 + "dyu_Latn",
  58 + "dzo_Tibt",
  59 + "ell_Grek",
  60 + "eng_Latn",
  61 + "epo_Latn",
  62 + "est_Latn",
  63 + "eus_Latn",
  64 + "ewe_Latn",
  65 + "fao_Latn",
  66 + "fij_Latn",
  67 + "fin_Latn",
  68 + "fon_Latn",
  69 + "fra_Latn",
  70 + "fur_Latn",
  71 + "fuv_Latn",
  72 + "gaz_Latn",
  73 + "gla_Latn",
  74 + "gle_Latn",
  75 + "glg_Latn",
  76 + "grn_Latn",
  77 + "guj_Gujr",
  78 + "hat_Latn",
  79 + "hau_Latn",
  80 + "heb_Hebr",
  81 + "hin_Deva",
  82 + "hne_Deva",
  83 + "hrv_Latn",
  84 + "hun_Latn",
  85 + "hye_Armn",
  86 + "ibo_Latn",
  87 + "ilo_Latn",
  88 + "ind_Latn",
  89 + "isl_Latn",
  90 + "ita_Latn",
  91 + "jav_Latn",
  92 + "jpn_Jpan",
  93 + "kab_Latn",
  94 + "kac_Latn",
  95 + "kam_Latn",
  96 + "kan_Knda",
  97 + "kas_Arab",
  98 + "kas_Deva",
  99 + "kat_Geor",
  100 + "kaz_Cyrl",
  101 + "kbp_Latn",
  102 + "kea_Latn",
  103 + "khk_Cyrl",
  104 + "khm_Khmr",
  105 + "kik_Latn",
  106 + "kin_Latn",
  107 + "kir_Cyrl",
  108 + "kmb_Latn",
  109 + "kmr_Latn",
  110 + "knc_Arab",
  111 + "knc_Latn",
  112 + "kon_Latn",
  113 + "kor_Hang",
  114 + "lao_Laoo",
  115 + "lij_Latn",
  116 + "lim_Latn",
  117 + "lin_Latn",
  118 + "lit_Latn",
  119 + "lmo_Latn",
  120 + "ltg_Latn",
  121 + "ltz_Latn",
  122 + "lua_Latn",
  123 + "lug_Latn",
  124 + "luo_Latn",
  125 + "lus_Latn",
  126 + "lvs_Latn",
  127 + "mag_Deva",
  128 + "mai_Deva",
  129 + "mal_Mlym",
  130 + "mar_Deva",
  131 + "min_Latn",
  132 + "mkd_Cyrl",
  133 + "mlt_Latn",
  134 + "mni_Beng",
  135 + "mos_Latn",
  136 + "mri_Latn",
  137 + "mya_Mymr",
  138 + "nld_Latn",
  139 + "nno_Latn",
  140 + "nob_Latn",
  141 + "npi_Deva",
  142 + "nso_Latn",
  143 + "nus_Latn",
  144 + "nya_Latn",
  145 + "oci_Latn",
  146 + "ory_Orya",
  147 + "pag_Latn",
  148 + "pan_Guru",
  149 + "pap_Latn",
  150 + "pbt_Arab",
  151 + "pes_Arab",
  152 + "plt_Latn",
  153 + "pol_Latn",
  154 + "por_Latn",
  155 + "prs_Arab",
  156 + "quy_Latn",
  157 + "ron_Latn",
  158 + "run_Latn",
  159 + "rus_Cyrl",
  160 + "sag_Latn",
  161 + "san_Deva",
  162 + "sat_Beng",
  163 + "scn_Latn",
  164 + "shn_Mymr",
  165 + "sin_Sinh",
  166 + "slk_Latn",
  167 + "slv_Latn",
  168 + "smo_Latn",
  169 + "sna_Latn",
  170 + "snd_Arab",
  171 + "som_Latn",
  172 + "sot_Latn",
  173 + "spa_Latn",
  174 + "srd_Latn",
  175 + "srp_Cyrl",
  176 + "ssw_Latn",
  177 + "sun_Latn",
  178 + "swe_Latn",
  179 + "swh_Latn",
  180 + "szl_Latn",
  181 + "tam_Taml",
  182 + "taq_Latn",
  183 + "taq_Tfng",
  184 + "tat_Cyrl",
  185 + "tel_Telu",
  186 + "tgk_Cyrl",
  187 + "tgl_Latn",
  188 + "tha_Thai",
  189 + "tir_Ethi",
  190 + "tpi_Latn",
  191 + "tsn_Latn",
  192 + "tso_Latn",
  193 + "tuk_Latn",
  194 + "tum_Latn",
  195 + "tur_Latn",
  196 + "twi_Latn",
  197 + "tzm_Tfng",
  198 + "uig_Arab",
  199 + "ukr_Cyrl",
  200 + "umb_Latn",
  201 + "urd_Arab",
  202 + "uzn_Latn",
  203 + "vec_Latn",
  204 + "vie_Latn",
  205 + "war_Latn",
  206 + "wol_Latn",
  207 + "xho_Latn",
  208 + "ydd_Hebr",
  209 + "yor_Latn",
  210 + "yue_Hant",
  211 + "zho_Hans",
  212 + "zho_Hant",
  213 + "zsm_Latn",
  214 + "zul_Latn",
  215 +})
  216 +
  217 +NLLB_FLORES_SHORT_TO_CODE: Dict[str, str] = {
  218 + "ace": "ace_Latn",
  219 + "acm": "acm_Arab",
  220 + "acq": "acq_Arab",
  221 + "aeb": "aeb_Arab",
  222 + "af": "afr_Latn",
  223 + "ajp": "ajp_Arab",
  224 + "ak": "aka_Latn",
  225 + "als": "als_Latn",
  226 + "am": "amh_Ethi",
  227 + "apc": "apc_Arab",
  228 + "ar": "arb_Arab",
  229 + "ars": "ars_Arab",
  230 + "ary": "ary_Arab",
  231 + "arz": "arz_Arab",
  232 + "as": "asm_Beng",
  233 + "ast": "ast_Latn",
  234 + "awa": "awa_Deva",
  235 + "ayr": "ayr_Latn",
  236 + "azb": "azb_Arab",
  237 + "azj": "azj_Latn",
  238 + "ba": "bak_Cyrl",
  239 + "ban": "ban_Latn",
  240 + "be": "bel_Cyrl",
  241 + "bem": "bem_Latn",
  242 + "bg": "bul_Cyrl",
  243 + "bho": "bho_Deva",
  244 + "bjn": "bjn_Latn",
  245 + "bm": "bam_Latn",
  246 + "bn": "ben_Beng",
  247 + "bo": "bod_Tibt",
  248 + "bs": "bos_Latn",
  249 + "bug": "bug_Latn",
  250 + "ca": "cat_Latn",
  251 + "ceb": "ceb_Latn",
  252 + "cjk": "cjk_Latn",
  253 + "ckb": "ckb_Arab",
  254 + "crh": "crh_Latn",
  255 + "cs": "ces_Latn",
  256 + "cy": "cym_Latn",
  257 + "da": "dan_Latn",
  258 + "de": "deu_Latn",
  259 + "dik": "dik_Latn",
  260 + "dyu": "dyu_Latn",
  261 + "dz": "dzo_Tibt",
  262 + "ee": "ewe_Latn",
  263 + "el": "ell_Grek",
  264 + "en": "eng_Latn",
  265 + "eo": "epo_Latn",
  266 + "es": "spa_Latn",
  267 + "et": "est_Latn",
  268 + "eu": "eus_Latn",
  269 + "fi": "fin_Latn",
  270 + "fj": "fij_Latn",
  271 + "fo": "fao_Latn",
  272 + "fon": "fon_Latn",
  273 + "fr": "fra_Latn",
  274 + "fur": "fur_Latn",
  275 + "fuv": "fuv_Latn",
  276 + "ga": "gle_Latn",
  277 + "gaz": "gaz_Latn",
  278 + "gd": "gla_Latn",
  279 + "gl": "glg_Latn",
  280 + "gn": "grn_Latn",
  281 + "gu": "guj_Gujr",
  282 + "ha": "hau_Latn",
  283 + "he": "heb_Hebr",
  284 + "hi": "hin_Deva",
  285 + "hne": "hne_Deva",
  286 + "hr": "hrv_Latn",
  287 + "ht": "hat_Latn",
  288 + "hu": "hun_Latn",
  289 + "hy": "hye_Armn",
  290 + "id": "ind_Latn",
  291 + "ig": "ibo_Latn",
  292 + "ilo": "ilo_Latn",
  293 + "is": "isl_Latn",
  294 + "it": "ita_Latn",
  295 + "ja": "jpn_Jpan",
  296 + "jv": "jav_Latn",
  297 + "ka": "kat_Geor",
  298 + "kab": "kab_Latn",
  299 + "kac": "kac_Latn",
  300 + "kam": "kam_Latn",
  301 + "kbp": "kbp_Latn",
  302 + "kea": "kea_Latn",
  303 + "kg": "kon_Latn",
  304 + "khk": "khk_Cyrl",
  305 + "ki": "kik_Latn",
  306 + "kk": "kaz_Cyrl",
  307 + "km": "khm_Khmr",
  308 + "kmb": "kmb_Latn",
  309 + "kmr": "kmr_Latn",
  310 + "kn": "kan_Knda",
  311 + "knc": "knc_Latn",
  312 + "ko": "kor_Hang",
  313 + "ks": "kas_Arab",
  314 + "ky": "kir_Cyrl",
  315 + "lb": "ltz_Latn",
  316 + "lg": "lug_Latn",
  317 + "li": "lim_Latn",
  318 + "lij": "lij_Latn",
  319 + "lmo": "lmo_Latn",
  320 + "ln": "lin_Latn",
  321 + "lo": "lao_Laoo",
  322 + "lt": "lit_Latn",
  323 + "ltg": "ltg_Latn",
  324 + "lua": "lua_Latn",
  325 + "luo": "luo_Latn",
  326 + "lus": "lus_Latn",
  327 + "lvs": "lvs_Latn",
  328 + "mag": "mag_Deva",
  329 + "mai": "mai_Deva",
  330 + "mar": "mar_Deva",
  331 + "mi": "mri_Latn",
  332 + "min": "min_Latn",
  333 + "mk": "mkd_Cyrl",
  334 + "ml": "mal_Mlym",
  335 + "mni": "mni_Beng",
  336 + "mos": "mos_Latn",
  337 + "mt": "mlt_Latn",
  338 + "my": "mya_Mymr",
  339 + "nb": "nob_Latn",
  340 + "nl": "nld_Latn",
  341 + "nn": "nno_Latn",
  342 + "no": "nob_Latn",
  343 + "npi": "npi_Deva",
  344 + "nso": "nso_Latn",
  345 + "nus": "nus_Latn",
  346 + "ny": "nya_Latn",
  347 + "oc": "oci_Latn",
  348 + "ory": "ory_Orya",
  349 + "pa": "pan_Guru",
  350 + "pag": "pag_Latn",
  351 + "pap": "pap_Latn",
  352 + "pbt": "pbt_Arab",
  353 + "pes": "pes_Arab",
  354 + "pl": "pol_Latn",
  355 + "plt": "plt_Latn",
  356 + "prs": "prs_Arab",
  357 + "pt": "por_Latn",
  358 + "quy": "quy_Latn",
  359 + "rn": "run_Latn",
  360 + "ro": "ron_Latn",
  361 + "ru": "rus_Cyrl",
  362 + "rw": "kin_Latn",
  363 + "sa": "san_Deva",
  364 + "sat": "sat_Beng",
  365 + "sc": "srd_Latn",
  366 + "scn": "scn_Latn",
  367 + "sd": "snd_Arab",
  368 + "sg": "sag_Latn",
  369 + "shn": "shn_Mymr",
  370 + "si": "sin_Sinh",
  371 + "sk": "slk_Latn",
  372 + "sl": "slv_Latn",
  373 + "sm": "smo_Latn",
  374 + "sn": "sna_Latn",
  375 + "so": "som_Latn",
  376 + "sr": "srp_Cyrl",
  377 + "ss": "ssw_Latn",
  378 + "st": "sot_Latn",
  379 + "su": "sun_Latn",
  380 + "sv": "swe_Latn",
  381 + "swh": "swh_Latn",
  382 + "szl": "szl_Latn",
  383 + "ta": "tam_Taml",
  384 + "taq": "taq_Latn",
  385 + "te": "tel_Telu",
  386 + "tg": "tgk_Cyrl",
  387 + "th": "tha_Thai",
  388 + "ti": "tir_Ethi",
  389 + "tk": "tuk_Latn",
  390 + "tl": "tgl_Latn",
  391 + "tn": "tsn_Latn",
  392 + "tpi": "tpi_Latn",
  393 + "tr": "tur_Latn",
  394 + "ts": "tso_Latn",
  395 + "tt": "tat_Cyrl",
  396 + "tum": "tum_Latn",
  397 + "tw": "twi_Latn",
  398 + "tzm": "tzm_Tfng",
  399 + "ug": "uig_Arab",
  400 + "uk": "ukr_Cyrl",
  401 + "umb": "umb_Latn",
  402 + "ur": "urd_Arab",
  403 + "uzn": "uzn_Latn",
  404 + "vec": "vec_Latn",
  405 + "vi": "vie_Latn",
  406 + "war": "war_Latn",
  407 + "wo": "wol_Latn",
  408 + "xh": "xho_Latn",
  409 + "ydd": "ydd_Hebr",
  410 + "yo": "yor_Latn",
  411 + "yue": "yue_Hant",
  412 + "zh": "zho_Hans",
  413 + "zsm": "zsm_Latn",
  414 + "zu": "zul_Latn",
  415 +}
  416 +
... ...