"""Translation-internal language metadata.""" from __future__ import annotations from functools import lru_cache from typing import Dict, Mapping, Optional, Tuple from translation.nllb_flores_short_map import ( NLLB_FLORES_SHORT_TO_CODE, NLLB_TOKENIZER_LANGUAGE_CODES, ) LANGUAGE_LABELS: Dict[str, str] = { "zh": "Chinese", "en": "English", "fi": "Finnish", "ru": "Russian", "ar": "Arabic", "ja": "Japanese", "es": "Spanish", "de": "German", "fr": "French", "it": "Italian", "pt": "Portuguese", } QWEN_LANGUAGE_CODES: Dict[str, str] = { "zh": "Chinese", "en": "English", "ru": "Russian", "ar": "Arabic", "ja": "Japanese", "es": "Spanish", "de": "German", "fr": "French", "it": "Italian", "pt": "Portuguese", } DEEPL_LANGUAGE_CODES: Dict[str, str] = { "zh": "ZH", "en": "EN", "ru": "RU", "ar": "AR", "ja": "JA", "es": "ES", "de": "DE", "fr": "FR", "it": "IT", "pt": "PT", } # Sparse overrides on top of ``NLLB_FLORES_SHORT_TO_CODE`` (same keys win later in # ``build_nllb_language_catalog``). Kept for backward compatibility and explicit defaults. NLLB_LANGUAGE_CODES: Dict[str, str] = { "en": "eng_Latn", "fi": "fin_Latn", "zh": "zho_Hans", "ru": "rus_Cyrl", "ar": "arb_Arab", "ja": "jpn_Jpan", "es": "spa_Latn", "de": "deu_Latn", "fr": "fra_Latn", "it": "ita_Latn", "pt": "por_Latn", } MARIAN_LANGUAGE_DIRECTIONS: Dict[str, Tuple[str, str]] = { "opus-mt-zh-en": ("zh", "en"), "opus-mt-en-zh": ("en", "zh"), } NLLB_LANGUAGE_ALIASES: Dict[str, str] = { "fi_fi": "fi", "fin": "fi", "fin_fin": "fi", "zh_cn": "zh", "zh_hans": "zh", } def normalize_language_key(language: Optional[str]) -> str: return str(language or "").strip().lower().replace("-", "_") @lru_cache(maxsize=1) def _nllb_tokenizer_code_by_normalized_key() -> Dict[str, str]: """Map lowercased ``deu_latn``-style keys to canonical tokenizer strings (e.g. ``deu_Latn``).""" return {normalize_language_key(code): code for code in NLLB_TOKENIZER_LANGUAGE_CODES} def build_nllb_language_catalog( overrides: Optional[Mapping[str, str]] = None, ) -> Dict[str, str]: catalog: Dict[str, str] = {} for key, value in NLLB_FLORES_SHORT_TO_CODE.items(): normalized_key = normalize_language_key(key) if normalized_key: catalog[normalized_key] = str(value).strip() for key, value in NLLB_LANGUAGE_CODES.items(): normalized_key = normalize_language_key(key) if normalized_key: catalog[normalized_key] = str(value).strip() for key, value in (overrides or {}).items(): normalized_key = normalize_language_key(key) if normalized_key: catalog[normalized_key] = str(value).strip() return catalog def resolve_nllb_language_code( language: Optional[str], language_codes: Optional[Mapping[str, str]] = None, ) -> Optional[str]: normalized = normalize_language_key(language) if not normalized: return None catalog = build_nllb_language_catalog(language_codes) direct = catalog.get(normalized) if direct is not None: return direct alias = NLLB_LANGUAGE_ALIASES.get(normalized) if alias is not None: aliased = catalog.get(normalize_language_key(alias)) if aliased is not None: return aliased tokenizer_hit = _nllb_tokenizer_code_by_normalized_key().get(normalized) if tokenizer_hit is not None: return tokenizer_hit for code in catalog.values(): if normalize_language_key(code) == normalized: return code return None