languages.py 2.67 KB
"""Translation-internal language metadata."""

from __future__ import annotations

from typing import Dict, Mapping, Optional, Tuple


LANGUAGE_LABELS: Dict[str, str] = {
    "zh": "Chinese",
    "en": "English",
    "fi": "Finnish",
    "ru": "Russian",
    "ar": "Arabic",
    "ja": "Japanese",
    "es": "Spanish",
    "de": "German",
    "fr": "French",
    "it": "Italian",
    "pt": "Portuguese",
}


QWEN_LANGUAGE_CODES: Dict[str, str] = {
    "zh": "Chinese",
    "en": "English",
    "ru": "Russian",
    "ar": "Arabic",
    "ja": "Japanese",
    "es": "Spanish",
    "de": "German",
    "fr": "French",
    "it": "Italian",
    "pt": "Portuguese",
}


DEEPL_LANGUAGE_CODES: Dict[str, str] = {
    "zh": "ZH",
    "en": "EN",
    "ru": "RU",
    "ar": "AR",
    "ja": "JA",
    "es": "ES",
    "de": "DE",
    "fr": "FR",
    "it": "IT",
    "pt": "PT",
}


NLLB_LANGUAGE_CODES: Dict[str, str] = {
    "en": "eng_Latn",
    "fi": "fin_Latn",
    "zh": "zho_Hans",
    "ru": "rus_Cyrl",
    "ar": "arb_Arab",
    "ja": "jpn_Jpan",
    "es": "spa_Latn",
    "de": "deu_Latn",
    "fr": "fra_Latn",
    "it": "ita_Latn",
    "pt": "por_Latn",
}


MARIAN_LANGUAGE_DIRECTIONS: Dict[str, Tuple[str, str]] = {
    "opus-mt-zh-en": ("zh", "en"),
    "opus-mt-en-zh": ("en", "zh"),
}


NLLB_LANGUAGE_ALIASES: Dict[str, str] = {
    "fi_fi": "fi",
    "fin": "fi",
    "fin_fin": "fi",
    "zh_cn": "zh",
    "zh_hans": "zh",
}


def normalize_language_key(language: Optional[str]) -> str:
    return str(language or "").strip().lower().replace("-", "_")


def build_nllb_language_catalog(
    overrides: Optional[Mapping[str, str]] = None,
) -> Dict[str, str]:
    catalog = {
        normalize_language_key(key): str(value).strip()
        for key, value in NLLB_LANGUAGE_CODES.items()
        if str(key).strip()
    }
    for key, value in (overrides or {}).items():
        normalized_key = normalize_language_key(key)
        if normalized_key:
            catalog[normalized_key] = str(value).strip()
    return catalog


def resolve_nllb_language_code(
    language: Optional[str],
    language_codes: Optional[Mapping[str, str]] = None,
) -> Optional[str]:
    normalized = normalize_language_key(language)
    if not normalized:
        return None

    catalog = build_nllb_language_catalog(language_codes)
    direct = catalog.get(normalized)
    if direct is not None:
        return direct

    alias = NLLB_LANGUAGE_ALIASES.get(normalized)
    if alias is not None:
        aliased = catalog.get(normalize_language_key(alias))
        if aliased is not None:
            return aliased

    for code in catalog.values():
        if normalize_language_key(code) == normalized:
            return code
    return None