mapping.py 3.3 KB
"""
Mapping generator for suggestion indices.
"""

from typing import Dict, Any, List


ANALYZER_BY_LANG: Dict[str, str] = {
    "zh": "index_ik",
    "en": "english",
    "ar": "arabic",
    "hy": "armenian",
    "eu": "basque",
    "pt_br": "brazilian",
    "bg": "bulgarian",
    "ca": "catalan",
    "cjk": "cjk",
    "cs": "czech",
    "da": "danish",
    "nl": "dutch",
    "fi": "finnish",
    "fr": "french",
    "gl": "galician",
    "de": "german",
    "el": "greek",
    "hi": "hindi",
    "hu": "hungarian",
    "id": "indonesian",
    "it": "italian",
    "no": "norwegian",
    "fa": "persian",
    "pt": "portuguese",
    "ro": "romanian",
    "ru": "russian",
    "es": "spanish",
    "sv": "swedish",
    "tr": "turkish",
    "th": "thai",
}


def _completion_field(lang: str) -> Dict[str, Any]:
    analyzer = ANALYZER_BY_LANG.get(lang, "standard")
    if lang == "zh":
        return {
            "type": "completion",
            "analyzer": analyzer,
            "search_analyzer": "query_ik",
        }
    return {"type": "completion", "analyzer": analyzer}


def _sat_field(lang: str) -> Dict[str, Any]:
    analyzer = ANALYZER_BY_LANG.get(lang, "standard")
    return {"type": "search_as_you_type", "analyzer": analyzer}


def build_suggestion_mapping(index_languages: List[str]) -> Dict[str, Any]:
    """Build index settings+mappings for suggestion index."""
    langs = [x for x in (index_languages or []) if x]
    if not langs:
        langs = ["en", "zh"]

    completion_props: Dict[str, Any] = {}
    sat_props: Dict[str, Any] = {}
    for lang in langs:
        completion_props[lang] = _completion_field(lang)
        sat_props[lang] = _sat_field(lang)

    return {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0,
            "refresh_interval": "30s",
            "analysis": {
                "analyzer": {
                    "index_ik": {
                        "type": "custom",
                        "tokenizer": "ik_max_word",
                        "filter": ["lowercase", "asciifolding"],
                    },
                    "query_ik": {
                        "type": "custom",
                        "tokenizer": "ik_smart",
                        "filter": ["lowercase", "asciifolding"],
                    },
                }
            },
        },
        "mappings": {
            "properties": {
                "tenant_id": {"type": "keyword"},
                "lang": {"type": "keyword"},
                "text": {"type": "keyword"},
                "text_norm": {"type": "keyword"},
                "sources": {"type": "keyword"},
                "title_doc_count": {"type": "integer"},
                "qanchor_doc_count": {"type": "integer"},
                "tag_doc_count": {"type": "integer"},
                "query_count_7d": {"type": "integer"},
                "query_count_30d": {"type": "integer"},
                "rank_score": {"type": "float"},
                "lang_confidence": {"type": "float"},
                "lang_source": {"type": "keyword"},
                "lang_conflict": {"type": "boolean"},
                "status": {"type": "byte"},
                "updated_at": {"type": "date"},
                "completion": {"properties": completion_props},
                "sat": {"properties": sat_props},
            }
        },
    }