generate_search_products_mapping.py 8.94 KB
#!/usr/bin/env python3
from __future__ import annotations

import argparse
import json
from pathlib import Path
from typing import Any

ALL_LANGUAGE_CODES = [
    "zh",
    "en",
    "ar",
    "hy",
    "eu",
    "pt_br",
    "bg",
    "ca",
    "cjk",
    "cs",
    "da",
    "nl",
    "fi",
    "fr",
    "gl",
    "de",
    "el",
    "hi",
    "hu",
    "id",
    "it",
    "no",
    "fa",
    "pt",
    "ro",
    "ru",
    "es",
    "sv",
    "tr",
    "th",
]

CORE_INDEX_LANGUAGES = ["zh", "en"]

LANGUAGE_GROUPS = {
    "all": ALL_LANGUAGE_CODES,
    "core": CORE_INDEX_LANGUAGES,
}

ANALYZERS = {
    "zh": "index_ik",
    "en": "english",
    "ar": "arabic",
    "hy": "armenian",
    "eu": "basque",
    "pt_br": "brazilian",
    "bg": "bulgarian",
    "ca": "catalan",
    "cjk": "cjk",
    "cs": "czech",
    "da": "danish",
    "nl": "dutch",
    "fi": "finnish",
    "fr": "french",
    "gl": "galician",
    "de": "german",
    "el": "greek",
    "hi": "hindi",
    "hu": "hungarian",
    "id": "indonesian",
    "it": "italian",
    "no": "norwegian",
    "fa": "persian",
    "pt": "portuguese",
    "ro": "romanian",
    "ru": "russian",
    "es": "spanish",
    "sv": "swedish",
    "tr": "turkish",
    "th": "thai",
}

SETTINGS = {
    "number_of_shards": 4,
    "number_of_replicas": 0,
    "refresh_interval": "30s",
    "analysis": {
        "analyzer": {
            "index_ik": {
                "type": "custom",
                "tokenizer": "ik_max_word",
                "filter": ["lowercase", "asciifolding"],
            },
            "query_ik": {
                "type": "custom",
                "tokenizer": "ik_smart",
                "filter": ["lowercase", "asciifolding"],
            },
        },
        "normalizer": {
            "lowercase": {
                "type": "custom",
                "filter": ["lowercase"],
            }
        },
    },
    "similarity": {
        "default": {
            "type": "BM25",
            "b": 0.0,
            "k1": 0.0,
        }
    },
}

TEXT_FIELD_TEMPLATES = {
    "all_language_text": {
        "language_group": "all",
        "with_keyword": False,
    },
    "all_language_text_with_keyword": {
        "language_group": "all",
        "with_keyword": True,
    },
    "core_language_text": {
        "language_group": "core",
        "with_keyword": False,
    },
    "core_language_text_with_keyword": {
        "language_group": "core",
        "with_keyword": True,
    },
}


def scalar_field(name: str, field_type: str, **extra: Any) -> dict[str, Any]:
    spec = {
        "name": name,
        "kind": "scalar",
        "type": field_type,
    }
    if extra:
        spec["extra"] = extra
    return spec


def text_field(name: str, template: str) -> dict[str, Any]:
    return {
        "name": name,
        "kind": "text",
        "template": template,
    }


def nested_field(name: str, *fields: dict[str, Any]) -> dict[str, Any]:
    return {
        "name": name,
        "kind": "nested",
        "fields": list(fields),
    }

TEXT_EMBEDDING_SIZE = 1024
IMAGE_EMBEDDING_SIZE = 768

FIELD_SPECS = [
    scalar_field("spu_id", "keyword"),
    scalar_field("create_time", "date"),
    scalar_field("update_time", "date"),
    text_field("title", "all_language_text"),
    text_field("keywords", "all_language_text_with_keyword"),
    text_field("brief", "all_language_text"),
    text_field("description", "all_language_text"),
    text_field("vendor", "all_language_text_with_keyword"),
    scalar_field("image_url", "keyword", index=False),
    scalar_field(
        "title_embedding",
        "dense_vector",
        dims=TEXT_EMBEDDING_SIZE,
        index=True,
        similarity="dot_product",
        element_type="bfloat16",
    ),
    nested_field(
        "image_embedding",
        scalar_field(
            "vector",
            "dense_vector",
            dims=IMAGE_EMBEDDING_SIZE,
            index=True,
            similarity="dot_product",
            element_type="bfloat16",
        ),
        scalar_field("url", "text"),
    ),
    text_field("category_path", "all_language_text_with_keyword"),
    text_field("category_name_text", "all_language_text_with_keyword"),
    text_field("tags", "all_language_text_with_keyword"),
    scalar_field("category_id", "keyword"),
    scalar_field("category_name", "keyword"),
    scalar_field("category_level", "integer"),
    scalar_field("category1_name", "keyword"),
    scalar_field("category2_name", "keyword"),
    scalar_field("category3_name", "keyword"),
    nested_field(
        "specifications",
        scalar_field("sku_id", "keyword"),
        scalar_field("name", "keyword"),
        scalar_field("value_keyword", "keyword"),
        text_field("value_text", "core_language_text_with_keyword"),
    ),
    text_field("qanchors", "core_language_text"),
    text_field("enriched_tags", "core_language_text_with_keyword"),
    nested_field(
        "enriched_attributes",
        scalar_field("name", "keyword"),
        text_field("value", "core_language_text_with_keyword"),
    ),
    scalar_field("option1_name", "keyword"),
    scalar_field("option2_name", "keyword"),
    scalar_field("option3_name", "keyword"),
    text_field("option1_values", "core_language_text_with_keyword"),
    text_field("option2_values", "core_language_text_with_keyword"),
    text_field("option3_values", "core_language_text_with_keyword"),
    scalar_field("min_price", "float"),
    scalar_field("max_price", "float"),
    scalar_field("compare_at_price", "float"),
    scalar_field("sku_prices", "float"),
    scalar_field("sku_weights", "long"),
    scalar_field("sku_weight_units", "keyword"),
    scalar_field("total_inventory", "long"),
    scalar_field("sales", "long"),
    nested_field(
        "skus",
        scalar_field("sku_id", "keyword"),
        scalar_field("price", "float"),
        scalar_field("compare_at_price", "float"),
        scalar_field("sku_code", "keyword"),
        scalar_field("stock", "long"),
        scalar_field("weight", "float"),
        scalar_field("weight_unit", "keyword"),
        scalar_field("option1_value", "keyword"),
        scalar_field("option2_value", "keyword"),
        scalar_field("option3_value", "keyword"),
        scalar_field("image_src", "keyword", index=False),
    ),
]


def build_keyword_fields() -> dict[str, Any]:
    return {
        "keyword": {
            "type": "keyword",
            "normalizer": "lowercase",
        }
    }


def build_text_field(language: str, *, add_keyword: bool) -> dict[str, Any]:
    field = {
        "type": "text",
        "analyzer": ANALYZERS[language],
    }
    if language == "zh":
        field["search_analyzer"] = "query_ik"
    if add_keyword:
        field["fields"] = build_keyword_fields()
    return field


def render_field(spec: dict[str, Any]) -> dict[str, Any]:
    kind = spec["kind"]

    if kind == "scalar":
        rendered = {"type": spec["type"]}
        rendered.update(spec.get("extra", {}))
        return rendered

    if kind == "text":
        template = TEXT_FIELD_TEMPLATES[spec["template"]]
        languages = LANGUAGE_GROUPS[template["language_group"]]
        properties = {}
        for language in languages:
            properties[language] = build_text_field(
                language,
                add_keyword=template["with_keyword"],
            )
        return {
            "type": "object",
            "properties": properties,
        }

    if kind == "nested":
        properties = {}
        for child in spec["fields"]:
            properties[child["name"]] = render_field(child)
        return {
            "type": "nested",
            "properties": properties,
        }

    raise ValueError(f"Unknown field kind: {kind}")


def build_mapping() -> dict[str, Any]:
    properties = {}
    for spec in FIELD_SPECS:
        properties[spec["name"]] = render_field(spec)

    return {
        "settings": SETTINGS,
        "mappings": {
            "properties": properties,
        },
    }


def render_mapping() -> str:
    return json.dumps(build_mapping(), indent=2, ensure_ascii=False)


def main() -> int:
    parser = argparse.ArgumentParser(
        description="Generate mappings/search_products.json from a compact Python spec.",
    )
    parser.add_argument(
        "-o",
        "--output",
        type=Path,
        help="Write the generated mapping to this file. Defaults to stdout.",
    )
    parser.add_argument(
        "--check",
        type=Path,
        help="Fail if the generated output does not exactly match this file.",
    )
    args = parser.parse_args()

    rendered = render_mapping()

    if args.check is not None:
        existing = args.check.read_text(encoding="utf-8")
        if existing != rendered:
            print(f"Generated mapping does not match {args.check}")
            return 1
        print(f"Generated mapping matches {args.check}")

    if args.output is not None:
        args.output.write_text(rendered, encoding="utf-8")
    elif args.check is None:
        print(rendered, end="")

    return 0


if __name__ == "__main__":
    raise SystemExit(main())