#!/usr/bin/env python3 from __future__ import annotations import argparse import json from pathlib import Path from typing import Any ALL_LANGUAGE_CODES = [ "zh", "en", "ar", "hy", "eu", "pt_br", "bg", "ca", "cjk", "cs", "da", "nl", "fi", "fr", "gl", "de", "el", "hi", "hu", "id", "it", "no", "fa", "pt", "ro", "ru", "es", "sv", "tr", "th", ] CORE_INDEX_LANGUAGES = ["zh", "en"] LANGUAGE_GROUPS = { "all": ALL_LANGUAGE_CODES, "core": CORE_INDEX_LANGUAGES, } ANALYZERS = { "zh": "index_ik", "en": "english", "ar": "arabic", "hy": "armenian", "eu": "basque", "pt_br": "brazilian", "bg": "bulgarian", "ca": "catalan", "cjk": "cjk", "cs": "czech", "da": "danish", "nl": "dutch", "fi": "finnish", "fr": "french", "gl": "galician", "de": "german", "el": "greek", "hi": "hindi", "hu": "hungarian", "id": "indonesian", "it": "italian", "no": "norwegian", "fa": "persian", "pt": "portuguese", "ro": "romanian", "ru": "russian", "es": "spanish", "sv": "swedish", "tr": "turkish", "th": "thai", } SETTINGS = { "number_of_shards": 4, "number_of_replicas": 0, "refresh_interval": "30s", "analysis": { "analyzer": { "index_ik": { "type": "custom", "tokenizer": "ik_max_word", "filter": ["lowercase", "asciifolding"], }, "query_ik": { "type": "custom", "tokenizer": "ik_smart", "filter": ["lowercase", "asciifolding"], }, }, "normalizer": { "lowercase": { "type": "custom", "filter": ["lowercase"], } }, }, "similarity": { "default": { "type": "BM25", "b": 0.0, "k1": 0.0, } }, } TEXT_FIELD_TEMPLATES = { "all_language_text": { "language_group": "all", "with_keyword": False, }, "all_language_text_with_keyword": { "language_group": "all", "with_keyword": True, }, "core_language_text": { "language_group": "core", "with_keyword": False, }, "core_language_text_with_keyword": { "language_group": "core", "with_keyword": True, }, } def scalar_field(name: str, field_type: str, **extra: Any) -> dict[str, Any]: spec = { "name": name, "kind": "scalar", "type": field_type, } if extra: spec["extra"] = extra return spec def text_field(name: str, template: str) -> dict[str, Any]: return { "name": name, "kind": "text", "template": template, } def nested_field(name: str, *fields: dict[str, Any]) -> dict[str, Any]: return { "name": name, "kind": "nested", "fields": list(fields), } TEXT_EMBEDDING_SIZE = 1024 IMAGE_EMBEDDING_SIZE = 768 FIELD_SPECS = [ scalar_field("spu_id", "keyword"), scalar_field("create_time", "date"), scalar_field("update_time", "date"), text_field("title", "all_language_text"), text_field("keywords", "all_language_text_with_keyword"), text_field("brief", "all_language_text"), text_field("description", "all_language_text"), text_field("vendor", "all_language_text_with_keyword"), scalar_field("image_url", "keyword", index=False), scalar_field( "title_embedding", "dense_vector", dims=TEXT_EMBEDDING_SIZE, index=True, similarity="dot_product", element_type="bfloat16", ), nested_field( "image_embedding", scalar_field( "vector", "dense_vector", dims=IMAGE_EMBEDDING_SIZE, index=True, similarity="dot_product", element_type="bfloat16", ), scalar_field("url", "text"), ), text_field("category_path", "all_language_text_with_keyword"), text_field("category_name_text", "all_language_text_with_keyword"), text_field("tags", "all_language_text_with_keyword"), scalar_field("category_id", "keyword"), scalar_field("category_name", "keyword"), scalar_field("category_level", "integer"), scalar_field("category1_name", "keyword"), scalar_field("category2_name", "keyword"), scalar_field("category3_name", "keyword"), nested_field( "specifications", scalar_field("sku_id", "keyword"), scalar_field("name", "keyword"), scalar_field("value_keyword", "keyword"), text_field("value_text", "core_language_text_with_keyword"), ), text_field("qanchors", "core_language_text"), text_field("enriched_tags", "core_language_text_with_keyword"), nested_field( "enriched_attributes", scalar_field("name", "keyword"), text_field("value", "core_language_text_with_keyword"), ), scalar_field("option1_name", "keyword"), scalar_field("option2_name", "keyword"), scalar_field("option3_name", "keyword"), text_field("option1_values", "core_language_text_with_keyword"), text_field("option2_values", "core_language_text_with_keyword"), text_field("option3_values", "core_language_text_with_keyword"), scalar_field("min_price", "float"), scalar_field("max_price", "float"), scalar_field("compare_at_price", "float"), scalar_field("sku_prices", "float"), scalar_field("sku_weights", "long"), scalar_field("sku_weight_units", "keyword"), scalar_field("total_inventory", "long"), scalar_field("sales", "long"), nested_field( "skus", scalar_field("sku_id", "keyword"), scalar_field("price", "float"), scalar_field("compare_at_price", "float"), scalar_field("sku_code", "keyword"), scalar_field("stock", "long"), scalar_field("weight", "float"), scalar_field("weight_unit", "keyword"), scalar_field("option1_value", "keyword"), scalar_field("option2_value", "keyword"), scalar_field("option3_value", "keyword"), scalar_field("image_src", "keyword", index=False), ), ] def build_keyword_fields() -> dict[str, Any]: return { "keyword": { "type": "keyword", "normalizer": "lowercase", } } def build_text_field(language: str, *, add_keyword: bool) -> dict[str, Any]: field = { "type": "text", "analyzer": ANALYZERS[language], } if language == "zh": field["search_analyzer"] = "query_ik" if add_keyword: field["fields"] = build_keyword_fields() return field def render_field(spec: dict[str, Any]) -> dict[str, Any]: kind = spec["kind"] if kind == "scalar": rendered = {"type": spec["type"]} rendered.update(spec.get("extra", {})) return rendered if kind == "text": template = TEXT_FIELD_TEMPLATES[spec["template"]] languages = LANGUAGE_GROUPS[template["language_group"]] properties = {} for language in languages: properties[language] = build_text_field( language, add_keyword=template["with_keyword"], ) return { "type": "object", "properties": properties, } if kind == "nested": properties = {} for child in spec["fields"]: properties[child["name"]] = render_field(child) return { "type": "nested", "properties": properties, } raise ValueError(f"Unknown field kind: {kind}") def build_mapping() -> dict[str, Any]: properties = {} for spec in FIELD_SPECS: properties[spec["name"]] = render_field(spec) return { "settings": SETTINGS, "mappings": { "properties": properties, }, } def render_mapping() -> str: return json.dumps(build_mapping(), indent=2, ensure_ascii=False) def main() -> int: parser = argparse.ArgumentParser( description="Generate mappings/search_products.json from a compact Python spec.", ) parser.add_argument( "-o", "--output", type=Path, help="Write the generated mapping to this file. Defaults to stdout.", ) parser.add_argument( "--check", type=Path, help="Fail if the generated output does not exactly match this file.", ) args = parser.parse_args() rendered = render_mapping() if args.check is not None: existing = args.check.read_text(encoding="utf-8") if existing != rendered: print(f"Generated mapping does not match {args.check}") return 1 print(f"Generated mapping matches {args.check}") if args.output is not None: args.output.write_text(rendered, encoding="utf-8") elif args.check is None: print(rendered, end="") return 0 if __name__ == "__main__": raise SystemExit(main())