From fca871fb17bf0366c5e2c324a76c4629c32dd729 Mon Sep 17 00:00:00 2001 From: tangwang Date: Mon, 30 Mar 2026 17:25:33 +0800 Subject: [PATCH] 索引字段修改 --- mappings/README.md | 268 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------- mappings/generate_search_products_mapping.py | 355 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mappings/search_products.json | 779 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------------------------------------- mappings/search_products.json.bak | 629 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 1959 insertions(+), 72 deletions(-) create mode 100644 mappings/generate_search_products_mapping.py create mode 100644 mappings/search_products.json.bak diff --git a/mappings/README.md b/mappings/README.md index 32cdfa1..ea70518 100644 --- a/mappings/README.md +++ b/mappings/README.md @@ -2,32 +2,280 @@ ## 概述 -所有租户共享同一个ES mapping结构,直接使用手写的JSON文件,无需通过config.yaml生成。 +所有租户共享同一个 Elasticsearch mapping 结构。 -## Mapping文件 +当前目录采用“声明式 Python 规格 + 字段模板 + 最终 JSON 产物”的方式维护 `search_products` 的索引定义: -- `search_products.json`: 完整的ES索引配置,包括settings和mappings +- `generate_search_products_mapping.py`: 唯一的生成源,包含字段模板、语言列表、分析器配置和递归生成逻辑 +- `search_products.json`: 由脚本生成的完整 ES 索引配置,包括 `settings` 和 `mappings` +- `search_suggestions.json`: 搜索建议索引配置 -## 使用方式 +默认应修改生成脚本中的规格定义,而不是手工编辑 `search_products.json`。 -### 创建索引 +## 字段抽象 + +脚本从业务语义上抽象出 4 类文本模板: + +- `all_language_text`: 全语言字段,不带 `keyword` +- `all_language_text_with_keyword`: 全语言字段,所有受支持语言都带 `keyword` +- `core_language_text`: 核心索引语言字段,不带 `keyword` +- `core_language_text_with_keyword`: 核心索引语言字段,核心语言都带 `keyword` + +这里的“核心索引语言”不是因为系统只支持两种语言,而是因为所有店铺、所有商品都必须至少产出这两种语言的索引内容。目前核心索引语言固定为: + +- `zh` +- `en` + +“全语言”表示 mapping 为原始商品语言预留了更多语言槽位。商品实际灌入时,不要求每个字段把所有语言都填满,只要求: + +- 核心索引语言字段必须填充 `zh` 和 `en` +- 全语言字段必须填充 `zh` 和 `en` +- 如果商品原始语言属于受支持语言,还应额外填充对应的原始语言字段,例如 `ru` + +当前字段大致分为几类: + +- 全语言字段:`title`、`keywords`、`brief`、`description`、`vendor`、`category_path`、`category_name_text`、`specifications.value` +- 核心索引语言字段:`qanchors`、`tags`、`option1_values`、`option2_values`、`option3_values`、`enriched_attributes.value` +- 复合嵌套字段:`image_embedding`、`specifications`、`enriched_attributes`、`skus` +- 其他标量字段:`tenant_id`、`spu_id`、价格、库存、类目等 + +生成规则里的几个基础约束: + +- 中文字段使用 `index_ik`,并额外设置 `search_analyzer: query_ik` +- 非中文语言使用各自的 Elasticsearch 内置 analyzer +- 带 `with_keyword` 的模板会为对应语言增加 `.keyword` +- `settings.analysis`、`normalizer`、`similarity` 也属于生成结果的一部分,不能只维护 `mappings.properties` + +## 索引灌入指引 + +### 基本原则 + +1. 所有商品都必须生成核心索引语言版本,也就是 `zh` 和 `en`。 +2. 全语言字段除了必须有 `zh` 和 `en`,还应尽量保留商品原始语言版本。 +3. 如果商品原始语言本身就是 `zh` 或 `en`,则原文直接写入对应字段,另一种核心语言通过翻译补齐。 +4. 如果商品原始语言是 `ru` 这类受支持的非核心语言,则应同时写入原始语言字段和 `zh/en` 翻译结果。 +5. 如果某个值为空,不应写入伪造内容;应在上游清洗后决定是否跳过该字段。 + +### 核心索引语言字段 + +这类字段的目标是保证所有商品都至少能被中文和英文检索到。无论商品原始语言是什么,都应通过翻译或标准化得到 `zh` 和 `en` 两份结果。 + +典型字段: + +- `qanchors` +- `tags` +- `option1_values` +- `option2_values` +- `option3_values` +- `enriched_attributes.value` + +以 `category_path` 和 `option*_values` 为例,核心语言灌入结果应至少包含: + +- `category_path.zh` +- `category_path.en` +- `option1_values.zh` +- `option1_values.en` +- `option2_values.zh` +- `option2_values.en` +- `option3_values.zh` +- `option3_values.en` + +示例:原始商品语言为俄语,原始 `option1_values` 为 `красный, синий` + +```json +{ + "option1_values": { + "zh": "红色, 蓝色", + "en": "red, blue" + } +} +``` + +示例:原始商品语言为俄语,类目路径为 `Одежда > Женская одежда > Куртки` + +```json +{ + "category_path": { + "zh": "服饰 > 女装 > 夹克", + "en": "Apparel > Women's Clothing > Jackets", + "ru": "Одежда > Женская одежда > Куртки" + } +} +``` + +注意:`category_path` 在 mapping 上属于全语言字段,但在灌入规范上依然要求 `zh/en` 必填。 + +### 全语言字段 + +这类字段既要保证 `zh/en` 两个核心索引语言可用,也要尽量保留商品原始语言,以便原语种召回和更自然的检索。 + +典型字段: + +- `title` +- `keywords` +- `brief` +- `description` +- `vendor` +- `category_path` +- `category_name_text` +- `specifications.value` + +灌入规则: + +1. 找到商品原始语言,例如 `ru` +2. 原文写入对应语言字段,例如 `title.ru` +3. 将原文翻译成 `zh` 和 `en` +4. 分别写入 `title.zh` 和 `title.en` + +示例:原始商品语言为俄语,标题为 `Женская зимняя куртка` + +```json +{ + "title": { + "zh": "女士冬季夹克", + "en": "Women's winter jacket", + "ru": "Женская зимняя куртка" + } +} +``` + +示例:原始商品语言为俄语,类目名称为 `Женские куртки` + +```json +{ + "category_name_text": { + "zh": "女式夹克", + "en": "Women's jackets", + "ru": "Женские куртки" + } +} +``` + +示例:规格值 `specifications.value` + +```json +{ + "specifications": [ + { + "sku_id": "sku-red-s", + "name": "color", + "value": { + "zh": "红色", + "en": "red", + "ru": "красный" + } + } + ] +} +``` + +### 原始语言为中文或英文时 + +如果原始语言就是核心索引语言之一,不需要额外再写第三份语言字段。 + +示例:原始语言为中文 + +```json +{ + "title": { + "zh": "女士冬季夹克", + "en": "Women's winter jacket" + }, + "option1_values": { + "zh": "红色, 蓝色", + "en": "red, blue" + } +} +``` + +示例:原始语言为英文 + +```json +{ + "title": { + "zh": "女士冬季夹克", + "en": "Women's winter jacket" + }, + "vendor": { + "zh": "北境服饰", + "en": "Northern Apparel" + } +} +``` + +### 不同字段的灌入方式 + +可以按下面的方式理解和实现: + +- 标量字段:直接写固定值,例如 `tenant_id`、`spu_id`、`min_price` +- 核心索引语言字段:只生成 `zh/en` +- 全语言字段:生成 `zh/en`,再按原始语言补一个对应语种字段 +- 嵌套字段:对每个元素内部重复应用同样规则,例如 `specifications[].value` + +### 推荐灌入流程 + +1. 识别商品原始语言 +2. 提取原文标题、描述、类目、规格、属性、选项值等字段 +3. 生成 `zh` 和 `en` 两份核心索引语言内容 +4. 对全语言字段,如果原始语言受支持,则额外写入原始语言字段 +5. 组装最终 ES 文档并写入索引 + +## 生成 Mapping + +在仓库根目录执行: + +```bash +source activate.sh +python mappings/generate_search_products_mapping.py > mappings/search_products.json +``` + +如果只想查看输出而不覆盖文件: + +```bash +source activate.sh +python mappings/generate_search_products_mapping.py +``` + +如果想先生成到临时文件: + +```bash +source activate.sh +python mappings/generate_search_products_mapping.py > mappings/search_products.generated.json +``` + +## 校验 Mapping + +确认当前 `search_products.json` 是否与生成规则完全一致: + +```bash +source activate.sh +python mappings/generate_search_products_mapping.py --check mappings/search_products.json +``` + +## 创建索引 ```python from indexer.mapping_generator import load_mapping, create_index_if_not_exists from utils.es_client import ESClient es_client = ESClient(hosts=["http://localhost:9200"]) -mapping = load_mapping() # 从mappings/search_products.json加载 +mapping = load_mapping() create_index_if_not_exists(es_client, "search_products", mapping) ``` -### 修改Mapping +## 修改 Mapping + +推荐流程: + +1. 修改 `mappings/generate_search_products_mapping.py` +2. 重新生成 `mappings/search_products.json` +3. 用 `--check` 或 diff 确认变更符合预期 +4. 重新创建索引并导入数据 -直接编辑 `mappings/search_products.json` 文件,然后重新创建索引。 +注意:Elasticsearch 不支持直接修改已有字段的 mapping 类型,只能新增字段。如需修改字段类型,需要: -注意:ES不支持修改已有字段的mapping类型,只能添加新字段。如需修改字段类型,需要: 1. 删除旧索引 -2. 使用新mapping创建索引 +2. 使用新 mapping 创建索引 3. 重新导入数据 ## 字段说明 diff --git a/mappings/generate_search_products_mapping.py b/mappings/generate_search_products_mapping.py new file mode 100644 index 0000000..ea9b24c --- /dev/null +++ b/mappings/generate_search_products_mapping.py @@ -0,0 +1,355 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Any + +ALL_LANGUAGE_CODES = [ + "zh", + "en", + "ar", + "hy", + "eu", + "pt_br", + "bg", + "ca", + "cjk", + "cs", + "da", + "nl", + "fi", + "fr", + "gl", + "de", + "el", + "hi", + "hu", + "id", + "it", + "no", + "fa", + "pt", + "ro", + "ru", + "es", + "sv", + "tr", + "th", +] + +CORE_INDEX_LANGUAGES = ["zh", "en"] + +LANGUAGE_GROUPS = { + "all": ALL_LANGUAGE_CODES, + "core": CORE_INDEX_LANGUAGES, +} + +ANALYZERS = { + "zh": "index_ik", + "en": "english", + "ar": "arabic", + "hy": "armenian", + "eu": "basque", + "pt_br": "brazilian", + "bg": "bulgarian", + "ca": "catalan", + "cjk": "cjk", + "cs": "czech", + "da": "danish", + "nl": "dutch", + "fi": "finnish", + "fr": "french", + "gl": "galician", + "de": "german", + "el": "greek", + "hi": "hindi", + "hu": "hungarian", + "id": "indonesian", + "it": "italian", + "no": "norwegian", + "fa": "persian", + "pt": "portuguese", + "ro": "romanian", + "ru": "russian", + "es": "spanish", + "sv": "swedish", + "tr": "turkish", + "th": "thai", +} + +SETTINGS = { + "number_of_shards": 1, + "number_of_replicas": 0, + "refresh_interval": "30s", + "analysis": { + "analyzer": { + "index_ik": { + "type": "custom", + "tokenizer": "ik_max_word", + "filter": ["lowercase", "asciifolding"], + }, + "query_ik": { + "type": "custom", + "tokenizer": "ik_smart", + "filter": ["lowercase", "asciifolding"], + }, + }, + "normalizer": { + "lowercase": { + "type": "custom", + "filter": ["lowercase"], + } + }, + }, + "similarity": { + "default": { + "type": "BM25", + "b": 0.0, + "k1": 0.0, + } + }, +} + +TEXT_FIELD_TEMPLATES = { + "all_language_text": { + "language_group": "all", + "with_keyword": False, + }, + "all_language_text_with_keyword": { + "language_group": "all", + "with_keyword": True, + }, + "core_language_text": { + "language_group": "core", + "with_keyword": False, + }, + "core_language_text_with_keyword": { + "language_group": "core", + "with_keyword": True, + }, +} + + +def scalar_field(name: str, field_type: str, **extra: Any) -> dict[str, Any]: + spec = { + "name": name, + "kind": "scalar", + "type": field_type, + } + if extra: + spec["extra"] = extra + return spec + + +def text_field(name: str, template: str) -> dict[str, Any]: + return { + "name": name, + "kind": "text", + "template": template, + } + + +def nested_field(name: str, *fields: dict[str, Any]) -> dict[str, Any]: + return { + "name": name, + "kind": "nested", + "fields": list(fields), + } + +TEXT_EMBEDDING_SIZE = 1024 +IMAGE_EMBEDDING_SIZE = 768 + +FIELD_SPECS = [ + scalar_field("tenant_id", "keyword"), + scalar_field("spu_id", "keyword"), + scalar_field("create_time", "date"), + scalar_field("update_time", "date"), + text_field("title", "all_language_text"), + text_field("keywords", "all_language_text_with_keyword"), + text_field("brief", "all_language_text"), + text_field("description", "all_language_text"), + text_field("vendor", "all_language_text_with_keyword"), + scalar_field("image_url", "keyword", index=False), + scalar_field( + "title_embedding", + "dense_vector", + dims=TEXT_EMBEDDING_SIZE, + index=True, + similarity="dot_product", + element_type="bfloat16", + ), + nested_field( + "image_embedding", + scalar_field( + "vector", + "dense_vector", + dims=IMAGE_EMBEDDING_SIZE, + index=True, + similarity="dot_product", + element_type="bfloat16", + ), + scalar_field("url", "text"), + ), + text_field("category_path", "all_language_text_with_keyword"), + text_field("category_name_text", "all_language_text_with_keyword"), + text_field("qanchors", "core_language_text"), + text_field("tags", "core_language_text_with_keyword"), + scalar_field("category_id", "keyword"), + scalar_field("category_name", "keyword"), + scalar_field("category_level", "integer"), + scalar_field("category1_name", "keyword"), + scalar_field("category2_name", "keyword"), + scalar_field("category3_name", "keyword"), + nested_field( + "specifications", + scalar_field("sku_id", "keyword"), + scalar_field("name", "keyword"), + scalar_field("value_keyword", "keyword"), + text_field("value_text", "core_language_text_with_keyword"), + ), + nested_field( + "enriched_attributes", + scalar_field("name", "keyword"), + text_field("value", "core_language_text_with_keyword"), + ), + scalar_field("option1_name", "keyword"), + scalar_field("option2_name", "keyword"), + scalar_field("option3_name", "keyword"), + text_field("option1_values", "core_language_text_with_keyword"), + text_field("option2_values", "core_language_text_with_keyword"), + text_field("option3_values", "core_language_text_with_keyword"), + scalar_field("min_price", "float"), + scalar_field("max_price", "float"), + scalar_field("compare_at_price", "float"), + scalar_field("sku_prices", "float"), + scalar_field("sku_weights", "long"), + scalar_field("sku_weight_units", "keyword"), + scalar_field("total_inventory", "long"), + scalar_field("sales", "long"), + nested_field( + "skus", + scalar_field("sku_id", "keyword"), + scalar_field("price", "float"), + scalar_field("compare_at_price", "float"), + scalar_field("sku_code", "keyword"), + scalar_field("stock", "long"), + scalar_field("weight", "float"), + scalar_field("weight_unit", "keyword"), + scalar_field("option1_value", "keyword"), + scalar_field("option2_value", "keyword"), + scalar_field("option3_value", "keyword"), + scalar_field("image_src", "keyword", index=False), + ), +] + + +def build_keyword_fields() -> dict[str, Any]: + return { + "keyword": { + "type": "keyword", + "normalizer": "lowercase", + } + } + + +def build_text_field(language: str, *, add_keyword: bool) -> dict[str, Any]: + field = { + "type": "text", + "analyzer": ANALYZERS[language], + } + if language == "zh": + field["search_analyzer"] = "query_ik" + if add_keyword: + field["fields"] = build_keyword_fields() + return field + + +def render_field(spec: dict[str, Any]) -> dict[str, Any]: + kind = spec["kind"] + + if kind == "scalar": + rendered = {"type": spec["type"]} + rendered.update(spec.get("extra", {})) + return rendered + + if kind == "text": + template = TEXT_FIELD_TEMPLATES[spec["template"]] + languages = LANGUAGE_GROUPS[template["language_group"]] + properties = {} + for language in languages: + properties[language] = build_text_field( + language, + add_keyword=template["with_keyword"], + ) + return { + "type": "object", + "properties": properties, + } + + if kind == "nested": + properties = {} + for child in spec["fields"]: + properties[child["name"]] = render_field(child) + return { + "type": "nested", + "properties": properties, + } + + raise ValueError(f"Unknown field kind: {kind}") + + +def build_mapping() -> dict[str, Any]: + properties = {} + for spec in FIELD_SPECS: + properties[spec["name"]] = render_field(spec) + + return { + "settings": SETTINGS, + "mappings": { + "properties": properties, + }, + } + + +def render_mapping() -> str: + return json.dumps(build_mapping(), indent=2, ensure_ascii=False) + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Generate mappings/search_products.json from a compact Python spec.", + ) + parser.add_argument( + "-o", + "--output", + type=Path, + help="Write the generated mapping to this file. Defaults to stdout.", + ) + parser.add_argument( + "--check", + type=Path, + help="Fail if the generated output does not exactly match this file.", + ) + args = parser.parse_args() + + rendered = render_mapping() + + if args.check is not None: + existing = args.check.read_text(encoding="utf-8") + if existing != rendered: + print(f"Generated mapping does not match {args.check}") + return 1 + print(f"Generated mapping matches {args.check}") + + if args.output is not None: + args.output.write_text(rendered, encoding="utf-8") + elif args.check is None: + print(rendered, end="") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/mappings/search_products.json b/mappings/search_products.json index fc120f2..db407d1 100644 --- a/mappings/search_products.json +++ b/mappings/search_products.json @@ -185,7 +185,13 @@ "zh": { "type": "text", "analyzer": "index_ik", - "search_analyzer": "query_ik" + "search_analyzer": "query_ik", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "en": { "type": "text", @@ -737,7 +743,13 @@ "zh": { "type": "text", "analyzer": "index_ik", - "search_analyzer": "query_ik" + "search_analyzer": "query_ik", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "en": { "type": "text", @@ -1063,123 +1075,303 @@ "zh": { "type": "text", "analyzer": "index_ik", - "search_analyzer": "query_ik" + "search_analyzer": "query_ik", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "en": { "type": "text", - "analyzer": "english" + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "ar": { "type": "text", - "analyzer": "arabic" + "analyzer": "arabic", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "hy": { "type": "text", - "analyzer": "armenian" + "analyzer": "armenian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "eu": { "type": "text", - "analyzer": "basque" + "analyzer": "basque", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "pt_br": { "type": "text", - "analyzer": "brazilian" + "analyzer": "brazilian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "bg": { "type": "text", - "analyzer": "bulgarian" + "analyzer": "bulgarian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "ca": { "type": "text", - "analyzer": "catalan" + "analyzer": "catalan", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "cjk": { "type": "text", - "analyzer": "cjk" + "analyzer": "cjk", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "cs": { "type": "text", - "analyzer": "czech" + "analyzer": "czech", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "da": { "type": "text", - "analyzer": "danish" + "analyzer": "danish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "nl": { "type": "text", - "analyzer": "dutch" + "analyzer": "dutch", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "fi": { "type": "text", - "analyzer": "finnish" + "analyzer": "finnish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "fr": { "type": "text", - "analyzer": "french" + "analyzer": "french", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "gl": { "type": "text", - "analyzer": "galician" + "analyzer": "galician", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "de": { "type": "text", - "analyzer": "german" + "analyzer": "german", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "el": { "type": "text", - "analyzer": "greek" + "analyzer": "greek", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "hi": { "type": "text", - "analyzer": "hindi" + "analyzer": "hindi", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "hu": { "type": "text", - "analyzer": "hungarian" + "analyzer": "hungarian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "id": { "type": "text", - "analyzer": "indonesian" + "analyzer": "indonesian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "it": { "type": "text", - "analyzer": "italian" + "analyzer": "italian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "no": { "type": "text", - "analyzer": "norwegian" + "analyzer": "norwegian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "fa": { "type": "text", - "analyzer": "persian" + "analyzer": "persian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "pt": { "type": "text", - "analyzer": "portuguese" + "analyzer": "portuguese", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "ro": { "type": "text", - "analyzer": "romanian" + "analyzer": "romanian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "ru": { "type": "text", - "analyzer": "russian" + "analyzer": "russian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "es": { "type": "text", - "analyzer": "spanish" + "analyzer": "spanish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "sv": { "type": "text", - "analyzer": "swedish" + "analyzer": "swedish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "tr": { "type": "text", - "analyzer": "turkish" + "analyzer": "turkish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "th": { "type": "text", - "analyzer": "thai" + "analyzer": "thai", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } } } }, @@ -1189,123 +1381,303 @@ "zh": { "type": "text", "analyzer": "index_ik", - "search_analyzer": "query_ik" + "search_analyzer": "query_ik", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "en": { "type": "text", - "analyzer": "english" + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "ar": { "type": "text", - "analyzer": "arabic" + "analyzer": "arabic", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "hy": { "type": "text", - "analyzer": "armenian" + "analyzer": "armenian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "eu": { "type": "text", - "analyzer": "basque" + "analyzer": "basque", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "pt_br": { "type": "text", - "analyzer": "brazilian" + "analyzer": "brazilian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "bg": { "type": "text", - "analyzer": "bulgarian" + "analyzer": "bulgarian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "ca": { "type": "text", - "analyzer": "catalan" + "analyzer": "catalan", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "cjk": { "type": "text", - "analyzer": "cjk" + "analyzer": "cjk", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "cs": { "type": "text", - "analyzer": "czech" + "analyzer": "czech", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "da": { "type": "text", - "analyzer": "danish" + "analyzer": "danish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "nl": { "type": "text", - "analyzer": "dutch" + "analyzer": "dutch", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "fi": { "type": "text", - "analyzer": "finnish" + "analyzer": "finnish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "fr": { "type": "text", - "analyzer": "french" + "analyzer": "french", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "gl": { "type": "text", - "analyzer": "galician" + "analyzer": "galician", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "de": { "type": "text", - "analyzer": "german" + "analyzer": "german", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "el": { "type": "text", - "analyzer": "greek" + "analyzer": "greek", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "hi": { "type": "text", - "analyzer": "hindi" + "analyzer": "hindi", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "hu": { "type": "text", - "analyzer": "hungarian" + "analyzer": "hungarian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "id": { "type": "text", - "analyzer": "indonesian" + "analyzer": "indonesian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "it": { "type": "text", - "analyzer": "italian" + "analyzer": "italian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "no": { "type": "text", - "analyzer": "norwegian" + "analyzer": "norwegian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "fa": { "type": "text", - "analyzer": "persian" + "analyzer": "persian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "pt": { "type": "text", - "analyzer": "portuguese" + "analyzer": "portuguese", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "ro": { "type": "text", - "analyzer": "romanian" + "analyzer": "romanian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "ru": { "type": "text", - "analyzer": "russian" + "analyzer": "russian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "es": { "type": "text", - "analyzer": "spanish" + "analyzer": "spanish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "sv": { "type": "text", - "analyzer": "swedish" + "analyzer": "swedish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "tr": { "type": "text", - "analyzer": "turkish" + "analyzer": "turkish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "th": { "type": "text", - "analyzer": "thai" + "analyzer": "thai", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } } } }, @@ -1377,6 +1749,9 @@ "type": "keyword" }, "value": { + "type": "keyword" + }, + "value_text": { "type": "object", "properties": { "zh": { @@ -1399,6 +1774,286 @@ "normalizer": "lowercase" } } + }, + "ar": { + "type": "text", + "analyzer": "arabic", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "hy": { + "type": "text", + "analyzer": "armenian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "eu": { + "type": "text", + "analyzer": "basque", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "pt_br": { + "type": "text", + "analyzer": "brazilian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "bg": { + "type": "text", + "analyzer": "bulgarian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "ca": { + "type": "text", + "analyzer": "catalan", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "cjk": { + "type": "text", + "analyzer": "cjk", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "cs": { + "type": "text", + "analyzer": "czech", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "da": { + "type": "text", + "analyzer": "danish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "nl": { + "type": "text", + "analyzer": "dutch", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "fi": { + "type": "text", + "analyzer": "finnish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "fr": { + "type": "text", + "analyzer": "french", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "gl": { + "type": "text", + "analyzer": "galician", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "de": { + "type": "text", + "analyzer": "german", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "el": { + "type": "text", + "analyzer": "greek", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "hi": { + "type": "text", + "analyzer": "hindi", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "hu": { + "type": "text", + "analyzer": "hungarian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "id": { + "type": "text", + "analyzer": "indonesian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "it": { + "type": "text", + "analyzer": "italian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "no": { + "type": "text", + "analyzer": "norwegian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "fa": { + "type": "text", + "analyzer": "persian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "pt": { + "type": "text", + "analyzer": "portuguese", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "ro": { + "type": "text", + "analyzer": "romanian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "ru": { + "type": "text", + "analyzer": "russian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "es": { + "type": "text", + "analyzer": "spanish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "sv": { + "type": "text", + "analyzer": "swedish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "tr": { + "type": "text", + "analyzer": "turkish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "th": { + "type": "text", + "analyzer": "thai", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } } } } diff --git a/mappings/search_products.json.bak b/mappings/search_products.json.bak new file mode 100644 index 0000000..332b4aa --- /dev/null +++ b/mappings/search_products.json.bak @@ -0,0 +1,629 @@ +{ + "settings": { + "number_of_shards": 1, + "number_of_replicas": 0, + "refresh_interval": "30s", + "analysis": { + "analyzer": { + "index_ik": { + "type": "custom", + "tokenizer": "ik_max_word", + "filter": [ + "lowercase", + "asciifolding" + ] + }, + "query_ik": { + "type": "custom", + "tokenizer": "ik_smart", + "filter": [ + "lowercase", + "asciifolding" + ] + } + }, + "normalizer": { + "lowercase": { + "type": "custom", + "filter": [ + "lowercase" + ] + } + } + }, + "similarity": { + "default": { + "type": "BM25", + "b": 0.0, + "k1": 0.0 + } + } + }, + "mappings": { + "properties": { + "tenant_id": { + "type": "keyword" + }, + "spu_id": { + "type": "keyword" + }, + "create_time": { + "type": "date" + }, + "update_time": { + "type": "date" + }, + "title": { + "type": "object", + "properties": { + "zh": { + "type": "text", + "analyzer": "index_ik", + "search_analyzer": "query_ik" + }, + "en": { + "type": "text", + "analyzer": "english" + }, + "ar": { + "type": "text", + "analyzer": "arabic" + }, + "hy": { + "type": "text", + "analyzer": "armenian" + }, + "eu": { + "type": "text", + "analyzer": "basque" + }, + "pt_br": { + "type": "text", + "analyzer": "brazilian" + }, + "bg": { + "type": "text", + "analyzer": "bulgarian" + }, + "ca": { + "type": "text", + "analyzer": "catalan" + }, + "cjk": { + "type": "text", + "analyzer": "cjk" + }, + "cs": { + "type": "text", + "analyzer": "czech" + }, + "da": { + "type": "text", + "analyzer": "danish" + }, + "nl": { + "type": "text", + "analyzer": "dutch" + }, + "fi": { + "type": "text", + "analyzer": "finnish" + }, + "fr": { + "type": "text", + "analyzer": "french" + }, + "gl": { + "type": "text", + "analyzer": "galician" + }, + "de": { + "type": "text", + "analyzer": "german" + }, + "el": { + "type": "text", + "analyzer": "greek" + }, + "hi": { + "type": "text", + "analyzer": "hindi" + }, + "hu": { + "type": "text", + "analyzer": "hungarian" + }, + "id": { + "type": "text", + "analyzer": "indonesian" + }, + "it": { + "type": "text", + "analyzer": "italian" + }, + "no": { + "type": "text", + "analyzer": "norwegian" + }, + "fa": { + "type": "text", + "analyzer": "persian" + }, + "pt": { + "type": "text", + "analyzer": "portuguese" + }, + "ro": { + "type": "text", + "analyzer": "romanian" + }, + "ru": { + "type": "text", + "analyzer": "russian" + }, + "es": { + "type": "text", + "analyzer": "spanish" + }, + "sv": { + "type": "text", + "analyzer": "swedish" + }, + "tr": { + "type": "text", + "analyzer": "turkish" + }, + "th": { + "type": "text", + "analyzer": "thai" + } + } + }, + "keywords": { + "type": "object", + "properties": { + "zh": { + "type": "text", + "analyzer": "index_ik", + "search_analyzer": "query_ik" + }, + "en": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "ar": { + "type": "text", + "analyzer": "arabic", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, +... + } + }, + "brief": { + "type": "object", + "properties": { + "zh": { + "type": "text", + "analyzer": "index_ik", + "search_analyzer": "query_ik" + }, + "en": { + "type": "text", + "analyzer": "english" + }, + "ar": { + "type": "text", + "analyzer": "arabic" + }, + ... + } + }, + "description": { + "type": "object", + "properties": { + "zh": { + "type": "text", + "analyzer": "index_ik", + "search_analyzer": "query_ik" + }, + "en": { + "type": "text", + "analyzer": "english" + }, + "ar": { + "type": "text", + "analyzer": "arabic" + }, + ... + } + }, + "vendor": { + "type": "object", + "properties": { + "zh": { + "type": "text", + "analyzer": "index_ik", + "search_analyzer": "query_ik" + }, + "en": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "ar": { + "type": "text", + "analyzer": "arabic", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + ... + } + }, + "image_url": { + "type": "keyword", + "index": false + }, + "title_embedding": { + "type": "dense_vector", + "dims": 1024, + "index": true, + "similarity": "dot_product", + "element_type": "bfloat16" + }, + "image_embedding": { + "type": "nested", + "properties": { + "vector": { + "type": "dense_vector", + "dims": 768, + "index": true, + "similarity": "dot_product", + "element_type": "bfloat16" + }, + "url": { + "type": "text" + } + } + }, + "category_path": { + "type": "object", + "properties": { + "zh": { + "type": "text", + "analyzer": "index_ik", + "search_analyzer": "query_ik" + }, + "en": { + "type": "text", + "analyzer": "english" + }, + "ar": { + "type": "text", + "analyzer": "arabic" + }, + ... + } + } + }, + "category_name_text": { + "type": "object", + "properties": { + "zh": { + "type": "text", + "analyzer": "index_ik", + "search_analyzer": "query_ik" + }, + "en": { + "type": "text", + "analyzer": "english" + }, + "ar": { + "type": "text", + "analyzer": "arabic" + }, + ... + + } + }, + "qanchors": { + "type": "object", + "properties": { + "zh": { + "type": "text", + "analyzer": "index_ik", + "search_analyzer": "query_ik" + }, + "en": { + "type": "text", + "analyzer": "english" + } + } + }, + "tags": { + "type": "object", + "properties": { + "zh": { + "type": "text", + "analyzer": "index_ik", + "search_analyzer": "query_ik", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "en": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + } + } + }, + "category_id": { + "type": "keyword" + }, + "category_name": { + "type": "keyword" + }, + "category_level": { + "type": "integer" + }, + "category1_name": { + "type": "keyword" + }, + "category2_name": { + "type": "keyword" + }, + "category3_name": { + "type": "keyword" + }, + "specifications": { + "type": "nested", + "properties": { + "sku_id": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "value": { + "type": "object", + "properties": { + "zh": { + "type": "text", + "analyzer": "index_ik", + "search_analyzer": "query_ik", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "en": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + } + } + } + } + }, + "enriched_attributes": { + "type": "nested", + "properties": { + "name": { + "type": "keyword" + }, + "value": { + "type": "object", + "properties": { + "zh": { + "type": "text", + "analyzer": "index_ik", + "search_analyzer": "query_ik", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "en": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + } + } + } + } + }, + "option1_name": { + "type": "keyword" + }, + "option2_name": { + "type": "keyword" + }, + "option3_name": { + "type": "keyword" + }, + "option1_values": { + "type": "object", + "properties": { + "zh": { + "type": "text", + "analyzer": "index_ik", + "search_analyzer": "query_ik", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "en": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + } + } + }, + "option2_values": { + "type": "object", + "properties": { + "zh": { + "type": "text", + "analyzer": "index_ik", + "search_analyzer": "query_ik", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "en": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + } + } + }, + "option3_values": { + "type": "object", + "properties": { + "zh": { + "type": "text", + "analyzer": "index_ik", + "search_analyzer": "query_ik", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "en": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + } + } + }, + "min_price": { + "type": "float" + }, + "max_price": { + "type": "float" + }, + "compare_at_price": { + "type": "float" + }, + "sku_prices": { + "type": "float" + }, + "sku_weights": { + "type": "long" + }, + "sku_weight_units": { + "type": "keyword" + }, + "total_inventory": { + "type": "long" + }, + "sales": { + "type": "long" + }, + "skus": { + "type": "nested", + "properties": { + "sku_id": { + "type": "keyword" + }, + "price": { + "type": "float" + }, + "compare_at_price": { + "type": "float" + }, + "sku_code": { + "type": "keyword" + }, + "stock": { + "type": "long" + }, + "weight": { + "type": "float" + }, + "weight_unit": { + "type": "keyword" + }, + "option1_value": { + "type": "keyword" + }, + "option2_value": { + "type": "keyword" + }, + "option3_value": { + "type": "keyword" + }, + "image_src": { + "type": "keyword", + "index": false + } + } + } + } + } +} \ No newline at end of file -- libgit2 0.21.2