From 36cf0ef91e3ae77c0c5f29d459962df70cbda623 Mon Sep 17 00:00:00 2001 From: tangwang Date: Mon, 30 Mar 2026 16:20:24 +0800 Subject: [PATCH] es索引结果修改 --- api/routes/indexer.py | 14 +++++++------- docs/工作总结-微服务性能优化与架构.md | 4 ++-- docs/常用查询 - ES.md | 16 ++++++++++++++++ docs/搜索API对接指南-00-总览与快速开始.md | 2 +- docs/搜索API对接指南-05-索引接口(Indexer).md | 12 ++++++------ docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md | 2 +- indexer/ANCHORS_AND_SEMANTIC_ATTRIBUTES.md | 30 +++++++++++++++--------------- indexer/document_transformer.py | 12 ++++++------ mappings/search_products.json | 562 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- scripts/es_debug_search.py | 4 +++- 10 files changed, 429 insertions(+), 229 deletions(-) diff --git a/api/routes/indexer.py b/api/routes/indexer.py index 5b121b3..57baf69 100644 --- a/api/routes/indexer.py +++ b/api/routes/indexer.py @@ -88,7 +88,7 @@ class EnrichContentItem(BaseModel): class EnrichContentRequest(BaseModel): """ - 内容理解字段生成请求:根据商品标题批量生成 qanchors、semantic_attributes、tags。 + 内容理解字段生成请求:根据商品标题批量生成 qanchors、enriched_attributes、tags。 供外部 indexer 在自行组织 doc 时调用,与翻译、向量化等微服务并列。 """ tenant_id: str = Field(..., description="租户 ID,用于请求路由与结果归属,不参与缓存键") @@ -447,7 +447,7 @@ async def build_docs_from_db(request: BuildDocsFromDbRequest): def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: List[str]) -> List[Dict[str, Any]]: """ 同步执行内容理解:调用 product_enrich.analyze_products,按语言批量跑 LLM, - 再聚合成每 SPU 的 qanchors、semantic_attributes、tags。供 run_in_executor 调用。 + 再聚合成每 SPU 的 qanchors、enriched_attributes、tags。供 run_in_executor 调用。 """ from indexer.product_enrich import analyze_products, split_multi_value_field @@ -473,11 +473,11 @@ def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: "features", ] - # 按 spu_id 聚合:qanchors[lang], semantic_attributes[], tags[] + # 按 spu_id 聚合:qanchors[lang], enriched_attributes[], tags[] by_spu: Dict[str, Dict[str, Any]] = {} for it in items: sid = str(it["spu_id"]) - by_spu[sid] = {"qanchors": {}, "semantic_attributes": [], "tags": []} + by_spu[sid] = {"qanchors": {}, "enriched_attributes": [], "tags": []} for lang in llm_langs: try: @@ -511,7 +511,7 @@ def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: if not raw: continue for value in split_multi_value_field(str(raw)): - rec["semantic_attributes"].append({"lang": lang, "name": name, "value": value}) + rec["enriched_attributes"].append({"lang": lang, "name": name, "value": value}) if name == "tags": rec["tags"].append(value) @@ -524,7 +524,7 @@ def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: out.append({ "spu_id": sid, "qanchors": rec["qanchors"], - "semantic_attributes": rec["semantic_attributes"], + "enriched_attributes": rec["enriched_attributes"], "tags": tags, **({"error": rec["error"]} if rec.get("error") else {}), }) @@ -534,7 +534,7 @@ def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: @router.post("/enrich-content") async def enrich_content(request: EnrichContentRequest): """ - 内容理解字段生成接口:根据商品标题批量生成 qanchors、semantic_attributes、tags。 + 内容理解字段生成接口:根据商品标题批量生成 qanchors、enriched_attributes、tags。 使用场景: - 外部 indexer 采用「微服务组合」方式自己组织 doc 时,可调用本接口获取 LLM 生成的 diff --git a/docs/工作总结-微服务性能优化与架构.md b/docs/工作总结-微服务性能优化与架构.md index 802aeb7..b846ef8 100644 --- a/docs/工作总结-微服务性能优化与架构.md +++ b/docs/工作总结-微服务性能优化与架构.md @@ -95,10 +95,10 @@ instruction: "Given a shopping query, rank product titles by relevance" ### 5. 内容理解字段(支撑 Suggest) -**能力**:支持根据商品标题批量生成 **qanchors**(锚文本)、**semantic_attributes**、**tags**,供索引与 suggest 使用。 +**能力**:支持根据商品标题批量生成 **qanchors**(锚文本)、**enriched_attributes**、**tags**,供索引与 suggest 使用。 **具体内容**: -- **接口**:`POST /indexer/enrich-content`(Indexer 服务端口 **6004**)。请求体为 `items` 数组,每项含 `spu_id`、`title`(必填)及可选多语言标题等;单次请求最多 **50 条**,建议批量调用。响应 `results` 与 `items` 一一对应,每项含 `spu_id`、`qanchors`(按语言键,如 `qanchors.zh`、`qanchors.en`,逗号分隔短语)、`semantic_attributes`、`tags`。 +- **接口**:`POST /indexer/enrich-content`(Indexer 服务端口 **6004**)。请求体为 `items` 数组,每项含 `spu_id`、`title`(必填)及可选多语言标题等;单次请求最多 **50 条**,建议批量调用。响应 `results` 与 `items` 一一对应,每项含 `spu_id`、`qanchors`(按语言键,如 `qanchors.zh`、`qanchors.en`,逗号分隔短语)、`enriched_attributes`、`tags`。 -- **索引侧**:微服务组合方式下,调用方先拿不含 qanchors/tags 的 doc,再调用本接口补齐后写入 ES 的 `qanchors.{lang}` 等字段;索引 transformer(`indexer/document_transformer.py`、`indexer/product_enrich.py`)内也可在构建 doc 时调用内容理解逻辑,写入 `qanchors.{lang}`。 - **Suggest 侧**:`suggestion/builder.py` 从 ES 商品索引读取 `_source: ["id", "spu_id", "title", "qanchors"]`,对 `qanchors.{lang}` 用 `_split_qanchors` 拆成词条,以 `source="qanchor"` 加入候选,排序时 `qanchor` 权重大于纯 title(`add_product("qanchor", ...)`);suggest 配置中 `sources: ["query_log", "qanchor"]` 表示候选来源包含 qanchor。 - **实现与依赖**:内容理解内部使用大模型(需 `DASHSCOPE_API_KEY`),支持多语言与 Redis 缓存(如 `product_anchors`);逻辑与 `indexer/product_enrich` 一致。 diff --git a/docs/常用查询 - ES.md b/docs/常用查询 - ES.md index 6a08941..459ce80 100644 --- a/docs/常用查询 - ES.md +++ b/docs/常用查询 - ES.md @@ -664,4 +664,20 @@ curl -u 'saas:4hOaLaf41y2VuI8y' -X POST \ } } }' + +curl -u 'saas:4hOaLaf41y2VuI8y' -X POST \ +'http://localhost:9200/search_products_tenant_163/_count' \ +-H 'Content-Type: application/json' \ +-d '{ +"query": { +"nested": { + "path": "image_embedding", + "query": { + "exists": { + "field": "image_embedding.vector" + } + } +} +} +}' ``` \ No newline at end of file diff --git a/docs/搜索API对接指南-00-总览与快速开始.md b/docs/搜索API对接指南-00-总览与快速开始.md index b573ca1..71948b1 100644 --- a/docs/搜索API对接指南-00-总览与快速开始.md +++ b/docs/搜索API对接指南-00-总览与快速开始.md @@ -90,7 +90,7 @@ curl -X POST "http://43.166.252.75:6002/search/" \ | 查询文档 | POST | `/indexer/documents` | 查询SPU文档数据(不写入ES) | | 构建ES文档(正式对接) | POST | `/indexer/build-docs` | 基于上游提供的 MySQL 行数据构建 ES doc,不写入 ES,供 Java 等调用后自行写入 | | 构建ES文档(测试用) | POST | `/indexer/build-docs-from-db` | 仅在测试/调试时使用,根据 `tenant_id + spu_ids` 内部查库并构建 ES doc | -| 内容理解字段生成 | POST | `/indexer/enrich-content` | 根据商品标题批量生成 qanchors、semantic_attributes、tags,供微服务组合方式使用 | +| 内容理解字段生成 | POST | `/indexer/enrich-content` | 根据商品标题批量生成 qanchors、enriched_attributes、tags,供微服务组合方式使用 | | 索引健康检查 | GET | `/indexer/health` | 检查索引服务状态 | | 健康检查 | GET | `/admin/health` | 服务健康检查 | | 获取配置 | GET | `/admin/config` | 获取租户配置 | diff --git a/docs/搜索API对接指南-05-索引接口(Indexer).md b/docs/搜索API对接指南-05-索引接口(Indexer).md index 6911913..c3129fa 100644 --- a/docs/搜索API对接指南-05-索引接口(Indexer).md +++ b/docs/搜索API对接指南-05-索引接口(Indexer).md @@ -649,7 +649,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ ### 5.8 内容理解字段生成接口 - **端点**: `POST /indexer/enrich-content` -- **描述**: 根据商品内容信息批量生成 **qanchors**(锚文本)、**semantic_attributes**(语义属性)、**tags**(细分标签),供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段(必填/可选见下表)。内部逻辑与 `indexer.product_enrich` 一致,支持多语言与 Redis 缓存;单次请求在线程池中执行,避免阻塞其他接口。 +- **描述**: 根据商品内容信息批量生成 **qanchors**(锚文本)、**enriched_attributes**(语义属性)、**tags**(细分标签),供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段(必填/可选见下表)。内部逻辑与 `indexer.product_enrich` 一致,支持多语言与 Redis 缓存;单次请求在线程池中执行,避免阻塞其他接口。 #### 请求参数 @@ -714,7 +714,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ "zh": "短袖T恤,纯棉,男装,夏季", "en": "cotton t-shirt, short sleeve, men, summer" }, - "semantic_attributes": [ + "enriched_attributes": [ { "lang": "zh", "name": "tags", "value": "纯棉" }, { "lang": "zh", "name": "usage_scene", "value": "日常" }, { "lang": "en", "name": "tags", "value": "cotton" } @@ -724,7 +724,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ { "spu_id": "223168", "qanchors": { "en": "dolls, toys, 12pcs" }, - "semantic_attributes": [], + "enriched_attributes": [], "tags": ["dolls", "toys"] } ] @@ -733,9 +733,9 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | 字段 | 类型 | 说明 | |------|------|------| -| `results` | array | 与请求 `items` 一一对应,每项含 `spu_id`、`qanchors`、`semantic_attributes`、`tags` | +| `results` | array | 与请求 `items` 一一对应,每项含 `spu_id`、`qanchors`、`enriched_attributes`、`tags` | | `results[].qanchors` | object | 按语言键的锚文本(逗号分隔短语),可写入 ES 文档的 `qanchors.{lang}` | -| `results[].semantic_attributes` | array | 语义属性列表,每项为 `{ "lang", "name", "value" }`,可写入 ES 的 `semantic_attributes` nested 字段 | +| `results[].enriched_attributes` | array | 语义属性列表,每项为 `{ "lang", "name", "value" }`,可写入 ES 的 `enriched_attributes` nested 字段 | | `results[].tags` | array | 从语义属性中抽取的 `name=tags` 的 value 集合,可与业务原有 `tags` 合并后写入 ES 的 `tags` 字段 | | `results[].error` | string | 若该条处理失败(如 LLM 异常),会在此字段返回错误信息 | @@ -753,7 +753,7 @@ curl -X POST "http://localhost:6004/indexer/enrich-content" \ "items": [ { "spu_id": "223167", - "title": "纯棉短袖T恤 夏季男装", + "title": "纯棉短袖T恤 夏季男装夏季男装", "brief": "夏季透气纯棉短袖,舒适亲肤", "description": "100%棉,圆领版型,适合日常通勤与休闲穿搭。", "image_url": "https://example.com/images/223167.jpg" diff --git a/docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md b/docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md index 7b92bff..b6e5e93 100644 --- a/docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md +++ b/docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md @@ -444,7 +444,7 @@ curl "http://localhost:6006/health" - **Base URL**: Indexer 服务地址,如 `http://localhost:6004` - **路径**: `POST /indexer/enrich-content` -- **说明**: 根据商品标题批量生成 `qanchors`、`semantic_attributes`、`tags`,用于拼装 ES 文档。内部使用大模型(需配置 `DASHSCOPE_API_KEY`),支持多语言与 Redis 缓存;单次最多 50 条,建议批量调用以提升效率。 +- **说明**: 根据商品标题批量生成 `qanchors`、`enriched_attributes`、`tags`,用于拼装 ES 文档。内部使用大模型(需配置 `DASHSCOPE_API_KEY`),支持多语言与 Redis 缓存;单次最多 50 条,建议批量调用以提升效率。 请求/响应格式、示例及错误码见 [-05-索引接口(Indexer)](./搜索API对接指南-05-索引接口(Indexer).md#58-内容理解字段生成接口)。 diff --git a/indexer/ANCHORS_AND_SEMANTIC_ATTRIBUTES.md b/indexer/ANCHORS_AND_SEMANTIC_ATTRIBUTES.md index d58159b..65329b5 100644 --- a/indexer/ANCHORS_AND_SEMANTIC_ATTRIBUTES.md +++ b/indexer/ANCHORS_AND_SEMANTIC_ATTRIBUTES.md @@ -1,9 +1,9 @@ -## qanchors 与 semantic_attributes 设计与索引逻辑说明 +## qanchors 与 enriched_attributes 设计与索引逻辑说明 本文档详细说明: - **锚文本字段 `qanchors.{lang}` 的作用与来源** -- **语义属性字段 `semantic_attributes` 的结构、用途与写入流程** +- **语义属性字段 `enriched_attributes` 的结构、用途与写入流程** - **多语言支持策略(zh / en / de / ru / fr)** - **索引阶段与 LLM 调用的集成方式** @@ -43,13 +43,13 @@ - 主搜索:作为额外的全文字段参与 BM25 召回与打分(可在 `search/query_config.py` 中给一定权重); - Suggestion:`suggestion/builder.py` 会从 `qanchors.{lang}` 中拆分词条作为候选(`source="qanchor"`,权重大于 `title`)。 -#### 1.2 `semantic_attributes`:面向过滤/分面的通用语义属性 +#### 1.2 `enriched_attributes`:面向过滤/分面的通用语义属性 - **Mapping 位置**:`mappings/search_products.json`,追加的 nested 字段。 - **结构**: ```1392:1410:/home/tw/saas-search/mappings/search_products.json -"semantic_attributes": { +"enriched_attributes": { "type": "nested", "properties": { "lang": { "type": "keyword" }, // 语言:zh / en / de / ru / fr @@ -199,7 +199,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st self._fill_llm_attributes(doc, spu_row) ``` -也就是说,**每个 SPU 文档默认会尝试补充 qanchors 与 semantic_attributes**。 +也就是说,**每个 SPU 文档默认会尝试补充 qanchors 与 enriched_attributes**。 #### 3.2 语言选择策略 @@ -237,7 +237,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st if not spu_id or not title: return - semantic_list = doc.get("semantic_attributes") or [] + semantic_list = doc.get("enriched_attributes") or [] qanchors_obj = doc.get("qanchors") or {} dim_keys = [ @@ -291,7 +291,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st if qanchors_obj: doc["qanchors"] = qanchors_obj if semantic_list: - doc["semantic_attributes"] = semantic_list + doc["enriched_attributes"] = semantic_list ``` 要点: @@ -307,7 +307,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st - 没有 `title`; - 或者 `tenant_config.index_languages` 与 `SUPPORTED_LANGS` 没有交集; - 或 `DASHSCOPE_API_KEY` 未配置 / LLM 请求报错; -- 则 `_fill_llm_attributes` 会在日志中输出 `warning`,**不会抛异常**,索引流程继续,只是该 SPU 在这一轮不会得到 `qanchors` / `semantic_attributes`。 +- 则 `_fill_llm_attributes` 会在日志中输出 `warning`,**不会抛异常**,索引流程继续,只是该 SPU 在这一轮不会得到 `qanchors` / `enriched_attributes`。 这保证了整个索引服务在 LLM 不可用时表现为一个普通的“传统索引”,而不会中断。 @@ -344,13 +344,13 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st ```json { "nested": { - "path": "semantic_attributes", + "path": "enriched_attributes", "query": { "bool": { "must": [ - { "term": { "semantic_attributes.lang": "zh" } }, - { "term": { "semantic_attributes.name": "usage_scene" } }, - { "term": { "semantic_attributes.value": "通勤" } } + { "term": { "enriched_attributes.lang": "zh" } }, + { "term": { "enriched_attributes.name": "usage_scene" } }, + { "term": { "enriched_attributes.value": "通勤" } } ] } } @@ -400,7 +400,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st 1. **功能定位**: - `qanchors.{lang}`:更好地贴近用户真实查询词,用于召回与 suggestion; - - `semantic_attributes`:以结构化形式承载 LLM 抽取的语义维度,用于 filter / facet。 + - `enriched_attributes`:以结构化形式承载 LLM 抽取的语义维度,用于 filter / facet。 2. **多语言对齐**: - 完全复用租户级 `index_languages` 配置; - 对每种语言单独生成锚文本与语义属性,不互相混用。 @@ -409,7 +409,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st - 当 LLM/配置异常时,只是“缺少增强特征”,不影响基础搜索能力。 4. **未来扩展**: - 可以在 `dim_keys` 中新增维度名(如 `style`, `benefit` 等),只要在 prompt 与解析逻辑中增加对应列即可; - - 可以为 `semantic_attributes` 增加额外字段(如 `confidence`、`source`),用于更精细的控制(当前 mapping 为简单版)。 + - 可以为 `enriched_attributes` 增加额外字段(如 `confidence`、`source`),用于更精细的控制(当前 mapping 为简单版)。 -如需在查询层面增加基于 `semantic_attributes` 的统一 DSL(类似 `specifications` 的过滤/分面规则),推荐在 `docs/搜索API对接指南.md` 中新增一节,并在 `search/es_query_builder.py` 里封装构造逻辑,避免前端直接拼 nested 查询。 +如需在查询层面增加基于 `enriched_attributes` 的统一 DSL(类似 `specifications` 的过滤/分面规则),推荐在 `docs/搜索API对接指南.md` 中新增一节,并在 `search/es_query_builder.py` 里封装构造逻辑,避免前端直接拼 nested 查询。 diff --git a/indexer/document_transformer.py b/indexer/document_transformer.py index 3a66209..d220417 100644 --- a/indexer/document_transformer.py +++ b/indexer/document_transformer.py @@ -202,7 +202,7 @@ class SPUDocumentTransformer: """ 批量调用 LLM,为一批 doc 填充: - qanchors.{lang} - - semantic_attributes (lang/name/value) + - enriched_attributes (lang/name/value) 设计目标: - 尽可能攒批调用 LLM; @@ -271,7 +271,7 @@ class SPUDocumentTransformer: if row.get("error"): return - semantic_list = doc.get("semantic_attributes") or [] + semantic_list = doc.get("enriched_attributes") or [] qanchors_obj = doc.get("qanchors") or {} anchor_text = str(row.get("anchor_text") or "").strip() @@ -288,7 +288,7 @@ class SPUDocumentTransformer: if qanchors_obj: doc["qanchors"] = qanchors_obj if semantic_list: - doc["semantic_attributes"] = semantic_list + doc["enriched_attributes"] = semantic_list except Exception as e: logger.warning("Failed to apply LLM row to doc (spu_id=%s, lang=%s): %s", doc.get("spu_id"), lang, e) @@ -638,7 +638,7 @@ class SPUDocumentTransformer: """ 调用 indexer.product_enrich.analyze_products,为当前 SPU 填充: - qanchors.{lang} - - semantic_attributes (lang/name/value) + - enriched_attributes (lang/name/value) """ try: index_langs = self.tenant_config.get("index_languages") or ["en", "zh"] @@ -653,7 +653,7 @@ class SPUDocumentTransformer: if not spu_id or not title: return - semantic_list = doc.get("semantic_attributes") or [] + semantic_list = doc.get("enriched_attributes") or [] qanchors_obj = doc.get("qanchors") or {} dim_keys = [ @@ -711,7 +711,7 @@ class SPUDocumentTransformer: if qanchors_obj: doc["qanchors"] = qanchors_obj if semantic_list: - doc["semantic_attributes"] = semantic_list + doc["enriched_attributes"] = semantic_list def _transform_sku_row(self, sku_row: pd.Series, option_name_map: Dict[int, str] = None) -> Optional[Dict[str, Any]]: """ diff --git a/mappings/search_products.json b/mappings/search_products.json index 6a796c4..fc120f2 100644 --- a/mappings/search_products.json +++ b/mappings/search_products.json @@ -179,132 +179,6 @@ } } }, - "qanchors": { - "type": "object", - "properties": { - "zh": { - "type": "text", - "analyzer": "index_ik", - "search_analyzer": "query_ik" - }, - "en": { - "type": "text", - "analyzer": "english" - }, - "ar": { - "type": "text", - "analyzer": "arabic" - }, - "hy": { - "type": "text", - "analyzer": "armenian" - }, - "eu": { - "type": "text", - "analyzer": "basque" - }, - "pt_br": { - "type": "text", - "analyzer": "brazilian" - }, - "bg": { - "type": "text", - "analyzer": "bulgarian" - }, - "ca": { - "type": "text", - "analyzer": "catalan" - }, - "cjk": { - "type": "text", - "analyzer": "cjk" - }, - "cs": { - "type": "text", - "analyzer": "czech" - }, - "da": { - "type": "text", - "analyzer": "danish" - }, - "nl": { - "type": "text", - "analyzer": "dutch" - }, - "fi": { - "type": "text", - "analyzer": "finnish" - }, - "fr": { - "type": "text", - "analyzer": "french" - }, - "gl": { - "type": "text", - "analyzer": "galician" - }, - "de": { - "type": "text", - "analyzer": "german" - }, - "el": { - "type": "text", - "analyzer": "greek" - }, - "hi": { - "type": "text", - "analyzer": "hindi" - }, - "hu": { - "type": "text", - "analyzer": "hungarian" - }, - "id": { - "type": "text", - "analyzer": "indonesian" - }, - "it": { - "type": "text", - "analyzer": "italian" - }, - "no": { - "type": "text", - "analyzer": "norwegian" - }, - "fa": { - "type": "text", - "analyzer": "persian" - }, - "pt": { - "type": "text", - "analyzer": "portuguese" - }, - "ro": { - "type": "text", - "analyzer": "romanian" - }, - "ru": { - "type": "text", - "analyzer": "russian" - }, - "es": { - "type": "text", - "analyzer": "spanish" - }, - "sv": { - "type": "text", - "analyzer": "swedish" - }, - "tr": { - "type": "text", - "analyzer": "turkish" - }, - "th": { - "type": "text", - "analyzer": "thai" - } - } - }, "keywords": { "type": "object", "properties": { @@ -315,119 +189,293 @@ }, "en": { "type": "text", - "analyzer": "english" + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "ar": { "type": "text", - "analyzer": "arabic" + "analyzer": "arabic", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "hy": { "type": "text", - "analyzer": "armenian" + "analyzer": "armenian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "eu": { "type": "text", - "analyzer": "basque" + "analyzer": "basque", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "pt_br": { "type": "text", - "analyzer": "brazilian" + "analyzer": "brazilian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "bg": { "type": "text", - "analyzer": "bulgarian" + "analyzer": "bulgarian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "ca": { "type": "text", - "analyzer": "catalan" + "analyzer": "catalan", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "cjk": { "type": "text", - "analyzer": "cjk" + "analyzer": "cjk", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "cs": { "type": "text", - "analyzer": "czech" + "analyzer": "czech", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "da": { "type": "text", - "analyzer": "danish" + "analyzer": "danish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "nl": { "type": "text", - "analyzer": "dutch" + "analyzer": "dutch", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "fi": { "type": "text", - "analyzer": "finnish" + "analyzer": "finnish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "fr": { "type": "text", - "analyzer": "french" + "analyzer": "french", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "gl": { "type": "text", - "analyzer": "galician" + "analyzer": "galician", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "de": { "type": "text", - "analyzer": "german" + "analyzer": "german", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "el": { "type": "text", - "analyzer": "greek" + "analyzer": "greek", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "hi": { "type": "text", - "analyzer": "hindi" + "analyzer": "hindi", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "hu": { "type": "text", - "analyzer": "hungarian" + "analyzer": "hungarian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "id": { "type": "text", - "analyzer": "indonesian" + "analyzer": "indonesian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "it": { "type": "text", - "analyzer": "italian" + "analyzer": "italian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "no": { "type": "text", - "analyzer": "norwegian" + "analyzer": "norwegian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "fa": { "type": "text", - "analyzer": "persian" + "analyzer": "persian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "pt": { "type": "text", - "analyzer": "portuguese" + "analyzer": "portuguese", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "ro": { "type": "text", - "analyzer": "romanian" + "analyzer": "romanian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "ru": { "type": "text", - "analyzer": "russian" + "analyzer": "russian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "es": { "type": "text", - "analyzer": "spanish" + "analyzer": "spanish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "sv": { "type": "text", - "analyzer": "swedish" + "analyzer": "swedish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "tr": { "type": "text", - "analyzer": "turkish" + "analyzer": "turkish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } }, "th": { "type": "text", - "analyzer": "thai" + "analyzer": "thai", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } } } }, @@ -983,9 +1031,6 @@ } } }, - "tags": { - "type": "keyword" - }, "image_url": { "type": "keyword", "index": false @@ -1012,21 +1057,6 @@ } } }, - "image_embedding_512": { - "type": "nested", - "properties": { - "vector": { - "type": "dense_vector", - "dims": 512, - "index": true, - "similarity": "dot_product", - "element_type": "bfloat16" - }, - "url": { - "type": "text" - } - } - }, "category_path": { "type": "object", "properties": { @@ -1279,6 +1309,46 @@ } } }, + "qanchors": { + "type": "object", + "properties": { + "zh": { + "type": "text", + "analyzer": "index_ik", + "search_analyzer": "query_ik" + }, + "en": { + "type": "text", + "analyzer": "english" + } + } + }, + "tags": { + "type": "object", + "properties": { + "zh": { + "type": "text", + "analyzer": "index_ik", + "search_analyzer": "query_ik", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "en": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + } + } + }, "category_id": { "type": "keyword" }, @@ -1307,7 +1377,64 @@ "type": "keyword" }, "value": { + "type": "object", + "properties": { + "zh": { + "type": "text", + "analyzer": "index_ik", + "search_analyzer": "query_ik", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "en": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + } + } + } + } + }, + "enriched_attributes": { + "type": "nested", + "properties": { + "name": { "type": "keyword" + }, + "value": { + "type": "object", + "properties": { + "zh": { + "type": "text", + "analyzer": "index_ik", + "search_analyzer": "query_ik", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "en": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + } + } } } }, @@ -1321,13 +1448,82 @@ "type": "keyword" }, "option1_values": { - "type": "keyword" + "type": "object", + "properties": { + "zh": { + "type": "text", + "analyzer": "index_ik", + "search_analyzer": "query_ik", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "en": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + } + } }, "option2_values": { - "type": "keyword" + "type": "object", + "properties": { + "zh": { + "type": "text", + "analyzer": "index_ik", + "search_analyzer": "query_ik", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "en": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + } + } }, "option3_values": { - "type": "keyword" + "type": "object", + "properties": { + "zh": { + "type": "text", + "analyzer": "index_ik", + "search_analyzer": "query_ik", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "en": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + } + } }, "min_price": { "type": "float" @@ -1391,20 +1587,6 @@ "index": false } } - }, - "semantic_attributes": { - "type": "nested", - "properties": { - "lang": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "value": { - "type": "keyword" - } - } } } } diff --git a/scripts/es_debug_search.py b/scripts/es_debug_search.py index 8097150..e8e4989 100644 --- a/scripts/es_debug_search.py +++ b/scripts/es_debug_search.py @@ -279,7 +279,9 @@ def _run_es( body: Dict[str, Any], size: int, ) -> List[Dict[str, Any]]: - resp = es.search(index=index_name, body=body, size=size) + # Avoid passing size= alongside body= (deprecated in elasticsearch-py). + payload = {**body, "size": size} + resp = es.search(index=index_name, body=payload) if hasattr(resp, "body"): payload = resp.body else: -- libgit2 0.21.2