Commit 36cf0ef91e3ae77c0c5f29d459962df70cbda623
1 parent
c3425429
es索引结果修改
Showing
10 changed files
with
429 additions
and
229 deletions
Show diff stats
api/routes/indexer.py
| @@ -88,7 +88,7 @@ class EnrichContentItem(BaseModel): | @@ -88,7 +88,7 @@ class EnrichContentItem(BaseModel): | ||
| 88 | 88 | ||
| 89 | class EnrichContentRequest(BaseModel): | 89 | class EnrichContentRequest(BaseModel): |
| 90 | """ | 90 | """ |
| 91 | - 内容理解字段生成请求:根据商品标题批量生成 qanchors、semantic_attributes、tags。 | 91 | + 内容理解字段生成请求:根据商品标题批量生成 qanchors、enriched_attributes、tags。 |
| 92 | 供外部 indexer 在自行组织 doc 时调用,与翻译、向量化等微服务并列。 | 92 | 供外部 indexer 在自行组织 doc 时调用,与翻译、向量化等微服务并列。 |
| 93 | """ | 93 | """ |
| 94 | tenant_id: str = Field(..., description="租户 ID,用于请求路由与结果归属,不参与缓存键") | 94 | tenant_id: str = Field(..., description="租户 ID,用于请求路由与结果归属,不参与缓存键") |
| @@ -447,7 +447,7 @@ async def build_docs_from_db(request: BuildDocsFromDbRequest): | @@ -447,7 +447,7 @@ async def build_docs_from_db(request: BuildDocsFromDbRequest): | ||
| 447 | def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: List[str]) -> List[Dict[str, Any]]: | 447 | def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: List[str]) -> List[Dict[str, Any]]: |
| 448 | """ | 448 | """ |
| 449 | 同步执行内容理解:调用 product_enrich.analyze_products,按语言批量跑 LLM, | 449 | 同步执行内容理解:调用 product_enrich.analyze_products,按语言批量跑 LLM, |
| 450 | - 再聚合成每 SPU 的 qanchors、semantic_attributes、tags。供 run_in_executor 调用。 | 450 | + 再聚合成每 SPU 的 qanchors、enriched_attributes、tags。供 run_in_executor 调用。 |
| 451 | """ | 451 | """ |
| 452 | from indexer.product_enrich import analyze_products, split_multi_value_field | 452 | from indexer.product_enrich import analyze_products, split_multi_value_field |
| 453 | 453 | ||
| @@ -473,11 +473,11 @@ def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: | @@ -473,11 +473,11 @@ def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: | ||
| 473 | "features", | 473 | "features", |
| 474 | ] | 474 | ] |
| 475 | 475 | ||
| 476 | - # 按 spu_id 聚合:qanchors[lang], semantic_attributes[], tags[] | 476 | + # 按 spu_id 聚合:qanchors[lang], enriched_attributes[], tags[] |
| 477 | by_spu: Dict[str, Dict[str, Any]] = {} | 477 | by_spu: Dict[str, Dict[str, Any]] = {} |
| 478 | for it in items: | 478 | for it in items: |
| 479 | sid = str(it["spu_id"]) | 479 | sid = str(it["spu_id"]) |
| 480 | - by_spu[sid] = {"qanchors": {}, "semantic_attributes": [], "tags": []} | 480 | + by_spu[sid] = {"qanchors": {}, "enriched_attributes": [], "tags": []} |
| 481 | 481 | ||
| 482 | for lang in llm_langs: | 482 | for lang in llm_langs: |
| 483 | try: | 483 | try: |
| @@ -511,7 +511,7 @@ def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: | @@ -511,7 +511,7 @@ def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: | ||
| 511 | if not raw: | 511 | if not raw: |
| 512 | continue | 512 | continue |
| 513 | for value in split_multi_value_field(str(raw)): | 513 | for value in split_multi_value_field(str(raw)): |
| 514 | - rec["semantic_attributes"].append({"lang": lang, "name": name, "value": value}) | 514 | + rec["enriched_attributes"].append({"lang": lang, "name": name, "value": value}) |
| 515 | if name == "tags": | 515 | if name == "tags": |
| 516 | rec["tags"].append(value) | 516 | rec["tags"].append(value) |
| 517 | 517 | ||
| @@ -524,7 +524,7 @@ def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: | @@ -524,7 +524,7 @@ def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: | ||
| 524 | out.append({ | 524 | out.append({ |
| 525 | "spu_id": sid, | 525 | "spu_id": sid, |
| 526 | "qanchors": rec["qanchors"], | 526 | "qanchors": rec["qanchors"], |
| 527 | - "semantic_attributes": rec["semantic_attributes"], | 527 | + "enriched_attributes": rec["enriched_attributes"], |
| 528 | "tags": tags, | 528 | "tags": tags, |
| 529 | **({"error": rec["error"]} if rec.get("error") else {}), | 529 | **({"error": rec["error"]} if rec.get("error") else {}), |
| 530 | }) | 530 | }) |
| @@ -534,7 +534,7 @@ def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: | @@ -534,7 +534,7 @@ def _run_enrich_content(tenant_id: str, items: List[Dict[str, str]], languages: | ||
| 534 | @router.post("/enrich-content") | 534 | @router.post("/enrich-content") |
| 535 | async def enrich_content(request: EnrichContentRequest): | 535 | async def enrich_content(request: EnrichContentRequest): |
| 536 | """ | 536 | """ |
| 537 | - 内容理解字段生成接口:根据商品标题批量生成 qanchors、semantic_attributes、tags。 | 537 | + 内容理解字段生成接口:根据商品标题批量生成 qanchors、enriched_attributes、tags。 |
| 538 | 538 | ||
| 539 | 使用场景: | 539 | 使用场景: |
| 540 | - 外部 indexer 采用「微服务组合」方式自己组织 doc 时,可调用本接口获取 LLM 生成的 | 540 | - 外部 indexer 采用「微服务组合」方式自己组织 doc 时,可调用本接口获取 LLM 生成的 |
docs/工作总结-微服务性能优化与架构.md
| @@ -95,10 +95,10 @@ instruction: "Given a shopping query, rank product titles by relevance" | @@ -95,10 +95,10 @@ instruction: "Given a shopping query, rank product titles by relevance" | ||
| 95 | 95 | ||
| 96 | ### 5. 内容理解字段(支撑 Suggest) | 96 | ### 5. 内容理解字段(支撑 Suggest) |
| 97 | 97 | ||
| 98 | -**能力**:支持根据商品标题批量生成 **qanchors**(锚文本)、**semantic_attributes**、**tags**,供索引与 suggest 使用。 | 98 | +**能力**:支持根据商品标题批量生成 **qanchors**(锚文本)、**enriched_attributes**、**tags**,供索引与 suggest 使用。 |
| 99 | 99 | ||
| 100 | **具体内容**: | 100 | **具体内容**: |
| 101 | -- **接口**:`POST /indexer/enrich-content`(Indexer 服务端口 **6004**)。请求体为 `items` 数组,每项含 `spu_id`、`title`(必填)及可选多语言标题等;单次请求最多 **50 条**,建议批量调用。响应 `results` 与 `items` 一一对应,每项含 `spu_id`、`qanchors`(按语言键,如 `qanchors.zh`、`qanchors.en`,逗号分隔短语)、`semantic_attributes`、`tags`。 | 101 | +- **接口**:`POST /indexer/enrich-content`(Indexer 服务端口 **6004**)。请求体为 `items` 数组,每项含 `spu_id`、`title`(必填)及可选多语言标题等;单次请求最多 **50 条**,建议批量调用。响应 `results` 与 `items` 一一对应,每项含 `spu_id`、`qanchors`(按语言键,如 `qanchors.zh`、`qanchors.en`,逗号分隔短语)、`enriched_attributes`、`tags`。 |
| 102 | -- **索引侧**:微服务组合方式下,调用方先拿不含 qanchors/tags 的 doc,再调用本接口补齐后写入 ES 的 `qanchors.{lang}` 等字段;索引 transformer(`indexer/document_transformer.py`、`indexer/product_enrich.py`)内也可在构建 doc 时调用内容理解逻辑,写入 `qanchors.{lang}`。 | 102 | -- **索引侧**:微服务组合方式下,调用方先拿不含 qanchors/tags 的 doc,再调用本接口补齐后写入 ES 的 `qanchors.{lang}` 等字段;索引 transformer(`indexer/document_transformer.py`、`indexer/product_enrich.py`)内也可在构建 doc 时调用内容理解逻辑,写入 `qanchors.{lang}`。 |
| 103 | - **Suggest 侧**:`suggestion/builder.py` 从 ES 商品索引读取 `_source: ["id", "spu_id", "title", "qanchors"]`,对 `qanchors.{lang}` 用 `_split_qanchors` 拆成词条,以 `source="qanchor"` 加入候选,排序时 `qanchor` 权重大于纯 title(`add_product("qanchor", ...)`);suggest 配置中 `sources: ["query_log", "qanchor"]` 表示候选来源包含 qanchor。 | 103 | - **Suggest 侧**:`suggestion/builder.py` 从 ES 商品索引读取 `_source: ["id", "spu_id", "title", "qanchors"]`,对 `qanchors.{lang}` 用 `_split_qanchors` 拆成词条,以 `source="qanchor"` 加入候选,排序时 `qanchor` 权重大于纯 title(`add_product("qanchor", ...)`);suggest 配置中 `sources: ["query_log", "qanchor"]` 表示候选来源包含 qanchor。 |
| 104 | - **实现与依赖**:内容理解内部使用大模型(需 `DASHSCOPE_API_KEY`),支持多语言与 Redis 缓存(如 `product_anchors`);逻辑与 `indexer/product_enrich` 一致。 | 104 | - **实现与依赖**:内容理解内部使用大模型(需 `DASHSCOPE_API_KEY`),支持多语言与 Redis 缓存(如 `product_anchors`);逻辑与 `indexer/product_enrich` 一致。 |
docs/常用查询 - ES.md
| @@ -664,4 +664,20 @@ curl -u 'saas:4hOaLaf41y2VuI8y' -X POST \ | @@ -664,4 +664,20 @@ curl -u 'saas:4hOaLaf41y2VuI8y' -X POST \ | ||
| 664 | } | 664 | } |
| 665 | } | 665 | } |
| 666 | }' | 666 | }' |
| 667 | + | ||
| 668 | +curl -u 'saas:4hOaLaf41y2VuI8y' -X POST \ | ||
| 669 | +'http://localhost:9200/search_products_tenant_163/_count' \ | ||
| 670 | +-H 'Content-Type: application/json' \ | ||
| 671 | +-d '{ | ||
| 672 | +"query": { | ||
| 673 | +"nested": { | ||
| 674 | + "path": "image_embedding", | ||
| 675 | + "query": { | ||
| 676 | + "exists": { | ||
| 677 | + "field": "image_embedding.vector" | ||
| 678 | + } | ||
| 679 | + } | ||
| 680 | +} | ||
| 681 | +} | ||
| 682 | +}' | ||
| 667 | ``` | 683 | ``` |
| 668 | \ No newline at end of file | 684 | \ No newline at end of file |
docs/搜索API对接指南-00-总览与快速开始.md
| @@ -90,7 +90,7 @@ curl -X POST "http://43.166.252.75:6002/search/" \ | @@ -90,7 +90,7 @@ curl -X POST "http://43.166.252.75:6002/search/" \ | ||
| 90 | | 查询文档 | POST | `/indexer/documents` | 查询SPU文档数据(不写入ES) | | 90 | | 查询文档 | POST | `/indexer/documents` | 查询SPU文档数据(不写入ES) | |
| 91 | | 构建ES文档(正式对接) | POST | `/indexer/build-docs` | 基于上游提供的 MySQL 行数据构建 ES doc,不写入 ES,供 Java 等调用后自行写入 | | 91 | | 构建ES文档(正式对接) | POST | `/indexer/build-docs` | 基于上游提供的 MySQL 行数据构建 ES doc,不写入 ES,供 Java 等调用后自行写入 | |
| 92 | | 构建ES文档(测试用) | POST | `/indexer/build-docs-from-db` | 仅在测试/调试时使用,根据 `tenant_id + spu_ids` 内部查库并构建 ES doc | | 92 | | 构建ES文档(测试用) | POST | `/indexer/build-docs-from-db` | 仅在测试/调试时使用,根据 `tenant_id + spu_ids` 内部查库并构建 ES doc | |
| 93 | -| 内容理解字段生成 | POST | `/indexer/enrich-content` | 根据商品标题批量生成 qanchors、semantic_attributes、tags,供微服务组合方式使用 | | 93 | +| 内容理解字段生成 | POST | `/indexer/enrich-content` | 根据商品标题批量生成 qanchors、enriched_attributes、tags,供微服务组合方式使用 | |
| 94 | | 索引健康检查 | GET | `/indexer/health` | 检查索引服务状态 | | 94 | | 索引健康检查 | GET | `/indexer/health` | 检查索引服务状态 | |
| 95 | | 健康检查 | GET | `/admin/health` | 服务健康检查 | | 95 | | 健康检查 | GET | `/admin/health` | 服务健康检查 | |
| 96 | | 获取配置 | GET | `/admin/config` | 获取租户配置 | | 96 | | 获取配置 | GET | `/admin/config` | 获取租户配置 | |
docs/搜索API对接指南-05-索引接口(Indexer).md
| @@ -649,7 +649,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | @@ -649,7 +649,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | ||
| 649 | ### 5.8 内容理解字段生成接口 | 649 | ### 5.8 内容理解字段生成接口 |
| 650 | 650 | ||
| 651 | - **端点**: `POST /indexer/enrich-content` | 651 | - **端点**: `POST /indexer/enrich-content` |
| 652 | -- **描述**: 根据商品内容信息批量生成 **qanchors**(锚文本)、**semantic_attributes**(语义属性)、**tags**(细分标签),供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段(必填/可选见下表)。内部逻辑与 `indexer.product_enrich` 一致,支持多语言与 Redis 缓存;单次请求在线程池中执行,避免阻塞其他接口。 | 652 | +- **描述**: 根据商品内容信息批量生成 **qanchors**(锚文本)、**enriched_attributes**(语义属性)、**tags**(细分标签),供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段(必填/可选见下表)。内部逻辑与 `indexer.product_enrich` 一致,支持多语言与 Redis 缓存;单次请求在线程池中执行,避免阻塞其他接口。 |
| 653 | 653 | ||
| 654 | #### 请求参数 | 654 | #### 请求参数 |
| 655 | 655 | ||
| @@ -714,7 +714,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | @@ -714,7 +714,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | ||
| 714 | "zh": "短袖T恤,纯棉,男装,夏季", | 714 | "zh": "短袖T恤,纯棉,男装,夏季", |
| 715 | "en": "cotton t-shirt, short sleeve, men, summer" | 715 | "en": "cotton t-shirt, short sleeve, men, summer" |
| 716 | }, | 716 | }, |
| 717 | - "semantic_attributes": [ | 717 | + "enriched_attributes": [ |
| 718 | { "lang": "zh", "name": "tags", "value": "纯棉" }, | 718 | { "lang": "zh", "name": "tags", "value": "纯棉" }, |
| 719 | { "lang": "zh", "name": "usage_scene", "value": "日常" }, | 719 | { "lang": "zh", "name": "usage_scene", "value": "日常" }, |
| 720 | { "lang": "en", "name": "tags", "value": "cotton" } | 720 | { "lang": "en", "name": "tags", "value": "cotton" } |
| @@ -724,7 +724,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | @@ -724,7 +724,7 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | ||
| 724 | { | 724 | { |
| 725 | "spu_id": "223168", | 725 | "spu_id": "223168", |
| 726 | "qanchors": { "en": "dolls, toys, 12pcs" }, | 726 | "qanchors": { "en": "dolls, toys, 12pcs" }, |
| 727 | - "semantic_attributes": [], | 727 | + "enriched_attributes": [], |
| 728 | "tags": ["dolls", "toys"] | 728 | "tags": ["dolls", "toys"] |
| 729 | } | 729 | } |
| 730 | ] | 730 | ] |
| @@ -733,9 +733,9 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | @@ -733,9 +733,9 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ | ||
| 733 | 733 | ||
| 734 | | 字段 | 类型 | 说明 | | 734 | | 字段 | 类型 | 说明 | |
| 735 | |------|------|------| | 735 | |------|------|------| |
| 736 | -| `results` | array | 与请求 `items` 一一对应,每项含 `spu_id`、`qanchors`、`semantic_attributes`、`tags` | | 736 | +| `results` | array | 与请求 `items` 一一对应,每项含 `spu_id`、`qanchors`、`enriched_attributes`、`tags` | |
| 737 | | `results[].qanchors` | object | 按语言键的锚文本(逗号分隔短语),可写入 ES 文档的 `qanchors.{lang}` | | 737 | | `results[].qanchors` | object | 按语言键的锚文本(逗号分隔短语),可写入 ES 文档的 `qanchors.{lang}` | |
| 738 | -| `results[].semantic_attributes` | array | 语义属性列表,每项为 `{ "lang", "name", "value" }`,可写入 ES 的 `semantic_attributes` nested 字段 | | 738 | +| `results[].enriched_attributes` | array | 语义属性列表,每项为 `{ "lang", "name", "value" }`,可写入 ES 的 `enriched_attributes` nested 字段 | |
| 739 | | `results[].tags` | array | 从语义属性中抽取的 `name=tags` 的 value 集合,可与业务原有 `tags` 合并后写入 ES 的 `tags` 字段 | | 739 | | `results[].tags` | array | 从语义属性中抽取的 `name=tags` 的 value 集合,可与业务原有 `tags` 合并后写入 ES 的 `tags` 字段 | |
| 740 | | `results[].error` | string | 若该条处理失败(如 LLM 异常),会在此字段返回错误信息 | | 740 | | `results[].error` | string | 若该条处理失败(如 LLM 异常),会在此字段返回错误信息 | |
| 741 | 741 | ||
| @@ -753,7 +753,7 @@ curl -X POST "http://localhost:6004/indexer/enrich-content" \ | @@ -753,7 +753,7 @@ curl -X POST "http://localhost:6004/indexer/enrich-content" \ | ||
| 753 | "items": [ | 753 | "items": [ |
| 754 | { | 754 | { |
| 755 | "spu_id": "223167", | 755 | "spu_id": "223167", |
| 756 | - "title": "纯棉短袖T恤 夏季男装", | 756 | + "title": "纯棉短袖T恤 夏季男装夏季男装", |
| 757 | "brief": "夏季透气纯棉短袖,舒适亲肤", | 757 | "brief": "夏季透气纯棉短袖,舒适亲肤", |
| 758 | "description": "100%棉,圆领版型,适合日常通勤与休闲穿搭。", | 758 | "description": "100%棉,圆领版型,适合日常通勤与休闲穿搭。", |
| 759 | "image_url": "https://example.com/images/223167.jpg" | 759 | "image_url": "https://example.com/images/223167.jpg" |
docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md
| @@ -444,7 +444,7 @@ curl "http://localhost:6006/health" | @@ -444,7 +444,7 @@ curl "http://localhost:6006/health" | ||
| 444 | 444 | ||
| 445 | - **Base URL**: Indexer 服务地址,如 `http://localhost:6004` | 445 | - **Base URL**: Indexer 服务地址,如 `http://localhost:6004` |
| 446 | - **路径**: `POST /indexer/enrich-content` | 446 | - **路径**: `POST /indexer/enrich-content` |
| 447 | -- **说明**: 根据商品标题批量生成 `qanchors`、`semantic_attributes`、`tags`,用于拼装 ES 文档。内部使用大模型(需配置 `DASHSCOPE_API_KEY`),支持多语言与 Redis 缓存;单次最多 50 条,建议批量调用以提升效率。 | 447 | +- **说明**: 根据商品标题批量生成 `qanchors`、`enriched_attributes`、`tags`,用于拼装 ES 文档。内部使用大模型(需配置 `DASHSCOPE_API_KEY`),支持多语言与 Redis 缓存;单次最多 50 条,建议批量调用以提升效率。 |
| 448 | 448 | ||
| 449 | 请求/响应格式、示例及错误码见 [-05-索引接口(Indexer)](./搜索API对接指南-05-索引接口(Indexer).md#58-内容理解字段生成接口)。 | 449 | 请求/响应格式、示例及错误码见 [-05-索引接口(Indexer)](./搜索API对接指南-05-索引接口(Indexer).md#58-内容理解字段生成接口)。 |
| 450 | 450 |
indexer/ANCHORS_AND_SEMANTIC_ATTRIBUTES.md
| 1 | -## qanchors 与 semantic_attributes 设计与索引逻辑说明 | 1 | +## qanchors 与 enriched_attributes 设计与索引逻辑说明 |
| 2 | 2 | ||
| 3 | 本文档详细说明: | 3 | 本文档详细说明: |
| 4 | 4 | ||
| 5 | - **锚文本字段 `qanchors.{lang}` 的作用与来源** | 5 | - **锚文本字段 `qanchors.{lang}` 的作用与来源** |
| 6 | -- **语义属性字段 `semantic_attributes` 的结构、用途与写入流程** | 6 | +- **语义属性字段 `enriched_attributes` 的结构、用途与写入流程** |
| 7 | - **多语言支持策略(zh / en / de / ru / fr)** | 7 | - **多语言支持策略(zh / en / de / ru / fr)** |
| 8 | - **索引阶段与 LLM 调用的集成方式** | 8 | - **索引阶段与 LLM 调用的集成方式** |
| 9 | 9 | ||
| @@ -43,13 +43,13 @@ | @@ -43,13 +43,13 @@ | ||
| 43 | - 主搜索:作为额外的全文字段参与 BM25 召回与打分(可在 `search/query_config.py` 中给一定权重); | 43 | - 主搜索:作为额外的全文字段参与 BM25 召回与打分(可在 `search/query_config.py` 中给一定权重); |
| 44 | - Suggestion:`suggestion/builder.py` 会从 `qanchors.{lang}` 中拆分词条作为候选(`source="qanchor"`,权重大于 `title`)。 | 44 | - Suggestion:`suggestion/builder.py` 会从 `qanchors.{lang}` 中拆分词条作为候选(`source="qanchor"`,权重大于 `title`)。 |
| 45 | 45 | ||
| 46 | -#### 1.2 `semantic_attributes`:面向过滤/分面的通用语义属性 | 46 | +#### 1.2 `enriched_attributes`:面向过滤/分面的通用语义属性 |
| 47 | 47 | ||
| 48 | - **Mapping 位置**:`mappings/search_products.json`,追加的 nested 字段。 | 48 | - **Mapping 位置**:`mappings/search_products.json`,追加的 nested 字段。 |
| 49 | - **结构**: | 49 | - **结构**: |
| 50 | 50 | ||
| 51 | ```1392:1410:/home/tw/saas-search/mappings/search_products.json | 51 | ```1392:1410:/home/tw/saas-search/mappings/search_products.json |
| 52 | -"semantic_attributes": { | 52 | +"enriched_attributes": { |
| 53 | "type": "nested", | 53 | "type": "nested", |
| 54 | "properties": { | 54 | "properties": { |
| 55 | "lang": { "type": "keyword" }, // 语言:zh / en / de / ru / fr | 55 | "lang": { "type": "keyword" }, // 语言:zh / en / de / ru / fr |
| @@ -199,7 +199,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st | @@ -199,7 +199,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st | ||
| 199 | self._fill_llm_attributes(doc, spu_row) | 199 | self._fill_llm_attributes(doc, spu_row) |
| 200 | ``` | 200 | ``` |
| 201 | 201 | ||
| 202 | -也就是说,**每个 SPU 文档默认会尝试补充 qanchors 与 semantic_attributes**。 | 202 | +也就是说,**每个 SPU 文档默认会尝试补充 qanchors 与 enriched_attributes**。 |
| 203 | 203 | ||
| 204 | #### 3.2 语言选择策略 | 204 | #### 3.2 语言选择策略 |
| 205 | 205 | ||
| @@ -237,7 +237,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st | @@ -237,7 +237,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st | ||
| 237 | if not spu_id or not title: | 237 | if not spu_id or not title: |
| 238 | return | 238 | return |
| 239 | 239 | ||
| 240 | - semantic_list = doc.get("semantic_attributes") or [] | 240 | + semantic_list = doc.get("enriched_attributes") or [] |
| 241 | qanchors_obj = doc.get("qanchors") or {} | 241 | qanchors_obj = doc.get("qanchors") or {} |
| 242 | 242 | ||
| 243 | dim_keys = [ | 243 | dim_keys = [ |
| @@ -291,7 +291,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st | @@ -291,7 +291,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st | ||
| 291 | if qanchors_obj: | 291 | if qanchors_obj: |
| 292 | doc["qanchors"] = qanchors_obj | 292 | doc["qanchors"] = qanchors_obj |
| 293 | if semantic_list: | 293 | if semantic_list: |
| 294 | - doc["semantic_attributes"] = semantic_list | 294 | + doc["enriched_attributes"] = semantic_list |
| 295 | ``` | 295 | ``` |
| 296 | 296 | ||
| 297 | 要点: | 297 | 要点: |
| @@ -307,7 +307,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st | @@ -307,7 +307,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st | ||
| 307 | - 没有 `title`; | 307 | - 没有 `title`; |
| 308 | - 或者 `tenant_config.index_languages` 与 `SUPPORTED_LANGS` 没有交集; | 308 | - 或者 `tenant_config.index_languages` 与 `SUPPORTED_LANGS` 没有交集; |
| 309 | - 或 `DASHSCOPE_API_KEY` 未配置 / LLM 请求报错; | 309 | - 或 `DASHSCOPE_API_KEY` 未配置 / LLM 请求报错; |
| 310 | -- 则 `_fill_llm_attributes` 会在日志中输出 `warning`,**不会抛异常**,索引流程继续,只是该 SPU 在这一轮不会得到 `qanchors` / `semantic_attributes`。 | 310 | +- 则 `_fill_llm_attributes` 会在日志中输出 `warning`,**不会抛异常**,索引流程继续,只是该 SPU 在这一轮不会得到 `qanchors` / `enriched_attributes`。 |
| 311 | 311 | ||
| 312 | 这保证了整个索引服务在 LLM 不可用时表现为一个普通的“传统索引”,而不会中断。 | 312 | 这保证了整个索引服务在 LLM 不可用时表现为一个普通的“传统索引”,而不会中断。 |
| 313 | 313 | ||
| @@ -344,13 +344,13 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st | @@ -344,13 +344,13 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st | ||
| 344 | ```json | 344 | ```json |
| 345 | { | 345 | { |
| 346 | "nested": { | 346 | "nested": { |
| 347 | - "path": "semantic_attributes", | 347 | + "path": "enriched_attributes", |
| 348 | "query": { | 348 | "query": { |
| 349 | "bool": { | 349 | "bool": { |
| 350 | "must": [ | 350 | "must": [ |
| 351 | - { "term": { "semantic_attributes.lang": "zh" } }, | ||
| 352 | - { "term": { "semantic_attributes.name": "usage_scene" } }, | ||
| 353 | - { "term": { "semantic_attributes.value": "通勤" } } | 351 | + { "term": { "enriched_attributes.lang": "zh" } }, |
| 352 | + { "term": { "enriched_attributes.name": "usage_scene" } }, | ||
| 353 | + { "term": { "enriched_attributes.value": "通勤" } } | ||
| 354 | ] | 354 | ] |
| 355 | } | 355 | } |
| 356 | } | 356 | } |
| @@ -400,7 +400,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st | @@ -400,7 +400,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st | ||
| 400 | 400 | ||
| 401 | 1. **功能定位**: | 401 | 1. **功能定位**: |
| 402 | - `qanchors.{lang}`:更好地贴近用户真实查询词,用于召回与 suggestion; | 402 | - `qanchors.{lang}`:更好地贴近用户真实查询词,用于召回与 suggestion; |
| 403 | - - `semantic_attributes`:以结构化形式承载 LLM 抽取的语义维度,用于 filter / facet。 | 403 | + - `enriched_attributes`:以结构化形式承载 LLM 抽取的语义维度,用于 filter / facet。 |
| 404 | 2. **多语言对齐**: | 404 | 2. **多语言对齐**: |
| 405 | - 完全复用租户级 `index_languages` 配置; | 405 | - 完全复用租户级 `index_languages` 配置; |
| 406 | - 对每种语言单独生成锚文本与语义属性,不互相混用。 | 406 | - 对每种语言单独生成锚文本与语义属性,不互相混用。 |
| @@ -409,7 +409,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st | @@ -409,7 +409,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> st | ||
| 409 | - 当 LLM/配置异常时,只是“缺少增强特征”,不影响基础搜索能力。 | 409 | - 当 LLM/配置异常时,只是“缺少增强特征”,不影响基础搜索能力。 |
| 410 | 4. **未来扩展**: | 410 | 4. **未来扩展**: |
| 411 | - 可以在 `dim_keys` 中新增维度名(如 `style`, `benefit` 等),只要在 prompt 与解析逻辑中增加对应列即可; | 411 | - 可以在 `dim_keys` 中新增维度名(如 `style`, `benefit` 等),只要在 prompt 与解析逻辑中增加对应列即可; |
| 412 | - - 可以为 `semantic_attributes` 增加额外字段(如 `confidence`、`source`),用于更精细的控制(当前 mapping 为简单版)。 | 412 | + - 可以为 `enriched_attributes` 增加额外字段(如 `confidence`、`source`),用于更精细的控制(当前 mapping 为简单版)。 |
| 413 | 413 | ||
| 414 | -如需在查询层面增加基于 `semantic_attributes` 的统一 DSL(类似 `specifications` 的过滤/分面规则),推荐在 `docs/搜索API对接指南.md` 中新增一节,并在 `search/es_query_builder.py` 里封装构造逻辑,避免前端直接拼 nested 查询。 | 414 | +如需在查询层面增加基于 `enriched_attributes` 的统一 DSL(类似 `specifications` 的过滤/分面规则),推荐在 `docs/搜索API对接指南.md` 中新增一节,并在 `search/es_query_builder.py` 里封装构造逻辑,避免前端直接拼 nested 查询。 |
| 415 | 415 |
indexer/document_transformer.py
| @@ -202,7 +202,7 @@ class SPUDocumentTransformer: | @@ -202,7 +202,7 @@ class SPUDocumentTransformer: | ||
| 202 | """ | 202 | """ |
| 203 | 批量调用 LLM,为一批 doc 填充: | 203 | 批量调用 LLM,为一批 doc 填充: |
| 204 | - qanchors.{lang} | 204 | - qanchors.{lang} |
| 205 | - - semantic_attributes (lang/name/value) | 205 | + - enriched_attributes (lang/name/value) |
| 206 | 206 | ||
| 207 | 设计目标: | 207 | 设计目标: |
| 208 | - 尽可能攒批调用 LLM; | 208 | - 尽可能攒批调用 LLM; |
| @@ -271,7 +271,7 @@ class SPUDocumentTransformer: | @@ -271,7 +271,7 @@ class SPUDocumentTransformer: | ||
| 271 | if row.get("error"): | 271 | if row.get("error"): |
| 272 | return | 272 | return |
| 273 | 273 | ||
| 274 | - semantic_list = doc.get("semantic_attributes") or [] | 274 | + semantic_list = doc.get("enriched_attributes") or [] |
| 275 | qanchors_obj = doc.get("qanchors") or {} | 275 | qanchors_obj = doc.get("qanchors") or {} |
| 276 | 276 | ||
| 277 | anchor_text = str(row.get("anchor_text") or "").strip() | 277 | anchor_text = str(row.get("anchor_text") or "").strip() |
| @@ -288,7 +288,7 @@ class SPUDocumentTransformer: | @@ -288,7 +288,7 @@ class SPUDocumentTransformer: | ||
| 288 | if qanchors_obj: | 288 | if qanchors_obj: |
| 289 | doc["qanchors"] = qanchors_obj | 289 | doc["qanchors"] = qanchors_obj |
| 290 | if semantic_list: | 290 | if semantic_list: |
| 291 | - doc["semantic_attributes"] = semantic_list | 291 | + doc["enriched_attributes"] = semantic_list |
| 292 | except Exception as e: | 292 | except Exception as e: |
| 293 | logger.warning("Failed to apply LLM row to doc (spu_id=%s, lang=%s): %s", doc.get("spu_id"), lang, e) | 293 | logger.warning("Failed to apply LLM row to doc (spu_id=%s, lang=%s): %s", doc.get("spu_id"), lang, e) |
| 294 | 294 | ||
| @@ -638,7 +638,7 @@ class SPUDocumentTransformer: | @@ -638,7 +638,7 @@ class SPUDocumentTransformer: | ||
| 638 | """ | 638 | """ |
| 639 | 调用 indexer.product_enrich.analyze_products,为当前 SPU 填充: | 639 | 调用 indexer.product_enrich.analyze_products,为当前 SPU 填充: |
| 640 | - qanchors.{lang} | 640 | - qanchors.{lang} |
| 641 | - - semantic_attributes (lang/name/value) | 641 | + - enriched_attributes (lang/name/value) |
| 642 | """ | 642 | """ |
| 643 | try: | 643 | try: |
| 644 | index_langs = self.tenant_config.get("index_languages") or ["en", "zh"] | 644 | index_langs = self.tenant_config.get("index_languages") or ["en", "zh"] |
| @@ -653,7 +653,7 @@ class SPUDocumentTransformer: | @@ -653,7 +653,7 @@ class SPUDocumentTransformer: | ||
| 653 | if not spu_id or not title: | 653 | if not spu_id or not title: |
| 654 | return | 654 | return |
| 655 | 655 | ||
| 656 | - semantic_list = doc.get("semantic_attributes") or [] | 656 | + semantic_list = doc.get("enriched_attributes") or [] |
| 657 | qanchors_obj = doc.get("qanchors") or {} | 657 | qanchors_obj = doc.get("qanchors") or {} |
| 658 | 658 | ||
| 659 | dim_keys = [ | 659 | dim_keys = [ |
| @@ -711,7 +711,7 @@ class SPUDocumentTransformer: | @@ -711,7 +711,7 @@ class SPUDocumentTransformer: | ||
| 711 | if qanchors_obj: | 711 | if qanchors_obj: |
| 712 | doc["qanchors"] = qanchors_obj | 712 | doc["qanchors"] = qanchors_obj |
| 713 | if semantic_list: | 713 | if semantic_list: |
| 714 | - doc["semantic_attributes"] = semantic_list | 714 | + doc["enriched_attributes"] = semantic_list |
| 715 | 715 | ||
| 716 | def _transform_sku_row(self, sku_row: pd.Series, option_name_map: Dict[int, str] = None) -> Optional[Dict[str, Any]]: | 716 | def _transform_sku_row(self, sku_row: pd.Series, option_name_map: Dict[int, str] = None) -> Optional[Dict[str, Any]]: |
| 717 | """ | 717 | """ |
mappings/search_products.json
| @@ -179,132 +179,6 @@ | @@ -179,132 +179,6 @@ | ||
| 179 | } | 179 | } |
| 180 | } | 180 | } |
| 181 | }, | 181 | }, |
| 182 | - "qanchors": { | ||
| 183 | - "type": "object", | ||
| 184 | - "properties": { | ||
| 185 | - "zh": { | ||
| 186 | - "type": "text", | ||
| 187 | - "analyzer": "index_ik", | ||
| 188 | - "search_analyzer": "query_ik" | ||
| 189 | - }, | ||
| 190 | - "en": { | ||
| 191 | - "type": "text", | ||
| 192 | - "analyzer": "english" | ||
| 193 | - }, | ||
| 194 | - "ar": { | ||
| 195 | - "type": "text", | ||
| 196 | - "analyzer": "arabic" | ||
| 197 | - }, | ||
| 198 | - "hy": { | ||
| 199 | - "type": "text", | ||
| 200 | - "analyzer": "armenian" | ||
| 201 | - }, | ||
| 202 | - "eu": { | ||
| 203 | - "type": "text", | ||
| 204 | - "analyzer": "basque" | ||
| 205 | - }, | ||
| 206 | - "pt_br": { | ||
| 207 | - "type": "text", | ||
| 208 | - "analyzer": "brazilian" | ||
| 209 | - }, | ||
| 210 | - "bg": { | ||
| 211 | - "type": "text", | ||
| 212 | - "analyzer": "bulgarian" | ||
| 213 | - }, | ||
| 214 | - "ca": { | ||
| 215 | - "type": "text", | ||
| 216 | - "analyzer": "catalan" | ||
| 217 | - }, | ||
| 218 | - "cjk": { | ||
| 219 | - "type": "text", | ||
| 220 | - "analyzer": "cjk" | ||
| 221 | - }, | ||
| 222 | - "cs": { | ||
| 223 | - "type": "text", | ||
| 224 | - "analyzer": "czech" | ||
| 225 | - }, | ||
| 226 | - "da": { | ||
| 227 | - "type": "text", | ||
| 228 | - "analyzer": "danish" | ||
| 229 | - }, | ||
| 230 | - "nl": { | ||
| 231 | - "type": "text", | ||
| 232 | - "analyzer": "dutch" | ||
| 233 | - }, | ||
| 234 | - "fi": { | ||
| 235 | - "type": "text", | ||
| 236 | - "analyzer": "finnish" | ||
| 237 | - }, | ||
| 238 | - "fr": { | ||
| 239 | - "type": "text", | ||
| 240 | - "analyzer": "french" | ||
| 241 | - }, | ||
| 242 | - "gl": { | ||
| 243 | - "type": "text", | ||
| 244 | - "analyzer": "galician" | ||
| 245 | - }, | ||
| 246 | - "de": { | ||
| 247 | - "type": "text", | ||
| 248 | - "analyzer": "german" | ||
| 249 | - }, | ||
| 250 | - "el": { | ||
| 251 | - "type": "text", | ||
| 252 | - "analyzer": "greek" | ||
| 253 | - }, | ||
| 254 | - "hi": { | ||
| 255 | - "type": "text", | ||
| 256 | - "analyzer": "hindi" | ||
| 257 | - }, | ||
| 258 | - "hu": { | ||
| 259 | - "type": "text", | ||
| 260 | - "analyzer": "hungarian" | ||
| 261 | - }, | ||
| 262 | - "id": { | ||
| 263 | - "type": "text", | ||
| 264 | - "analyzer": "indonesian" | ||
| 265 | - }, | ||
| 266 | - "it": { | ||
| 267 | - "type": "text", | ||
| 268 | - "analyzer": "italian" | ||
| 269 | - }, | ||
| 270 | - "no": { | ||
| 271 | - "type": "text", | ||
| 272 | - "analyzer": "norwegian" | ||
| 273 | - }, | ||
| 274 | - "fa": { | ||
| 275 | - "type": "text", | ||
| 276 | - "analyzer": "persian" | ||
| 277 | - }, | ||
| 278 | - "pt": { | ||
| 279 | - "type": "text", | ||
| 280 | - "analyzer": "portuguese" | ||
| 281 | - }, | ||
| 282 | - "ro": { | ||
| 283 | - "type": "text", | ||
| 284 | - "analyzer": "romanian" | ||
| 285 | - }, | ||
| 286 | - "ru": { | ||
| 287 | - "type": "text", | ||
| 288 | - "analyzer": "russian" | ||
| 289 | - }, | ||
| 290 | - "es": { | ||
| 291 | - "type": "text", | ||
| 292 | - "analyzer": "spanish" | ||
| 293 | - }, | ||
| 294 | - "sv": { | ||
| 295 | - "type": "text", | ||
| 296 | - "analyzer": "swedish" | ||
| 297 | - }, | ||
| 298 | - "tr": { | ||
| 299 | - "type": "text", | ||
| 300 | - "analyzer": "turkish" | ||
| 301 | - }, | ||
| 302 | - "th": { | ||
| 303 | - "type": "text", | ||
| 304 | - "analyzer": "thai" | ||
| 305 | - } | ||
| 306 | - } | ||
| 307 | - }, | ||
| 308 | "keywords": { | 182 | "keywords": { |
| 309 | "type": "object", | 183 | "type": "object", |
| 310 | "properties": { | 184 | "properties": { |
| @@ -315,119 +189,293 @@ | @@ -315,119 +189,293 @@ | ||
| 315 | }, | 189 | }, |
| 316 | "en": { | 190 | "en": { |
| 317 | "type": "text", | 191 | "type": "text", |
| 318 | - "analyzer": "english" | 192 | + "analyzer": "english", |
| 193 | + "fields": { | ||
| 194 | + "keyword": { | ||
| 195 | + "type": "keyword", | ||
| 196 | + "normalizer": "lowercase" | ||
| 197 | + } | ||
| 198 | + } | ||
| 319 | }, | 199 | }, |
| 320 | "ar": { | 200 | "ar": { |
| 321 | "type": "text", | 201 | "type": "text", |
| 322 | - "analyzer": "arabic" | 202 | + "analyzer": "arabic", |
| 203 | + "fields": { | ||
| 204 | + "keyword": { | ||
| 205 | + "type": "keyword", | ||
| 206 | + "normalizer": "lowercase" | ||
| 207 | + } | ||
| 208 | + } | ||
| 323 | }, | 209 | }, |
| 324 | "hy": { | 210 | "hy": { |
| 325 | "type": "text", | 211 | "type": "text", |
| 326 | - "analyzer": "armenian" | 212 | + "analyzer": "armenian", |
| 213 | + "fields": { | ||
| 214 | + "keyword": { | ||
| 215 | + "type": "keyword", | ||
| 216 | + "normalizer": "lowercase" | ||
| 217 | + } | ||
| 218 | + } | ||
| 327 | }, | 219 | }, |
| 328 | "eu": { | 220 | "eu": { |
| 329 | "type": "text", | 221 | "type": "text", |
| 330 | - "analyzer": "basque" | 222 | + "analyzer": "basque", |
| 223 | + "fields": { | ||
| 224 | + "keyword": { | ||
| 225 | + "type": "keyword", | ||
| 226 | + "normalizer": "lowercase" | ||
| 227 | + } | ||
| 228 | + } | ||
| 331 | }, | 229 | }, |
| 332 | "pt_br": { | 230 | "pt_br": { |
| 333 | "type": "text", | 231 | "type": "text", |
| 334 | - "analyzer": "brazilian" | 232 | + "analyzer": "brazilian", |
| 233 | + "fields": { | ||
| 234 | + "keyword": { | ||
| 235 | + "type": "keyword", | ||
| 236 | + "normalizer": "lowercase" | ||
| 237 | + } | ||
| 238 | + } | ||
| 335 | }, | 239 | }, |
| 336 | "bg": { | 240 | "bg": { |
| 337 | "type": "text", | 241 | "type": "text", |
| 338 | - "analyzer": "bulgarian" | 242 | + "analyzer": "bulgarian", |
| 243 | + "fields": { | ||
| 244 | + "keyword": { | ||
| 245 | + "type": "keyword", | ||
| 246 | + "normalizer": "lowercase" | ||
| 247 | + } | ||
| 248 | + } | ||
| 339 | }, | 249 | }, |
| 340 | "ca": { | 250 | "ca": { |
| 341 | "type": "text", | 251 | "type": "text", |
| 342 | - "analyzer": "catalan" | 252 | + "analyzer": "catalan", |
| 253 | + "fields": { | ||
| 254 | + "keyword": { | ||
| 255 | + "type": "keyword", | ||
| 256 | + "normalizer": "lowercase" | ||
| 257 | + } | ||
| 258 | + } | ||
| 343 | }, | 259 | }, |
| 344 | "cjk": { | 260 | "cjk": { |
| 345 | "type": "text", | 261 | "type": "text", |
| 346 | - "analyzer": "cjk" | 262 | + "analyzer": "cjk", |
| 263 | + "fields": { | ||
| 264 | + "keyword": { | ||
| 265 | + "type": "keyword", | ||
| 266 | + "normalizer": "lowercase" | ||
| 267 | + } | ||
| 268 | + } | ||
| 347 | }, | 269 | }, |
| 348 | "cs": { | 270 | "cs": { |
| 349 | "type": "text", | 271 | "type": "text", |
| 350 | - "analyzer": "czech" | 272 | + "analyzer": "czech", |
| 273 | + "fields": { | ||
| 274 | + "keyword": { | ||
| 275 | + "type": "keyword", | ||
| 276 | + "normalizer": "lowercase" | ||
| 277 | + } | ||
| 278 | + } | ||
| 351 | }, | 279 | }, |
| 352 | "da": { | 280 | "da": { |
| 353 | "type": "text", | 281 | "type": "text", |
| 354 | - "analyzer": "danish" | 282 | + "analyzer": "danish", |
| 283 | + "fields": { | ||
| 284 | + "keyword": { | ||
| 285 | + "type": "keyword", | ||
| 286 | + "normalizer": "lowercase" | ||
| 287 | + } | ||
| 288 | + } | ||
| 355 | }, | 289 | }, |
| 356 | "nl": { | 290 | "nl": { |
| 357 | "type": "text", | 291 | "type": "text", |
| 358 | - "analyzer": "dutch" | 292 | + "analyzer": "dutch", |
| 293 | + "fields": { | ||
| 294 | + "keyword": { | ||
| 295 | + "type": "keyword", | ||
| 296 | + "normalizer": "lowercase" | ||
| 297 | + } | ||
| 298 | + } | ||
| 359 | }, | 299 | }, |
| 360 | "fi": { | 300 | "fi": { |
| 361 | "type": "text", | 301 | "type": "text", |
| 362 | - "analyzer": "finnish" | 302 | + "analyzer": "finnish", |
| 303 | + "fields": { | ||
| 304 | + "keyword": { | ||
| 305 | + "type": "keyword", | ||
| 306 | + "normalizer": "lowercase" | ||
| 307 | + } | ||
| 308 | + } | ||
| 363 | }, | 309 | }, |
| 364 | "fr": { | 310 | "fr": { |
| 365 | "type": "text", | 311 | "type": "text", |
| 366 | - "analyzer": "french" | 312 | + "analyzer": "french", |
| 313 | + "fields": { | ||
| 314 | + "keyword": { | ||
| 315 | + "type": "keyword", | ||
| 316 | + "normalizer": "lowercase" | ||
| 317 | + } | ||
| 318 | + } | ||
| 367 | }, | 319 | }, |
| 368 | "gl": { | 320 | "gl": { |
| 369 | "type": "text", | 321 | "type": "text", |
| 370 | - "analyzer": "galician" | 322 | + "analyzer": "galician", |
| 323 | + "fields": { | ||
| 324 | + "keyword": { | ||
| 325 | + "type": "keyword", | ||
| 326 | + "normalizer": "lowercase" | ||
| 327 | + } | ||
| 328 | + } | ||
| 371 | }, | 329 | }, |
| 372 | "de": { | 330 | "de": { |
| 373 | "type": "text", | 331 | "type": "text", |
| 374 | - "analyzer": "german" | 332 | + "analyzer": "german", |
| 333 | + "fields": { | ||
| 334 | + "keyword": { | ||
| 335 | + "type": "keyword", | ||
| 336 | + "normalizer": "lowercase" | ||
| 337 | + } | ||
| 338 | + } | ||
| 375 | }, | 339 | }, |
| 376 | "el": { | 340 | "el": { |
| 377 | "type": "text", | 341 | "type": "text", |
| 378 | - "analyzer": "greek" | 342 | + "analyzer": "greek", |
| 343 | + "fields": { | ||
| 344 | + "keyword": { | ||
| 345 | + "type": "keyword", | ||
| 346 | + "normalizer": "lowercase" | ||
| 347 | + } | ||
| 348 | + } | ||
| 379 | }, | 349 | }, |
| 380 | "hi": { | 350 | "hi": { |
| 381 | "type": "text", | 351 | "type": "text", |
| 382 | - "analyzer": "hindi" | 352 | + "analyzer": "hindi", |
| 353 | + "fields": { | ||
| 354 | + "keyword": { | ||
| 355 | + "type": "keyword", | ||
| 356 | + "normalizer": "lowercase" | ||
| 357 | + } | ||
| 358 | + } | ||
| 383 | }, | 359 | }, |
| 384 | "hu": { | 360 | "hu": { |
| 385 | "type": "text", | 361 | "type": "text", |
| 386 | - "analyzer": "hungarian" | 362 | + "analyzer": "hungarian", |
| 363 | + "fields": { | ||
| 364 | + "keyword": { | ||
| 365 | + "type": "keyword", | ||
| 366 | + "normalizer": "lowercase" | ||
| 367 | + } | ||
| 368 | + } | ||
| 387 | }, | 369 | }, |
| 388 | "id": { | 370 | "id": { |
| 389 | "type": "text", | 371 | "type": "text", |
| 390 | - "analyzer": "indonesian" | 372 | + "analyzer": "indonesian", |
| 373 | + "fields": { | ||
| 374 | + "keyword": { | ||
| 375 | + "type": "keyword", | ||
| 376 | + "normalizer": "lowercase" | ||
| 377 | + } | ||
| 378 | + } | ||
| 391 | }, | 379 | }, |
| 392 | "it": { | 380 | "it": { |
| 393 | "type": "text", | 381 | "type": "text", |
| 394 | - "analyzer": "italian" | 382 | + "analyzer": "italian", |
| 383 | + "fields": { | ||
| 384 | + "keyword": { | ||
| 385 | + "type": "keyword", | ||
| 386 | + "normalizer": "lowercase" | ||
| 387 | + } | ||
| 388 | + } | ||
| 395 | }, | 389 | }, |
| 396 | "no": { | 390 | "no": { |
| 397 | "type": "text", | 391 | "type": "text", |
| 398 | - "analyzer": "norwegian" | 392 | + "analyzer": "norwegian", |
| 393 | + "fields": { | ||
| 394 | + "keyword": { | ||
| 395 | + "type": "keyword", | ||
| 396 | + "normalizer": "lowercase" | ||
| 397 | + } | ||
| 398 | + } | ||
| 399 | }, | 399 | }, |
| 400 | "fa": { | 400 | "fa": { |
| 401 | "type": "text", | 401 | "type": "text", |
| 402 | - "analyzer": "persian" | 402 | + "analyzer": "persian", |
| 403 | + "fields": { | ||
| 404 | + "keyword": { | ||
| 405 | + "type": "keyword", | ||
| 406 | + "normalizer": "lowercase" | ||
| 407 | + } | ||
| 408 | + } | ||
| 403 | }, | 409 | }, |
| 404 | "pt": { | 410 | "pt": { |
| 405 | "type": "text", | 411 | "type": "text", |
| 406 | - "analyzer": "portuguese" | 412 | + "analyzer": "portuguese", |
| 413 | + "fields": { | ||
| 414 | + "keyword": { | ||
| 415 | + "type": "keyword", | ||
| 416 | + "normalizer": "lowercase" | ||
| 417 | + } | ||
| 418 | + } | ||
| 407 | }, | 419 | }, |
| 408 | "ro": { | 420 | "ro": { |
| 409 | "type": "text", | 421 | "type": "text", |
| 410 | - "analyzer": "romanian" | 422 | + "analyzer": "romanian", |
| 423 | + "fields": { | ||
| 424 | + "keyword": { | ||
| 425 | + "type": "keyword", | ||
| 426 | + "normalizer": "lowercase" | ||
| 427 | + } | ||
| 428 | + } | ||
| 411 | }, | 429 | }, |
| 412 | "ru": { | 430 | "ru": { |
| 413 | "type": "text", | 431 | "type": "text", |
| 414 | - "analyzer": "russian" | 432 | + "analyzer": "russian", |
| 433 | + "fields": { | ||
| 434 | + "keyword": { | ||
| 435 | + "type": "keyword", | ||
| 436 | + "normalizer": "lowercase" | ||
| 437 | + } | ||
| 438 | + } | ||
| 415 | }, | 439 | }, |
| 416 | "es": { | 440 | "es": { |
| 417 | "type": "text", | 441 | "type": "text", |
| 418 | - "analyzer": "spanish" | 442 | + "analyzer": "spanish", |
| 443 | + "fields": { | ||
| 444 | + "keyword": { | ||
| 445 | + "type": "keyword", | ||
| 446 | + "normalizer": "lowercase" | ||
| 447 | + } | ||
| 448 | + } | ||
| 419 | }, | 449 | }, |
| 420 | "sv": { | 450 | "sv": { |
| 421 | "type": "text", | 451 | "type": "text", |
| 422 | - "analyzer": "swedish" | 452 | + "analyzer": "swedish", |
| 453 | + "fields": { | ||
| 454 | + "keyword": { | ||
| 455 | + "type": "keyword", | ||
| 456 | + "normalizer": "lowercase" | ||
| 457 | + } | ||
| 458 | + } | ||
| 423 | }, | 459 | }, |
| 424 | "tr": { | 460 | "tr": { |
| 425 | "type": "text", | 461 | "type": "text", |
| 426 | - "analyzer": "turkish" | 462 | + "analyzer": "turkish", |
| 463 | + "fields": { | ||
| 464 | + "keyword": { | ||
| 465 | + "type": "keyword", | ||
| 466 | + "normalizer": "lowercase" | ||
| 467 | + } | ||
| 468 | + } | ||
| 427 | }, | 469 | }, |
| 428 | "th": { | 470 | "th": { |
| 429 | "type": "text", | 471 | "type": "text", |
| 430 | - "analyzer": "thai" | 472 | + "analyzer": "thai", |
| 473 | + "fields": { | ||
| 474 | + "keyword": { | ||
| 475 | + "type": "keyword", | ||
| 476 | + "normalizer": "lowercase" | ||
| 477 | + } | ||
| 478 | + } | ||
| 431 | } | 479 | } |
| 432 | } | 480 | } |
| 433 | }, | 481 | }, |
| @@ -983,9 +1031,6 @@ | @@ -983,9 +1031,6 @@ | ||
| 983 | } | 1031 | } |
| 984 | } | 1032 | } |
| 985 | }, | 1033 | }, |
| 986 | - "tags": { | ||
| 987 | - "type": "keyword" | ||
| 988 | - }, | ||
| 989 | "image_url": { | 1034 | "image_url": { |
| 990 | "type": "keyword", | 1035 | "type": "keyword", |
| 991 | "index": false | 1036 | "index": false |
| @@ -1012,21 +1057,6 @@ | @@ -1012,21 +1057,6 @@ | ||
| 1012 | } | 1057 | } |
| 1013 | } | 1058 | } |
| 1014 | }, | 1059 | }, |
| 1015 | - "image_embedding_512": { | ||
| 1016 | - "type": "nested", | ||
| 1017 | - "properties": { | ||
| 1018 | - "vector": { | ||
| 1019 | - "type": "dense_vector", | ||
| 1020 | - "dims": 512, | ||
| 1021 | - "index": true, | ||
| 1022 | - "similarity": "dot_product", | ||
| 1023 | - "element_type": "bfloat16" | ||
| 1024 | - }, | ||
| 1025 | - "url": { | ||
| 1026 | - "type": "text" | ||
| 1027 | - } | ||
| 1028 | - } | ||
| 1029 | - }, | ||
| 1030 | "category_path": { | 1060 | "category_path": { |
| 1031 | "type": "object", | 1061 | "type": "object", |
| 1032 | "properties": { | 1062 | "properties": { |
| @@ -1279,6 +1309,46 @@ | @@ -1279,6 +1309,46 @@ | ||
| 1279 | } | 1309 | } |
| 1280 | } | 1310 | } |
| 1281 | }, | 1311 | }, |
| 1312 | + "qanchors": { | ||
| 1313 | + "type": "object", | ||
| 1314 | + "properties": { | ||
| 1315 | + "zh": { | ||
| 1316 | + "type": "text", | ||
| 1317 | + "analyzer": "index_ik", | ||
| 1318 | + "search_analyzer": "query_ik" | ||
| 1319 | + }, | ||
| 1320 | + "en": { | ||
| 1321 | + "type": "text", | ||
| 1322 | + "analyzer": "english" | ||
| 1323 | + } | ||
| 1324 | + } | ||
| 1325 | + }, | ||
| 1326 | + "tags": { | ||
| 1327 | + "type": "object", | ||
| 1328 | + "properties": { | ||
| 1329 | + "zh": { | ||
| 1330 | + "type": "text", | ||
| 1331 | + "analyzer": "index_ik", | ||
| 1332 | + "search_analyzer": "query_ik", | ||
| 1333 | + "fields": { | ||
| 1334 | + "keyword": { | ||
| 1335 | + "type": "keyword", | ||
| 1336 | + "normalizer": "lowercase" | ||
| 1337 | + } | ||
| 1338 | + } | ||
| 1339 | + }, | ||
| 1340 | + "en": { | ||
| 1341 | + "type": "text", | ||
| 1342 | + "analyzer": "english", | ||
| 1343 | + "fields": { | ||
| 1344 | + "keyword": { | ||
| 1345 | + "type": "keyword", | ||
| 1346 | + "normalizer": "lowercase" | ||
| 1347 | + } | ||
| 1348 | + } | ||
| 1349 | + } | ||
| 1350 | + } | ||
| 1351 | + }, | ||
| 1282 | "category_id": { | 1352 | "category_id": { |
| 1283 | "type": "keyword" | 1353 | "type": "keyword" |
| 1284 | }, | 1354 | }, |
| @@ -1307,7 +1377,64 @@ | @@ -1307,7 +1377,64 @@ | ||
| 1307 | "type": "keyword" | 1377 | "type": "keyword" |
| 1308 | }, | 1378 | }, |
| 1309 | "value": { | 1379 | "value": { |
| 1380 | + "type": "object", | ||
| 1381 | + "properties": { | ||
| 1382 | + "zh": { | ||
| 1383 | + "type": "text", | ||
| 1384 | + "analyzer": "index_ik", | ||
| 1385 | + "search_analyzer": "query_ik", | ||
| 1386 | + "fields": { | ||
| 1387 | + "keyword": { | ||
| 1388 | + "type": "keyword", | ||
| 1389 | + "normalizer": "lowercase" | ||
| 1390 | + } | ||
| 1391 | + } | ||
| 1392 | + }, | ||
| 1393 | + "en": { | ||
| 1394 | + "type": "text", | ||
| 1395 | + "analyzer": "english", | ||
| 1396 | + "fields": { | ||
| 1397 | + "keyword": { | ||
| 1398 | + "type": "keyword", | ||
| 1399 | + "normalizer": "lowercase" | ||
| 1400 | + } | ||
| 1401 | + } | ||
| 1402 | + } | ||
| 1403 | + } | ||
| 1404 | + } | ||
| 1405 | + } | ||
| 1406 | + }, | ||
| 1407 | + "enriched_attributes": { | ||
| 1408 | + "type": "nested", | ||
| 1409 | + "properties": { | ||
| 1410 | + "name": { | ||
| 1310 | "type": "keyword" | 1411 | "type": "keyword" |
| 1412 | + }, | ||
| 1413 | + "value": { | ||
| 1414 | + "type": "object", | ||
| 1415 | + "properties": { | ||
| 1416 | + "zh": { | ||
| 1417 | + "type": "text", | ||
| 1418 | + "analyzer": "index_ik", | ||
| 1419 | + "search_analyzer": "query_ik", | ||
| 1420 | + "fields": { | ||
| 1421 | + "keyword": { | ||
| 1422 | + "type": "keyword", | ||
| 1423 | + "normalizer": "lowercase" | ||
| 1424 | + } | ||
| 1425 | + } | ||
| 1426 | + }, | ||
| 1427 | + "en": { | ||
| 1428 | + "type": "text", | ||
| 1429 | + "analyzer": "english", | ||
| 1430 | + "fields": { | ||
| 1431 | + "keyword": { | ||
| 1432 | + "type": "keyword", | ||
| 1433 | + "normalizer": "lowercase" | ||
| 1434 | + } | ||
| 1435 | + } | ||
| 1436 | + } | ||
| 1437 | + } | ||
| 1311 | } | 1438 | } |
| 1312 | } | 1439 | } |
| 1313 | }, | 1440 | }, |
| @@ -1321,13 +1448,82 @@ | @@ -1321,13 +1448,82 @@ | ||
| 1321 | "type": "keyword" | 1448 | "type": "keyword" |
| 1322 | }, | 1449 | }, |
| 1323 | "option1_values": { | 1450 | "option1_values": { |
| 1324 | - "type": "keyword" | 1451 | + "type": "object", |
| 1452 | + "properties": { | ||
| 1453 | + "zh": { | ||
| 1454 | + "type": "text", | ||
| 1455 | + "analyzer": "index_ik", | ||
| 1456 | + "search_analyzer": "query_ik", | ||
| 1457 | + "fields": { | ||
| 1458 | + "keyword": { | ||
| 1459 | + "type": "keyword", | ||
| 1460 | + "normalizer": "lowercase" | ||
| 1461 | + } | ||
| 1462 | + } | ||
| 1463 | + }, | ||
| 1464 | + "en": { | ||
| 1465 | + "type": "text", | ||
| 1466 | + "analyzer": "english", | ||
| 1467 | + "fields": { | ||
| 1468 | + "keyword": { | ||
| 1469 | + "type": "keyword", | ||
| 1470 | + "normalizer": "lowercase" | ||
| 1471 | + } | ||
| 1472 | + } | ||
| 1473 | + } | ||
| 1474 | + } | ||
| 1325 | }, | 1475 | }, |
| 1326 | "option2_values": { | 1476 | "option2_values": { |
| 1327 | - "type": "keyword" | 1477 | + "type": "object", |
| 1478 | + "properties": { | ||
| 1479 | + "zh": { | ||
| 1480 | + "type": "text", | ||
| 1481 | + "analyzer": "index_ik", | ||
| 1482 | + "search_analyzer": "query_ik", | ||
| 1483 | + "fields": { | ||
| 1484 | + "keyword": { | ||
| 1485 | + "type": "keyword", | ||
| 1486 | + "normalizer": "lowercase" | ||
| 1487 | + } | ||
| 1488 | + } | ||
| 1489 | + }, | ||
| 1490 | + "en": { | ||
| 1491 | + "type": "text", | ||
| 1492 | + "analyzer": "english", | ||
| 1493 | + "fields": { | ||
| 1494 | + "keyword": { | ||
| 1495 | + "type": "keyword", | ||
| 1496 | + "normalizer": "lowercase" | ||
| 1497 | + } | ||
| 1498 | + } | ||
| 1499 | + } | ||
| 1500 | + } | ||
| 1328 | }, | 1501 | }, |
| 1329 | "option3_values": { | 1502 | "option3_values": { |
| 1330 | - "type": "keyword" | 1503 | + "type": "object", |
| 1504 | + "properties": { | ||
| 1505 | + "zh": { | ||
| 1506 | + "type": "text", | ||
| 1507 | + "analyzer": "index_ik", | ||
| 1508 | + "search_analyzer": "query_ik", | ||
| 1509 | + "fields": { | ||
| 1510 | + "keyword": { | ||
| 1511 | + "type": "keyword", | ||
| 1512 | + "normalizer": "lowercase" | ||
| 1513 | + } | ||
| 1514 | + } | ||
| 1515 | + }, | ||
| 1516 | + "en": { | ||
| 1517 | + "type": "text", | ||
| 1518 | + "analyzer": "english", | ||
| 1519 | + "fields": { | ||
| 1520 | + "keyword": { | ||
| 1521 | + "type": "keyword", | ||
| 1522 | + "normalizer": "lowercase" | ||
| 1523 | + } | ||
| 1524 | + } | ||
| 1525 | + } | ||
| 1526 | + } | ||
| 1331 | }, | 1527 | }, |
| 1332 | "min_price": { | 1528 | "min_price": { |
| 1333 | "type": "float" | 1529 | "type": "float" |
| @@ -1391,20 +1587,6 @@ | @@ -1391,20 +1587,6 @@ | ||
| 1391 | "index": false | 1587 | "index": false |
| 1392 | } | 1588 | } |
| 1393 | } | 1589 | } |
| 1394 | - }, | ||
| 1395 | - "semantic_attributes": { | ||
| 1396 | - "type": "nested", | ||
| 1397 | - "properties": { | ||
| 1398 | - "lang": { | ||
| 1399 | - "type": "keyword" | ||
| 1400 | - }, | ||
| 1401 | - "name": { | ||
| 1402 | - "type": "keyword" | ||
| 1403 | - }, | ||
| 1404 | - "value": { | ||
| 1405 | - "type": "keyword" | ||
| 1406 | - } | ||
| 1407 | - } | ||
| 1408 | } | 1590 | } |
| 1409 | } | 1591 | } |
| 1410 | } | 1592 | } |
scripts/es_debug_search.py
| @@ -279,7 +279,9 @@ def _run_es( | @@ -279,7 +279,9 @@ def _run_es( | ||
| 279 | body: Dict[str, Any], | 279 | body: Dict[str, Any], |
| 280 | size: int, | 280 | size: int, |
| 281 | ) -> List[Dict[str, Any]]: | 281 | ) -> List[Dict[str, Any]]: |
| 282 | - resp = es.search(index=index_name, body=body, size=size) | 282 | + # Avoid passing size= alongside body= (deprecated in elasticsearch-py). |
| 283 | + payload = {**body, "size": size} | ||
| 284 | + resp = es.search(index=index_name, body=payload) | ||
| 283 | if hasattr(resp, "body"): | 285 | if hasattr(resp, "body"): |
| 284 | payload = resp.body | 286 | payload = resp.body |
| 285 | else: | 287 | else: |