From d7d48f5238c6c83665e359cda931dbc17dc748f8 Mon Sep 17 00:00:00 2001 From: tangwang Date: Tue, 6 Jan 2026 19:42:20 +0800 Subject: [PATCH] 改动(mapping + 灌入结构) mappings/search_products.json:把原来的 title_zh/title_en/brief_zh/... 改成 按语言 key 的对象结构( /products/_doc/1 { "title": {"en":...} } ) 同时在这些字段下 预置了全部 analyzer 语言: arabic, armenian, basque, brazilian, bulgarian, catalan, chinese, cjk, czech, danish, dutch, english, finnish, french, galician, german, greek, hindi, hungarian, indonesian, italian, norwegian, persian, portuguese, romanian, russian, spanish, swedish, turkish, thai --- CLAUDE.md | 22 +++++++++++----------- README.md | 4 ++-- api/models.py | 2 +- api/result_formatter.py | 21 +++++++++++++++------ config/config.yaml | 44 ++++++++++++++++++++++---------------------- config/utils.py | 2 +- docs/MySQL到ES字段映射说明-业务版.md | 68 ++++++++++++++++++++++++++++++++++---------------------------------- docs/MySQL到ES文档映射说明.md | 40 ++++++++++++++++++++-------------------- docs/Search-API-Examples.md | 4 ++-- docs/基础配置指南.md | 36 ++++++++++++++++++------------------ docs/常用查询 - ES.md | 65 +++++++++++++++++++++++++++++++++++++++++++++-------------------- docs/搜索API对接指南.md | 30 +++++++++++++++++------------- docs/搜索API速查表.md | 2 +- docs/相关性检索优化说明.md | 10 +++++----- docs/系统设计文档.md | 38 +++++++++++++++++++------------------- docs/系统设计文档v1.md | 6 +++--- docs/索引字段说明v1.md | 2 +- docs/索引字段说明v2-mapping结构.md | 24 ++++++++++++------------ docs/索引字段说明v2-plan.md | 18 +++++++++--------- docs/索引字段说明v2.md | 64 ++++++++++++++++++++++++++++++++-------------------------------- docs/索引数据接口文档___old.md | 54 +++++++++++++++++++++++++++--------------------------- docs/索引方案.md | 4 ++-- docs/翻译功能测试说明.md | 14 +++++++------- indexer/document_transformer.py | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------------------------------------------------- indexer/incremental_service.py | 20 ++++++++++++++++++-- indexer/test_indexing.py | 31 +++++++++++++++++-------------- mappings/search_products.json | 1311 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------------------------------------------------------------------------- scripts/check_es_data.py | 10 ++++++---- scripts/check_index_mapping.py | 33 +++++++++++++++++++++------------ search/es_query_builder.py | 40 ++++++++++++++++++++-------------------- tests/conftest.py | 28 ++++++++++++++-------------- 31 files changed, 1673 insertions(+), 503 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 78a26b6..c617e62 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -256,12 +256,12 @@ The system uses centralized configuration through `config/config.yaml`: { "tenant_id": "keyword", // Multi-tenant isolation "spu_id": "keyword", // Product identifier - "title_zh/en": "text", // Multi-language titles - "brief_zh/en": "text", // Short descriptions - "description_zh/en": "text", // Detailed descriptions - "vendor_zh/en": "text", // Supplier/brand with keyword subfield - "category_path_zh/en": "text", // Hierarchical category paths - "category_name_zh/en": "text", // Category names for search + "title.zh/en": "text", // Multi-language titles + "brief.zh/en": "text", // Short descriptions + "description.zh/en": "text", // Detailed descriptions + "vendor.zh/en": "text", // Supplier/brand with keyword subfield + "category_path.zh/en": "text", // Hierarchical category paths + "category_name_text.zh/en": "text", // Category names for search "category1/2/3_name": "keyword", // Multi-level category filtering "tags": "keyword", // Product tags "specifications": "nested", // Product variants (color, size, etc.) @@ -318,11 +318,11 @@ The system uses centralized configuration through `config/config.yaml`: **Field Boost Configuration**: ```yaml field_boosts: - title_zh/en: 3.0 # Highest priority - brief_zh/en: 1.5 # Medium priority - description_zh/en: 1.0 # Lower priority - vendor_zh/en: 1.5 # Brand emphasis - category_path_zh/en: 1.5 # Category relevance + title.zh/en: 3.0 # Highest priority + brief.zh/en: 1.5 # Medium priority + description.zh/en: 1.0 # Lower priority + vendor.zh/en: 1.5 # Brand emphasis + category_path.zh/en: 1.5 # Category relevance tags: 1.0 # Tag matching ``` diff --git a/README.md b/README.md index 972a7bb..1eef882 100644 --- a/README.md +++ b/README.md @@ -15,8 +15,8 @@ "query" : { "dis_max" : { "queries" : [ - {"match" : { "title_en" : xxx }}, - {"match" : { "title_zh" : xxx }}, + {"match" : { "title.en" : xxx }}, + {"match" : { "title.zh" : xxx }}, {"match" : { "title_xx" : xxx }} ], "tie_breakler" : 0.8 diff --git a/api/models.py b/api/models.py index 0b5ab3c..661c24d 100644 --- a/api/models.py +++ b/api/models.py @@ -86,7 +86,7 @@ class SearchRequest(BaseModel): "examples": [ { "category_name": ["手机", "电子产品"], - "vendor_zh.keyword": "奇乐", + "vendor.zh.keyword": "奇乐", "specifications": {"name": "颜色", "value": "白色"} }, { diff --git a/api/result_formatter.py b/api/result_formatter.py index f3fd9f7..56e2b1e 100644 --- a/api/result_formatter.py +++ b/api/result_formatter.py @@ -32,13 +32,22 @@ class ResultFormatter: lang = "en" def pick_lang_field(src: Dict[str, Any], base: str) -> Optional[str]: - """从 *_zh / *_en 字段中按语言选择一个值,若目标语言缺失则回退到另一种。""" - zh_val = src.get(f"{base}_zh") - en_val = src.get(f"{base}_en") + """ + 从多语言对象字段中按语言选择一个值: + - 新结构: {base: {"zh": "...", "en": "...", ...}} + - 兼容旧结构: {base_zh: "...", base_en: "..."} + 若目标语言缺失则回退到另一种。 + """ + obj = src.get(base) + if isinstance(obj, dict): + zh_val = obj.get("zh") + en_val = obj.get("en") + else: + zh_val = src.get(f"{base}_zh") + en_val = src.get(f"{base}_en") if lang == "zh": return zh_val or en_val - else: - return en_val or zh_val + return en_val or zh_val for hit in es_hits: source = hit.get('_source', {}) @@ -60,7 +69,7 @@ class ResultFormatter: description = pick_lang_field(source, "description") vendor = pick_lang_field(source, "vendor") category_path = pick_lang_field(source, "category_path") - category_name = pick_lang_field(source, "category_name") + category_name = pick_lang_field(source, "category_name_text") or source.get("category_name") # Extract SKUs skus = [] diff --git a/config/config.yaml b/config/config.yaml index 7296ff5..25eefbe 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -15,20 +15,20 @@ es_settings: # 只配置权重,不配置字段结构(字段结构由 mappings/search_products.json 定义) field_boosts: # 文本相关性字段 - title_zh: 3.0 - brief_zh: 1.5 - description_zh: 1.0 - vendor_zh: 1.5 - title_en: 3.0 - brief_en: 1.5 - description_en: 1.0 - vendor_en: 1.5 + "title.zh": 3.0 + "brief.zh": 1.5 + "description.zh": 1.0 + "vendor.zh": 1.5 + "title.en": 3.0 + "brief.en": 1.5 + "description.en": 1.0 + "vendor.en": 1.5 # 分类相关字段 - category_path_zh: 1.5 - category_name_zh: 1.5 - category_path_en: 1.5 - category_name_en: 1.5 + "category_path.zh": 1.5 + "category_name_text.zh": 1.5 + "category_path.en": 1.5 + "category_name_text.en": 1.5 # 标签和属性值字段 tags: 1.0 @@ -42,33 +42,33 @@ indexes: - name: "default" label: "默认搜索" fields: - - "title_zh" - - "brief_zh" - - "description_zh" - - "vendor_zh" + - "title.zh" + - "brief.zh" + - "description.zh" + - "vendor.zh" - "tags" - - "category_path_zh" - - "category_name_zh" + - "category_path.zh" + - "category_name_text.zh" - "option1_values" boost: 1.0 - name: "title" label: "标题搜索" fields: - - "title_zh" + - "title.zh" boost: 2.0 - name: "vendor" label: "品牌搜索" fields: - - "vendor_zh" + - "vendor.zh" boost: 1.5 - name: "category" label: "类目搜索" fields: - - "category_path_zh" - - "category_name_zh" + - "category_path.zh" + - "category_name_text.zh" boost: 1.5 - name: "tags" diff --git a/config/utils.py b/config/utils.py index 0bd7ff1..b0247e0 100644 --- a/config/utils.py +++ b/config/utils.py @@ -17,7 +17,7 @@ def get_match_fields_for_index(config: SearchConfig, index_name: str = "default" index_name: Name of the index domain (default: "default") Returns: - List of field names with boost, e.g., ["title_zh^3.0", "brief_zh^1.5"] + List of field names with boost, e.g., ["title.zh^3.0", "brief.zh^1.5"] """ # Find the index config index_config = None diff --git a/docs/MySQL到ES字段映射说明-业务版.md b/docs/MySQL到ES字段映射说明-业务版.md index 33c9ed1..7ca7260 100644 --- a/docs/MySQL到ES字段映射说明-业务版.md +++ b/docs/MySQL到ES字段映射说明-业务版.md @@ -28,14 +28,14 @@ | ES 字段 | 数据来源表 | 表中字段 | 转换说明 | |---------|-----------|----------|----------| -| `title_zh` | SPU 表 | `title` | 如果主语言是中文,直接使用;否则翻译为中文 | -| `title_en` | SPU 表 | `title` | 如果主语言是英文,直接使用;否则翻译为英文 | -| `brief_zh` | SPU 表 | `brief` | 同上 | -| `brief_en` | SPU 表 | `brief` | 同上 | -| `description_zh` | SPU 表 | `description` | 同上 | -| `description_en` | SPU 表 | `description` | 同上 | -| `vendor_zh` | SPU 表 | `vendor` | 同上 | -| `vendor_en` | SPU 表 | `vendor` | 同上 | +| `title.zh` | SPU 表 | `title` | 如果主语言是中文,直接使用;否则翻译为中文 | +| `title.en` | SPU 表 | `title` | 如果主语言是英文,直接使用;否则翻译为英文 | +| `brief.zh` | SPU 表 | `brief` | 同上 | +| `brief.en` | SPU 表 | `brief` | 同上 | +| `description.zh` | SPU 表 | `description` | 同上 | +| `description.en` | SPU 表 | `description` | 同上 | +| `vendor.zh` | SPU 表 | `vendor` | 同上 | +| `vendor.en` | SPU 表 | `vendor` | 同上 | **翻译规则:** - 根据租户配置的 `primary_language` 确定主语言 @@ -104,7 +104,7 @@ category_ids = ["1", "2", "3"] | ES 字段 | 数据来源 | 转换说明 | |---------|----------|----------| -| `category_path_zh` | 类目名称列表 | 用 `/` 连接:`"电子产品/手机/iPhone"` | +| `category_path.zh` | 类目名称列表 | 用 `/` 连接:`"电子产品/手机/iPhone"` | | `category1_name` | 类目名称列表[0] | 一级类目:`"电子产品"` | | `category2_name` | 类目名称列表[1] | 二级类目:`"手机"` | | `category3_name` | 类目名称列表[2] | 三级类目:`"iPhone"` | @@ -300,13 +300,13 @@ sku_weight_units = 去重后的重量单位列表 | ES 字段 | 数据来源 | 转换说明 | |---------|----------|----------| -| `title_embedding` | SPU 表 `title` | 使用文本编码器(BGE)将标题转换为 1024 维向量
优先使用 `title_en`,如果没有则使用 `title_zh` | +| `title_embedding` | SPU 表 `title` | 使用文本编码器(BGE)将标题转换为 1024 维向量
优先使用 `title.en`,如果没有则使用 `title.zh` | **生成逻辑:** ``` 如果启用向量搜索: - 文本 = title_en 或 title_zh + 文本 = title.en 或 title.zh 向量 = 文本编码器.encode(文本) title_embedding = 向量(1024 维浮点数组) ``` @@ -322,17 +322,17 @@ sku_weight_units = 去重后的重量单位列表 | **基础字段** | | `tenant_id` | SPU | `tenant_id` | 租户ID | | `spu_id` | SPU | `id` | 商品ID | -| `title_zh/en` | SPU | `title` | 标题(多语言) | -| `brief_zh/en` | SPU | `brief` | 简介(多语言) | -| `description_zh/en` | SPU | `description` | 描述(多语言) | -| `vendor_zh/en` | SPU | `vendor` | 品牌(多语言) | +| `title.zh/en` | SPU | `title` | 标题(多语言) | +| `brief.zh/en` | SPU | `brief` | 简介(多语言) | +| `description.zh/en` | SPU | `description` | 描述(多语言) | +| `vendor.zh/en` | SPU | `vendor` | 品牌(多语言) | | `tags` | SPU | `tags` | 标签数组 | | `image_url` | SPU | `image_src` | 主图URL | | `sales` | SPU | `fake_sales` | 销量 | | `create_time` | SPU | `create_time` | 创建时间 | | `update_time` | SPU | `update_time` | 更新时间 | | **类别字段** | -| `category_path_zh` | SPU + 类目映射 | `category_path` → 类目名称 | 类目路径 | +| `category_path.zh` | SPU + 类目映射 | `category_path` → 类目名称 | 类目路径 | | `category1_name` | SPU + 类目映射 | `category_path` → 类目名称[0] | 一级类目 | | `category2_name` | SPU + 类目映射 | `category_path` → 类目名称[1] | 二级类目 | | `category3_name` | SPU + 类目映射 | `category_path` → 类目名称[2] | 三级类目 | @@ -451,14 +451,14 @@ GET /search_products/_search "_source": { "tenant_id": "162", "spu_id": "74174", - "title_zh": "实色二阶碳纤维魔方 方形 2阶 塑料【英文包装】", - "title_en": "Solid Color 2nd Order Carbon Fiber Rubik's Cube Square 2 Steps Plastic [English Packaging", - "brief_zh": "实色二阶碳纤维魔方 方形 2阶 塑料【英文包装】", - "brief_en": "Solid Color 2nd Order Carbon Fiber Rubik's Cube Square 2 Steps Plastic [English Packaging", - "description_zh": "

实色二阶碳纤维魔方 方形 2阶 塑料【英文包装】

", - "description_en": "

Solid Color 2nd Order Carbon Fiber Rubik's Cube Square 2 Steps Plastic [English Packaging]实色二阶碳纤维魔方 方形 2阶 塑料【英文包装】

", + "description.en": "

Solid Color 2nd Order Carbon Fiber Rubik's Cube Square 2 Steps Plastic [English Packaging] 0: @@ -358,7 +358,7 @@ if category_names: ```python elif pd.notna(spu_row.get('category')): category = str(spu_row['category']) - doc['category_name_zh'] = category + doc['category_name_text.zh'] = category # 尝试从 category 字段解析多级分类(如果包含 "/") if '/' in category: @@ -375,8 +375,8 @@ elif pd.notna(spu_row.get('category')): | ES 字段 | 类型 | 说明 | |---------|------|------| -| `category_path_zh` | text | 类目路径字符串(如:"电子产品/手机/iPhone") | -| `category_path_en` | text | 类目路径英文(暂未实现) | +| `category_path.zh` | text | 类目路径字符串(如:"电子产品/手机/iPhone") | +| `category_path.en` | text | 类目路径英文(暂未实现) | | `category1_name` | keyword | 一级类目名称 | | `category2_name` | keyword | 二级类目名称 | | `category3_name` | keyword | 三级类目名称 | @@ -673,7 +673,7 @@ if enable_embedding and encoder and documents: title_texts = [] title_doc_indices = [] for i, (_, doc) in enumerate(documents): - title_text = doc.get("title_en") or doc.get("title_zh") + title_text = doc.get("title.en") or doc.get("title.zh") if title_text and str(title_text).strip(): title_texts.append(str(title_text)) title_doc_indices.append(i) @@ -858,16 +858,16 @@ spu_df["_is_deleted"] = spu_df["deleted"].apply(_is_deleted_value) { "tenant_id": "1", "spu_id": "12345", - "title_zh": "iPhone 15 Pro Max", - "title_en": "iPhone 15 Pro Max", - "brief_zh": "最新款 iPhone", - "brief_en": "Latest iPhone", - "description_zh": "详细描述...", - "description_en": "Detailed description...", - "vendor_zh": "Apple", - "vendor_en": "Apple", + "title.zh": "iPhone 15 Pro Max", + "title.en": "iPhone 15 Pro Max", + "brief.zh": "最新款 iPhone", + "brief.en": "Latest iPhone", + "description.zh": "详细描述...", + "description.en": "Detailed description...", + "vendor.zh": "Apple", + "vendor.en": "Apple", "tags": ["手机", "智能手机", "Apple"], - "category_path_zh": "电子产品/手机/iPhone", + "category_path.zh": "电子产品/手机/iPhone", "category1_name": "电子产品", "category2_name": "手机", "category3_name": "iPhone", diff --git a/docs/Search-API-Examples.md b/docs/Search-API-Examples.md index c8efa67..2427edb 100644 --- a/docs/Search-API-Examples.md +++ b/docs/Search-API-Examples.md @@ -132,7 +132,7 @@ curl -X POST "http://localhost:6002/search/" \ "language": "zh", "filters": { "category_name": "手机", - "vendor_zh.keyword": "奇乐" + "vendor.zh.keyword": "奇乐" } }' ``` @@ -294,7 +294,7 @@ curl -X POST "http://localhost:6002/search/" \ "language": "zh", "filters": { "category_name": ["手机", "电子产品"], - "vendor_zh.keyword": "品牌A" + "vendor.zh.keyword": "品牌A" }, "range_filters": { "min_price": { diff --git a/docs/基础配置指南.md b/docs/基础配置指南.md index ca67edd..6e060fe 100644 --- a/docs/基础配置指南.md +++ b/docs/基础配置指南.md @@ -26,12 +26,12 @@ - `create_time`, `update_time` (date) - 时间字段 #### 多语言文本字段 -- `title_zh`, `title_en` (text) - 标题(中英文) -- `brief_zh`, `brief_en` (text) - 短描述(中英文) -- `description_zh`, `description_en` (text) - 详细描述(中英文) -- `vendor_zh`, `vendor_en` (text) - 供应商/品牌(中英文,含keyword子字段) -- `category_path_zh`, `category_path_en` (text) - 类目路径(中英文) -- `category_name_zh`, `category_name_en` (text) - 类目名称(中英文) +- `title.zh`, `title.en` (text) - 标题(中英文) +- `brief.zh`, `brief.en` (text) - 短描述(中英文) +- `description.zh`, `description.en` (text) - 详细描述(中英文) +- `vendor.zh`, `vendor.en` (text) - 供应商/品牌(中英文,含keyword子字段) +- `category_path.zh`, `category_path.en` (text) - 类目路径(中英文) +- `category_name_text.zh`, `category_name_text.en` (text) - 类目名称(中英文) #### 类目字段 - `category_id` (keyword) - 类目ID @@ -64,12 +64,12 @@ ### 文本召回字段 默认同时搜索以下字段(中英文都包含): -- `title_zh^3.0`, `title_en^3.0` -- `brief_zh^1.5`, `brief_en^1.5` -- `description_zh^1.0`, `description_en^1.0` -- `vendor_zh^1.5`, `vendor_en^1.5` -- `category_path_zh^1.5`, `category_path_en^1.5` -- `category_name_zh^1.5`, `category_name_en^1.5` +- `title.zh^3.0`, `title.en^3.0` +- `brief.zh^1.5`, `brief.en^1.5` +- `description.zh^1.0`, `description.en^1.0` +- `vendor.zh^1.5`, `vendor.en^1.5` +- `category_path.zh^1.5`, `category_path.en^1.5` +- `category_name_text.zh^1.5`, `category_name_text.en^1.5` - `tags^1.0` ### 查询架构 @@ -126,12 +126,12 @@ - `language="en"`: 优先返回 `*_en` 字段,如果为空则回退到 `*_zh` 字段 映射规则: -- `title_zh/en` → `title` -- `brief_zh/en` → `brief` -- `description_zh/en` → `description` -- `vendor_zh/en` → `vendor` -- `category_path_zh/en` → `category_path` -- `category_name_zh/en` → `category_name` +- `title.zh/en` → `title` +- `brief.zh/en` → `brief` +- `description.zh/en` → `description` +- `vendor.zh/en` → `vendor` +- `category_path.zh/en` → `category_path` +- `category_name_text.zh/en` → `category_name` ## 配置修改 diff --git a/docs/常用查询 - ES.md b/docs/常用查询 - ES.md index e2f4f23..db51ad1 100644 --- a/docs/常用查询 - ES.md +++ b/docs/常用查询 - ES.md @@ -22,7 +22,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{ "size": 100, - "_source": ["title_zh", "title_en"], + "_source": ["title"], "query": { "bool": { "filter": [ @@ -34,13 +34,13 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{ "size": 1, - "_source": ["title_zh", "title_en"], + "_source": ["title"], "query": { "bool": { "must": [ { "match": { - "title_zh": { + "title.zh": { "query": "裙子" } } @@ -54,7 +54,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/ }' curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_analyze' -H 'Content-Type: application/json' -d '{ - "analyzer": "query_ansj", + "analyzer": "icu_analyzer", "text": "14寸第4代-眼珠实身冰雪公仔带手动大推车,搪胶雪宝宝" }' @@ -73,13 +73,13 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/ "multi_match": { "_name": "base_query", "fields": [ - "title_zh^3.0", - "brief_zh^1.5", - "description_zh", - "vendor_zh^1.5", + "title.zh^3.0", + "brief.zh^1.5", + "description.zh", + "vendor.zh^1.5", "tags", - "category_path_zh^1.5", - "category_name_zh^1.5", + "category_path.zh^1.5", + "category_name_text.zh^1.5", "option1_values^0.5" ], "minimum_should_match": "75%", @@ -110,13 +110,13 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/ "multi_match": { "_name": "base_query", "fields": [ - "title_zh^3.0", - "brief_zh^1.5", - "description_zh", - "vendor_zh^1.5", + "title.zh^3.0", + "brief.zh^1.5", + "description.zh", + "vendor.zh^1.5", "tags", - "category_path_zh^1.5", - "category_name_zh^1.5", + "category_path.zh^1.5", + "category_name_text.zh^1.5", "option1_values^0.5" ], "minimum_should_match": "75%", @@ -271,7 +271,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_s "size": 1, "_source": [ "spu_id", - "title_zh", + "title", "category1_name", "category2_name", "category3_name", @@ -552,7 +552,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_s } }, "size": 1, - "_source": ["spu_id", "title_zh", "specifications"] + "_source": ["spu_id", "title", "specifications"] }' ## 4. 统计查询 @@ -597,7 +597,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_s } }, "size": 10, - "_source": ["spu_id", "title_zh", "category_name_zh", "category_path_zh"] + "_source": ["spu_id", "title", "category_name_text", "category_path"] }' ### 5.2 查找有option但没有specifications的文档(数据转换问题) @@ -614,7 +614,32 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_s } }, "size": 10, - "_source": ["spu_id", "title_zh", "option1_name", "option2_name", "option3_name", "specifications"] + "_source": ["spu_id", "title", "option1_name", "option2_name", "option3_name", "specifications"] }' +重排序: +GET /search_products/_search +{ + "query": { + "match": { + "title.en": { + "query": "quick brown fox", + "minimum_should_match": "90%" + } + } + }, + "rescore": { + "window_size": 50, + "query": { + "rescore_query": { + "match_phrase": { + "title.en": { + "query": "quick brown fox", + "slop": 50 + } + } + } + } + } +} diff --git a/docs/搜索API对接指南.md b/docs/搜索API对接指南.md index 21dd5dd..09f72d7 100644 --- a/docs/搜索API对接指南.md +++ b/docs/搜索API对接指南.md @@ -203,7 +203,7 @@ curl -X POST "http://120.76.41.98:6002/search/" \ "category1_name": "服装", // 可以为单值 或者 数组 匹配数组中任意一个 "category2_name": "男装", // 可以为单值 或者 数组 匹配数组中任意一个 "category3_name": "衬衫", // 可以为单值 或者 数组 匹配数组中任意一个 - "vendor_zh.keyword": ["奇乐", "品牌A"], // 可以为单值 或者 数组 匹配数组中任意一个 + "vendor.zh.keyword": ["奇乐", "品牌A"], // 可以为单值 或者 数组 匹配数组中任意一个 "tags": "手机", // 可以为单值 或者 数组 匹配数组中任意一个 // specifications 嵌套过滤(特殊格式) "specifications": { @@ -274,7 +274,7 @@ curl -X POST "http://120.76.41.98:6002/search/" \ - `category_name`: 类目名称 - `category1_name`, `category2_name`, `category3_name`: 多级类目 - `category_id`: 类目ID -- `vendor_zh.keyword`, `vendor_en.keyword`: 供应商/品牌(使用keyword子字段) +- `vendor.zh.keyword`, `vendor.en.keyword`: 供应商/品牌(使用keyword子字段) - `tags`: 标签(keyword类型,支持数组) - `option1_name`, `option2_name`, `option3_name`: 选项名称 - `specifications`: 规格过滤(嵌套字段,格式见上文) @@ -602,7 +602,9 @@ curl "http://localhost:6002/search/instant?q=玩具&size=5" { "id": "12345", "source": { - "title_zh": "芭比时尚娃娃", + "title": { + "zh": "芭比时尚娃娃" + }, "min_price": 89.99, "category1_name": "玩具" } @@ -739,7 +741,7 @@ curl "http://localhost:6002/search/12345" | 字段 | 类型 | 说明 | |------|------|------| | `spu_id` | string | SPU ID | -| `title` | string | 商品标题(根据language参数自动选择title_zh或title_en) | +| `title` | string | 商品标题(根据language参数自动选择 `title.zh` 或 `title.en`) | | `brief` | string | 商品短描述(根据language参数自动选择) | | `description` | string | 商品详细描述(根据language参数自动选择) | | `vendor` | string | 供应商/品牌(根据language参数自动选择) | @@ -1088,7 +1090,9 @@ curl -X POST "http://localhost:6004/indexer/index" \ "document": { "tenant_id": "162", "spu_id": "123", - "title_zh": "商品标题", + "title": { + "zh": "商品标题" + }, ... } }, @@ -1298,7 +1302,7 @@ curl -X GET "http://localhost:6004/indexer/health" "size": 20, "language": "zh", "filters": { - "vendor_zh.keyword": ["品牌A", "品牌B"] + "vendor.zh.keyword": ["品牌A", "品牌B"] }, "range_filters": { "min_price": { @@ -1505,12 +1509,12 @@ curl -X GET "http://localhost:6004/indexer/health" |--------|------|------| | `tenant_id` | keyword | 租户ID(多租户隔离) | | `spu_id` | keyword | SPU ID | -| `title_zh`, `title_en` | text | 商品标题(中英文) | -| `brief_zh`, `brief_en` | text | 商品短描述(中英文) | -| `description_zh`, `description_en` | text | 商品详细描述(中英文) | -| `vendor_zh`, `vendor_en` | text | 供应商/品牌(中英文,含keyword子字段) | -| `category_path_zh`, `category_path_en` | text | 类目路径(中英文,用于搜索) | -| `category_name_zh`, `category_name_en` | text | 类目名称(中英文,用于搜索) | +| `title.` | object/text | 商品标题(多语言对象,如 `title.zh`, `title.en`) | +| `brief.` | object/text | 商品短描述(多语言对象,如 `brief.zh`, `brief.en`) | +| `description.` | object/text | 商品详细描述(多语言对象,如 `description.zh`, `description.en`) | +| `vendor.` | object/text | 供应商/品牌(多语言对象,且带 keyword 子字段,如 `vendor.zh.keyword`) | +| `category_path.` | object/text | 类目路径(多语言对象,用于搜索,如 `category_path.zh`) | +| `category_name_text.` | object/text | 类目名称(多语言对象,用于搜索,如 `category_name_text.zh`) | | `category_id` | keyword | 类目ID | | `category_name` | keyword | 类目名称(用于过滤) | | `category_level` | integer | 类目层级 | @@ -1552,7 +1556,7 @@ curl -X GET "http://localhost:6004/indexer/health" - `category_name`: 类目名称 - `category1_name`, `category2_name`, `category3_name`: 多级类目 - `category_id`: 类目ID -- `vendor_zh.keyword`, `vendor_en.keyword`: 供应商/品牌(使用keyword子字段) +- `vendor.zh.keyword`, `vendor.en.keyword`: 供应商/品牌(使用keyword子字段) - `tags`: 标签(keyword类型) - `option1_name`, `option2_name`, `option3_name`: 选项名称 - `specifications`: 规格过滤(嵌套字段,格式见[过滤器详解](#33-过滤器详解)) diff --git a/docs/搜索API速查表.md b/docs/搜索API速查表.md index 0c313c2..e485230 100644 --- a/docs/搜索API速查表.md +++ b/docs/搜索API速查表.md @@ -19,7 +19,7 @@ POST /search/ "filters": { "category_name": "手机", // 单值 "category1_name": "服装", // 一级类目 - "vendor_zh.keyword": ["奇乐", "品牌A"], // 多值(OR) + "vendor.zh.keyword": ["奇乐", "品牌A"], // 多值(OR) "tags": "手机", // 标签 // specifications 嵌套过滤 "specifications": { diff --git a/docs/相关性检索优化说明.md b/docs/相关性检索优化说明.md index 638029a..f3eeccf 100644 --- a/docs/相关性检索优化说明.md +++ b/docs/相关性检索优化说明.md @@ -20,7 +20,7 @@ { "multi_match": { "query": "戏水动物", - "fields": ["title_zh^3.0", "brief_zh^1.5", ...], + "fields": ["title.zh^3.0", "brief.zh^1.5", ...], "minimum_should_match": "67%", "tie_breaker": 0.9, "boost": 1, @@ -40,7 +40,7 @@ { "multi_match": { "_name": "base_query", - "fields": ["title_zh^3.0", "brief_zh^1.5", ...], + "fields": ["title.zh^3.0", "brief.zh^1.5", ...], "minimum_should_match": "75%", "operator": "AND", "query": "戏水动物", @@ -51,7 +51,7 @@ "multi_match": { "_name": "base_query_trans_en", "boost": 0.4, - "fields": ["title_en^3.0", ...], + "fields": ["title.en^3.0", ...], "minimum_should_match": "75%", "operator": "AND", "query": "water sports", @@ -61,7 +61,7 @@ { "multi_match": { "query": "戏水动物", - "fields": ["title_zh^3.0", "brief_zh^1.5", ...], + "fields": ["title.zh^3.0", "brief.zh^1.5", ...], "type": "phrase", "slop": 2, "boost": 1.0, @@ -71,7 +71,7 @@ { "multi_match": { "query": "戏水 动物", - "fields": ["title_zh^3.0", "brief_zh^1.5", ...], + "fields": ["title.zh^3.0", "brief.zh^1.5", ...], "operator": "AND", "tie_breaker": 0.9, "boost": 0.1, diff --git a/docs/系统设计文档.md b/docs/系统设计文档.md index 3aeaeb4..1b9890d 100644 --- a/docs/系统设计文档.md +++ b/docs/系统设计文档.md @@ -31,18 +31,18 @@ { "tenant_id": "1", "spu_id": "123", - "title_zh": "蓝牙耳机", - "title_en": "Bluetooth Headphones", - "brief_zh": "高品质蓝牙耳机", - "brief_en": "High-quality Bluetooth headphones", + "title.zh": "蓝牙耳机", + "title.en": "Bluetooth Headphones", + "brief.zh": "高品质蓝牙耳机", + "brief.en": "High-quality Bluetooth headphones", "category_name": "电子产品", - "category_path_zh": "电子产品/音频设备/耳机", - "category_path_en": "Electronics/Audio/Headphones", + "category_path.zh": "电子产品/音频设备/耳机", + "category_path.en": "Electronics/Audio/Headphones", "category1_name": "电子产品", "category2_name": "音频设备", "category3_name": "耳机", - "vendor_zh": "品牌A", - "vendor_en": "Brand A", + "vendor.zh": "品牌A", + "vendor.en": "Brand A", "min_price": 199.99, "max_price": 299.99, "option1_name": "color", @@ -93,7 +93,7 @@ - **查询配置硬编码**:查询相关配置(字段 boost、查询域等)硬编码在 `search/query_config.py` **索引结构特点**: -1. **多语言字段**:所有文本字段支持中英文(`title_zh/en`, `brief_zh/en`, `description_zh/en`, `vendor_zh/en`, `category_path_zh/en`, `category_name_zh/en`) +1. **多语言字段**:所有文本字段支持中英文(`title.zh/en`, `brief.zh/en`, `description.zh/en`, `vendor.zh/en`, `category_path.zh/en`, `category_name_text.zh/en`) 2. **嵌套字段**: - `skus`: SKU 嵌套数组(包含价格、库存、选项值等) - `specifications`: 规格嵌套数组(包含 name、value、sku_id) @@ -125,12 +125,12 @@ - `create_time`, `update_time` (date): 创建和更新时间 #### 多语言文本字段 -- `title_zh/en` (text): 标题(中英文) -- `brief_zh/en` (text): 短描述(中英文) -- `description_zh/en` (text): 详细描述(中英文) -- `vendor_zh/en` (text): 供应商/品牌(中英文) -- `category_path_zh/en` (text): 类目路径(中英文) -- `category_name_zh/en` (text): 类目名称(中英文) +- `title.zh/en` (text): 标题(中英文) +- `brief.zh/en` (text): 短描述(中英文) +- `description.zh/en` (text): 详细描述(中英文) +- `vendor.zh/en` (text): 供应商/品牌(中英文) +- `category_path.zh/en` (text): 类目路径(中英文) +- `category_name_text.zh/en` (text): 类目名称(中英文) **分析器配置**: - 中文字段:`hanlp_index`(索引时)/ `hanlp_standard`(查询时) @@ -282,7 +282,7 @@ indexes: 2. **数据转换**(`indexer/spu_transformer.py`): - 按`spu_id`和`tenant_id`关联SPU和SKU数据 - **多语言字段映射**: - - MySQL的`title` → ES的`title_zh`(英文字段设为空) + - MySQL的`title` → ES的`title.zh`(英文字段设为空) - 其他文本字段类似处理 - **分类字段映射**: - 从SPU表的`category_path`解析多级类目(`category1_name`, `category2_name`, `category3_name`) @@ -483,7 +483,7 @@ laptop AND (gaming OR professional) ANDNOT cheap - 过滤逻辑:不同维度(不同name)是AND关系,相同维度(相同name)的多个值是OR关系 - 使用ES的`nested`查询实现 - **text_recall**: 文本相关性召回 - - 同时搜索中英文字段(`title_zh/en`, `brief_zh/en`, `description_zh/en`, `vendor_zh/en`, `category_path_zh/en`, `category_name_zh/en`, `tags`) + - 同时搜索中英文字段(`title.zh/en`, `brief.zh/en`, `description.zh/en`, `vendor.zh/en`, `category_path.zh/en`, `category_name_text.zh/en`, `tags`) - 使用 `multi_match` 查询,支持字段 boost - 中文字段使用中文分词器,英文字段使用英文分析器 - **embedding_recall**: 向量召回(KNN) @@ -503,8 +503,8 @@ laptop AND (gaming OR professional) ANDNOT cheap "multi_match": { "query": "手机", "fields": [ - "title_zh^3.0", "title_en^3.0", - "brief_zh^1.5", "brief_en^1.5", + "title.zh^3.0", "title.en^3.0", + "brief.zh^1.5", "brief.en^1.5", ... ] } diff --git a/docs/系统设计文档v1.md b/docs/系统设计文档v1.md index 63dfd61..8b32944 100644 --- a/docs/系统设计文档v1.md +++ b/docs/系统设计文档v1.md @@ -424,7 +424,7 @@ laptop AND (gaming OR professional) ANDNOT cheap - **结构**: `filters AND (text_recall OR embedding_recall)` - **filters**: 前端传递的过滤条件(永远起作用,放在 `filter` 中) - **text_recall**: 文本相关性召回 - - 同时搜索中英文字段(`title_zh/en`, `brief_zh/en`, `description_zh/en`, `vendor_zh/en`, `category_path_zh/en`, `category_name_zh/en`, `tags`) + - 同时搜索中英文字段(`title.zh/en`, `brief.zh/en`, `description.zh/en`, `vendor.zh/en`, `category_path.zh/en`, `category_name_text.zh/en`, `tags`) - 使用 `multi_match` 查询,支持字段 boost - **embedding_recall**: 向量召回(KNN) - 使用 `title_embedding` 字段进行 KNN 搜索 @@ -443,8 +443,8 @@ laptop AND (gaming OR professional) ANDNOT cheap "multi_match": { "query": "手机", "fields": [ - "title_zh^3.0", "title_en^3.0", - "brief_zh^1.5", "brief_en^1.5", + "title.zh^3.0", "title.en^3.0", + "brief.zh^1.5", "brief.en^1.5", ... ] } diff --git a/docs/索引字段说明v1.md b/docs/索引字段说明v1.md index 219bf41..c24cc61 100644 --- a/docs/索引字段说明v1.md +++ b/docs/索引字段说明v1.md @@ -539,7 +539,7 @@ title brief description seo_title seo_description seo_keywords vendor vendor_key 2. tenant - 数据灌入: 对每个tenant设置一一个语言,作为tenant的一个基本配置。 -写入索引的时候,根据语言配置将title 等文本字段写入对应的索引字段(比如 title_en) +写入索引的时候,根据语言配置将title 等文本字段写入对应的索引字段(比如 title.en) 查询的时候,将query转为商家所用语言,并到对应的field去查。 diff --git a/docs/索引字段说明v2-mapping结构.md b/docs/索引字段说明v2-mapping结构.md index 5698bbe..1c5598e 100644 --- a/docs/索引字段说明v2-mapping结构.md +++ b/docs/索引字段说明v2-mapping结构.md @@ -16,22 +16,22 @@ }, // 文本相关性相关字段 - "title_zh": { + "title.zh": { "type": "text", "analyzer": "hanlp_index", "search_analyzer": "hanlp_standard" }, - "brief_zh": { + "brief.zh": { "type": "text", "analyzer": "hanlp_index", "search_analyzer": "hanlp_standard" }, - "description_zh": { + "description.zh": { "type": "text", "analyzer": "hanlp_index", "search_analyzer": "hanlp_standard" }, - "vendor_zh": { + "vendor.zh": { "type": "text", "analyzer": "hanlp_index", "search_analyzer": "hanlp_standard", @@ -43,23 +43,23 @@ } }, - "title_en": { + "title.en": { "type": "text", "analyzer": "english", "search_analyzer": "english", }, - "brief_en": { + "brief.en": { "type": "text", "analyzer": "english", "search_analyzer": "english", }, - "description_en": { + "description.en": { "type": "text", "analyzer": "english", "search_analyzer": "english", }, - "vendor_en": { + "vendor.en": { "type": "text", "analyzer": "english", "search_analyzer": "english", @@ -103,22 +103,22 @@ }, // 分类相关 - "category_path_zh": { // 提供模糊查询功能,辅助相关性计算 + "category_path.zh": { // 提供模糊查询功能,辅助相关性计算 "type": "text", "analyzer": "hanlp_index", "search_analyzer": "hanlp_standard" }, - "category_path_en": { // 提供模糊查询功能,辅助相关性计算 + "category_path.en": { // 提供模糊查询功能,辅助相关性计算 "type": "text", "analyzer": "english", "search_analyzer": "english" }, - "category_name_zh": { // 提供模糊查询功能,辅助相关性计算 + "category_name_text.zh": { // 提供模糊查询功能,辅助相关性计算 "type": "text", "analyzer": "hanlp_index", "search_analyzer": "hanlp_standard" }, - "category_name_en": { // 提供模糊查询功能,辅助相关性计算 + "category_name_text.en": { // 提供模糊查询功能,辅助相关性计算 "type": "text", "analyzer": "english", "search_analyzer": "english" diff --git a/docs/索引字段说明v2-plan.md b/docs/索引字段说明v2-plan.md index a50c428..5cdbfa1 100644 --- a/docs/索引字段说明v2-plan.md +++ b/docs/索引字段说明v2-plan.md @@ -10,7 +10,7 @@ #### 1.1 多语言文本字段 -- 为文本字段添加中英文双字段支持(title_zh/title_en, brief_zh/brief_en, description_zh/description_en, vendor_zh/vendor_en) +- 为文本字段添加中英文双字段支持(title.zh/title.en, brief.zh/brief.en, description.zh/description.en, vendor.zh/vendor.en) - 中文字段使用 `index_ansj`/`query_ansj` 分析器(对应文档中的hanlp_index/hanlp_standard) - 英文字段使用 `english` 分析器 - **暂时只填充中文字段,英文字段设为空**(不需要语言检测,每个tenant的语言预先知道) @@ -28,22 +28,22 @@ category_path varchar(500) mapping: - "category_path_zh": { // 提供模糊查询功能,辅助相关性计算 + "category_path.zh": { // 提供模糊查询功能,辅助相关性计算 "type": "text", "analyzer": "hanlp_index", "search_analyzer": "hanlp_standard" }, - "category_path_en": { // 提供模糊查询功能,辅助相关性计算 + "category_path.en": { // 提供模糊查询功能,辅助相关性计算 "type": "text", "analyzer": "english", "search_analyzer": "english" }, - "category_name_zh": { // 提供模糊查询功能,辅助相关性计算 + "category_name_text.zh": { // 提供模糊查询功能,辅助相关性计算 "type": "text", "analyzer": "hanlp_index", "search_analyzer": "hanlp_standard" }, - "category_name_en": { // 提供模糊查询功能,辅助相关性计算 + "category_name_text.en": { // 提供模糊查询功能,辅助相关性计算 "type": "text", "analyzer": "english", "search_analyzer": "english" @@ -104,15 +104,15 @@ mapping: #### 2.1 多语言文本处理 -- **简化处理**:暂时只填充中文字段(title_zh, brief_zh, description_zh, vendor_zh) -- 英文字段(title_en, brief_en, description_en, vendor_en)设为空或None +- **简化处理**:暂时只填充中文字段(title.zh, brief.zh, description.zh, vendor.zh) +- 英文字段(title.en, brief.en, description.en, vendor.en)设为空或None - 不需要语言检测逻辑 #### 2.2 分类路径解析 - 从 `category_path` 字段按 "/" 分割提取分类层级 - 分割结果赋值给 `category1_name`, `category2_name`, `category3_name` -- 生成 `category_path_zh`(暂时填充,`category_path_en` 设为空) +- 生成 `category_path.zh`(暂时填充,`category_path.en` 设为空) #### 2.3 SKU字段展开计算 @@ -152,7 +152,7 @@ mapping: **文件**: `indexer/spu_transformer.py` -- **简化多语言处理**:只填充中文字段(title_zh, brief_zh等),英文字段设为空或None +- **简化多语言处理**:只填充中文字段(title.zh, brief.zh等),英文字段设为空或None - 实现分类路径的解析和展开 - 实现SKU字段的展开计算(价格、重量、库存) - 实现选项字段的处理 diff --git a/docs/索引字段说明v2.md b/docs/索引字段说明v2.md index 40900ec..6da30f4 100644 --- a/docs/索引字段说明v2.md +++ b/docs/索引字段说明v2.md @@ -28,24 +28,24 @@ | 字段名 | ES类型 | 分析器 | 说明 | 数据来源 | |--------|--------|--------|------|----------| -| `title_zh` | text | hanlp_index / hanlp_standard | 中文标题 | MySQL: `shoplazza_product_spu.title` | -| `title_en` | text | english | 英文标题 | 暂为空(待翻译服务填充) | +| `title.zh` | text | hanlp_index / hanlp_standard | 中文标题 | MySQL: `shoplazza_product_spu.title` | +| `title.en` | text | english | 英文标题 | 暂为空(待翻译服务填充) | #### 2.2 描述字段 | 字段名 | ES类型 | 分析器 | 说明 | 数据来源 | |--------|--------|--------|------|----------| -| `brief_zh` | text | hanlp_index / hanlp_standard | 中文短描述 | MySQL: `shoplazza_product_spu.brief` | -| `brief_en` | text | english | 英文短描述 | 暂为空 | -| `description_zh` | text | hanlp_index / hanlp_standard | 中文详细描述 | MySQL: `shoplazza_product_spu.description` | -| `description_en` | text | english | 英文详细描述 | 暂为空 | +| `brief.zh` | text | hanlp_index / hanlp_standard | 中文短描述 | MySQL: `shoplazza_product_spu.brief` | +| `brief.en` | text | english | 英文短描述 | 暂为空 | +| `description.zh` | text | hanlp_index / hanlp_standard | 中文详细描述 | MySQL: `shoplazza_product_spu.description` | +| `description.en` | text | english | 英文详细描述 | 暂为空 | #### 2.3 供应商/品牌字段 | 字段名 | ES类型 | 分析器 | 子字段 | 说明 | 数据来源 | |--------|--------|--------|--------|------|----------| -| `vendor_zh` | text | hanlp_index / hanlp_standard | `vendor_zh.keyword` (keyword, normalizer: lowercase) | 中文供应商/品牌 | MySQL: `shoplazza_product_spu.vendor` | -| `vendor_en` | text | english | `vendor_en.keyword` (keyword, normalizer: lowercase) | 英文供应商/品牌 | 暂为空 | +| `vendor.zh` | text | hanlp_index / hanlp_standard | `vendor.zh.keyword` (keyword, normalizer: lowercase) | 中文供应商/品牌 | MySQL: `shoplazza_product_spu.vendor` | +| `vendor.en` | text | english | `vendor.en.keyword` (keyword, normalizer: lowercase) | 英文供应商/品牌 | 暂为空 | **用途**: - `text` 类型:用于全文搜索(支持模糊匹配) @@ -65,15 +65,15 @@ | 字段名 | ES类型 | 分析器 | 说明 | 数据来源 | |--------|--------|--------|------|----------| -| `category_path_zh` | text | hanlp_index / hanlp_standard | 中文类目路径(如"服装/男装/衬衫") | MySQL: `shoplazza_product_spu.category_path` | -| `category_path_en` | text | english | 英文类目路径 | 暂为空 | +| `category_path.zh` | text | hanlp_index / hanlp_standard | 中文类目路径(如"服装/男装/衬衫") | MySQL: `shoplazza_product_spu.category_path` | +| `category_path.en` | text | english | 英文类目路径 | 暂为空 | #### 4.2 类目名称(用于搜索) | 字段名 | ES类型 | 分析器 | 说明 | 数据来源 | |--------|--------|--------|------|----------| -| `category_name_zh` | text | hanlp_index / hanlp_standard | 中文类目名称 | MySQL: `shoplazza_product_spu.category` | -| `category_name_en` | text | english | 英文类目名称 | 暂为空 | +| `category_name_text.zh` | text | hanlp_index / hanlp_standard | 中文类目名称 | MySQL: `shoplazza_product_spu.category` | +| `category_name_text.en` | text | english | 英文类目名称 | 暂为空 | #### 4.3 类目标识(用于过滤和分面) @@ -87,7 +87,7 @@ | `category3_name` | keyword | 三级类目名称 | 从 `category_path` 解析 | **用途**: -- `category_path_zh/en`, `category_name_zh/en`: 用于全文搜索,支持模糊匹配 +- `category_path.zh/en`, `category_name_text.zh/en`: 用于全文搜索,支持模糊匹配 - `category_id`, `category_name`, `category_level`, `category1/2/3_name`: 用于精确过滤和分面聚合 ### 5. 规格字段(Specifications) @@ -348,20 +348,20 @@ ### 搜索字段(参与相关性计算) -- `title_zh`, `title_en` (boost: 3.0) -- `brief_zh`, `brief_en` (boost: 1.5) -- `description_zh`, `description_en` (boost: 1.0) -- `vendor_zh`, `vendor_en` (boost: 1.5) +- `title.zh`, `title.en` (boost: 3.0) +- `brief.zh`, `brief.en` (boost: 1.5) +- `description.zh`, `description.en` (boost: 1.0) +- `vendor.zh`, `vendor.en` (boost: 1.5) - `tags` (boost: 1.0) -- `category_path_zh`, `category_path_en` (boost: 1.5) -- `category_name_zh`, `category_name_en` (boost: 1.5) +- `category_path.zh`, `category_path.en` (boost: 1.5) +- `category_name_text.zh`, `category_name_text.en` (boost: 1.5) - `title_embedding` (向量召回,boost: 0.2) ### 过滤字段(精确匹配) - `tenant_id` (必需,多租户隔离) - `category_id`, `category_name`, `category1_name`, `category2_name`, `category3_name` -- `vendor_zh.keyword`, `vendor_en.keyword` +- `vendor.zh.keyword`, `vendor.en.keyword` - `specifications` (嵌套查询) - `min_price`, `max_price` (范围过滤) - `sales` (范围过滤) @@ -395,12 +395,12 @@ - `language="en"`: 优先返回 `*_en` 字段,如果为空则回退到 `*_zh` 字段 映射到前端字段: -- `title_zh/en` → `title` -- `brief_zh/en` → `brief` -- `description_zh/en` → `description` -- `vendor_zh/en` → `vendor` -- `category_path_zh/en` → `category_path` -- `category_name_zh/en` → `category_name` +- `title.zh/en` → `title` +- `brief.zh/en` → `brief` +- `description.zh/en` → `description` +- `vendor.zh/en` → `vendor` +- `category_path.zh/en` → `category_path` +- `category_name_text.zh/en` → `category_name` ### 规格数据构建 @@ -434,12 +434,12 @@ filters AND (text_recall OR embedding_recall) ### 文本召回字段 根据查询词的语言选择对应的索引字段: -- `title_zh^3.0`, `title_en^3.0` -- `brief_zh^1.5`, `brief_en^1.5` -- `description_zh^1.0`, `description_en^1.0` -- `vendor_zh^1.5`, `vendor_en^1.5` -- `category_path_zh^1.5`, `category_path_en^1.5` -- `category_name_zh^1.5`, `category_name_en^1.5` +- `title.zh^3.0`, `title.en^3.0` +- `brief.zh^1.5`, `brief.en^1.5` +- `description.zh^1.0`, `description.en^1.0` +- `vendor.zh^1.5`, `vendor.en^1.5` +- `category_path.zh^1.5`, `category_path.en^1.5` +- `category_name_text.zh^1.5`, `category_name_text.en^1.5` - `tags^1.0` ## 注意事项 diff --git a/docs/索引数据接口文档___old.md b/docs/索引数据接口文档___old.md index 8e968b6..22a7d5f 100644 --- a/docs/索引数据接口文档___old.md +++ b/docs/索引数据接口文档___old.md @@ -52,12 +52,12 @@ tenant_config: ### 配置规则 1. **主语言**:指定SKU表中 `title`、`brief`、`description`、`vendor` 等字段的语言。 - - 如果主语言是 `zh`,这些字段的值会填充到 `title_zh`、`brief_zh` 等字段 - - 如果主语言是 `en`,这些字段的值会填充到 `title_en`、`brief_en` 等字段 + - 如果主语言是 `zh`,这些字段的值会填充到 `title.zh`、`brief.zh` 等字段 + - 如果主语言是 `en`,这些字段的值会填充到 `title.en`、`brief.en` 等字段 2. **翻译配置**: - - `translate_to_en: true`:如果主语言是中文,则会将中文内容翻译为英文,填充到 `title_en` 等字段 - - `translate_to_zh: true`:如果主语言是英文,则会将英文内容翻译为中文,填充到 `title_zh` 等字段 + - `translate_to_en: true`:如果主语言是中文,则会将中文内容翻译为英文,填充到 `title.en` 等字段 + - `translate_to_zh: true`:如果主语言是英文,则会将英文内容翻译为中文,填充到 `title.zh` 等字段 - **注意**:如果主语言本身就是目标语言,则不会触发翻译(例如主语言是英文,`translate_to_en: true` 不会触发翻译) 3. **默认配置**:如果租户ID不在 `tenants` 中,则使用 `default` 配置。 @@ -72,8 +72,8 @@ tenant_config: "translate_to_zh": false } ``` -- SKU表的 `title` 字段(中文)→ `title_zh` -- 翻译服务将中文翻译为英文 → `title_en` +- SKU表的 `title` 字段(中文)→ `title.zh` +- 翻译服务将中文翻译为英文 → `title.en` **示例2:英文主语言,需要翻译中文** ```json @@ -83,8 +83,8 @@ tenant_config: "translate_to_zh": true } ``` -- SKU表的 `title` 字段(英文)→ `title_en` -- 翻译服务将英文翻译为中文 → `title_zh` +- SKU表的 `title` 字段(英文)→ `title.en` +- 翻译服务将英文翻译为中文 → `title.zh` **示例3:仅使用主语言,不翻译** ```json @@ -94,8 +94,8 @@ tenant_config: "translate_to_zh": false } ``` -- SKU表的 `title` 字段(中文)→ `title_zh` -- `title_en` 保持为 `null` +- SKU表的 `title` 字段(中文)→ `title.zh` +- `title.en` 保持为 `null` ### 配置更新 @@ -272,19 +272,19 @@ String json = response.body().string(); { "tenant_id": "1", "spu_id": "123", - "title_zh": "商品标题", - "title_en": null, - "brief_zh": "商品简介", - "brief_en": null, - "description_zh": "商品详细描述", - "description_en": null, - "vendor_zh": "供应商名称", - "vendor_en": null, + "title.zh": "商品标题", + "title.en": null, + "brief.zh": "商品简介", + "brief.en": null, + "description.zh": "商品详细描述", + "description.en": null, + "vendor.zh": "供应商名称", + "vendor.en": null, "tags": ["标签1", "标签2"], - "category_path_zh": "类目1/类目2/类目3", - "category_path_en": null, - "category_name_zh": "类目名称", - "category_name_en": null, + "category_path.zh": "类目1/类目2/类目3", + "category_path.en": null, + "category_name_text.zh": "类目名称", + "category_name_text.en": null, "category_id": "100", "category_name": "类目名称", "category_level": 3, @@ -453,11 +453,11 @@ for (String spuId : changedSpuIds) { |---------|--------|------|------| | 基础标识 | `tenant_id` | keyword | 租户ID | | 基础标识 | `spu_id` | keyword | SPU ID | -| 文本字段 | `title_zh`, `title_en` | text | 标题(中英文) | -| 文本字段 | `brief_zh`, `brief_en` | text | 简介(中英文) | -| 文本字段 | `description_zh`, `description_en` | text | 描述(中英文) | -| 文本字段 | `vendor_zh`, `vendor_en` | text | 供应商(中英文) | -| 类目字段 | `category_path_zh`, `category_path_en` | text | 类目路径(中英文) | +| 文本字段 | `title.zh`, `title.en` | text | 标题(中英文) | +| 文本字段 | `brief.zh`, `brief.en` | text | 简介(中英文) | +| 文本字段 | `description.zh`, `description.en` | text | 描述(中英文) | +| 文本字段 | `vendor.zh`, `vendor.en` | text | 供应商(中英文) | +| 类目字段 | `category_path.zh`, `category_path.en` | text | 类目路径(中英文) | | 类目字段 | `category1_name`, `category2_name`, `category3_name` | keyword | 分层类目名称 | | 价格字段 | `min_price`, `max_price` | float | 价格范围 | | 库存字段 | `total_inventory` | long | 总库存 | diff --git a/docs/索引方案.md b/docs/索引方案.md index a1c5f85..4e3b546 100644 --- a/docs/索引方案.md +++ b/docs/索引方案.md @@ -178,12 +178,12 @@ category_path varchar(500) 方案:采用方案2 4. categoryPath索引 + Prefix 查询(categoryPath.keyword: "服装/男装")(如果满足条件的key太多的则性能较差,比如 查询的是一级类目,类目树叶子节点太多时性能较差) 5. categoryPath支撑模糊查询 和 多级cate keyword索引支撑精确查询。 索引阶段冗余,查询性能高。 - "category_path_zh": { // 提供模糊查询功能,辅助相关性计算 + "category_path.zh": { // 提供模糊查询功能,辅助相关性计算 "type": "text", "analyzer": "hanlp_index", "search_analyzer": "hanlp_standard" }, - "category_path_en": { // 提供模糊查询功能,辅助相关性计算 + "category_path.en": { // 提供模糊查询功能,辅助相关性计算 "type": "text", "analyzer": "english", "search_analyzer": "english" diff --git a/docs/翻译功能测试说明.md b/docs/翻译功能测试说明.md index a524c8c..76bb4a9 100644 --- a/docs/翻译功能测试说明.md +++ b/docs/翻译功能测试说明.md @@ -22,8 +22,8 @@ ```yaml translation_prompts: # 商品标题翻译提示词 - product_title_zh: "请将原文翻译成中文商品SKU名称,要求:确保精确、完整地传达原文信息的基础上,语言简洁清晰、地道、专业。" - product_title_en: "Translate the original text into an English product SKU name. Requirements: Ensure accurate and complete transmission of the original information, with concise, clear, authentic, and professional language." + product_title.zh: "请将原文翻译成中文商品SKU名称,要求:确保精确、完整地传达原文信息的基础上,语言简洁清晰、地道、专业。" + product_title.en: "Translate the original text into an English product SKU name. Requirements: Ensure accurate and complete transmission of the original information, with concise, clear, authentic, and professional language." # query翻译提示词 query_zh: "电商领域" query_en: "e-commerce domain" @@ -35,8 +35,8 @@ translation_prompts: ### 提示词使用规则 1. **商品标题翻译**: - - 中文→英文:使用 `product_title_en` - - 英文→中文:使用 `product_title_zh` + - 中文→英文:使用 `product_title.en` + - 英文→中文:使用 `product_title.zh` 2. **其他字段翻译**(brief, description, vendor): - 根据目标语言选择 `default_zh` 或 `default_en` @@ -72,7 +72,7 @@ translator = Translator( # 测试商品标题翻译 text = "蓝牙耳机" -prompt = config.query_config.translation_prompts.get('product_title_en') +prompt = config.query_config.translation_prompts.get('product_title.en') result = translator.translate( text, target_lang='en', @@ -140,8 +140,8 @@ doc = transformer.transform_spu_to_doc( options=pd.DataFrame() ) -print(f"title_zh: {doc.get('title_zh')}") -print(f"title_en: {doc.get('title_en')}") # 应该包含翻译结果 +print(f"title.zh: {doc.get('title.zh')}") +print(f"title.en: {doc.get('title.en')}") # 应该包含翻译结果 ``` ### 5. 测试缓存功能 diff --git a/indexer/document_transformer.py b/indexer/document_transformer.py index 69a4a92..a7a87b6 100644 --- a/indexer/document_transformer.py +++ b/indexer/document_transformer.py @@ -184,17 +184,40 @@ class SPUDocumentTransformer: translate_to_en = bool(self.tenant_config.get('translate_to_en')) translate_to_zh = bool(self.tenant_config.get('translate_to_zh')) + def _set_lang_obj(field_name: str, source_text: Optional[str], translations: Optional[Dict[str, str]] = None): + """ + Write multilingual text field as an object, e.g.: + doc[field_name] = {"zh": "...", "en": "..."} + Only writes keys based on tenant primary_language + translate_to_en/translate_to_zh. + """ + if not source_text or not str(source_text).strip(): + return + + obj: Dict[str, str] = {} + src = str(source_text) + obj[primary_lang] = src + + tr = translations or {} + if translate_to_en and primary_lang != "en": + en_text = tr.get("en") + if en_text and str(en_text).strip(): + obj["en"] = str(en_text) + if translate_to_zh and primary_lang != "zh": + zh_text = tr.get("zh") + if zh_text and str(zh_text).strip(): + obj["zh"] = str(zh_text) + + if obj: + doc[field_name] = obj + # Title if pd.notna(spu_row.get('title')): title_text = str(spu_row['title']) - - # 使用translator的translate_for_indexing方法,自动处理多语言翻译 + + translations: Dict[str, str] = {} if self.translator: - # 根据目标语言选择对应的提示词 prompt_zh = self.translation_prompts.get('product_title_zh') or self.translation_prompts.get('default_zh') prompt_en = self.translation_prompts.get('product_title_en') or self.translation_prompts.get('default_en') - - # 调用translate_for_indexing,自动处理翻译逻辑 translations = self.translator.translate_for_indexing( title_text, shop_language=primary_lang, @@ -202,26 +225,14 @@ class SPUDocumentTransformer: prompt=prompt_zh if primary_lang == 'zh' else prompt_en, translate_to_en=translate_to_en, translate_to_zh=translate_to_zh, - ) - - # 填充翻译结果 - doc['title_zh'] = translations.get('zh') or (title_text if primary_lang == 'zh' else None) - doc['title_en'] = translations.get('en') or (title_text if primary_lang == 'en' else None) - else: - # 无翻译器,只填充主语言字段 - if primary_lang == 'zh': - doc['title_zh'] = title_text - doc['title_en'] = None - else: - doc['title_zh'] = None - doc['title_en'] = title_text - else: - doc['title_zh'] = None - doc['title_en'] = None + ) or {} + + _set_lang_obj("title", title_text, translations) # Brief if pd.notna(spu_row.get('brief')): brief_text = str(spu_row['brief']) + translations: Dict[str, str] = {} if self.translator: prompt = self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en') translations = self.translator.translate_for_indexing( @@ -231,23 +242,13 @@ class SPUDocumentTransformer: prompt=prompt, translate_to_en=translate_to_en, translate_to_zh=translate_to_zh, - ) - doc['brief_zh'] = translations.get('zh') or (brief_text if primary_lang == 'zh' else None) - doc['brief_en'] = translations.get('en') or (brief_text if primary_lang == 'en' else None) - else: - if primary_lang == 'zh': - doc['brief_zh'] = brief_text - doc['brief_en'] = None - else: - doc['brief_zh'] = None - doc['brief_en'] = brief_text - else: - doc['brief_zh'] = None - doc['brief_en'] = None + ) or {} + _set_lang_obj("brief", brief_text, translations) # Description if pd.notna(spu_row.get('description')): desc_text = str(spu_row['description']) + translations: Dict[str, str] = {} if self.translator: prompt = self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en') translations = self.translator.translate_for_indexing( @@ -257,23 +258,13 @@ class SPUDocumentTransformer: prompt=prompt, translate_to_en=translate_to_en, translate_to_zh=translate_to_zh, - ) - doc['description_zh'] = translations.get('zh') or (desc_text if primary_lang == 'zh' else None) - doc['description_en'] = translations.get('en') or (desc_text if primary_lang == 'en' else None) - else: - if primary_lang == 'zh': - doc['description_zh'] = desc_text - doc['description_en'] = None - else: - doc['description_zh'] = None - doc['description_en'] = desc_text - else: - doc['description_zh'] = None - doc['description_en'] = None + ) or {} + _set_lang_obj("description", desc_text, translations) # Vendor if pd.notna(spu_row.get('vendor')): vendor_text = str(spu_row['vendor']) + translations: Dict[str, str] = {} if self.translator: prompt = self.translation_prompts.get('default_zh') or self.translation_prompts.get('default_en') translations = self.translator.translate_for_indexing( @@ -283,19 +274,8 @@ class SPUDocumentTransformer: prompt=prompt, translate_to_en=translate_to_en, translate_to_zh=translate_to_zh, - ) - doc['vendor_zh'] = translations.get('zh') or (vendor_text if primary_lang == 'zh' else None) - doc['vendor_en'] = translations.get('en') or (vendor_text if primary_lang == 'en' else None) - else: - if primary_lang == 'zh': - doc['vendor_zh'] = vendor_text - doc['vendor_en'] = None - else: - doc['vendor_zh'] = None - doc['vendor_en'] = vendor_text - else: - doc['vendor_zh'] = None - doc['vendor_en'] = None + ) or {} + _set_lang_obj("vendor", vendor_text, translations) def _fill_category_fields(self, doc: Dict[str, Any], spu_row: pd.Series): """填充类目相关字段。""" @@ -303,6 +283,8 @@ class SPUDocumentTransformer: # - 当商品的类目ID在映射中不存在时,视为“不合法类目”,整条类目相关字段都不写入(当成没有类目) # - 仅记录错误日志,不阻塞索引流程 + primary_lang = self.tenant_config.get('primary_language', 'zh') + if pd.notna(spu_row.get('category_path')): category_path = str(spu_row['category_path']) @@ -329,8 +311,7 @@ class SPUDocumentTransformer: # 构建类目路径字符串(用于搜索) if category_names: category_path_str = '/'.join(category_names) - doc['category_path_zh'] = category_path_str - doc['category_path_en'] = None # 暂时设为空 + doc['category_path'] = {primary_lang: category_path_str} # 填充分层类目名称 if len(category_names) > 0: @@ -342,8 +323,7 @@ class SPUDocumentTransformer: elif pd.notna(spu_row.get('category')): # 如果category_path为空,使用category字段作为category1_name的备选 category = str(spu_row['category']) - doc['category_name_zh'] = category - doc['category_name_en'] = None + doc['category_name_text'] = {primary_lang: category} doc['category_name'] = category # 尝试从category字段解析多级分类 @@ -362,10 +342,8 @@ class SPUDocumentTransformer: if pd.notna(spu_row.get('category')): # 确保category相关字段都被设置(如果前面没有设置) category_name = str(spu_row['category']) - if 'category_name_zh' not in doc: - doc['category_name_zh'] = category_name - if 'category_name_en' not in doc: - doc['category_name_en'] = None + if 'category_name_text' not in doc: + doc['category_name_text'] = {primary_lang: category_name} if 'category_name' not in doc: doc['category_name'] = category_name @@ -587,13 +565,22 @@ class SPUDocumentTransformer: """ 填充标题向量化字段。 - 使用英文标题(title_en)生成embedding。如果title_en不存在,则使用title_zh。 + 使用英文标题(title.en)生成embedding。如果title.en不存在,则使用title.zh。 Args: doc: ES文档字典 """ - # 优先使用英文标题,如果没有则使用中文标题 - title_text = doc.get('title_en') or doc.get('title_zh') + # 优先使用英文标题,如果没有则使用中文标题;再没有则取任意可用语言 + title_obj = doc.get("title") or {} + if isinstance(title_obj, dict): + title_text = title_obj.get("en") or title_obj.get("zh") + if not title_text: + for v in title_obj.values(): + if v and str(v).strip(): + title_text = str(v) + break + else: + title_text = None if not title_text or not title_text.strip(): logger.debug(f"No title text available for embedding, SPU: {doc.get('spu_id')}") diff --git a/indexer/incremental_service.py b/indexer/incremental_service.py index 1170602..cd9f49b 100644 --- a/indexer/incremental_service.py +++ b/indexer/incremental_service.py @@ -130,7 +130,15 @@ class IncrementalIndexerService: # 单条场景下也可补齐 embedding(仍走缓存) if enable_embedding and encoder: - title_text = doc.get("title_en") or doc.get("title_zh") + title_obj = doc.get("title") or {} + title_text = None + if isinstance(title_obj, dict): + title_text = title_obj.get("en") or title_obj.get("zh") + if not title_text: + for v in title_obj.values(): + if v and str(v).strip(): + title_text = str(v) + break if title_text and str(title_text).strip(): try: embeddings = encoder.encode(title_text) @@ -560,7 +568,15 @@ class IncrementalIndexerService: title_texts: List[str] = [] title_doc_indices: List[int] = [] for i, (_, doc) in enumerate(documents): - title_text = doc.get("title_en") or doc.get("title_zh") + title_obj = doc.get("title") or {} + title_text = None + if isinstance(title_obj, dict): + title_text = title_obj.get("en") or title_obj.get("zh") + if not title_text: + for v in title_obj.values(): + if v and str(v).strip(): + title_text = str(v) + break if title_text and str(title_text).strip(): title_texts.append(str(title_text)) title_doc_indices.append(i) diff --git a/indexer/test_indexing.py b/indexer/test_indexing.py index c018054..58330d7 100755 --- a/indexer/test_indexing.py +++ b/indexer/test_indexing.py @@ -114,16 +114,17 @@ def test_full_indexing(tenant_id: str = "162"): print(f"\n文档 {i+1}:") print(f" SPU ID: {doc.get('spu_id')}") print(f" Tenant ID: {doc.get('tenant_id')}") - print(f" 标题 (中文): {doc.get('title_zh', 'N/A')}") - print(f" 标题 (英文): {doc.get('title_en', 'N/A')}") + title_obj = doc.get("title") or {} + print(f" 标题 (中文): {title_obj.get('zh', 'N/A') if isinstance(title_obj, dict) else 'N/A'}") + print(f" 标题 (英文): {title_obj.get('en', 'N/A') if isinstance(title_obj, dict) else 'N/A'}") # 检查租户162的翻译状态 if tenant_id == "162": - # 租户162翻译应该关闭,title_en应该为None - if doc.get('title_en') is None: - print(f" ✓ 翻译已关闭(title_en为None)") + # 租户162翻译应该关闭:只写入主语言,不应出现 title.en + if isinstance(title_obj, dict) and title_obj.get("en") is None: + print(f" ✓ 翻译已关闭(title.en为空)") else: - print(f" ⚠ 警告:翻译应该关闭,但title_en有值: {doc.get('title_en')}") + print(f" ⚠ 警告:翻译应该关闭,但title.en有值: {title_obj.get('en') if isinstance(title_obj, dict) else None}") return True @@ -192,17 +193,18 @@ def test_incremental_indexing(tenant_id: str = "162"): print(f"✓ SPU文档获取成功") print(f" SPU ID: {doc.get('spu_id')}") print(f" Tenant ID: {doc.get('tenant_id')}") - print(f" 标题 (中文): {doc.get('title_zh', 'N/A')}") - print(f" 标题 (英文): {doc.get('title_en', 'N/A')}") + title_obj = doc.get("title") or {} + print(f" 标题 (中文): {title_obj.get('zh', 'N/A') if isinstance(title_obj, dict) else 'N/A'}") + print(f" 标题 (英文): {title_obj.get('en', 'N/A') if isinstance(title_obj, dict) else 'N/A'}") print(f" SKU数量: {len(doc.get('skus', []))}") print(f" 规格数量: {len(doc.get('specifications', []))}") # 检查租户162的翻译状态 if tenant_id == "162": - if doc.get('title_en') is None: - print(f" ✓ 翻译已关闭(title_en为None)") + if isinstance(title_obj, dict) and title_obj.get("en") is None: + print(f" ✓ 翻译已关闭(title.en为空)") else: - print(f" ⚠ 警告:翻译应该关闭,但title_en有值: {doc.get('title_en')}") + print(f" ⚠ 警告:翻译应该关闭,但title.en有值: {title_obj.get('en') if isinstance(title_obj, dict) else None}") return True @@ -291,12 +293,13 @@ def test_document_transformer(): if doc: print(f"✓ 文档转换成功") - print(f" title_zh: {doc.get('title_zh')}") - print(f" title_en: {doc.get('title_en')}") + title_obj = doc.get("title") or {} + print(f" title.zh: {title_obj.get('zh') if isinstance(title_obj, dict) else None}") + print(f" title.en: {title_obj.get('en') if isinstance(title_obj, dict) else None}") print(f" SKU数量: {len(doc.get('skus', []))}") # 验证租户162翻译关闭 - if doc.get('title_en') is None: + if isinstance(title_obj, dict) and title_obj.get("en") is None: print(f" ✓ 翻译已关闭(符合租户162配置)") else: print(f" ⚠ 警告:翻译应该关闭") diff --git a/mappings/search_products.json b/mappings/search_products.json index 9d0a5a8..acda2df 100644 --- a/mappings/search_products.json +++ b/mappings/search_products.json @@ -4,22 +4,12 @@ "number_of_replicas": 0, "refresh_interval": "30s", "analysis": { - "analyzer": { - "hanlp_index": { - "type": "custom", - "tokenizer": "standard", - "filter": ["lowercase", "asciifolding"] - }, - "hanlp_standard": { - "type": "custom", - "tokenizer": "standard", - "filter": ["lowercase", "asciifolding"] - } - }, "normalizer": { "lowercase": { "type": "custom", - "filter": ["lowercase"] + "filter": [ + "lowercase" + ] } } }, @@ -45,75 +35,963 @@ "update_time": { "type": "date" }, - "title_zh": { - "type": "text", - "analyzer": "hanlp_index", - "search_analyzer": "hanlp_standard" - }, - "qanchors_zh": { - "type": "text", - "analyzer": "hanlp_index", - "search_analyzer": "hanlp_standard" - }, - "keywords_zh": { - "type": "text", - "analyzer": "hanlp_index", - "search_analyzer": "hanlp_standard" - }, - "brief_zh": { - "type": "text", - "analyzer": "hanlp_index", - "search_analyzer": "hanlp_standard" - }, - "description_zh": { - "type": "text", - "analyzer": "hanlp_index", - "search_analyzer": "hanlp_standard" - }, - "vendor_zh": { - "type": "text", - "analyzer": "hanlp_index", - "search_analyzer": "hanlp_standard", - "fields": { - "keyword": { - "type": "keyword", - "normalizer": "lowercase" + "title": { + "type": "object", + "properties": { + "ar": { + "type": "text", + "analyzer": "arabic" + }, + "hy": { + "type": "text", + "analyzer": "armenian" + }, + "eu": { + "type": "text", + "analyzer": "basque" + }, + "pt_br": { + "type": "text", + "analyzer": "brazilian" + }, + "bg": { + "type": "text", + "analyzer": "bulgarian" + }, + "ca": { + "type": "text", + "analyzer": "catalan" + }, + "zh": { + "type": "text", + "analyzer": "icu_analyzer" + }, + "chinese": { + "type": "text", + "analyzer": "chinese" + }, + "cjk": { + "type": "text", + "analyzer": "cjk" + }, + "cs": { + "type": "text", + "analyzer": "czech" + }, + "da": { + "type": "text", + "analyzer": "danish" + }, + "nl": { + "type": "text", + "analyzer": "dutch" + }, + "en": { + "type": "text", + "analyzer": "english" + }, + "fi": { + "type": "text", + "analyzer": "finnish" + }, + "fr": { + "type": "text", + "analyzer": "french" + }, + "gl": { + "type": "text", + "analyzer": "galician" + }, + "de": { + "type": "text", + "analyzer": "german" + }, + "el": { + "type": "text", + "analyzer": "greek" + }, + "hi": { + "type": "text", + "analyzer": "hindi" + }, + "hu": { + "type": "text", + "analyzer": "hungarian" + }, + "id": { + "type": "text", + "analyzer": "indonesian" + }, + "it": { + "type": "text", + "analyzer": "italian" + }, + "no": { + "type": "text", + "analyzer": "norwegian" + }, + "fa": { + "type": "text", + "analyzer": "persian" + }, + "pt": { + "type": "text", + "analyzer": "portuguese" + }, + "ro": { + "type": "text", + "analyzer": "romanian" + }, + "ru": { + "type": "text", + "analyzer": "russian" + }, + "es": { + "type": "text", + "analyzer": "spanish" + }, + "sv": { + "type": "text", + "analyzer": "swedish" + }, + "tr": { + "type": "text", + "analyzer": "turkish" + }, + "th": { + "type": "text", + "analyzer": "thai" } } }, - "title_en": { - "type": "text", - "analyzer": "english", - "search_analyzer": "english" - }, - "qanchors_en": { - "type": "text", - "analyzer": "english", - "search_analyzer": "english" - }, - "keywords_en": { - "type": "text", - "analyzer": "english", - "search_analyzer": "english" - }, - "brief_en": { - "type": "text", - "analyzer": "english", - "search_analyzer": "english" - }, - "description_en": { - "type": "text", - "analyzer": "english", - "search_analyzer": "english" - }, - "vendor_en": { - "type": "text", - "analyzer": "english", - "search_analyzer": "english", - "fields": { - "keyword": { - "type": "keyword", - "normalizer": "lowercase" + "qanchors": { + "type": "object", + "properties": { + "ar": { + "type": "text", + "analyzer": "arabic" + }, + "hy": { + "type": "text", + "analyzer": "armenian" + }, + "eu": { + "type": "text", + "analyzer": "basque" + }, + "pt_br": { + "type": "text", + "analyzer": "brazilian" + }, + "bg": { + "type": "text", + "analyzer": "bulgarian" + }, + "ca": { + "type": "text", + "analyzer": "catalan" + }, + "zh": { + "type": "text", + "analyzer": "icu_analyzer" + }, + "chinese": { + "type": "text", + "analyzer": "chinese" + }, + "cjk": { + "type": "text", + "analyzer": "cjk" + }, + "cs": { + "type": "text", + "analyzer": "czech" + }, + "da": { + "type": "text", + "analyzer": "danish" + }, + "nl": { + "type": "text", + "analyzer": "dutch" + }, + "en": { + "type": "text", + "analyzer": "english" + }, + "fi": { + "type": "text", + "analyzer": "finnish" + }, + "fr": { + "type": "text", + "analyzer": "french" + }, + "gl": { + "type": "text", + "analyzer": "galician" + }, + "de": { + "type": "text", + "analyzer": "german" + }, + "el": { + "type": "text", + "analyzer": "greek" + }, + "hi": { + "type": "text", + "analyzer": "hindi" + }, + "hu": { + "type": "text", + "analyzer": "hungarian" + }, + "id": { + "type": "text", + "analyzer": "indonesian" + }, + "it": { + "type": "text", + "analyzer": "italian" + }, + "no": { + "type": "text", + "analyzer": "norwegian" + }, + "fa": { + "type": "text", + "analyzer": "persian" + }, + "pt": { + "type": "text", + "analyzer": "portuguese" + }, + "ro": { + "type": "text", + "analyzer": "romanian" + }, + "ru": { + "type": "text", + "analyzer": "russian" + }, + "es": { + "type": "text", + "analyzer": "spanish" + }, + "sv": { + "type": "text", + "analyzer": "swedish" + }, + "tr": { + "type": "text", + "analyzer": "turkish" + }, + "th": { + "type": "text", + "analyzer": "thai" + } + } + }, + "keywords": { + "type": "object", + "properties": { + "ar": { + "type": "text", + "analyzer": "arabic" + }, + "hy": { + "type": "text", + "analyzer": "armenian" + }, + "eu": { + "type": "text", + "analyzer": "basque" + }, + "pt_br": { + "type": "text", + "analyzer": "brazilian" + }, + "bg": { + "type": "text", + "analyzer": "bulgarian" + }, + "ca": { + "type": "text", + "analyzer": "catalan" + }, + "zh": { + "type": "text", + "analyzer": "icu_analyzer" + }, + "chinese": { + "type": "text", + "analyzer": "chinese" + }, + "cjk": { + "type": "text", + "analyzer": "cjk" + }, + "cs": { + "type": "text", + "analyzer": "czech" + }, + "da": { + "type": "text", + "analyzer": "danish" + }, + "nl": { + "type": "text", + "analyzer": "dutch" + }, + "en": { + "type": "text", + "analyzer": "english" + }, + "fi": { + "type": "text", + "analyzer": "finnish" + }, + "fr": { + "type": "text", + "analyzer": "french" + }, + "gl": { + "type": "text", + "analyzer": "galician" + }, + "de": { + "type": "text", + "analyzer": "german" + }, + "el": { + "type": "text", + "analyzer": "greek" + }, + "hi": { + "type": "text", + "analyzer": "hindi" + }, + "hu": { + "type": "text", + "analyzer": "hungarian" + }, + "id": { + "type": "text", + "analyzer": "indonesian" + }, + "it": { + "type": "text", + "analyzer": "italian" + }, + "no": { + "type": "text", + "analyzer": "norwegian" + }, + "fa": { + "type": "text", + "analyzer": "persian" + }, + "pt": { + "type": "text", + "analyzer": "portuguese" + }, + "ro": { + "type": "text", + "analyzer": "romanian" + }, + "ru": { + "type": "text", + "analyzer": "russian" + }, + "es": { + "type": "text", + "analyzer": "spanish" + }, + "sv": { + "type": "text", + "analyzer": "swedish" + }, + "tr": { + "type": "text", + "analyzer": "turkish" + }, + "th": { + "type": "text", + "analyzer": "thai" + } + } + }, + "brief": { + "type": "object", + "properties": { + "ar": { + "type": "text", + "analyzer": "arabic" + }, + "hy": { + "type": "text", + "analyzer": "armenian" + }, + "eu": { + "type": "text", + "analyzer": "basque" + }, + "pt_br": { + "type": "text", + "analyzer": "brazilian" + }, + "bg": { + "type": "text", + "analyzer": "bulgarian" + }, + "ca": { + "type": "text", + "analyzer": "catalan" + }, + "zh": { + "type": "text", + "analyzer": "icu_analyzer" + }, + "chinese": { + "type": "text", + "analyzer": "chinese" + }, + "cjk": { + "type": "text", + "analyzer": "cjk" + }, + "cs": { + "type": "text", + "analyzer": "czech" + }, + "da": { + "type": "text", + "analyzer": "danish" + }, + "nl": { + "type": "text", + "analyzer": "dutch" + }, + "en": { + "type": "text", + "analyzer": "english" + }, + "fi": { + "type": "text", + "analyzer": "finnish" + }, + "fr": { + "type": "text", + "analyzer": "french" + }, + "gl": { + "type": "text", + "analyzer": "galician" + }, + "de": { + "type": "text", + "analyzer": "german" + }, + "el": { + "type": "text", + "analyzer": "greek" + }, + "hi": { + "type": "text", + "analyzer": "hindi" + }, + "hu": { + "type": "text", + "analyzer": "hungarian" + }, + "id": { + "type": "text", + "analyzer": "indonesian" + }, + "it": { + "type": "text", + "analyzer": "italian" + }, + "no": { + "type": "text", + "analyzer": "norwegian" + }, + "fa": { + "type": "text", + "analyzer": "persian" + }, + "pt": { + "type": "text", + "analyzer": "portuguese" + }, + "ro": { + "type": "text", + "analyzer": "romanian" + }, + "ru": { + "type": "text", + "analyzer": "russian" + }, + "es": { + "type": "text", + "analyzer": "spanish" + }, + "sv": { + "type": "text", + "analyzer": "swedish" + }, + "tr": { + "type": "text", + "analyzer": "turkish" + }, + "th": { + "type": "text", + "analyzer": "thai" + } + } + }, + "description": { + "type": "object", + "properties": { + "ar": { + "type": "text", + "analyzer": "arabic" + }, + "hy": { + "type": "text", + "analyzer": "armenian" + }, + "eu": { + "type": "text", + "analyzer": "basque" + }, + "pt_br": { + "type": "text", + "analyzer": "brazilian" + }, + "bg": { + "type": "text", + "analyzer": "bulgarian" + }, + "ca": { + "type": "text", + "analyzer": "catalan" + }, + "zh": { + "type": "text", + "analyzer": "icu_analyzer" + }, + "chinese": { + "type": "text", + "analyzer": "chinese" + }, + "cjk": { + "type": "text", + "analyzer": "cjk" + }, + "cs": { + "type": "text", + "analyzer": "czech" + }, + "da": { + "type": "text", + "analyzer": "danish" + }, + "nl": { + "type": "text", + "analyzer": "dutch" + }, + "en": { + "type": "text", + "analyzer": "english" + }, + "fi": { + "type": "text", + "analyzer": "finnish" + }, + "fr": { + "type": "text", + "analyzer": "french" + }, + "gl": { + "type": "text", + "analyzer": "galician" + }, + "de": { + "type": "text", + "analyzer": "german" + }, + "el": { + "type": "text", + "analyzer": "greek" + }, + "hi": { + "type": "text", + "analyzer": "hindi" + }, + "hu": { + "type": "text", + "analyzer": "hungarian" + }, + "id": { + "type": "text", + "analyzer": "indonesian" + }, + "it": { + "type": "text", + "analyzer": "italian" + }, + "no": { + "type": "text", + "analyzer": "norwegian" + }, + "fa": { + "type": "text", + "analyzer": "persian" + }, + "pt": { + "type": "text", + "analyzer": "portuguese" + }, + "ro": { + "type": "text", + "analyzer": "romanian" + }, + "ru": { + "type": "text", + "analyzer": "russian" + }, + "es": { + "type": "text", + "analyzer": "spanish" + }, + "sv": { + "type": "text", + "analyzer": "swedish" + }, + "tr": { + "type": "text", + "analyzer": "turkish" + }, + "th": { + "type": "text", + "analyzer": "thai" + } + } + }, + "vendor": { + "type": "object", + "properties": { + "ar": { + "type": "text", + "analyzer": "arabic", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "hy": { + "type": "text", + "analyzer": "armenian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "eu": { + "type": "text", + "analyzer": "basque", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "pt_br": { + "type": "text", + "analyzer": "brazilian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "bg": { + "type": "text", + "analyzer": "bulgarian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "ca": { + "type": "text", + "analyzer": "catalan", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "zh": { + "type": "text", + "analyzer": "icu_analyzer", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "chinese": { + "type": "text", + "analyzer": "chinese", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "cjk": { + "type": "text", + "analyzer": "cjk", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "cs": { + "type": "text", + "analyzer": "czech", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "da": { + "type": "text", + "analyzer": "danish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "nl": { + "type": "text", + "analyzer": "dutch", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "en": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "fi": { + "type": "text", + "analyzer": "finnish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "fr": { + "type": "text", + "analyzer": "french", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "gl": { + "type": "text", + "analyzer": "galician", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "de": { + "type": "text", + "analyzer": "german", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "el": { + "type": "text", + "analyzer": "greek", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "hi": { + "type": "text", + "analyzer": "hindi", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "hu": { + "type": "text", + "analyzer": "hungarian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "id": { + "type": "text", + "analyzer": "indonesian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "it": { + "type": "text", + "analyzer": "italian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "no": { + "type": "text", + "analyzer": "norwegian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "fa": { + "type": "text", + "analyzer": "persian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "pt": { + "type": "text", + "analyzer": "portuguese", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "ro": { + "type": "text", + "analyzer": "romanian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "ru": { + "type": "text", + "analyzer": "russian", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "es": { + "type": "text", + "analyzer": "spanish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "sv": { + "type": "text", + "analyzer": "swedish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "tr": { + "type": "text", + "analyzer": "turkish", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } + }, + "th": { + "type": "text", + "analyzer": "thai", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase" + } + } } } }, @@ -158,25 +1036,263 @@ } } }, - "category_path_zh": { - "type": "text", - "analyzer": "hanlp_index", - "search_analyzer": "hanlp_standard" - }, - "category_path_en": { - "type": "text", - "analyzer": "english", - "search_analyzer": "english" - }, - "category_name_zh": { - "type": "text", - "analyzer": "hanlp_index", - "search_analyzer": "hanlp_standard" + "category_path": { + "type": "object", + "properties": { + "ar": { + "type": "text", + "analyzer": "arabic" + }, + "hy": { + "type": "text", + "analyzer": "armenian" + }, + "eu": { + "type": "text", + "analyzer": "basque" + }, + "pt_br": { + "type": "text", + "analyzer": "brazilian" + }, + "bg": { + "type": "text", + "analyzer": "bulgarian" + }, + "ca": { + "type": "text", + "analyzer": "catalan" + }, + "zh": { + "type": "text", + "analyzer": "icu_analyzer" + }, + "chinese": { + "type": "text", + "analyzer": "chinese" + }, + "cjk": { + "type": "text", + "analyzer": "cjk" + }, + "cs": { + "type": "text", + "analyzer": "czech" + }, + "da": { + "type": "text", + "analyzer": "danish" + }, + "nl": { + "type": "text", + "analyzer": "dutch" + }, + "en": { + "type": "text", + "analyzer": "english" + }, + "fi": { + "type": "text", + "analyzer": "finnish" + }, + "fr": { + "type": "text", + "analyzer": "french" + }, + "gl": { + "type": "text", + "analyzer": "galician" + }, + "de": { + "type": "text", + "analyzer": "german" + }, + "el": { + "type": "text", + "analyzer": "greek" + }, + "hi": { + "type": "text", + "analyzer": "hindi" + }, + "hu": { + "type": "text", + "analyzer": "hungarian" + }, + "id": { + "type": "text", + "analyzer": "indonesian" + }, + "it": { + "type": "text", + "analyzer": "italian" + }, + "no": { + "type": "text", + "analyzer": "norwegian" + }, + "fa": { + "type": "text", + "analyzer": "persian" + }, + "pt": { + "type": "text", + "analyzer": "portuguese" + }, + "ro": { + "type": "text", + "analyzer": "romanian" + }, + "ru": { + "type": "text", + "analyzer": "russian" + }, + "es": { + "type": "text", + "analyzer": "spanish" + }, + "sv": { + "type": "text", + "analyzer": "swedish" + }, + "tr": { + "type": "text", + "analyzer": "turkish" + }, + "th": { + "type": "text", + "analyzer": "thai" + } + } }, - "category_name_en": { - "type": "text", - "analyzer": "english", - "search_analyzer": "english" + "category_name_text": { + "type": "object", + "properties": { + "ar": { + "type": "text", + "analyzer": "arabic" + }, + "hy": { + "type": "text", + "analyzer": "armenian" + }, + "eu": { + "type": "text", + "analyzer": "basque" + }, + "pt_br": { + "type": "text", + "analyzer": "brazilian" + }, + "bg": { + "type": "text", + "analyzer": "bulgarian" + }, + "ca": { + "type": "text", + "analyzer": "catalan" + }, + "zh": { + "type": "text", + "analyzer": "icu_analyzer" + }, + "chinese": { + "type": "text", + "analyzer": "chinese" + }, + "cjk": { + "type": "text", + "analyzer": "cjk" + }, + "cs": { + "type": "text", + "analyzer": "czech" + }, + "da": { + "type": "text", + "analyzer": "danish" + }, + "nl": { + "type": "text", + "analyzer": "dutch" + }, + "en": { + "type": "text", + "analyzer": "english" + }, + "fi": { + "type": "text", + "analyzer": "finnish" + }, + "fr": { + "type": "text", + "analyzer": "french" + }, + "gl": { + "type": "text", + "analyzer": "galician" + }, + "de": { + "type": "text", + "analyzer": "german" + }, + "el": { + "type": "text", + "analyzer": "greek" + }, + "hi": { + "type": "text", + "analyzer": "hindi" + }, + "hu": { + "type": "text", + "analyzer": "hungarian" + }, + "id": { + "type": "text", + "analyzer": "indonesian" + }, + "it": { + "type": "text", + "analyzer": "italian" + }, + "no": { + "type": "text", + "analyzer": "norwegian" + }, + "fa": { + "type": "text", + "analyzer": "persian" + }, + "pt": { + "type": "text", + "analyzer": "portuguese" + }, + "ro": { + "type": "text", + "analyzer": "romanian" + }, + "ru": { + "type": "text", + "analyzer": "russian" + }, + "es": { + "type": "text", + "analyzer": "spanish" + }, + "sv": { + "type": "text", + "analyzer": "swedish" + }, + "tr": { + "type": "text", + "analyzer": "turkish" + }, + "th": { + "type": "text", + "analyzer": "thai" + } + } }, "category_id": { "type": "keyword" @@ -294,4 +1410,3 @@ } } } - diff --git a/scripts/check_es_data.py b/scripts/check_es_data.py index ef9c735..bc3e753 100755 --- a/scripts/check_es_data.py +++ b/scripts/check_es_data.py @@ -28,12 +28,12 @@ def check_es_facet_fields(es_client, tenant_id: str, size: int = 5): "size": size, "_source": [ "spu_id", - "title_zh", + "title", "category1_name", "category2_name", "category3_name", "category_name", - "category_path_zh", + "category_path", "specifications", "option1_name", "option2_name", @@ -51,14 +51,16 @@ def check_es_facet_fields(es_client, tenant_id: str, size: int = 5): for i, hit in enumerate(hits, 1): source = hit.get('_source', {}) + title_obj = source.get("title") or {} + category_path_obj = source.get("category_path") or {} print(f"文档 {i}:") print(f" spu_id: {source.get('spu_id')}") - print(f" title_zh: {source.get('title_zh', '')[:50]}") + print(f" title.zh: {str(title_obj.get('zh', ''))[:50] if isinstance(title_obj, dict) else ''}") print(f" category1_name: {source.get('category1_name')}") print(f" category2_name: {source.get('category2_name')}") print(f" category3_name: {source.get('category3_name')}") print(f" category_name: {source.get('category_name')}") - print(f" category_path_zh: {source.get('category_path_zh')}") + print(f" category_path.zh: {category_path_obj.get('zh') if isinstance(category_path_obj, dict) else None}") print(f" option1_name: {source.get('option1_name')}") print(f" option2_name: {source.get('option2_name')}") print(f" option3_name: {source.get('option3_name')}") diff --git a/scripts/check_index_mapping.py b/scripts/check_index_mapping.py index 3640633..c7b06bd 100644 --- a/scripts/check_index_mapping.py +++ b/scripts/check_index_mapping.py @@ -20,11 +20,19 @@ def check_field_mapping(mapping_dict, field_path): current = mapping_dict for part in parts: - if isinstance(current, dict): - current = current.get(part) - if current is None: - return None - else: + if not isinstance(current, dict): + return None + + # ES mapping nesting: object fields store subfields under "properties" + if "properties" in current and isinstance(current["properties"], dict): + current = current["properties"] + + # multi-fields store subfields under "fields" (e.g. vendor.zh.keyword) + if part != parts[0] and "fields" in current and isinstance(current["fields"], dict) and part in current["fields"]: + current = current["fields"] + + current = current.get(part) + if current is None: return None return current @@ -70,12 +78,13 @@ def main(): # 检查关键字段 fields_to_check = [ - 'title_zh', - 'brief_zh', - 'description_zh', - 'vendor_zh', - 'category_path_zh', - 'category_name_zh' + "title.zh", + "brief.zh", + "description.zh", + "vendor.zh", + "vendor.zh.keyword", + "category_path.zh", + "category_name_text.zh" ] print("=" * 80) @@ -83,7 +92,7 @@ def main(): print("=" * 80) for field_name in fields_to_check: - field_mapping = index_mapping.get(field_name) + field_mapping = check_field_mapping(index_mapping, field_name) if field_mapping is None: print(f"\n❌ {field_name}: 字段不存在") diff --git a/search/es_query_builder.py b/search/es_query_builder.py index 7b6bd77..eacc4fa 100644 --- a/search/es_query_builder.py +++ b/search/es_query_builder.py @@ -367,37 +367,37 @@ class ESQueryBuilder: """ if language == 'zh': all_fields = [ - "title_zh^3.0", - "brief_zh^1.5", - "description_zh", - "vendor_zh^1.5", + "title.zh^3.0", + "brief.zh^1.5", + "description.zh", + "vendor.zh^1.5", "tags", - "category_path_zh^1.5", - "category_name_zh^1.5", + "category_path.zh^1.5", + "category_name_text.zh^1.5", "option1_values^0.5" ] core_fields = [ - "title_zh^3.0", - "brief_zh^1.5", - "vendor_zh^1.5", - "category_name_zh^1.5" + "title.zh^3.0", + "brief.zh^1.5", + "vendor.zh^1.5", + "category_name_text.zh^1.5" ] else: # en all_fields = [ - "title_en^3.0", - "brief_en^1.5", - "description_en", - "vendor_en^1.5", + "title.en^3.0", + "brief.en^1.5", + "description.en", + "vendor.en^1.5", "tags", - "category_path_en^1.5", - "category_name_en^1.5", + "category_path.en^1.5", + "category_name_text.en^1.5", "option1_values^0.5" ] core_fields = [ - "title_en^3.0", - "brief_en^1.5", - "vendor_en^1.5", - "category_name_en^1.5" + "title.en^3.0", + "brief.en^1.5", + "vendor.en^1.5", + "category_name_text.en^1.5" ] return all_fields, core_fields diff --git a/tests/conftest.py b/tests/conftest.py index 573d53e..49373c6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -28,7 +28,7 @@ def sample_index_config() -> IndexConfig: return IndexConfig( name="default", label="默认索引", - fields=["title_zh", "brief_zh", "tags"], + fields=["title.zh", "brief.zh", "tags"], boost=1.0 ) @@ -60,10 +60,10 @@ def sample_search_config(sample_index_config) -> SearchConfig: es_index_name="test_products", field_boosts={ "tenant_id": 1.0, - "title_zh": 3.0, - "brief_zh": 1.5, + "title.zh": 3.0, + "brief.zh": 1.5, "tags": 1.0, - "category_path_zh": 1.5, + "category_path.zh": 1.5, }, indexes=[sample_index_config], query_config=query_config, @@ -89,8 +89,8 @@ def mock_es_client() -> Mock: "_id": "1", "_score": 2.5, "_source": { - "title_zh": "红色连衣裙", - "vendor_zh": "测试品牌", + "title": {"zh": "红色连衣裙"}, + "vendor": {"zh": "测试品牌"}, "min_price": 299.0, "category_id": "1" } @@ -99,8 +99,8 @@ def mock_es_client() -> Mock: "_id": "2", "_score": 2.2, "_source": { - "title_zh": "蓝色连衣裙", - "vendor_zh": "测试品牌", + "title": {"zh": "蓝色连衣裙"}, + "vendor": {"zh": "测试品牌"}, "min_price": 399.0, "category_id": "1" } @@ -142,8 +142,8 @@ def sample_search_results() -> Dict[str, Any]: "query": "红色连衣裙", "expected_total": 2, "expected_products": [ - {"title_zh": "红色连衣裙", "min_price": 299.0}, - {"title_zh": "蓝色连衣裙", "min_price": 399.0} + {"title": "红色连衣裙", "min_price": 299.0}, + {"title": "蓝色连衣裙", "min_price": 399.0} ] } @@ -157,16 +157,16 @@ def temp_config_file() -> Generator[str, None, None]: config_data = { "es_index_name": "test_products", "field_boosts": { - "title_zh": 3.0, - "brief_zh": 1.5, + "title.zh": 3.0, + "brief.zh": 1.5, "tags": 1.0, - "category_path_zh": 1.5 + "category_path.zh": 1.5 }, "indexes": [ { "name": "default", "label": "默认索引", - "fields": ["title_zh", "brief_zh", "tags"], + "fields": ["title.zh", "brief.zh", "tags"], "boost": 1.0 } ], -- libgit2 0.21.2