check_index_mapping.py
4.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python3
"""
检查ES索引的实际映射配置,特别是中文字段的analyzer设置
"""
import os
import sys
import json
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from utils.es_client import get_es_client_from_env
from indexer.mapping_generator import DEFAULT_INDEX_NAME
def check_field_mapping(mapping_dict, field_path):
"""递归查找字段映射"""
parts = field_path.split('.')
current = mapping_dict
for part in parts:
if isinstance(current, dict):
current = current.get(part)
if current is None:
return None
else:
return None
return current
def main():
print("=" * 80)
print("检查 Elasticsearch 索引实际映射配置")
print("=" * 80)
# 连接ES
try:
es_client = get_es_client_from_env()
if not es_client.ping():
print("✗ 无法连接到 Elasticsearch")
return 1
print("✓ Elasticsearch 连接成功\n")
except Exception as e:
print(f"✗ 连接 Elasticsearch 失败: {e}")
return 1
index_name = DEFAULT_INDEX_NAME
# 检查索引是否存在
if not es_client.index_exists(index_name):
print(f"✗ 索引 '{index_name}' 不存在")
return 1
# 获取实际映射
print(f"获取索引 '{index_name}' 的映射配置...\n")
mapping = es_client.get_mapping(index_name)
if not mapping:
print("✗ 无法获取索引映射")
return 1
# 提取实际映射结构
# ES返回格式: {index_name: {mappings: {properties: {...}}}}
index_mapping = mapping.get(index_name, {}).get('mappings', {}).get('properties', {})
if not index_mapping:
print("✗ 无法解析映射结构")
return 1
# 检查关键字段
fields_to_check = [
'title_zh',
'brief_zh',
'description_zh',
'vendor_zh',
'category_path_zh',
'category_name_zh'
]
print("=" * 80)
print("中文字段实际映射配置")
print("=" * 80)
for field_name in fields_to_check:
field_mapping = index_mapping.get(field_name)
if field_mapping is None:
print(f"\n❌ {field_name}: 字段不存在")
continue
print(f"\n📋 {field_name}:")
print(f" 类型: {field_mapping.get('type', 'N/A')}")
analyzer = field_mapping.get('analyzer')
search_analyzer = field_mapping.get('search_analyzer')
if analyzer:
print(f" 索引分析器 (analyzer): {analyzer}")
else:
print(f" 索引分析器 (analyzer): 未设置(使用默认)")
if search_analyzer:
print(f" 查询分析器 (search_analyzer): {search_analyzer}")
else:
print(f" 查询分析器 (search_analyzer): 未设置(使用analyzer或默认)")
# 检查是否有子字段
if 'fields' in field_mapping:
print(f" 子字段:")
for sub_field, sub_mapping in field_mapping['fields'].items():
print(f" - {sub_field}: {sub_mapping.get('type', 'N/A')}")
if 'normalizer' in sub_mapping:
print(f" normalizer: {sub_mapping['normalizer']}")
# 获取settings中的analyzer定义
print("\n" + "=" * 80)
print("索引 Settings 中的 Analyzer 定义")
print("=" * 80)
try:
settings = es_client.client.indices.get_settings(index=index_name)
index_settings = settings.get(index_name, {}).get('settings', {}).get('index', {})
analysis = index_settings.get('analysis', {})
analyzers = analysis.get('analyzer', {})
if analyzers:
print("\n定义的 Analyzer:")
for analyzer_name, analyzer_config in analyzers.items():
print(f"\n {analyzer_name}:")
if isinstance(analyzer_config, dict):
print(f" 类型: {analyzer_config.get('type', 'N/A')}")
if 'tokenizer' in analyzer_config:
print(f" tokenizer: {analyzer_config['tokenizer']}")
if 'filter' in analyzer_config:
print(f" filter: {analyzer_config['filter']}")
else:
print(f" 配置: {analyzer_config}")
else:
print("\n⚠ 未找到自定义 analyzer 定义")
except Exception as e:
print(f"\n⚠ 无法获取 settings: {e}")
print("\n" + "=" * 80)
print("检查完成")
print("=" * 80)
return 0
if __name__ == '__main__':
sys.exit(main())