Blame view

scripts/check_index_mapping.py 5.18 KB
5ac64fc7   tangwang   多语言查询
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
  #!/usr/bin/env python3
  """
  检查ES索引的实际映射配置,特别是中文字段的analyzer设置
  """
  
  import os
  import sys
  import json
  from pathlib import Path
  
  sys.path.insert(0, str(Path(__file__).parent.parent))
  
  from utils.es_client import get_es_client_from_env
  from indexer.mapping_generator import DEFAULT_INDEX_NAME
  
  
  def check_field_mapping(mapping_dict, field_path):
      """递归查找字段映射"""
      parts = field_path.split('.')
      current = mapping_dict
      
      for part in parts:
d7d48f52   tangwang   改动(mapping + 灌入结构)
23
24
25
26
27
28
29
30
31
32
33
34
35
          if not isinstance(current, dict):
              return None
  
          # ES mapping nesting: object fields store subfields under "properties"
          if "properties" in current and isinstance(current["properties"], dict):
              current = current["properties"]
  
          # multi-fields store subfields under "fields" (e.g. vendor.zh.keyword)
          if part != parts[0] and "fields" in current and isinstance(current["fields"], dict) and part in current["fields"]:
              current = current["fields"]
  
          current = current.get(part)
          if current is None:
5ac64fc7   tangwang   多语言查询
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
              return None
      return current
  
  
  def main():
      print("=" * 80)
      print("检查 Elasticsearch 索引实际映射配置")
      print("=" * 80)
      
      # 连接ES
      try:
          es_client = get_es_client_from_env()
          if not es_client.ping():
              print("✗ 无法连接到 Elasticsearch")
              return 1
          print("✓ Elasticsearch 连接成功\n")
      except Exception as e:
          print(f"✗ 连接 Elasticsearch 失败: {e}")
          return 1
      
      index_name = DEFAULT_INDEX_NAME
      
      # 检查索引是否存在
      if not es_client.index_exists(index_name):
          print(f"✗ 索引 '{index_name}' 不存在")
          return 1
      
      # 获取实际映射
      print(f"获取索引 '{index_name}' 的映射配置...\n")
      mapping = es_client.get_mapping(index_name)
      
      if not mapping:
          print("✗ 无法获取索引映射")
          return 1
      
      # 提取实际映射结构
      # ES返回格式: {index_name: {mappings: {properties: {...}}}}
      index_mapping = mapping.get(index_name, {}).get('mappings', {}).get('properties', {})
      
      if not index_mapping:
          print("✗ 无法解析映射结构")
          return 1
      
      # 检查关键字段
      fields_to_check = [
d7d48f52   tangwang   改动(mapping + 灌入结构)
81
82
83
84
85
86
87
          "title.zh",
          "brief.zh",
          "description.zh",
          "vendor.zh",
          "vendor.zh.keyword",
          "category_path.zh",
          "category_name_text.zh"
5ac64fc7   tangwang   多语言查询
88
89
90
91
92
93
94
      ]
      
      print("=" * 80)
      print("中文字段实际映射配置")
      print("=" * 80)
      
      for field_name in fields_to_check:
d7d48f52   tangwang   改动(mapping + 灌入结构)
95
          field_mapping = check_field_mapping(index_mapping, field_name)
5ac64fc7   tangwang   多语言查询
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
          
          if field_mapping is None:
              print(f"\n❌ {field_name}: 字段不存在")
              continue
          
          print(f"\n📋 {field_name}:")
          print(f"   类型: {field_mapping.get('type', 'N/A')}")
          
          analyzer = field_mapping.get('analyzer')
          search_analyzer = field_mapping.get('search_analyzer')
          
          if analyzer:
              print(f"   索引分析器 (analyzer): {analyzer}")
          else:
              print(f"   索引分析器 (analyzer): 未设置(使用默认)")
          
          if search_analyzer:
              print(f"   查询分析器 (search_analyzer): {search_analyzer}")
          else:
              print(f"   查询分析器 (search_analyzer): 未设置(使用analyzer或默认)")
          
          # 检查是否有子字段
          if 'fields' in field_mapping:
              print(f"   子字段:")
              for sub_field, sub_mapping in field_mapping['fields'].items():
                  print(f"     - {sub_field}: {sub_mapping.get('type', 'N/A')}")
                  if 'normalizer' in sub_mapping:
                      print(f"       normalizer: {sub_mapping['normalizer']}")
      
      # 获取settings中的analyzer定义
      print("\n" + "=" * 80)
      print("索引 Settings 中的 Analyzer 定义")
      print("=" * 80)
      
      try:
          settings = es_client.client.indices.get_settings(index=index_name)
          index_settings = settings.get(index_name, {}).get('settings', {}).get('index', {})
          analysis = index_settings.get('analysis', {})
          analyzers = analysis.get('analyzer', {})
          
          if analyzers:
              print("\n定义的 Analyzer:")
              for analyzer_name, analyzer_config in analyzers.items():
                  print(f"\n  {analyzer_name}:")
                  if isinstance(analyzer_config, dict):
                      print(f"    类型: {analyzer_config.get('type', 'N/A')}")
                      if 'tokenizer' in analyzer_config:
                          print(f"    tokenizer: {analyzer_config['tokenizer']}")
                      if 'filter' in analyzer_config:
                          print(f"    filter: {analyzer_config['filter']}")
                  else:
                      print(f"    配置: {analyzer_config}")
          else:
              print("\n⚠ 未找到自定义 analyzer 定义")
              
      except Exception as e:
          print(f"\n⚠ 无法获取 settings: {e}")
      
      print("\n" + "=" * 80)
      print("检查完成")
      print("=" * 80)
      
      return 0
  
  
  if __name__ == '__main__':
      sys.exit(main())