check_index_mapping.py 4.71 KB
#!/usr/bin/env python3
"""
检查ES索引的实际映射配置,特别是中文字段的analyzer设置
"""

import os
import sys
import json
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent))

from utils.es_client import get_es_client_from_env
from indexer.mapping_generator import DEFAULT_INDEX_NAME


def check_field_mapping(mapping_dict, field_path):
    """递归查找字段映射"""
    parts = field_path.split('.')
    current = mapping_dict
    
    for part in parts:
        if isinstance(current, dict):
            current = current.get(part)
            if current is None:
                return None
        else:
            return None
    return current


def main():
    print("=" * 80)
    print("检查 Elasticsearch 索引实际映射配置")
    print("=" * 80)
    
    # 连接ES
    try:
        es_client = get_es_client_from_env()
        if not es_client.ping():
            print("✗ 无法连接到 Elasticsearch")
            return 1
        print("✓ Elasticsearch 连接成功\n")
    except Exception as e:
        print(f"✗ 连接 Elasticsearch 失败: {e}")
        return 1
    
    index_name = DEFAULT_INDEX_NAME
    
    # 检查索引是否存在
    if not es_client.index_exists(index_name):
        print(f"✗ 索引 '{index_name}' 不存在")
        return 1
    
    # 获取实际映射
    print(f"获取索引 '{index_name}' 的映射配置...\n")
    mapping = es_client.get_mapping(index_name)
    
    if not mapping:
        print("✗ 无法获取索引映射")
        return 1
    
    # 提取实际映射结构
    # ES返回格式: {index_name: {mappings: {properties: {...}}}}
    index_mapping = mapping.get(index_name, {}).get('mappings', {}).get('properties', {})
    
    if not index_mapping:
        print("✗ 无法解析映射结构")
        return 1
    
    # 检查关键字段
    fields_to_check = [
        'title_zh',
        'brief_zh', 
        'description_zh',
        'vendor_zh',
        'category_path_zh',
        'category_name_zh'
    ]
    
    print("=" * 80)
    print("中文字段实际映射配置")
    print("=" * 80)
    
    for field_name in fields_to_check:
        field_mapping = index_mapping.get(field_name)
        
        if field_mapping is None:
            print(f"\n❌ {field_name}: 字段不存在")
            continue
        
        print(f"\n📋 {field_name}:")
        print(f"   类型: {field_mapping.get('type', 'N/A')}")
        
        analyzer = field_mapping.get('analyzer')
        search_analyzer = field_mapping.get('search_analyzer')
        
        if analyzer:
            print(f"   索引分析器 (analyzer): {analyzer}")
        else:
            print(f"   索引分析器 (analyzer): 未设置(使用默认)")
        
        if search_analyzer:
            print(f"   查询分析器 (search_analyzer): {search_analyzer}")
        else:
            print(f"   查询分析器 (search_analyzer): 未设置(使用analyzer或默认)")
        
        # 检查是否有子字段
        if 'fields' in field_mapping:
            print(f"   子字段:")
            for sub_field, sub_mapping in field_mapping['fields'].items():
                print(f"     - {sub_field}: {sub_mapping.get('type', 'N/A')}")
                if 'normalizer' in sub_mapping:
                    print(f"       normalizer: {sub_mapping['normalizer']}")
    
    # 获取settings中的analyzer定义
    print("\n" + "=" * 80)
    print("索引 Settings 中的 Analyzer 定义")
    print("=" * 80)
    
    try:
        settings = es_client.client.indices.get_settings(index=index_name)
        index_settings = settings.get(index_name, {}).get('settings', {}).get('index', {})
        analysis = index_settings.get('analysis', {})
        analyzers = analysis.get('analyzer', {})
        
        if analyzers:
            print("\n定义的 Analyzer:")
            for analyzer_name, analyzer_config in analyzers.items():
                print(f"\n  {analyzer_name}:")
                if isinstance(analyzer_config, dict):
                    print(f"    类型: {analyzer_config.get('type', 'N/A')}")
                    if 'tokenizer' in analyzer_config:
                        print(f"    tokenizer: {analyzer_config['tokenizer']}")
                    if 'filter' in analyzer_config:
                        print(f"    filter: {analyzer_config['filter']}")
                else:
                    print(f"    配置: {analyzer_config}")
        else:
            print("\n⚠ 未找到自定义 analyzer 定义")
            
    except Exception as e:
        print(f"\n⚠ 无法获取 settings: {e}")
    
    print("\n" + "=" * 80)
    print("检查完成")
    print("=" * 80)
    
    return 0


if __name__ == '__main__':
    sys.exit(main())