debug_query.py 9.74 KB
#!/usr/bin/env python3
"""
Debug script to investigate why query "车" returns no results for tenant_id=2
"""

import json
import os
import sys
# Import ESClient directly to avoid sqlalchemy dependency
from elasticsearch import Elasticsearch
from typing import Dict, Any, List, Optional

class ESClient:
    """Simple ES client for debugging."""
    def __init__(self, hosts: List[str] = None, username: Optional[str] = None, password: Optional[str] = None):
        if hosts is None:
            hosts = [os.getenv('ES_HOST', 'http://localhost:9200')]
        client_config = {'hosts': hosts, 'timeout': 30}
        if username and password:
            client_config['http_auth'] = (username, password)
        self.client = Elasticsearch(**client_config)
    
    def ping(self) -> bool:
        try:
            return self.client.ping()
        except:
            return False
    
    def index_exists(self, index_name: str) -> bool:
        return self.client.indices.exists(index=index_name)
    
    def count(self, index_name: str, body: Optional[Dict[str, Any]] = None) -> int:
        try:
            result = self.client.count(index=index_name, body=body)
            return result['count']
        except:
            return 0
    
    def search(self, index_name: str, body: Dict[str, Any], size: int = 10, from_: int = 0) -> Dict[str, Any]:
        try:
            return self.client.search(index=index_name, body=body, size=size, from_=from_)
        except Exception as e:
            return {'hits': {'total': {'value': 0}, 'hits': []}, 'error': str(e)}

from config.config_loader import ConfigLoader

def main():
    # Load config
    config_loader = ConfigLoader("config/config.yaml")
    config = config_loader.load_config()
    
    # Get ES connection info
    es_host = os.getenv('ES_HOST', 'http://localhost:9200')
    es_username = os.getenv('ES_USERNAME')
    es_password = os.getenv('ES_PASSWORD')
    
    # Initialize ES client
    if es_username and es_password:
        es_client = ESClient(hosts=[es_host], username=es_username, password=es_password)
    else:
        es_client = ESClient(hosts=[es_host])
    
    if not es_client.ping():
        print(f"ERROR: Cannot connect to Elasticsearch at {es_host}")
        return 1
    
    index_name = config.es_index_name
    print(f"Using index: {index_name}")
    print(f"ES host: {es_host}\n")
    
    # 1. Check if index exists
    if not es_client.index_exists(index_name):
        print(f"ERROR: Index '{index_name}' does not exist")
        return 1
    
    # 2. Count documents with tenant_id=2
    count_query = {
        "query": {
            "term": {
                "tenant_id": "2"
            }
        }
    }
    tenant_count = es_client.count(index_name, count_query)
    print(f"Documents with tenant_id='2': {tenant_count}")
    
    # 3. Check if there are documents containing "遥控车" or "车"
    search_terms = ["遥控车", "车"]
    for term in search_terms:
        term_query = {
            "query": {
                "bool": {
                    "must": [
                        {
                            "multi_match": {
                                "query": term,
                                "fields": [
                                    "title^3.0",
                                    "brief^1.5",
                                    "description",
                                    "seo_title^2.0",
                                    "seo_description^1.5",
                                    "seo_keywords^2.0",
                                    "vendor^1.5",
                                    "product_type^1.5",
                                    "tags",
                                    "category^1.5"
                                ]
                            }
                        }
                    ],
                    "filter": [
                        {
                            "term": {
                                "tenant_id": "2"
                            }
                        }
                    ]
                }
            },
            "size": 5
        }
        result = es_client.search(index_name, term_query, size=5)
        total = result.get('hits', {}).get('total', {})
        if isinstance(total, dict):
            total_value = total.get('value', 0)
        else:
            total_value = total
        print(f"\nSearch for '{term}' with tenant_id=2: {total_value} results")
        
        if total_value > 0:
            print("Sample results:")
            for i, hit in enumerate(result.get('hits', {}).get('hits', [])[:3], 1):
                source = hit.get('_source', {})
                print(f"  {i}. {source.get('title', 'N/A')} (score: {hit.get('_score', 0):.3f})")
                print(f"     tenant_id: {source.get('tenant_id', 'N/A')}")
    
    # 4. Test the exact query structure from the user
    print("\n" + "="*60)
    print("Testing exact query structure from user:")
    print("="*60)
    
    exact_query = {
        "query": {
            "bool": {
                "must": [
                    {
                        "bool": {
                            "should": [
                                {
                                    "multi_match": {
                                        "query": "车",
                                        "fields": [
                                            "title^3.0",
                                            "brief^1.5",
                                            "description",
                                            "seo_title^2.0",
                                            "seo_description^1.5",
                                            "seo_keywords^2.0",
                                            "vendor^1.5",
                                            "product_type^1.5",
                                            "tags",
                                            "category^1.5"
                                        ],
                                        "minimum_should_match": "67%",
                                        "tie_breaker": 0.9,
                                        "boost": 1,
                                        "_name": "default_query"
                                    }
                                }
                            ],
                            "minimum_should_match": 1
                        }
                    }
                ],
                "filter": [
                    {
                        "term": {
                            "tenant_id": "2"
                        }
                    }
                ]
            }
        },
        "size": 10
    }
    
    print("\nQuery structure:")
    print(json.dumps(exact_query, indent=2, ensure_ascii=False))
    
    result = es_client.search(index_name, exact_query, size=10)
    total = result.get('hits', {}).get('total', {})
    if isinstance(total, dict):
        total_value = total.get('value', 0)
    else:
        total_value = total
    
    max_score = result.get('hits', {}).get('max_score') or 0.0
    print(f"\nResults: {total_value} hits, max_score: {max_score}")
    
    if total_value == 0:
        print("\n⚠️  No results found! Investigating...")
        
        # 5. Check if "车" matches without minimum_should_match
        print("\nTesting without minimum_should_match:")
        simple_query = {
            "query": {
                "bool": {
                    "must": [
                        {
                            "multi_match": {
                                "query": "车",
                                "fields": [
                                    "title^3.0",
                                    "brief^1.5",
                                    "description"
                                ]
                            }
                        }
                    ],
                    "filter": [
                        {
                            "term": {
                                "tenant_id": "2"
                            }
                        }
                    ]
                }
            },
            "size": 5
        }
        simple_result = es_client.search(index_name, simple_query, size=5)
        simple_total = simple_result.get('hits', {}).get('total', {})
        if isinstance(simple_total, dict):
            simple_total_value = simple_total.get('value', 0)
        else:
            simple_total_value = simple_total
        print(f"Results without minimum_should_match: {simple_total_value}")
        
        # 6. Check field values for tenant_id=2 documents
        print("\nChecking sample documents with tenant_id=2:")
        sample_query = {
            "query": {
                "term": {
                    "tenant_id": "2"
                }
            },
            "size": 3,
            "_source": ["title", "brief", "description", "category", "tags", "tenant_id"]
        }
        sample_result = es_client.search(index_name, sample_query, size=3)
        for i, hit in enumerate(sample_result.get('hits', {}).get('hits', []), 1):
            source = hit.get('_source', {})
            print(f"\n  Document {i}:")
            print(f"    title: {source.get('title', 'N/A')}")
            print(f"    brief: {source.get('brief', 'N/A')[:50] if source.get('brief') else 'N/A'}")
            print(f"    category: {source.get('category', 'N/A')}")
            print(f"    tags: {source.get('tags', 'N/A')}")
    
    else:
        print("\n✅ Found results!")
        print("\nTop results:")
        for i, hit in enumerate(result.get('hits', {}).get('hits', [])[:5], 1):
            source = hit.get('_source', {})
            print(f"  {i}. {source.get('title', 'N/A')} (score: {hit.get('_score', 0):.3f})")
    
    return 0

if __name__ == "__main__":
    sys.exit(main())