data/mai_jia_jing_ling/keywords/process_excel.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Process Excel files to extract keywords and ASINs by country
"""
import pandas as pd
import os
import re
from collections import defaultdict
def extract_country_from_filename(filename):
    """Extract country code from filename like KeywordResearch-US-202501-342779.xlsx"""
    match = re.match(r'KeywordResearch-(\w{2})-\d{6}-\d+\.xlsx', filename)
    if match:
        return match.group(1)
    return None
def process_excel_files():
    """Process all Excel files in current directory"""
    # Get all Excel files
    excel_files = [f for f in os.listdir('.') if f.endswith('.xlsx')]
    # Dictionary to store data by country
    # keywords: dict mapping keyword to max monthly search volume
    country_data = defaultdict(lambda: {
        'keywords': {},
        'asins': set()
    })
    print(f"Found {len(excel_files)} Excel files")
    # Process each file
    for file_path in excel_files:
        country = extract_country_from_filename(file_path)
        if not country:
            print(f"Could not extract country from {file_path}, skipping...")
            continue
        print(f"Processing {file_path} (Country: {country})")
        try:
            # Read Excel file using openpyxl engine
            df = pd.read_excel(file_path, engine='openpyxl')
            # Check if required columns exist
            if len(df.columns) < 4:
                print(f"  Insufficient columns in {file_path}")
                continue
            keyword_col = df.columns[0]      # "关键词"
            search_vol_col = df.columns[3]   # "月搜索量"
            asins_col = df.columns[-1]       # "前10ASIN"
            print(f"  Columns: {keyword_col}, {search_vol_col}, {asins_col}")
            # Process each row
            for _, row in df.iterrows():
                keyword = row[keyword_col]
                search_volume = row[search_vol_col]
                asins_str = row[asins_col]
                # Skip if keyword is NaN or empty
                if pd.isna(keyword) or not str(keyword).strip():
                    continue
                keyword_clean = str(keyword).strip()
                # Store keyword with max monthly search volume
                # If keyword already exists, keep the maximum search volume
                current_vol = country_data[country]['keywords'].get(keyword_clean, 0)
                new_vol = search_volume if not pd.isna(search_volume) else 0
                # Convert to numeric if it's a string
                if isinstance(new_vol, str):
                    try:
                        new_vol = float(new_vol)
                    except:
                        new_vol = 0
                if new_vol > current_vol:
                    country_data[country]['keywords'][keyword_clean] = int(new_vol)
                # Process ASINs if not NaN
                if not pd.isna(asins_str) and str(asins_str).strip():
                    asins = str(asins_str).split(',')
                    for asin in asins:
                        asin = asin.strip()
                        if asin:
                            country_data[country]['asins'].add(asin)
            print(f"  Processed {len(df)} rows")
        except Exception as e:
            print(f"  Error processing {file_path}: {e}")
            continue
    return country_data
def save_country_data(country_data):
    """Save keywords and ASINs for each country to separate files"""
    # Create output directory if it doesn't exist
    os.makedirs('output', exist_ok=True)
    for country, data in country_data.items():
        print(f"\nSaving data for {country}:")
        print(f"  Keywords: {len(data['keywords'])}")
        print(f"  ASINs: {len(data['asins'])}")
        # Save keywords with monthly search volume (tab-separated)
        keywords_file = f'output/{country}_keywords.txt'
        with open(keywords_file, 'w', encoding='utf-8') as f:
            # Sort by keyword alphabetically
            for keyword in sorted(data['keywords'].keys()):
                search_volume = data['keywords'][keyword]
                f.write(f"{keyword}\t{search_volume}\n")
        # Save ASINs (1000 per line, comma-separated)
        asins_file = f'output/{country}_asins.txt'
        sorted_asins = sorted(data['asins'])
        with open(asins_file, 'w', encoding='utf-8') as f:
            for i in range(0, len(sorted_asins), 1000):
                chunk = sorted_asins[i:i+1000]
                f.write(','.join(chunk) + '\n')
        print(f"  Saved to {keywords_file} and {asins_file}")
def main():
    print("Starting to process Excel files...")
    # Process all files
    country_data = process_excel_files()
    # Print summary
    print(f"\nSummary:")
    print(f"Processed {len(country_data)} countries:")
    for country in sorted(country_data.keys()):
        data = country_data[country]
        print(f"  {country}: {len(data['keywords'])} keywords, {len(data['asins'])} ASINs")
    # Save to files
    save_country_data(country_data)
    print(f"\nDone! Files saved in 'output' directory with format: keyword\\tmonthly_search_volume")
if __name__ == "__main__":
    main()