#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Process Excel files to extract keywords and ASINs by country """ import pandas as pd import os import re from collections import defaultdict def extract_country_from_filename(filename): """Extract country code from filename like KeywordResearch-US-202501-342779.xlsx""" match = re.match(r'KeywordResearch-(\w{2})-\d{6}-\d+\.xlsx', filename) if match: return match.group(1) return None def process_excel_files(): """Process all Excel files in current directory""" # Get all Excel files excel_files = [f for f in os.listdir('.') if f.endswith('.xlsx')] # Dictionary to store data by country country_data = defaultdict(lambda: { 'keywords': set(), 'asins': set() }) print(f"Found {len(excel_files)} Excel files") # Process each file for file_path in excel_files: country = extract_country_from_filename(file_path) if not country: print(f"Could not extract country from {file_path}, skipping...") continue print(f"Processing {file_path} (Country: {country})") try: # Read Excel file using openpyxl engine df = pd.read_excel(file_path, engine='openpyxl') # Check if required columns exist if len(df.columns) < 1: print(f" No columns found in {file_path}") continue first_col = df.columns[0] # Should be "关键词" last_col = df.columns[-1] # Should be "前10ASIN" print(f" Columns: {first_col} -> {last_col}") # Process each row for _, row in df.iterrows(): keyword = row[first_col] asins_str = row[last_col] # Skip if keyword is NaN or empty if pd.isna(keyword) or not str(keyword).strip(): continue # Add keyword to country set country_data[country]['keywords'].add(str(keyword).strip()) # Process ASINs if not NaN if not pd.isna(asins_str) and str(asins_str).strip(): asins = str(asins_str).split(',') for asin in asins: asin = asin.strip() if asin: country_data[country]['asins'].add(asin) print(f" Processed {len(df)} rows") except Exception as e: print(f" Error processing {file_path}: {e}") continue return country_data def save_country_data(country_data): """Save keywords and ASINs for each country to separate files""" # Create output directory if it doesn't exist os.makedirs('output', exist_ok=True) for country, data in country_data.items(): print(f"\nSaving data for {country}:") print(f" Keywords: {len(data['keywords'])}") print(f" ASINs: {len(data['asins'])}") # Save keywords keywords_file = f'output/{country}_keywords.txt' with open(keywords_file, 'w', encoding='utf-8') as f: for keyword in sorted(data['keywords']): f.write(f"{keyword}\n") # Save ASINs (1000 per line, comma-separated) asins_file = f'output/{country}_asins.txt' sorted_asins = sorted(data['asins']) with open(asins_file, 'w', encoding='utf-8') as f: for i in range(0, len(sorted_asins), 1000): chunk = sorted_asins[i:i+1000] f.write(','.join(chunk) + '\n') print(f" Saved to {keywords_file} and {asins_file}") def main(): print("Starting to process Excel files...") # Process all files country_data = process_excel_files() # Print summary print(f"\nSummary:") print(f"Processed {len(country_data)} countries:") for country in sorted(country_data.keys()): data = country_data[country] print(f" {country}: {len(data['keywords'])} keywords, {len(data['asins'])} ASINs") # Save to files save_country_data(country_data) print(f"\nDone! Files saved in 'output' directory.") if __name__ == "__main__": main()