#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Process Excel files to extract keywords and ASINs by country """ import pandas as pd import os import re from collections import defaultdict def extract_country_from_filename(filename): """Extract country code from filename like KeywordResearch-US-202501-342779.xlsx""" match = re.match(r'KeywordResearch-(\w{2})-\d{6}-\d+\.xlsx', filename) if match: return match.group(1) return None def process_excel_files(): """Process all Excel files in current directory""" # Get all Excel files excel_files = [f for f in os.listdir('.') if f.endswith('.xlsx')] # Dictionary to store data by country # keywords: dict mapping keyword to max monthly search volume country_data = defaultdict(lambda: { 'keywords': {}, 'asins': set() }) print(f"Found {len(excel_files)} Excel files") # Process each file for file_path in excel_files: country = extract_country_from_filename(file_path) if not country: print(f"Could not extract country from {file_path}, skipping...") continue print(f"Processing {file_path} (Country: {country})") try: # Read Excel file using openpyxl engine df = pd.read_excel(file_path, engine='openpyxl') # Check if required columns exist if len(df.columns) < 4: print(f" Insufficient columns in {file_path}") continue keyword_col = df.columns[0] # "关键词" search_vol_col = df.columns[3] # "月搜索量" asins_col = df.columns[-1] # "前10ASIN" print(f" Columns: {keyword_col}, {search_vol_col}, {asins_col}") # Process each row for _, row in df.iterrows(): keyword = row[keyword_col] search_volume = row[search_vol_col] asins_str = row[asins_col] # Skip if keyword is NaN or empty if pd.isna(keyword) or not str(keyword).strip(): continue keyword_clean = str(keyword).strip() # Store keyword with max monthly search volume # If keyword already exists, keep the maximum search volume current_vol = country_data[country]['keywords'].get(keyword_clean, 0) new_vol = search_volume if not pd.isna(search_volume) else 0 # Convert to numeric if it's a string if isinstance(new_vol, str): try: new_vol = float(new_vol) except: new_vol = 0 if new_vol > current_vol: country_data[country]['keywords'][keyword_clean] = int(new_vol) # Process ASINs if not NaN if not pd.isna(asins_str) and str(asins_str).strip(): asins = str(asins_str).split(',') for asin in asins: asin = asin.strip() if asin: country_data[country]['asins'].add(asin) print(f" Processed {len(df)} rows") except Exception as e: print(f" Error processing {file_path}: {e}") continue return country_data def save_country_data(country_data): """Save keywords and ASINs for each country to separate files""" # Create output directory if it doesn't exist os.makedirs('output', exist_ok=True) for country, data in country_data.items(): print(f"\nSaving data for {country}:") print(f" Keywords: {len(data['keywords'])}") print(f" ASINs: {len(data['asins'])}") # Save keywords with monthly search volume (tab-separated) keywords_file = f'output/{country}_keywords.txt' with open(keywords_file, 'w', encoding='utf-8') as f: # Sort by keyword alphabetically for keyword in sorted(data['keywords'].keys()): search_volume = data['keywords'][keyword] f.write(f"{keyword}\t{search_volume}\n") # Save ASINs (1000 per line, comma-separated) asins_file = f'output/{country}_asins.txt' sorted_asins = sorted(data['asins']) with open(asins_file, 'w', encoding='utf-8') as f: for i in range(0, len(sorted_asins), 1000): chunk = sorted_asins[i:i+1000] f.write(','.join(chunk) + '\n') print(f" Saved to {keywords_file} and {asins_file}") def main(): print("Starting to process Excel files...") # Process all files country_data = process_excel_files() # Print summary print(f"\nSummary:") print(f"Processed {len(country_data)} countries:") for country in sorted(country_data.keys()): data = country_data[country] print(f" {country}: {len(data['keywords'])} keywords, {len(data['asins'])} ASINs") # Save to files save_country_data(country_data) print(f"\nDone! Files saved in 'output' directory with format: keyword\\tmonthly_search_volume") if __name__ == "__main__": main()