process_excel.py
4.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Process Excel files to extract keywords and ASINs by country
"""
import pandas as pd
import os
import re
from collections import defaultdict
def extract_country_from_filename(filename):
"""Extract country code from filename like KeywordResearch-US-202501-342779.xlsx"""
match = re.match(r'KeywordResearch-(\w{2})-\d{6}-\d+\.xlsx', filename)
if match:
return match.group(1)
return None
def process_excel_files():
"""Process all Excel files in current directory"""
# Get all Excel files
excel_files = [f for f in os.listdir('.') if f.endswith('.xlsx')]
# Dictionary to store data by country
country_data = defaultdict(lambda: {
'keywords': set(),
'asins': set()
})
print(f"Found {len(excel_files)} Excel files")
# Process each file
for file_path in excel_files:
country = extract_country_from_filename(file_path)
if not country:
print(f"Could not extract country from {file_path}, skipping...")
continue
print(f"Processing {file_path} (Country: {country})")
try:
# Read Excel file using openpyxl engine
df = pd.read_excel(file_path, engine='openpyxl')
# Check if required columns exist
if len(df.columns) < 1:
print(f" No columns found in {file_path}")
continue
first_col = df.columns[0] # Should be "关键词"
last_col = df.columns[-1] # Should be "前10ASIN"
print(f" Columns: {first_col} -> {last_col}")
# Process each row
for _, row in df.iterrows():
keyword = row[first_col]
asins_str = row[last_col]
# Skip if keyword is NaN or empty
if pd.isna(keyword) or not str(keyword).strip():
continue
# Add keyword to country set
country_data[country]['keywords'].add(str(keyword).strip())
# Process ASINs if not NaN
if not pd.isna(asins_str) and str(asins_str).strip():
asins = str(asins_str).split(',')
for asin in asins:
asin = asin.strip()
if asin:
country_data[country]['asins'].add(asin)
print(f" Processed {len(df)} rows")
except Exception as e:
print(f" Error processing {file_path}: {e}")
continue
return country_data
def save_country_data(country_data):
"""Save keywords and ASINs for each country to separate files"""
# Create output directory if it doesn't exist
os.makedirs('output', exist_ok=True)
for country, data in country_data.items():
print(f"\nSaving data for {country}:")
print(f" Keywords: {len(data['keywords'])}")
print(f" ASINs: {len(data['asins'])}")
# Save keywords
keywords_file = f'output/{country}_keywords.txt'
with open(keywords_file, 'w', encoding='utf-8') as f:
for keyword in sorted(data['keywords']):
f.write(f"{keyword}\n")
# Save ASINs (1000 per line, comma-separated)
asins_file = f'output/{country}_asins.txt'
sorted_asins = sorted(data['asins'])
with open(asins_file, 'w', encoding='utf-8') as f:
for i in range(0, len(sorted_asins), 1000):
chunk = sorted_asins[i:i+1000]
f.write(','.join(chunk) + '\n')
print(f" Saved to {keywords_file} and {asins_file}")
def main():
print("Starting to process Excel files...")
# Process all files
country_data = process_excel_files()
# Print summary
print(f"\nSummary:")
print(f"Processed {len(country_data)} countries:")
for country in sorted(country_data.keys()):
data = country_data[country]
print(f" {country}: {len(data['keywords'])} keywords, {len(data['asins'])} ASINs")
# Save to files
save_country_data(country_data)
print(f"\nDone! Files saved in 'output' directory.")
if __name__ == "__main__":
main()