process_excel.py
3.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Process Excel files to extract keywords and ASINs by country
"""
import pandas as pd
import os
import re
from collections import defaultdict
def extract_country_from_filename(filename):
"""Extract country code from filename like KeywordResearch-US-202501-342779.xlsx"""
match = re.match(r'KeywordResearch-(\w{2})-\d{6}-\d+\.xlsx', filename)
if match:
return match.group(1)
return None
def process_excel_files():
"""Process all Excel files in current directory"""
# Get all Excel files
excel_files = [f for f in os.listdir('.') if f.endswith('.xlsx')]
# Dictionary to store data by country
country_data = defaultdict(lambda: {
'keywords': set(),
'asins': set()
})
print(f"Found {len(excel_files)} Excel files")
# Process each file
for file_path in excel_files:
country = extract_country_from_filename(file_path)
if not country:
print(f"Could not extract country from {file_path}, skipping...")
continue
print(f"Processing {file_path} (Country: {country})")
try:
# Read Excel file using openpyxl engine
df = pd.read_excel(file_path, engine='openpyxl')
# Check if required columns exist
if len(df.columns) < 1:
print(f" No columns found in {file_path}")
continue
first_col = df.columns[0] # Should be "关键词"
last_col = df.columns[-1] # Should be "前10ASIN"
print(f" Columns: {first_col} -> {last_col}")
# Process each row
for _, row in df.iterrows():
keyword = row[first_col]
asins_str = row[last_col]
# Skip if keyword is NaN or empty
if pd.isna(keyword) or not str(keyword).strip():
continue
# Add keyword to country set
country_data[country]['keywords'].add(str(keyword).strip())
# Process ASINs if not NaN
if not pd.isna(asins_str) and str(asins_str).strip():
asins = str(asins_str).split(',')
for asin in asins:
asin = asin.strip()
if asin:
country_data[country]['asins'].add(asin)
print(f" Processed {len(df)} rows")
except Exception as e:
print(f" Error processing {file_path}: {e}")
continue
return country_data
def save_country_data(country_data):
"""Save keywords and ASINs for each country to separate files"""
# Create output directory if it doesn't exist
os.makedirs('output', exist_ok=True)
for country, data in country_data.items():
print(f"\nSaving data for {country}:")
print(f" Keywords: {len(data['keywords'])}")
print(f" ASINs: {len(data['asins'])}")
# Save keywords
keywords_file = f'output/{country}_keywords.txt'
with open(keywords_file, 'w', encoding='utf-8') as f:
for keyword in sorted(data['keywords']):
f.write(f"{keyword}\n")
# Save ASINs
asins_file = f'output/{country}_asins.txt'
with open(asins_file, 'w', encoding='utf-8') as f:
for asin in sorted(data['asins']):
f.write(f"{asin}\n")
print(f" Saved to {keywords_file} and {asins_file}")
def main():
print("Starting to process Excel files...")
# Process all files
country_data = process_excel_files()
# Print summary
print(f"\nSummary:")
print(f"Processed {len(country_data)} countries:")
for country in sorted(country_data.keys()):
data = country_data[country]
print(f" {country}: {len(data['keywords'])} keywords, {len(data['asins'])} ASINs")
# Save to files
save_country_data(country_data)
print(f"\nDone! Files saved in 'output' directory.")
if __name__ == "__main__":
main()