eb835c2a
tangwang
amazon keywords
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Process Excel files to extract keywords and ASINs by country
"""
import pandas as pd
import os
import re
from collections import defaultdict
def extract_country_from_filename(filename):
"""Extract country code from filename like KeywordResearch-US-202501-342779.xlsx"""
match = re.match(r'KeywordResearch-(\w{2})-\d{6}-\d+\.xlsx', filename)
if match:
return match.group(1)
return None
def process_excel_files():
"""Process all Excel files in current directory"""
# Get all Excel files
excel_files = [f for f in os.listdir('.') if f.endswith('.xlsx')]
# Dictionary to store data by country
|
40f1e391
tangwang
cnclip
|
26
|
# keywords: dict mapping keyword to max monthly search volume
|
eb835c2a
tangwang
amazon keywords
|
27
|
country_data = defaultdict(lambda: {
|
40f1e391
tangwang
cnclip
|
28
|
'keywords': {},
|
eb835c2a
tangwang
amazon keywords
|
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
|
'asins': set()
})
print(f"Found {len(excel_files)} Excel files")
# Process each file
for file_path in excel_files:
country = extract_country_from_filename(file_path)
if not country:
print(f"Could not extract country from {file_path}, skipping...")
continue
print(f"Processing {file_path} (Country: {country})")
try:
# Read Excel file using openpyxl engine
df = pd.read_excel(file_path, engine='openpyxl')
# Check if required columns exist
|
40f1e391
tangwang
cnclip
|
48
49
|
if len(df.columns) < 4:
print(f" Insufficient columns in {file_path}")
|
eb835c2a
tangwang
amazon keywords
|
50
51
|
continue
|
40f1e391
tangwang
cnclip
|
52
53
54
|
keyword_col = df.columns[0] # "关键词"
search_vol_col = df.columns[3] # "月搜索量"
asins_col = df.columns[-1] # "前10ASIN"
|
eb835c2a
tangwang
amazon keywords
|
55
|
|
40f1e391
tangwang
cnclip
|
56
|
print(f" Columns: {keyword_col}, {search_vol_col}, {asins_col}")
|
eb835c2a
tangwang
amazon keywords
|
57
58
59
|
# Process each row
for _, row in df.iterrows():
|
40f1e391
tangwang
cnclip
|
60
61
62
|
keyword = row[keyword_col]
search_volume = row[search_vol_col]
asins_str = row[asins_col]
|
eb835c2a
tangwang
amazon keywords
|
63
64
65
66
67
|
# Skip if keyword is NaN or empty
if pd.isna(keyword) or not str(keyword).strip():
continue
|
40f1e391
tangwang
cnclip
|
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
|
keyword_clean = str(keyword).strip()
# Store keyword with max monthly search volume
# If keyword already exists, keep the maximum search volume
current_vol = country_data[country]['keywords'].get(keyword_clean, 0)
new_vol = search_volume if not pd.isna(search_volume) else 0
# Convert to numeric if it's a string
if isinstance(new_vol, str):
try:
new_vol = float(new_vol)
except:
new_vol = 0
if new_vol > current_vol:
country_data[country]['keywords'][keyword_clean] = int(new_vol)
|
eb835c2a
tangwang
amazon keywords
|
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
|
# Process ASINs if not NaN
if not pd.isna(asins_str) and str(asins_str).strip():
asins = str(asins_str).split(',')
for asin in asins:
asin = asin.strip()
if asin:
country_data[country]['asins'].add(asin)
print(f" Processed {len(df)} rows")
except Exception as e:
print(f" Error processing {file_path}: {e}")
continue
return country_data
def save_country_data(country_data):
"""Save keywords and ASINs for each country to separate files"""
# Create output directory if it doesn't exist
os.makedirs('output', exist_ok=True)
for country, data in country_data.items():
print(f"\nSaving data for {country}:")
print(f" Keywords: {len(data['keywords'])}")
print(f" ASINs: {len(data['asins'])}")
|
40f1e391
tangwang
cnclip
|
112
|
# Save keywords with monthly search volume (tab-separated)
|
eb835c2a
tangwang
amazon keywords
|
113
114
|
keywords_file = f'output/{country}_keywords.txt'
with open(keywords_file, 'w', encoding='utf-8') as f:
|
40f1e391
tangwang
cnclip
|
115
116
117
118
|
# Sort by keyword alphabetically
for keyword in sorted(data['keywords'].keys()):
search_volume = data['keywords'][keyword]
f.write(f"{keyword}\t{search_volume}\n")
|
eb835c2a
tangwang
amazon keywords
|
119
|
|
ad248a90
tangwang
1. facet 前端调试页面: ...
|
120
|
# Save ASINs (1000 per line, comma-separated)
|
eb835c2a
tangwang
amazon keywords
|
121
|
asins_file = f'output/{country}_asins.txt'
|
ad248a90
tangwang
1. facet 前端调试页面: ...
|
122
|
sorted_asins = sorted(data['asins'])
|
eb835c2a
tangwang
amazon keywords
|
123
|
with open(asins_file, 'w', encoding='utf-8') as f:
|
ad248a90
tangwang
1. facet 前端调试页面: ...
|
124
125
126
|
for i in range(0, len(sorted_asins), 1000):
chunk = sorted_asins[i:i+1000]
f.write(','.join(chunk) + '\n')
|
eb835c2a
tangwang
amazon keywords
|
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
|
print(f" Saved to {keywords_file} and {asins_file}")
def main():
print("Starting to process Excel files...")
# Process all files
country_data = process_excel_files()
# Print summary
print(f"\nSummary:")
print(f"Processed {len(country_data)} countries:")
for country in sorted(country_data.keys()):
data = country_data[country]
print(f" {country}: {len(data['keywords'])} keywords, {len(data['asins'])} ASINs")
# Save to files
save_country_data(country_data)
|
40f1e391
tangwang
cnclip
|
146
|
print(f"\nDone! Files saved in 'output' directory with format: keyword\\tmonthly_search_volume")
|
eb835c2a
tangwang
amazon keywords
|
147
148
149
|
if __name__ == "__main__":
main()
|