Blame view

data/mai_jia_jing_ling/keywords/process_excel.py 5.09 KB
eb835c2a   tangwang   amazon keywords
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
  #!/usr/bin/env python3
  # -*- coding: utf-8 -*-
  """
  Process Excel files to extract keywords and ASINs by country
  """
  
  import pandas as pd
  import os
  import re
  from collections import defaultdict
  
  def extract_country_from_filename(filename):
      """Extract country code from filename like KeywordResearch-US-202501-342779.xlsx"""
      match = re.match(r'KeywordResearch-(\w{2})-\d{6}-\d+\.xlsx', filename)
      if match:
          return match.group(1)
      return None
  
  def process_excel_files():
      """Process all Excel files in current directory"""
  
      # Get all Excel files
      excel_files = [f for f in os.listdir('.') if f.endswith('.xlsx')]
  
      # Dictionary to store data by country
40f1e391   tangwang   cnclip
26
      # keywords: dict mapping keyword to max monthly search volume
eb835c2a   tangwang   amazon keywords
27
      country_data = defaultdict(lambda: {
40f1e391   tangwang   cnclip
28
          'keywords': {},
eb835c2a   tangwang   amazon keywords
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
          'asins': set()
      })
  
      print(f"Found {len(excel_files)} Excel files")
  
      # Process each file
      for file_path in excel_files:
          country = extract_country_from_filename(file_path)
          if not country:
              print(f"Could not extract country from {file_path}, skipping...")
              continue
  
          print(f"Processing {file_path} (Country: {country})")
  
          try:
              # Read Excel file using openpyxl engine
              df = pd.read_excel(file_path, engine='openpyxl')
  
              # Check if required columns exist
40f1e391   tangwang   cnclip
48
49
              if len(df.columns) < 4:
                  print(f"  Insufficient columns in {file_path}")
eb835c2a   tangwang   amazon keywords
50
51
                  continue
  
40f1e391   tangwang   cnclip
52
53
54
              keyword_col = df.columns[0]      # "关键词"
              search_vol_col = df.columns[3]   # "月搜索量"
              asins_col = df.columns[-1]       # "前10ASIN"
eb835c2a   tangwang   amazon keywords
55
  
40f1e391   tangwang   cnclip
56
              print(f"  Columns: {keyword_col}, {search_vol_col}, {asins_col}")
eb835c2a   tangwang   amazon keywords
57
58
59
  
              # Process each row
              for _, row in df.iterrows():
40f1e391   tangwang   cnclip
60
61
62
                  keyword = row[keyword_col]
                  search_volume = row[search_vol_col]
                  asins_str = row[asins_col]
eb835c2a   tangwang   amazon keywords
63
64
65
66
67
  
                  # Skip if keyword is NaN or empty
                  if pd.isna(keyword) or not str(keyword).strip():
                      continue
  
40f1e391   tangwang   cnclip
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
                  keyword_clean = str(keyword).strip()
  
                  # Store keyword with max monthly search volume
                  # If keyword already exists, keep the maximum search volume
                  current_vol = country_data[country]['keywords'].get(keyword_clean, 0)
                  new_vol = search_volume if not pd.isna(search_volume) else 0
  
                  # Convert to numeric if it's a string
                  if isinstance(new_vol, str):
                      try:
                          new_vol = float(new_vol)
                      except:
                          new_vol = 0
  
                  if new_vol > current_vol:
                      country_data[country]['keywords'][keyword_clean] = int(new_vol)
eb835c2a   tangwang   amazon keywords
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
  
                  # Process ASINs if not NaN
                  if not pd.isna(asins_str) and str(asins_str).strip():
                      asins = str(asins_str).split(',')
                      for asin in asins:
                          asin = asin.strip()
                          if asin:
                              country_data[country]['asins'].add(asin)
  
              print(f"  Processed {len(df)} rows")
  
          except Exception as e:
              print(f"  Error processing {file_path}: {e}")
              continue
  
      return country_data
  
  def save_country_data(country_data):
      """Save keywords and ASINs for each country to separate files"""
  
      # Create output directory if it doesn't exist
      os.makedirs('output', exist_ok=True)
  
      for country, data in country_data.items():
          print(f"\nSaving data for {country}:")
          print(f"  Keywords: {len(data['keywords'])}")
          print(f"  ASINs: {len(data['asins'])}")
  
40f1e391   tangwang   cnclip
112
          # Save keywords with monthly search volume (tab-separated)
eb835c2a   tangwang   amazon keywords
113
114
          keywords_file = f'output/{country}_keywords.txt'
          with open(keywords_file, 'w', encoding='utf-8') as f:
40f1e391   tangwang   cnclip
115
116
117
118
              # Sort by keyword alphabetically
              for keyword in sorted(data['keywords'].keys()):
                  search_volume = data['keywords'][keyword]
                  f.write(f"{keyword}\t{search_volume}\n")
eb835c2a   tangwang   amazon keywords
119
  
ad248a90   tangwang   1. facet 前端调试页面: ...
120
          # Save ASINs (1000 per line, comma-separated)
eb835c2a   tangwang   amazon keywords
121
          asins_file = f'output/{country}_asins.txt'
ad248a90   tangwang   1. facet 前端调试页面: ...
122
          sorted_asins = sorted(data['asins'])
eb835c2a   tangwang   amazon keywords
123
          with open(asins_file, 'w', encoding='utf-8') as f:
ad248a90   tangwang   1. facet 前端调试页面: ...
124
125
126
              for i in range(0, len(sorted_asins), 1000):
                  chunk = sorted_asins[i:i+1000]
                  f.write(','.join(chunk) + '\n')
eb835c2a   tangwang   amazon keywords
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
  
          print(f"  Saved to {keywords_file} and {asins_file}")
  
  def main():
      print("Starting to process Excel files...")
  
      # Process all files
      country_data = process_excel_files()
  
      # Print summary
      print(f"\nSummary:")
      print(f"Processed {len(country_data)} countries:")
      for country in sorted(country_data.keys()):
          data = country_data[country]
          print(f"  {country}: {len(data['keywords'])} keywords, {len(data['asins'])} ASINs")
  
      # Save to files
      save_country_data(country_data)
  
40f1e391   tangwang   cnclip
146
      print(f"\nDone! Files saved in 'output' directory with format: keyword\\tmonthly_search_volume")
eb835c2a   tangwang   amazon keywords
147
148
149
  
  if __name__ == "__main__":
      main()