Blame view

data/mai_jia_jing_ling/keywords/process_excel.py 4.1 KB
eb835c2a   tangwang   amazon keywords
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
  #!/usr/bin/env python3
  # -*- coding: utf-8 -*-
  """
  Process Excel files to extract keywords and ASINs by country
  """
  
  import pandas as pd
  import os
  import re
  from collections import defaultdict
  
  def extract_country_from_filename(filename):
      """Extract country code from filename like KeywordResearch-US-202501-342779.xlsx"""
      match = re.match(r'KeywordResearch-(\w{2})-\d{6}-\d+\.xlsx', filename)
      if match:
          return match.group(1)
      return None
  
  def process_excel_files():
      """Process all Excel files in current directory"""
  
      # Get all Excel files
      excel_files = [f for f in os.listdir('.') if f.endswith('.xlsx')]
  
      # Dictionary to store data by country
      country_data = defaultdict(lambda: {
          'keywords': set(),
          'asins': set()
      })
  
      print(f"Found {len(excel_files)} Excel files")
  
      # Process each file
      for file_path in excel_files:
          country = extract_country_from_filename(file_path)
          if not country:
              print(f"Could not extract country from {file_path}, skipping...")
              continue
  
          print(f"Processing {file_path} (Country: {country})")
  
          try:
              # Read Excel file using openpyxl engine
              df = pd.read_excel(file_path, engine='openpyxl')
  
              # Check if required columns exist
              if len(df.columns) < 1:
                  print(f"  No columns found in {file_path}")
                  continue
  
              first_col = df.columns[0]  # Should be "关键词"
              last_col = df.columns[-1]   # Should be "前10ASIN"
  
              print(f"  Columns: {first_col} -> {last_col}")
  
              # Process each row
              for _, row in df.iterrows():
                  keyword = row[first_col]
                  asins_str = row[last_col]
  
                  # Skip if keyword is NaN or empty
                  if pd.isna(keyword) or not str(keyword).strip():
                      continue
  
                  # Add keyword to country set
                  country_data[country]['keywords'].add(str(keyword).strip())
  
                  # Process ASINs if not NaN
                  if not pd.isna(asins_str) and str(asins_str).strip():
                      asins = str(asins_str).split(',')
                      for asin in asins:
                          asin = asin.strip()
                          if asin:
                              country_data[country]['asins'].add(asin)
  
              print(f"  Processed {len(df)} rows")
  
          except Exception as e:
              print(f"  Error processing {file_path}: {e}")
              continue
  
      return country_data
  
  def save_country_data(country_data):
      """Save keywords and ASINs for each country to separate files"""
  
      # Create output directory if it doesn't exist
      os.makedirs('output', exist_ok=True)
  
      for country, data in country_data.items():
          print(f"\nSaving data for {country}:")
          print(f"  Keywords: {len(data['keywords'])}")
          print(f"  ASINs: {len(data['asins'])}")
  
          # Save keywords
          keywords_file = f'output/{country}_keywords.txt'
          with open(keywords_file, 'w', encoding='utf-8') as f:
              for keyword in sorted(data['keywords']):
                  f.write(f"{keyword}\n")
  
ad248a90   tangwang   1. facet 前端调试页面: ...
101
          # Save ASINs (1000 per line, comma-separated)
eb835c2a   tangwang   amazon keywords
102
          asins_file = f'output/{country}_asins.txt'
ad248a90   tangwang   1. facet 前端调试页面: ...
103
          sorted_asins = sorted(data['asins'])
eb835c2a   tangwang   amazon keywords
104
          with open(asins_file, 'w', encoding='utf-8') as f:
ad248a90   tangwang   1. facet 前端调试页面: ...
105
106
107
              for i in range(0, len(sorted_asins), 1000):
                  chunk = sorted_asins[i:i+1000]
                  f.write(','.join(chunk) + '\n')
eb835c2a   tangwang   amazon keywords
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
  
          print(f"  Saved to {keywords_file} and {asins_file}")
  
  def main():
      print("Starting to process Excel files...")
  
      # Process all files
      country_data = process_excel_files()
  
      # Print summary
      print(f"\nSummary:")
      print(f"Processed {len(country_data)} countries:")
      for country in sorted(country_data.keys()):
          data = country_data[country]
          print(f"  {country}: {len(data['keywords'])} keywords, {len(data['asins'])} ASINs")
  
      # Save to files
      save_country_data(country_data)
  
      print(f"\nDone! Files saved in 'output' directory.")
  
  if __name__ == "__main__":
      main()