Blame view

scripts/download_dataset.py 2.56 KB
e7f2b240   tangwang   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
  """
  Script to download the Fashion Product Images Dataset from Kaggle
  
  Requirements:
  1. Install Kaggle CLI: pip install kaggle
  2. Setup Kaggle API credentials:
     - Go to https://www.kaggle.com/settings/account
     - Click "Create New API Token"
     - Save kaggle.json to ~/.kaggle/kaggle.json
     - chmod 600 ~/.kaggle/kaggle.json
  
  Usage:
      python scripts/download_dataset.py
  """
  
  import subprocess
  import zipfile
  from pathlib import Path
  
  
  def download_dataset():
      """Download and extract the Fashion Product Images Dataset"""
  
      # Get project root
      project_root = Path(__file__).parent.parent
      raw_data_path = project_root / "data" / "raw"
  
      # Check if data already exists
      if (raw_data_path / "styles.csv").exists():
          print("Dataset already exists in data/raw/")
          response = input("Do you want to re-download? (y/n): ")
          if response.lower() != "y":
              print("Skipping download.")
              return
  
      # Check Kaggle credentials
      kaggle_json = Path.home() / ".kaggle" / "kaggle.json"
      if not kaggle_json.exists():
          print(" Kaggle API credentials not found!")
          return
  
      print("Downloading dataset from Kaggle...")
  
      try:
          # Download using Kaggle API
          subprocess.run(
              [
                  "kaggle",
                  "datasets",
                  "download",
                  "-d",
                  "paramaggarwal/fashion-product-images-dataset",
                  "-p",
                  str(raw_data_path),
              ],
              check=True,
          )
  
          print("Download complete!")
  
          # Extract zip file
          zip_path = raw_data_path / "fashion-product-images-dataset.zip"
          if zip_path.exists():
              print("Extracting files...")
              with zipfile.ZipFile(zip_path, "r") as zip_ref:
                  zip_ref.extractall(raw_data_path)
  
              print("Extraction complete!")
  
              # Clean up zip file
              zip_path.unlink()
              print("Cleaned up zip file")
  
          # Verify files
          styles_csv = raw_data_path / "styles.csv"
          images_dir = raw_data_path / "images"
  
          if styles_csv.exists() and images_dir.exists():
              print("\Dataset ready!")
  
              # Count images
              image_count = len(list(images_dir.glob("*.jpg")))
              print(f"- Total images: {image_count:,}")
          else:
              print("Warning: Expected files not found")
  
      except subprocess.CalledProcessError:
          print("Download failed!")
  
      except Exception as e:
          print(f"Error: {e}")
  
  
  if __name__ == "__main__":
      download_dataset()