e7f2b240
tangwang
first commit
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
|
"""
Script to download the Fashion Product Images Dataset from Kaggle
Requirements:
1. Install Kaggle CLI: pip install kaggle
2. Setup Kaggle API credentials:
- Go to https://www.kaggle.com/settings/account
- Click "Create New API Token"
- Save kaggle.json to ~/.kaggle/kaggle.json
- chmod 600 ~/.kaggle/kaggle.json
Usage:
python scripts/download_dataset.py
"""
import subprocess
import zipfile
from pathlib import Path
def download_dataset():
"""Download and extract the Fashion Product Images Dataset"""
# Get project root
project_root = Path(__file__).parent.parent
raw_data_path = project_root / "data" / "raw"
# Check if data already exists
if (raw_data_path / "styles.csv").exists():
print("Dataset already exists in data/raw/")
response = input("Do you want to re-download? (y/n): ")
if response.lower() != "y":
print("Skipping download.")
return
# Check Kaggle credentials
kaggle_json = Path.home() / ".kaggle" / "kaggle.json"
if not kaggle_json.exists():
print(" Kaggle API credentials not found!")
return
print("Downloading dataset from Kaggle...")
try:
# Download using Kaggle API
subprocess.run(
[
"kaggle",
"datasets",
"download",
"-d",
"paramaggarwal/fashion-product-images-dataset",
"-p",
str(raw_data_path),
],
check=True,
)
print("Download complete!")
# Extract zip file
zip_path = raw_data_path / "fashion-product-images-dataset.zip"
if zip_path.exists():
print("Extracting files...")
with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(raw_data_path)
print("Extraction complete!")
# Clean up zip file
zip_path.unlink()
print("Cleaned up zip file")
# Verify files
styles_csv = raw_data_path / "styles.csv"
images_dir = raw_data_path / "images"
if styles_csv.exists() and images_dir.exists():
print("\Dataset ready!")
# Count images
image_count = len(list(images_dir.glob("*.jpg")))
print(f"- Total images: {image_count:,}")
else:
print("Warning: Expected files not found")
except subprocess.CalledProcessError:
print("Download failed!")
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
download_dataset()
|