download_dataset.py 2.56 KB
"""
Script to download the Fashion Product Images Dataset from Kaggle

Requirements:
1. Install Kaggle CLI: pip install kaggle
2. Setup Kaggle API credentials:
   - Go to https://www.kaggle.com/settings/account
   - Click "Create New API Token"
   - Save kaggle.json to ~/.kaggle/kaggle.json
   - chmod 600 ~/.kaggle/kaggle.json

Usage:
    python scripts/download_dataset.py
"""

import subprocess
import zipfile
from pathlib import Path


def download_dataset():
    """Download and extract the Fashion Product Images Dataset"""

    # Get project root
    project_root = Path(__file__).parent.parent
    raw_data_path = project_root / "data" / "raw"

    # Check if data already exists
    if (raw_data_path / "styles.csv").exists():
        print("Dataset already exists in data/raw/")
        response = input("Do you want to re-download? (y/n): ")
        if response.lower() != "y":
            print("Skipping download.")
            return

    # Check Kaggle credentials
    kaggle_json = Path.home() / ".kaggle" / "kaggle.json"
    if not kaggle_json.exists():
        print(" Kaggle API credentials not found!")
        return

    print("Downloading dataset from Kaggle...")

    try:
        # Download using Kaggle API
        subprocess.run(
            [
                "kaggle",
                "datasets",
                "download",
                "-d",
                "paramaggarwal/fashion-product-images-dataset",
                "-p",
                str(raw_data_path),
            ],
            check=True,
        )

        print("Download complete!")

        # Extract zip file
        zip_path = raw_data_path / "fashion-product-images-dataset.zip"
        if zip_path.exists():
            print("Extracting files...")
            with zipfile.ZipFile(zip_path, "r") as zip_ref:
                zip_ref.extractall(raw_data_path)

            print("Extraction complete!")

            # Clean up zip file
            zip_path.unlink()
            print("Cleaned up zip file")

        # Verify files
        styles_csv = raw_data_path / "styles.csv"
        images_dir = raw_data_path / "images"

        if styles_csv.exists() and images_dir.exists():
            print("\Dataset ready!")

            # Count images
            image_count = len(list(images_dir.glob("*.jpg")))
            print(f"- Total images: {image_count:,}")
        else:
            print("Warning: Expected files not found")

    except subprocess.CalledProcessError:
        print("Download failed!")

    except Exception as e:
        print(f"Error: {e}")


if __name__ == "__main__":
    download_dataset()