ingest.sh 1.16 KB
#!/bin/bash

# Data Ingestion Script for Customer1

set -e

cd "$(dirname "$0")/.."
source /home/tw/miniconda3/etc/profile.d/conda.sh
conda activate searchengine

GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

echo -e "${GREEN}========================================${NC}"
echo -e "${GREEN}Customer1 Data Ingestion${NC}"
echo -e "${GREEN}========================================${NC}"

# Default values
LIMIT=${1:-1000}
SKIP_EMBEDDINGS=${2:-false}

echo -e "\n${YELLOW}Configuration:${NC}"
echo "  Limit: $LIMIT documents"
echo "  Skip embeddings: $SKIP_EMBEDDINGS"

CSV_FILE="data/customer1/goods_with_pic.5years_congku.csv.shuf.1w"

if [ ! -f "$CSV_FILE" ]; then
    echo "Error: CSV file not found: $CSV_FILE"
    exit 1
fi

# Build command
CMD="python data/customer1/ingest_customer1.py \
  --csv $CSV_FILE \
  --limit $LIMIT \
  --recreate-index \
  --batch-size 100"

if [ "$SKIP_EMBEDDINGS" = "true" ]; then
    CMD="$CMD --skip-embeddings"
fi

echo -e "\n${YELLOW}Starting ingestion...${NC}"
eval $CMD

echo -e "\n${GREEN}========================================${NC}"
echo -e "${GREEN}Ingestion Complete!${NC}"
echo -e "${GREEN}========================================${NC}"