ingest.sh
1.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/bin/bash
# Data Ingestion Script for Customer1
set -e
cd "$(dirname "$0")/.."
source /home/tw/miniconda3/etc/profile.d/conda.sh
conda activate searchengine
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
echo -e "${GREEN}========================================${NC}"
echo -e "${GREEN}Customer1 Data Ingestion${NC}"
echo -e "${GREEN}========================================${NC}"
# Default values
LIMIT=${1:-1000}
SKIP_EMBEDDINGS=${2:-false}
echo -e "\n${YELLOW}Configuration:${NC}"
echo " Limit: $LIMIT documents"
echo " Skip embeddings: $SKIP_EMBEDDINGS"
CSV_FILE="data/customer1/goods_with_pic.5years_congku.csv.shuf.1w"
if [ ! -f "$CSV_FILE" ]; then
echo "Error: CSV file not found: $CSV_FILE"
exit 1
fi
# Build command
CMD="python data/customer1/ingest_customer1.py \
--csv $CSV_FILE \
--limit $LIMIT \
--recreate-index \
--batch-size 100"
if [ "$SKIP_EMBEDDINGS" = "true" ]; then
CMD="$CMD --skip-embeddings"
fi
echo -e "\n${YELLOW}Starting ingestion...${NC}"
eval $CMD
echo -e "\n${GREEN}========================================${NC}"
echo -e "${GREEN}Ingestion Complete!${NC}"
echo -e "${GREEN}========================================${NC}"