Blame view

scripts/ingest.sh 1.16 KB
115047ee   tangwang   为一个租户灌入测试数据;实例的启动...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
  #!/bin/bash
  
  # Data Ingestion Script for Customer1
  
  set -e
  
  cd "$(dirname "$0")/.."
  source /home/tw/miniconda3/etc/profile.d/conda.sh
  conda activate searchengine
  
  GREEN='\033[0;32m'
  YELLOW='\033[1;33m'
  NC='\033[0m'
  
  echo -e "${GREEN}========================================${NC}"
  echo -e "${GREEN}Customer1 Data Ingestion${NC}"
  echo -e "${GREEN}========================================${NC}"
  
  # Default values
  LIMIT=${1:-1000}
  SKIP_EMBEDDINGS=${2:-false}
  
  echo -e "\n${YELLOW}Configuration:${NC}"
  echo "  Limit: $LIMIT documents"
  echo "  Skip embeddings: $SKIP_EMBEDDINGS"
  
  CSV_FILE="data/customer1/goods_with_pic.5years_congku.csv.shuf.1w"
  
  if [ ! -f "$CSV_FILE" ]; then
      echo "Error: CSV file not found: $CSV_FILE"
      exit 1
  fi
  
  # Build command
  CMD="python data/customer1/ingest_customer1.py \
    --csv $CSV_FILE \
    --limit $LIMIT \
    --recreate-index \
    --batch-size 100"
  
  if [ "$SKIP_EMBEDDINGS" = "true" ]; then
      CMD="$CMD --skip-embeddings"
  fi
  
  echo -e "\n${YELLOW}Starting ingestion...${NC}"
  eval $CMD
  
  echo -e "\n${GREEN}========================================${NC}"
  echo -e "${GREEN}Ingestion Complete!${NC}"
  echo -e "${GREEN}========================================${NC}"