Blame view

offline_tasks/run.sh 2.57 KB
db578127   tangwang   offline tasks: me...
1
2
  #!/bin/bash
  
1721766b   tangwang   offline tasks
3
4
  cd /home/tw/recommendation/offline_tasks
  
db578127   tangwang   offline tasks: me...
5
6
7
8
  # 内存监控函数
  check_memory() {
      local pid=$1
      local threshold_warn=25  # 25GB警告阈值
6409ab2c   tangwang   offline tasks: me...
9
      local threshold_kill=35  # 30GB强制kill阈值
db578127   tangwang   offline tasks: me...
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
      
      while kill -0 $pid 2>/dev/null; do
          # 获取进程内存使用(MB)
          local mem_mb=$(ps -p $pid -o rss= 2>/dev/null | awk '{print int($1/1024)}')
          
          if [ -n "$mem_mb" ]; then
              local mem_gb=$(echo "scale=2; $mem_mb/1024" | bc)
              local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
              
              if [ $(echo "$mem_gb >= $threshold_kill" | bc) -eq 1 ]; then
                  echo "[$timestamp] ❌ 内存超限!当前使用: ${mem_gb}GB (>= ${threshold_kill}GB), 强制终止进程 PID=$pid" | tee -a logs/memory_monitor.log
                  kill -9 $pid
                  break
              elif [ $(echo "$mem_gb >= $threshold_warn" | bc) -eq 1 ]; then
                  echo "[$timestamp] ⚠️  内存警告!当前使用: ${mem_gb}GB (>= ${threshold_warn}GB), PID=$pid" | tee -a logs/memory_monitor.log
              fi
          fi
          
6409ab2c   tangwang   offline tasks: me...
28
          sleep 60
db578127   tangwang   offline tasks: me...
29
30
31
      done
  }
  
db578127   tangwang   offline tasks: me...
32
33
34
35
36
37
  # 清理旧进程
  ps -ef|grep run_all.py | awk '{print $2}' | xargs kill -9 2>/dev/null
  ps -ef|grep recommendation | awk '{print $2}' | xargs kill -9 2>/dev/null
  rm output/* -rf 2>/dev/null
  rm logs/* -rf 2>/dev/null
  mkdir -p logs
40442baf   tangwang   offline tasks: fi...
38
  
db578127   tangwang   offline tasks: me...
39
40
41
42
  echo "======================================================================"
  echo "开始运行离线任务 - $(date '+%Y-%m-%d %H:%M:%S')"
  echo "内存监控: 警告阈值=25GB, 强制终止阈值=30GB"
  echo "======================================================================"
40442baf   tangwang   offline tasks: fi...
43
  
db578127   tangwang   offline tasks: me...
44
  
db578127   tangwang   offline tasks: me...
45
  echo ""
6409ab2c   tangwang   offline tasks: me...
46
  echo ">>>  run_all.py"
12118125   tangwang   offline tasks: me...
47
48
  # python3 run_all.py --lookback_days 400 --top_n 50 --debug &
  python3 run_all.py --debug &
db578127   tangwang   offline tasks: me...
49
50
  PID_PROD=$!
  echo "生产任务 PID: $PID_PROD"
a1f370ee   tangwang   offline tasks
51
  
db578127   tangwang   offline tasks: me...
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
  # 启动内存监控
  check_memory $PID_PROD &
  MONITOR_PID_2=$!
  
  # 等待生产任务完成
  wait $PID_PROD
  PROD_EXIT_CODE=$?
  kill $MONITOR_PID_2 2>/dev/null
  
  if [ $PROD_EXIT_CODE -eq 0 ]; then
      echo "✓ 生产模式完成"
  else
      echo "✗ 生产模式失败,退出码: $PROD_EXIT_CODE"
      exit 1
  fi
a1f370ee   tangwang   offline tasks
67
  
6409ab2c   tangwang   offline tasks: me...
68
  
db578127   tangwang   offline tasks: me...
69
70
  echo ""
  echo ">>> 步骤3: 加载到Redis"
a1f370ee   tangwang   offline tasks
71
  python3 scripts/load_index_to_redis.py --redis-host localhost
db578127   tangwang   offline tasks: me...
72
73
74
75
76
77
78
79
80
81
82
83
84
  LOAD_EXIT_CODE=$?
  
  if [ $LOAD_EXIT_CODE -eq 0 ]; then
      echo "✓ Redis加载完成"
  else
      echo "✗ Redis加载失败,退出码: $LOAD_EXIT_CODE"
      exit 1
  fi
  
  echo ""
  echo "======================================================================"
  echo "所有任务完成 - $(date '+%Y-%m-%d %H:%M:%S')"
  echo "======================================================================"
14f3dcbe   tangwang   offline tasks