Blame view

offline_tasks/run.sh 3.27 KB
db578127   tangwang   offline tasks: me...
1
2
  #!/bin/bash
  
1721766b   tangwang   offline tasks
3
4
  cd /home/tw/recommendation/offline_tasks
  
db578127   tangwang   offline tasks: me...
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
  # 内存监控函数
  check_memory() {
      local pid=$1
      local threshold_warn=25  # 25GB警告阈值
      local threshold_kill=30  # 30GB强制kill阈值
      
      while kill -0 $pid 2>/dev/null; do
          # 获取进程内存使用(MB)
          local mem_mb=$(ps -p $pid -o rss= 2>/dev/null | awk '{print int($1/1024)}')
          
          if [ -n "$mem_mb" ]; then
              local mem_gb=$(echo "scale=2; $mem_mb/1024" | bc)
              local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
              
              if [ $(echo "$mem_gb >= $threshold_kill" | bc) -eq 1 ]; then
                  echo "[$timestamp] ❌ 内存超限!当前使用: ${mem_gb}GB (>= ${threshold_kill}GB), 强制终止进程 PID=$pid" | tee -a logs/memory_monitor.log
                  kill -9 $pid
                  break
              elif [ $(echo "$mem_gb >= $threshold_warn" | bc) -eq 1 ]; then
                  echo "[$timestamp] ⚠️  内存警告!当前使用: ${mem_gb}GB (>= ${threshold_warn}GB), PID=$pid" | tee -a logs/memory_monitor.log
              fi
          fi
          
          sleep 10  # 每10秒检查一次
      done
  }
  
a1f370ee   tangwang   offline tasks
32
33
  # # 查看配置指南
  # cat UPDATE_CONFIG_GUIDE.md
1721766b   tangwang   offline tasks
34
  
a1f370ee   tangwang   offline tasks
35
  # 2. 测试连接
14f3dcbe   tangwang   offline tasks
36
  # python3 test_connection.py
1721766b   tangwang   offline tasks
37
  
db578127   tangwang   offline tasks: me...
38
39
40
41
42
43
  # 清理旧进程
  ps -ef|grep run_all.py | awk '{print $2}' | xargs kill -9 2>/dev/null
  ps -ef|grep recommendation | awk '{print $2}' | xargs kill -9 2>/dev/null
  rm output/* -rf 2>/dev/null
  rm logs/* -rf 2>/dev/null
  mkdir -p logs
40442baf   tangwang   offline tasks: fi...
44
  
db578127   tangwang   offline tasks: me...
45
46
47
48
  echo "======================================================================"
  echo "开始运行离线任务 - $(date '+%Y-%m-%d %H:%M:%S')"
  echo "内存监控: 警告阈值=25GB, 强制终止阈值=30GB"
  echo "======================================================================"
40442baf   tangwang   offline tasks: fi...
49
  
a1f370ee   tangwang   offline tasks
50
  # 3. 调试模式运行(小数据量)
db578127   tangwang   offline tasks: me...
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
  echo ""
  echo ">>> 步骤1: 调试模式运行(小数据量)"
  python3 run_all.py --debug &
  PID_DEBUG=$!
  echo "调试任务 PID: $PID_DEBUG"
  
  # 启动内存监控
  check_memory $PID_DEBUG &
  MONITOR_PID_1=$!
  
  # 等待调试任务完成
  wait $PID_DEBUG
  DEBUG_EXIT_CODE=$?
  kill $MONITOR_PID_1 2>/dev/null
  
  if [ $DEBUG_EXIT_CODE -eq 0 ]; then
      echo "✓ 调试模式完成"
      mv output output_debug 2>/dev/null
      mkdir output
  else
      echo "✗ 调试模式失败,退出码: $DEBUG_EXIT_CODE"
      exit 1
  fi
a1f370ee   tangwang   offline tasks
74
  
db578127   tangwang   offline tasks: me...
75
76
77
78
79
80
  # 4. 生产模式运行(大数据量)
  echo ""
  echo ">>> 步骤2: 生产模式运行(大数据量)"
  python3 run_all.py --debug &
  PID_PROD=$!
  echo "生产任务 PID: $PID_PROD"
a1f370ee   tangwang   offline tasks
81
  
db578127   tangwang   offline tasks: me...
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
  # 启动内存监控
  check_memory $PID_PROD &
  MONITOR_PID_2=$!
  
  # 等待生产任务完成
  wait $PID_PROD
  PROD_EXIT_CODE=$?
  kill $MONITOR_PID_2 2>/dev/null
  
  if [ $PROD_EXIT_CODE -eq 0 ]; then
      echo "✓ 生产模式完成"
  else
      echo "✗ 生产模式失败,退出码: $PROD_EXIT_CODE"
      exit 1
  fi
a1f370ee   tangwang   offline tasks
97
98
  
  # 5. 加载到Redis
db578127   tangwang   offline tasks: me...
99
100
  echo ""
  echo ">>> 步骤3: 加载到Redis"
a1f370ee   tangwang   offline tasks
101
  python3 scripts/load_index_to_redis.py --redis-host localhost
db578127   tangwang   offline tasks: me...
102
103
104
105
106
107
108
109
110
111
112
113
114
  LOAD_EXIT_CODE=$?
  
  if [ $LOAD_EXIT_CODE -eq 0 ]; then
      echo "✓ Redis加载完成"
  else
      echo "✗ Redis加载失败,退出码: $LOAD_EXIT_CODE"
      exit 1
  fi
  
  echo ""
  echo "======================================================================"
  echo "所有任务完成 - $(date '+%Y-%m-%d %H:%M:%S')"
  echo "======================================================================"
14f3dcbe   tangwang   offline tasks