db578127
tangwang
offline tasks: me...
|
1
2
|
#!/bin/bash
|
1721766b
tangwang
offline tasks
|
3
4
|
cd /home/tw/recommendation/offline_tasks
|
db578127
tangwang
offline tasks: me...
|
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
# 内存监控函数
check_memory() {
local pid=$1
local threshold_warn=25 # 25GB警告阈值
local threshold_kill=30 # 30GB强制kill阈值
while kill -0 $pid 2>/dev/null; do
# 获取进程内存使用(MB)
local mem_mb=$(ps -p $pid -o rss= 2>/dev/null | awk '{print int($1/1024)}')
if [ -n "$mem_mb" ]; then
local mem_gb=$(echo "scale=2; $mem_mb/1024" | bc)
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
if [ $(echo "$mem_gb >= $threshold_kill" | bc) -eq 1 ]; then
echo "[$timestamp] ❌ 内存超限!当前使用: ${mem_gb}GB (>= ${threshold_kill}GB), 强制终止进程 PID=$pid" | tee -a logs/memory_monitor.log
kill -9 $pid
break
elif [ $(echo "$mem_gb >= $threshold_warn" | bc) -eq 1 ]; then
echo "[$timestamp] ⚠️ 内存警告!当前使用: ${mem_gb}GB (>= ${threshold_warn}GB), PID=$pid" | tee -a logs/memory_monitor.log
fi
fi
sleep 10 # 每10秒检查一次
done
}
|
a1f370ee
tangwang
offline tasks
|
32
33
|
# # 查看配置指南
# cat UPDATE_CONFIG_GUIDE.md
|
1721766b
tangwang
offline tasks
|
34
|
|
a1f370ee
tangwang
offline tasks
|
35
|
# 2. 测试连接
|
14f3dcbe
tangwang
offline tasks
|
36
|
# python3 test_connection.py
|
1721766b
tangwang
offline tasks
|
37
|
|
db578127
tangwang
offline tasks: me...
|
38
39
40
41
42
43
|
# 清理旧进程
ps -ef|grep run_all.py | awk '{print $2}' | xargs kill -9 2>/dev/null
ps -ef|grep recommendation | awk '{print $2}' | xargs kill -9 2>/dev/null
rm output/* -rf 2>/dev/null
rm logs/* -rf 2>/dev/null
mkdir -p logs
|
40442baf
tangwang
offline tasks: fi...
|
44
|
|
db578127
tangwang
offline tasks: me...
|
45
46
47
48
|
echo "======================================================================"
echo "开始运行离线任务 - $(date '+%Y-%m-%d %H:%M:%S')"
echo "内存监控: 警告阈值=25GB, 强制终止阈值=30GB"
echo "======================================================================"
|
40442baf
tangwang
offline tasks: fi...
|
49
|
|
a1f370ee
tangwang
offline tasks
|
50
|
# 3. 调试模式运行(小数据量)
|
db578127
tangwang
offline tasks: me...
|
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
|
echo ""
echo ">>> 步骤1: 调试模式运行(小数据量)"
python3 run_all.py --debug &
PID_DEBUG=$!
echo "调试任务 PID: $PID_DEBUG"
# 启动内存监控
check_memory $PID_DEBUG &
MONITOR_PID_1=$!
# 等待调试任务完成
wait $PID_DEBUG
DEBUG_EXIT_CODE=$?
kill $MONITOR_PID_1 2>/dev/null
if [ $DEBUG_EXIT_CODE -eq 0 ]; then
echo "✓ 调试模式完成"
mv output output_debug 2>/dev/null
mkdir output
else
echo "✗ 调试模式失败,退出码: $DEBUG_EXIT_CODE"
exit 1
fi
|
a1f370ee
tangwang
offline tasks
|
74
|
|
db578127
tangwang
offline tasks: me...
|
75
76
77
78
79
80
|
# 4. 生产模式运行(大数据量)
echo ""
echo ">>> 步骤2: 生产模式运行(大数据量)"
python3 run_all.py --debug &
PID_PROD=$!
echo "生产任务 PID: $PID_PROD"
|
a1f370ee
tangwang
offline tasks
|
81
|
|
db578127
tangwang
offline tasks: me...
|
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
|
# 启动内存监控
check_memory $PID_PROD &
MONITOR_PID_2=$!
# 等待生产任务完成
wait $PID_PROD
PROD_EXIT_CODE=$?
kill $MONITOR_PID_2 2>/dev/null
if [ $PROD_EXIT_CODE -eq 0 ]; then
echo "✓ 生产模式完成"
else
echo "✗ 生产模式失败,退出码: $PROD_EXIT_CODE"
exit 1
fi
|
a1f370ee
tangwang
offline tasks
|
97
98
|
# 5. 加载到Redis
|
db578127
tangwang
offline tasks: me...
|
99
100
|
echo ""
echo ">>> 步骤3: 加载到Redis"
|
a1f370ee
tangwang
offline tasks
|
101
|
python3 scripts/load_index_to_redis.py --redis-host localhost
|
db578127
tangwang
offline tasks: me...
|
102
103
104
105
106
107
108
109
110
111
112
113
114
|
LOAD_EXIT_CODE=$?
if [ $LOAD_EXIT_CODE -eq 0 ]; then
echo "✓ Redis加载完成"
else
echo "✗ Redis加载失败,退出码: $LOAD_EXIT_CODE"
exit 1
fi
echo ""
echo "======================================================================"
echo "所有任务完成 - $(date '+%Y-%m-%d %H:%M:%S')"
echo "======================================================================"
|
14f3dcbe
tangwang
offline tasks
|
|
|