run.sh 3.27 KB
#!/bin/bash

cd /home/tw/recommendation/offline_tasks

# 内存监控函数
check_memory() {
    local pid=$1
    local threshold_warn=25  # 25GB警告阈值
    local threshold_kill=30  # 30GB强制kill阈值
    
    while kill -0 $pid 2>/dev/null; do
        # 获取进程内存使用(MB)
        local mem_mb=$(ps -p $pid -o rss= 2>/dev/null | awk '{print int($1/1024)}')
        
        if [ -n "$mem_mb" ]; then
            local mem_gb=$(echo "scale=2; $mem_mb/1024" | bc)
            local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
            
            if [ $(echo "$mem_gb >= $threshold_kill" | bc) -eq 1 ]; then
                echo "[$timestamp] ❌ 内存超限!当前使用: ${mem_gb}GB (>= ${threshold_kill}GB), 强制终止进程 PID=$pid" | tee -a logs/memory_monitor.log
                kill -9 $pid
                break
            elif [ $(echo "$mem_gb >= $threshold_warn" | bc) -eq 1 ]; then
                echo "[$timestamp] ⚠️  内存警告!当前使用: ${mem_gb}GB (>= ${threshold_warn}GB), PID=$pid" | tee -a logs/memory_monitor.log
            fi
        fi
        
        sleep 10  # 每10秒检查一次
    done
}

# # 查看配置指南
# cat UPDATE_CONFIG_GUIDE.md

# 2. 测试连接
# python3 test_connection.py

# 清理旧进程
ps -ef|grep run_all.py | awk '{print $2}' | xargs kill -9 2>/dev/null
ps -ef|grep recommendation | awk '{print $2}' | xargs kill -9 2>/dev/null
rm output/* -rf 2>/dev/null
rm logs/* -rf 2>/dev/null
mkdir -p logs

echo "======================================================================"
echo "开始运行离线任务 - $(date '+%Y-%m-%d %H:%M:%S')"
echo "内存监控: 警告阈值=25GB, 强制终止阈值=30GB"
echo "======================================================================"

# 3. 调试模式运行(小数据量)
echo ""
echo ">>> 步骤1: 调试模式运行(小数据量)"
python3 run_all.py --debug &
PID_DEBUG=$!
echo "调试任务 PID: $PID_DEBUG"

# 启动内存监控
check_memory $PID_DEBUG &
MONITOR_PID_1=$!

# 等待调试任务完成
wait $PID_DEBUG
DEBUG_EXIT_CODE=$?
kill $MONITOR_PID_1 2>/dev/null

if [ $DEBUG_EXIT_CODE -eq 0 ]; then
    echo "✓ 调试模式完成"
    mv output output_debug 2>/dev/null
    mkdir output
else
    echo "✗ 调试模式失败,退出码: $DEBUG_EXIT_CODE"
    exit 1
fi

# 4. 生产模式运行(大数据量)
echo ""
echo ">>> 步骤2: 生产模式运行(大数据量)"
python3 run_all.py --debug &
PID_PROD=$!
echo "生产任务 PID: $PID_PROD"

# 启动内存监控
check_memory $PID_PROD &
MONITOR_PID_2=$!

# 等待生产任务完成
wait $PID_PROD
PROD_EXIT_CODE=$?
kill $MONITOR_PID_2 2>/dev/null

if [ $PROD_EXIT_CODE -eq 0 ]; then
    echo "✓ 生产模式完成"
else
    echo "✗ 生产模式失败,退出码: $PROD_EXIT_CODE"
    exit 1
fi

# 5. 加载到Redis
echo ""
echo ">>> 步骤3: 加载到Redis"
python3 scripts/load_index_to_redis.py --redis-host localhost
LOAD_EXIT_CODE=$?

if [ $LOAD_EXIT_CODE -eq 0 ]; then
    echo "✓ Redis加载完成"
else
    echo "✗ Redis加载失败,退出码: $LOAD_EXIT_CODE"
    exit 1
fi

echo ""
echo "======================================================================"
echo "所有任务完成 - $(date '+%Y-%m-%d %H:%M:%S')"
echo "======================================================================"