Blame view

offline_tasks/test_memory_monitor.sh 2.7 KB
db578127   tangwang   offline tasks: me...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
  #!/bin/bash
  
  # 测试内存监控功能
  
  cd /home/tw/recommendation/offline_tasks
  mkdir -p logs
  
  echo "======================================================================"
  echo "测试内存监控功能"
  echo "======================================================================"
  
  # 内存监控函数(从run.sh复制)
  check_memory() {
      local pid=$1
      local threshold_warn=25  # 25GB警告阈值
      local threshold_kill=30  # 30GB强制kill阈值
      
      echo "启动内存监控: PID=$pid, 警告阈值=${threshold_warn}GB, 终止阈值=${threshold_kill}GB"
      
      while kill -0 $pid 2>/dev/null; do
          # 获取进程内存使用(MB)
          local mem_mb=$(ps -p $pid -o rss= 2>/dev/null | awk '{print int($1/1024)}')
          
          if [ -n "$mem_mb" ]; then
              local mem_gb=$(echo "scale=2; $mem_mb/1024" | bc)
              local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
              
              # 显示当前内存使用
              echo "[$timestamp] 📊 当前内存: ${mem_gb}GB, PID=$pid"
              
              if [ $(echo "$mem_gb >= $threshold_kill" | bc) -eq 1 ]; then
                  echo "[$timestamp] ❌ 内存超限!当前使用: ${mem_gb}GB (>= ${threshold_kill}GB), 强制终止进程 PID=$pid" | tee -a logs/memory_monitor.log
                  kill -9 $pid
                  break
              elif [ $(echo "$mem_gb >= $threshold_warn" | bc) -eq 1 ]; then
                  echo "[$timestamp] ⚠️  内存警告!当前使用: ${mem_gb}GB (>= ${threshold_warn}GB), PID=$pid" | tee -a logs/memory_monitor.log
              fi
          fi
          
          sleep 2  # 测试时每2秒检查一次
      done
      
      echo "内存监控结束: PID=$pid"
  }
  
  # 模拟占用内存的测试进程
  echo ""
  echo ">>> 测试1: 启动一个简单进程(低内存)"
  sleep 30 &
  TEST_PID=$!
  echo "测试进程 PID: $TEST_PID"
  
  # 启动监控
  check_memory $TEST_PID &
  MONITOR_PID=$!
  
  # 等待测试进程
  sleep 5
  kill $TEST_PID 2>/dev/null
  wait $TEST_PID 2>/dev/null
  kill $MONITOR_PID 2>/dev/null
  
  echo "✓ 测试1完成"
  echo ""
  
  # 测试2: 查看当前Python进程内存
  echo ">>> 测试2: 查看当前运行的Python进程内存"
  ps aux | grep python | grep -v grep | awk '{printf "PID: %s, 内存: %.2fGB, 命令: %s\n", $2, $6/1024/1024, $11}'
  echo ""
  
  # 测试3: 显示系统总内存
  echo ">>> 测试3: 系统内存信息"
  free -h
  echo ""
  
  echo "======================================================================"
  echo "测试完成"
  echo "======================================================================"
  echo ""
  echo "💡 提示:"
  echo "  - 内存监控日志: logs/memory_monitor.log"
  echo "  - 查看实时日志: tail -f logs/memory_monitor.log"
  echo "  - 监控阈值可在 run.sh 中修改"
  echo ""