Blame view

third-party/xinference/install_nvidia_container_toolkit.sh 3.7 KB
b401ef94   tangwang   third-party/xinfe...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
  #!/bin/bash
  
  # NVIDIA Container Toolkit 自动安装脚本
  # 适用于 Ubuntu/Debian 系统
  
  set -e
  
  echo "========================================="
  echo "  NVIDIA Container Toolkit 安装脚本"
  echo "========================================="
  echo ""
  
  # 检测系统发行版
  if [ -f /etc/os-release ]; then
      . /etc/os-release
      OS=$ID
      OS_VERSION=$VERSION_ID
  else
      echo "❌ 无法检测系统类型"
      exit 1
  fi
  
  echo "检测到系统: $OS $OS_VERSION"
  echo ""
  
  # 检查 NVIDIA 驱动
  echo "🔍 检查 NVIDIA 驱动..."
  if ! command -v nvidia-smi &> /dev/null; then
      echo "❌ 未找到 NVIDIA 驱动,请先安装 NVIDIA 驱动"
      echo "   访问: https://www.nvidia.com/Download/index.aspx"
      exit 1
  fi
  
  echo "✅ NVIDIA 驱动已安装:"
  nvidia-smi --query-gpu=name,driver_version --format=csv,noheader
  echo ""
  
  # 检查 Docker
  echo "🔍 检查 Docker..."
  if ! command -v docker &> /dev/null; then
      echo "❌ 未找到 Docker,请先安装 Docker"
      exit 1
  fi
  
  echo "✅ Docker 已安装"
  docker --version
  echo ""
  
  # 添加 NVIDIA 仓库
  echo "📦 添加 NVIDIA Container Toolkit 仓库..."
  
  if [ "$OS" = "ubuntu" ] || [ "$OS" = "debian" ]; then
      # Ubuntu/Debian
      distribution=$OS$(. /etc/os-release; echo $VERSION_ID | sed 's/\.//')
  
      curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
      curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \
        sudo tee /etc/apt/sources.list.d/nvidia-docker.list
  
      echo "✅ 仓库配置完成"
  elif [ "$OS" = "centos" ] || [ "$OS" = "rhel" ]; then
      # CentOS/RHEL
      distribution=$OS$(. /etc/os-release; echo $VERSION_ID | sed 's/\.//')
  
      curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | \
        sudo tee /etc/yum.repos.d/nvidia-docker.repo
  
      echo "✅ 仓库配置完成"
  else
      echo "⚠️  不支持的系统: $OS"
      echo "   请手动安装,参考: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html"
      exit 1
  fi
  
  echo ""
  
  # 更新包列表
  echo "🔄 更新包列表..."
  if [ "$OS" = "ubuntu" ] || [ "$OS" = "debian" ]; then
      sudo apt-get update
  elif [ "$OS" = "centos" ] || [ "$OS" = "rhel" ]; then
      sudo yum makecache
  fi
  echo ""
  
  # 安装 NVIDIA Container Toolkit
  echo "🔨 安装 NVIDIA Container Toolkit..."
  if [ "$OS" = "ubuntu" ] || [ "$OS" = "debian" ]; then
      sudo apt-get install -y nvidia-container-toolkit
  elif [ "$OS" = "centos" ] || [ "$OS" = "rhel" ]; then
      sudo yum install -y nvidia-container-toolkit
  fi
  echo ""
  
  # 配置 Docker
  echo "⚙️  配置 Docker 使用 NVIDIA 运行时..."
  sudo nvidia-ctk runtime configure --runtime=docker
  echo "✅ Docker 配置完成"
  echo ""
  
  # 重启 Docker
  echo "🔄 重启 Docker 服务..."
  sudo systemctl restart docker
  echo "✅ Docker 重启完成"
  echo ""
  
  # 验证安装
  echo "🧪 验证安装..."
  echo "测试 Docker GPU 访问..."
  if sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi &> /dev/null; then
      echo "✅ NVIDIA Container Toolkit 安装成功!"
      echo ""
      echo "GPU 可用于 Docker 容器!"
  else
      echo "❌ 验证失败,请检查日志"
      exit 1
  fi
  
  echo ""
  echo "========================================="
  echo "  🎉 安装完成!"
  echo "========================================="
  echo ""
  echo "📝 下一步:"
  echo "   1. 停止当前的 Xinference 容器(如果在运行):"
  echo "      sudo docker stop xinference"
  echo "      sudo docker rm xinference"
  echo ""
  echo "   2. 重新启动 Xinference 服务(会自动使用 GPU):"
  echo "      cd $(pwd)"
  echo "      bash start.sh"
  echo ""
  echo "   3. 部署模型:"
  echo "      python deploy_models.py"
  echo ""
  echo "✅ 现在可以使用 GPU 加速了!"
  echo ""