install_nvidia_container_toolkit.sh
3.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/bin/bash
# NVIDIA Container Toolkit 自动安装脚本
# 适用于 Ubuntu/Debian 系统
set -e
echo "========================================="
echo " NVIDIA Container Toolkit 安装脚本"
echo "========================================="
echo ""
# 检测系统发行版
if [ -f /etc/os-release ]; then
. /etc/os-release
OS=$ID
OS_VERSION=$VERSION_ID
else
echo "❌ 无法检测系统类型"
exit 1
fi
echo "检测到系统: $OS $OS_VERSION"
echo ""
# 检查 NVIDIA 驱动
echo "🔍 检查 NVIDIA 驱动..."
if ! command -v nvidia-smi &> /dev/null; then
echo "❌ 未找到 NVIDIA 驱动,请先安装 NVIDIA 驱动"
echo " 访问: https://www.nvidia.com/Download/index.aspx"
exit 1
fi
echo "✅ NVIDIA 驱动已安装:"
nvidia-smi --query-gpu=name,driver_version --format=csv,noheader
echo ""
# 检查 Docker
echo "🔍 检查 Docker..."
if ! command -v docker &> /dev/null; then
echo "❌ 未找到 Docker,请先安装 Docker"
exit 1
fi
echo "✅ Docker 已安装"
docker --version
echo ""
# 添加 NVIDIA 仓库
echo "📦 添加 NVIDIA Container Toolkit 仓库..."
if [ "$OS" = "ubuntu" ] || [ "$OS" = "debian" ]; then
# Ubuntu/Debian
distribution=$OS$(. /etc/os-release; echo $VERSION_ID | sed 's/\.//')
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \
sudo tee /etc/apt/sources.list.d/nvidia-docker.list
echo "✅ 仓库配置完成"
elif [ "$OS" = "centos" ] || [ "$OS" = "rhel" ]; then
# CentOS/RHEL
distribution=$OS$(. /etc/os-release; echo $VERSION_ID | sed 's/\.//')
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | \
sudo tee /etc/yum.repos.d/nvidia-docker.repo
echo "✅ 仓库配置完成"
else
echo "⚠️ 不支持的系统: $OS"
echo " 请手动安装,参考: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html"
exit 1
fi
echo ""
# 更新包列表
echo "🔄 更新包列表..."
if [ "$OS" = "ubuntu" ] || [ "$OS" = "debian" ]; then
sudo apt-get update
elif [ "$OS" = "centos" ] || [ "$OS" = "rhel" ]; then
sudo yum makecache
fi
echo ""
# 安装 NVIDIA Container Toolkit
echo "🔨 安装 NVIDIA Container Toolkit..."
if [ "$OS" = "ubuntu" ] || [ "$OS" = "debian" ]; then
sudo apt-get install -y nvidia-container-toolkit
elif [ "$OS" = "centos" ] || [ "$OS" = "rhel" ]; then
sudo yum install -y nvidia-container-toolkit
fi
echo ""
# 配置 Docker
echo "⚙️ 配置 Docker 使用 NVIDIA 运行时..."
sudo nvidia-ctk runtime configure --runtime=docker
echo "✅ Docker 配置完成"
echo ""
# 重启 Docker
echo "🔄 重启 Docker 服务..."
sudo systemctl restart docker
echo "✅ Docker 重启完成"
echo ""
# 验证安装
echo "🧪 验证安装..."
echo "测试 Docker GPU 访问..."
if sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi &> /dev/null; then
echo "✅ NVIDIA Container Toolkit 安装成功!"
echo ""
echo "GPU 可用于 Docker 容器!"
else
echo "❌ 验证失败,请检查日志"
exit 1
fi
echo ""
echo "========================================="
echo " 🎉 安装完成!"
echo "========================================="
echo ""
echo "📝 下一步:"
echo " 1. 停止当前的 Xinference 容器(如果在运行):"
echo " sudo docker stop xinference"
echo " sudo docker rm xinference"
echo ""
echo " 2. 重新启动 Xinference 服务(会自动使用 GPU):"
echo " cd $(pwd)"
echo " bash start.sh"
echo ""
echo " 3. 部署模型:"
echo " python deploy_models.py"
echo ""
echo "✅ 现在可以使用 GPU 加速了!"
echo ""