#!/bin/bash # NVIDIA Container Toolkit 自动安装脚本 # 适用于 Ubuntu/Debian 系统 set -e echo "=========================================" echo " NVIDIA Container Toolkit 安装脚本" echo "=========================================" echo "" # 检测系统发行版 if [ -f /etc/os-release ]; then . /etc/os-release OS=$ID OS_VERSION=$VERSION_ID else echo "❌ 无法检测系统类型" exit 1 fi echo "检测到系统: $OS $OS_VERSION" echo "" # 检查 NVIDIA 驱动 echo "🔍 检查 NVIDIA 驱动..." if ! command -v nvidia-smi &> /dev/null; then echo "❌ 未找到 NVIDIA 驱动,请先安装 NVIDIA 驱动" echo " 访问: https://www.nvidia.com/Download/index.aspx" exit 1 fi echo "✅ NVIDIA 驱动已安装:" nvidia-smi --query-gpu=name,driver_version --format=csv,noheader echo "" # 检查 Docker echo "🔍 检查 Docker..." if ! command -v docker &> /dev/null; then echo "❌ 未找到 Docker,请先安装 Docker" exit 1 fi echo "✅ Docker 已安装" docker --version echo "" # 添加 NVIDIA 仓库 echo "📦 添加 NVIDIA Container Toolkit 仓库..." if [ "$OS" = "ubuntu" ] || [ "$OS" = "debian" ]; then # Ubuntu/Debian distribution=$OS$(. /etc/os-release; echo $VERSION_ID | sed 's/\.//') curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \ sudo tee /etc/apt/sources.list.d/nvidia-docker.list echo "✅ 仓库配置完成" elif [ "$OS" = "centos" ] || [ "$OS" = "rhel" ]; then # CentOS/RHEL distribution=$OS$(. /etc/os-release; echo $VERSION_ID | sed 's/\.//') curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | \ sudo tee /etc/yum.repos.d/nvidia-docker.repo echo "✅ 仓库配置完成" else echo "⚠️ 不支持的系统: $OS" echo " 请手动安装,参考: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html" exit 1 fi echo "" # 更新包列表 echo "🔄 更新包列表..." if [ "$OS" = "ubuntu" ] || [ "$OS" = "debian" ]; then sudo apt-get update elif [ "$OS" = "centos" ] || [ "$OS" = "rhel" ]; then sudo yum makecache fi echo "" # 安装 NVIDIA Container Toolkit echo "🔨 安装 NVIDIA Container Toolkit..." if [ "$OS" = "ubuntu" ] || [ "$OS" = "debian" ]; then sudo apt-get install -y nvidia-container-toolkit elif [ "$OS" = "centos" ] || [ "$OS" = "rhel" ]; then sudo yum install -y nvidia-container-toolkit fi echo "" # 配置 Docker echo "⚙️ 配置 Docker 使用 NVIDIA 运行时..." sudo nvidia-ctk runtime configure --runtime=docker echo "✅ Docker 配置完成" echo "" # 重启 Docker echo "🔄 重启 Docker 服务..." sudo systemctl restart docker echo "✅ Docker 重启完成" echo "" # 验证安装 echo "🧪 验证安装..." echo "测试 Docker GPU 访问..." if sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi &> /dev/null; then echo "✅ NVIDIA Container Toolkit 安装成功!" echo "" echo "GPU 可用于 Docker 容器!" else echo "❌ 验证失败,请检查日志" exit 1 fi echo "" echo "=========================================" echo " 🎉 安装完成!" echo "=========================================" echo "" echo "📝 下一步:" echo " 1. 停止当前的 Xinference 容器(如果在运行):" echo " sudo docker stop xinference" echo " sudo docker rm xinference" echo "" echo " 2. 重新启动 Xinference 服务(会自动使用 GPU):" echo " cd $(pwd)" echo " bash start.sh" echo "" echo " 3. 部署模型:" echo " python deploy_models.py" echo "" echo "✅ 现在可以使用 GPU 加速了!" echo ""