install_nvidia_container_toolkit.sh 3.7 KB
#!/bin/bash

# NVIDIA Container Toolkit 自动安装脚本
# 适用于 Ubuntu/Debian 系统

set -e

echo "========================================="
echo "  NVIDIA Container Toolkit 安装脚本"
echo "========================================="
echo ""

# 检测系统发行版
if [ -f /etc/os-release ]; then
    . /etc/os-release
    OS=$ID
    OS_VERSION=$VERSION_ID
else
    echo "❌ 无法检测系统类型"
    exit 1
fi

echo "检测到系统: $OS $OS_VERSION"
echo ""

# 检查 NVIDIA 驱动
echo "🔍 检查 NVIDIA 驱动..."
if ! command -v nvidia-smi &> /dev/null; then
    echo "❌ 未找到 NVIDIA 驱动,请先安装 NVIDIA 驱动"
    echo "   访问: https://www.nvidia.com/Download/index.aspx"
    exit 1
fi

echo "✅ NVIDIA 驱动已安装:"
nvidia-smi --query-gpu=name,driver_version --format=csv,noheader
echo ""

# 检查 Docker
echo "🔍 检查 Docker..."
if ! command -v docker &> /dev/null; then
    echo "❌ 未找到 Docker,请先安装 Docker"
    exit 1
fi

echo "✅ Docker 已安装"
docker --version
echo ""

# 添加 NVIDIA 仓库
echo "📦 添加 NVIDIA Container Toolkit 仓库..."

if [ "$OS" = "ubuntu" ] || [ "$OS" = "debian" ]; then
    # Ubuntu/Debian
    distribution=$OS$(. /etc/os-release; echo $VERSION_ID | sed 's/\.//')

    curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
    curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \
      sudo tee /etc/apt/sources.list.d/nvidia-docker.list

    echo "✅ 仓库配置完成"
elif [ "$OS" = "centos" ] || [ "$OS" = "rhel" ]; then
    # CentOS/RHEL
    distribution=$OS$(. /etc/os-release; echo $VERSION_ID | sed 's/\.//')

    curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | \
      sudo tee /etc/yum.repos.d/nvidia-docker.repo

    echo "✅ 仓库配置完成"
else
    echo "⚠️  不支持的系统: $OS"
    echo "   请手动安装,参考: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html"
    exit 1
fi

echo ""

# 更新包列表
echo "🔄 更新包列表..."
if [ "$OS" = "ubuntu" ] || [ "$OS" = "debian" ]; then
    sudo apt-get update
elif [ "$OS" = "centos" ] || [ "$OS" = "rhel" ]; then
    sudo yum makecache
fi
echo ""

# 安装 NVIDIA Container Toolkit
echo "🔨 安装 NVIDIA Container Toolkit..."
if [ "$OS" = "ubuntu" ] || [ "$OS" = "debian" ]; then
    sudo apt-get install -y nvidia-container-toolkit
elif [ "$OS" = "centos" ] || [ "$OS" = "rhel" ]; then
    sudo yum install -y nvidia-container-toolkit
fi
echo ""

# 配置 Docker
echo "⚙️  配置 Docker 使用 NVIDIA 运行时..."
sudo nvidia-ctk runtime configure --runtime=docker
echo "✅ Docker 配置完成"
echo ""

# 重启 Docker
echo "🔄 重启 Docker 服务..."
sudo systemctl restart docker
echo "✅ Docker 重启完成"
echo ""

# 验证安装
echo "🧪 验证安装..."
echo "测试 Docker GPU 访问..."
if sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi &> /dev/null; then
    echo "✅ NVIDIA Container Toolkit 安装成功!"
    echo ""
    echo "GPU 可用于 Docker 容器!"
else
    echo "❌ 验证失败,请检查日志"
    exit 1
fi

echo ""
echo "========================================="
echo "  🎉 安装完成!"
echo "========================================="
echo ""
echo "📝 下一步:"
echo "   1. 停止当前的 Xinference 容器(如果在运行):"
echo "      sudo docker stop xinference"
echo "      sudo docker rm xinference"
echo ""
echo "   2. 重新启动 Xinference 服务(会自动使用 GPU):"
echo "      cd $(pwd)"
echo "      bash start.sh"
echo ""
echo "   3. 部署模型:"
echo "      python deploy_models.py"
echo ""
echo "✅ 现在可以使用 GPU 加速了!"
echo ""