start.sh 5.14 KB
#!/bin/bash

# Xinference Docker 部署脚本
# 用于启动 Xinference 服务

set -e

# 检查是否需要 sudo
if sudo -n docker info &> /dev/null 2>&1; then
    DOCKER="sudo docker"
    # 检测 Docker Compose 的形式(V2: docker compose 或 V1: docker-compose)
    if sudo docker compose version &> /dev/null 2>&1; then
        DOCKER_COMPOSE="sudo docker compose"
        echo "✅ 检测到 Docker Compose V2"
    elif command -v docker-compose &> /dev/null; then
        DOCKER_COMPOSE="sudo docker-compose"
        echo "✅ 检测到 Docker Compose V1"
    else
        echo "❌ 错误: 未找到 Docker Compose"
        exit 1
    fi
else
    DOCKER="docker"
    # 检测 Docker Compose 的形式
    if docker compose version &> /dev/null 2>&1; then
        DOCKER_COMPOSE="docker compose"
        echo "✅ 检测到 Docker Compose V2"
    elif command -v docker-compose &> /dev/null; then
        DOCKER_COMPOSE="docker-compose"
        echo "✅ 检测到 Docker Compose V1"
    else
        echo "❌ 错误: 未找到 Docker Compose"
        exit 1
    fi
fi

echo "========================================="
echo "  Xinference Docker 部署脚本"
echo "========================================="
echo ""

# 检查 GPU
echo "🔍 检查 GPU 可用性..."
if command -v nvidia-smi &> /dev/null; then
    echo "✅ 检测到 NVIDIA GPU:"
    nvidia-smi --query-gpu=name,memory.total --format=csv,noheader | head -n 1

    # 检查 NVIDIA Container Toolkit
    if $DOCKER run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi &> /dev/null; then
        GPU_AVAILABLE=true
        echo "✅ NVIDIA Container Toolkit 已安装,GPU 可用"
    else
        GPU_AVAILABLE=false
        echo "⚠️  检测到 GPU 但 NVIDIA Container Toolkit 未安装"
        echo "⚠️  将使用 CPU 模式启动"
        echo "💡 如需 GPU 加速,请安装 NVIDIA Container Toolkit(见下方说明)"
    fi
else
    echo "⚠️  未检测到 NVIDIA GPU,将使用 CPU 模式"
    GPU_AVAILABLE=false
fi

# 创建模型目录
echo ""
echo "📁 创建模型存储目录..."
mkdir -p models

# 拉取镜像
echo ""
echo "🐳 拉取 Xinference Docker 镜像..."
if [ "$GPU_AVAILABLE" = true ]; then
    $DOCKER pull xprobe/xinference:latest
else
    echo "⚠️  CPU 模式:如需 GPU 支持,请配置好 NVIDIA Docker 运行时"
    $DOCKER pull xprobe/xinference:latest
fi

# 停止并删除旧容器
echo ""
echo "🛑 停止并删除旧容器..."
$DOCKER stop xinference 2>/dev/null || true
$DOCKER rm xinference 2>/dev/null || true
$DOCKER_COMPOSE down 2>/dev/null || true

# 启动服务
echo ""
echo "🚀 启动 Xinference 服务..."
if [ "$GPU_AVAILABLE" = true ]; then
    echo "🔥 使用 GPU 模式启动..."
    $DOCKER_COMPOSE up -d
else
    # CPU 模式:直接使用 docker run
    echo "💻 使用 CPU 模式启动..."
    $DOCKER run -d \
        --name xinference \
        -p 9997:9997 -p 9998:9998 \
        -v "$(pwd)/models:/data" \
        -e XINFERENCE_HOME=/data \
        --restart unless-stopped \
        xprobe/xinference:latest \
        xinference-local -H 0.0.0.0
fi

# 等待服务启动
echo ""
echo "⏳ 等待服务启动..."
for i in {1..30}; do
    if curl -s http://localhost:9997/v1/models > /dev/null 2>&1; then
        echo "✅ Xinference 服务启动成功!"
        break
    fi
    echo "   等待中... ($i/30)"
    sleep 2
done

# 检查服务状态
echo ""
echo "📊 服务状态检查..."
if curl -s http://localhost:9997/v1/models > /dev/null 2>&1; then
    echo "✅ 服务健康检查通过"
    echo ""
    echo "========================================="
    echo "  🎉 部署成功!"
    echo "========================================="
    echo ""
    echo "📍 服务地址:"
    echo "   - API: http://localhost:9997"
    echo "   - Dashboard: http://localhost:9998"
    echo ""
    echo "📝 下一步操作:"
    echo "   1. 查看日志: $DOCKER logs -f xinference"
    echo "   2. 部署模型: python deploy_models.py"
    echo "   3. 测试搜索: python ecommerce_demo.py"
    echo ""

    if [ "$GPU_AVAILABLE" = false ]; then
        echo "💡 启用 GPU 加速(可选):"
        echo "   如果你想使用 GPU 加速,请安装 NVIDIA Container Toolkit:"
        echo ""
        echo "   # 1. 添加 NVIDIA 仓库"
        echo "   curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -"
        echo "   distribution=\$(. /etc/os-release;echo \$ID\$VERSION_ID)"
        echo "   curl -s -L https://nvidia.github.io/nvidia-docker/\$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list"
        echo ""
        echo "   # 2. 安装 NVIDIA Container Toolkit"
        echo "   sudo apt-get update"
        echo "   sudo apt-get install -y nvidia-container-toolkit"
        echo ""
        echo "   # 3. 重启 Docker"
        echo "   sudo systemctl restart docker"
        echo ""
        echo "   # 4. 重新运行此脚本"
        echo "   bash start.sh"
        echo ""
    fi

    echo "📚 查看所有已部署模型:"
    echo "   curl http://localhost:9997/v1/models"
    echo ""
else
    echo "❌ 服务启动失败,请查看日志:"
    echo "   $DOCKER logs xinference"
    exit 1
fi