Commit 775db2b03caf3e78999e91e24b497e8352a6cf1d
1 parent
15eae5ee
xinfer
Showing
23 changed files
with
802 additions
and
2905 deletions
Show diff stats
third-party/xinference/ENV_SETUP.md deleted
| ... | ... | @@ -1,162 +0,0 @@ |
| 1 | -# Xinference 环境配置指南 | |
| 2 | - | |
| 3 | -独立的 Conda 环境用于运行 Xinference 客户端和演示脚本。 | |
| 4 | - | |
| 5 | -## 📋 快速开始 | |
| 6 | - | |
| 7 | -### 方法1:一键安装和配置(推荐) | |
| 8 | - | |
| 9 | -```bash | |
| 10 | -cd /data/tw/SearchEngine/third-party/xinference | |
| 11 | - | |
| 12 | -# 1. 创建环境 | |
| 13 | -bash setup_env.sh | |
| 14 | - | |
| 15 | -# 2. 设置快捷别名 | |
| 16 | -bash setup_alias.sh | |
| 17 | - | |
| 18 | -# 3. 使配置生效 | |
| 19 | -source ~/.bashrc # 或 source ~/.zshrc | |
| 20 | - | |
| 21 | -# 4. 激活环境 | |
| 22 | -xinference-env | |
| 23 | -``` | |
| 24 | - | |
| 25 | -### 方法2:手动配置 | |
| 26 | - | |
| 27 | -```bash | |
| 28 | -# 1. 创建环境 | |
| 29 | -conda create -n xinference python=3.10 -y | |
| 30 | - | |
| 31 | -# 2. 激活环境 | |
| 32 | -conda activate xinference | |
| 33 | - | |
| 34 | -# 3. 安装依赖 | |
| 35 | -pip install xinference-client numpy requests | |
| 36 | - | |
| 37 | -# 4. 运行演示 | |
| 38 | -python ecommerce_demo.py | |
| 39 | -``` | |
| 40 | - | |
| 41 | -## 🚀 快捷命令 | |
| 42 | - | |
| 43 | -配置别名后,可以使用以下快捷命令: | |
| 44 | - | |
| 45 | -| 命令 | 说明 | | |
| 46 | -|------|------| | |
| 47 | -| `xinference-env` | 激活环境并切换到 Xinference 目录 | | |
| 48 | -| `xinference-activate` | 仅激活 Xinference 环境 | | |
| 49 | -| `xinference-cd` | 切换到 Xinference 目录 | | |
| 50 | - | |
| 51 | -## 📦 已安装的包 | |
| 52 | - | |
| 53 | -- `xinference-client` - Xinference 客户端库 | |
| 54 | -- `numpy` - 数值计算库 | |
| 55 | -- `requests` - HTTP 请求库 | |
| 56 | - | |
| 57 | -## 💡 使用示例 | |
| 58 | - | |
| 59 | -### 激活环境后运行演示 | |
| 60 | - | |
| 61 | -```bash | |
| 62 | -# 方式1: 使用快捷命令 | |
| 63 | -xinference-env | |
| 64 | -python ecommerce_demo.py | |
| 65 | - | |
| 66 | -# 方式2: 手动激活 | |
| 67 | -conda activate xinference | |
| 68 | -python ecommerce_demo.py | |
| 69 | - | |
| 70 | -# 方式3: 使用激活脚本 | |
| 71 | -source activate.sh | |
| 72 | -python ecommerce_demo.py | |
| 73 | -``` | |
| 74 | - | |
| 75 | -### 部署模型 | |
| 76 | - | |
| 77 | -```bash | |
| 78 | -# 确保环境已激活 | |
| 79 | -xinference-env | |
| 80 | - | |
| 81 | -# 运行部署脚本 | |
| 82 | -python deploy_models.py | |
| 83 | -``` | |
| 84 | - | |
| 85 | -### 运行简单测试 | |
| 86 | - | |
| 87 | -```bash | |
| 88 | -xinference-env | |
| 89 | -python ecommerce_demo.py --simple | |
| 90 | -``` | |
| 91 | - | |
| 92 | -## 🛠️ 脚本说明 | |
| 93 | - | |
| 94 | -| 脚本 | 说明 | | |
| 95 | -|------|------| | |
| 96 | -| `setup_env.sh` | 创建 Xinference Conda 环境 | | |
| 97 | -| `setup_alias.sh` | 设置 Shell 快捷别名 | | |
| 98 | -| `activate.sh` | 快速激活环境的脚本 | | |
| 99 | -| `start.sh` | 启动 Xinference Docker 服务 | | |
| 100 | - | |
| 101 | -## 🔧 环境管理 | |
| 102 | - | |
| 103 | -### 更新环境 | |
| 104 | - | |
| 105 | -```bash | |
| 106 | -conda activate xinference | |
| 107 | -pip install --upgrade xinference-client numpy | |
| 108 | -``` | |
| 109 | - | |
| 110 | -### 删除环境 | |
| 111 | - | |
| 112 | -```bash | |
| 113 | -conda env remove -n xinference -y | |
| 114 | -``` | |
| 115 | - | |
| 116 | -### 查看已安装的包 | |
| 117 | - | |
| 118 | -```bash | |
| 119 | -conda activate xinference | |
| 120 | -pip list | |
| 121 | -``` | |
| 122 | - | |
| 123 | -## 📝 注意事项 | |
| 124 | - | |
| 125 | -1. **Docker 服务**:Xinference 服务本身运行在 Docker 中,不需要 Python 环境 | |
| 126 | -2. **客户端**:Python 环境仅用于运行客户端脚本(如 `ecommerce_demo.py`) | |
| 127 | -3. **独立环境**:使用独立环境避免与其他项目的依赖冲突 | |
| 128 | - | |
| 129 | -## 🐛 故障排除 | |
| 130 | - | |
| 131 | -### 问题:找不到 xinference-client | |
| 132 | - | |
| 133 | -```bash | |
| 134 | -# 重新安装 | |
| 135 | -conda activate xinference | |
| 136 | -pip install xinference-client --force-reinstall | |
| 137 | -``` | |
| 138 | - | |
| 139 | -### 问题:环境激活失败 | |
| 140 | - | |
| 141 | -```bash | |
| 142 | -# 初始化 conda | |
| 143 | -conda init bash | |
| 144 | -source ~/.bashrc | |
| 145 | - | |
| 146 | -# 然后再激活 | |
| 147 | -conda activate xinference | |
| 148 | -``` | |
| 149 | - | |
| 150 | -### 问题:快捷命令不生效 | |
| 151 | - | |
| 152 | -```bash | |
| 153 | -# 手动添加别名到 ~/.bashrc | |
| 154 | -echo 'alias xinference-env="source /data/tw/SearchEngine/third-party/xinference/activate.sh"' >> ~/.bashrc | |
| 155 | -source ~/.bashrc | |
| 156 | -``` | |
| 157 | - | |
| 158 | -## 📚 相关文档 | |
| 159 | - | |
| 160 | -- [Xinference 官方文档](https://inference.readthedocs.io/) | |
| 161 | -- [电商搜索演示](ecommerce_demo.py) | |
| 162 | -- [模型部署脚本](deploy_models.py) |
third-party/xinference/QUICKSTART.md deleted
| ... | ... | @@ -1,137 +0,0 @@ |
| 1 | -# Xinference 快速使用指南 | |
| 2 | - | |
| 3 | -## ✅ 环境已配置完成! | |
| 4 | - | |
| 5 | -您的 **xinference** conda 环境已创建并配置好了所有依赖。 | |
| 6 | - | |
| 7 | -## 🚀 快速开始 | |
| 8 | - | |
| 9 | -### 方式1:使用快捷命令(推荐) | |
| 10 | - | |
| 11 | -在新的终端窗口中: | |
| 12 | - | |
| 13 | -```bash | |
| 14 | -# 首次使用需要重新加载配置 | |
| 15 | -source ~/.bashrc | |
| 16 | - | |
| 17 | -# 然后就可以直接使用快捷命令 | |
| 18 | -xinference-env | |
| 19 | - | |
| 20 | -# 运行演示 | |
| 21 | -python ecommerce_demo.py | |
| 22 | -``` | |
| 23 | - | |
| 24 | -### 方式2:手动激活 | |
| 25 | - | |
| 26 | -```bash | |
| 27 | -# 激活环境 | |
| 28 | -conda activate xinference | |
| 29 | - | |
| 30 | -# 运行演示 | |
| 31 | -python ecommerce_demo.py | |
| 32 | -``` | |
| 33 | - | |
| 34 | -### 方式3:使用激活脚本 | |
| 35 | - | |
| 36 | -```bash | |
| 37 | -# 切换到目录 | |
| 38 | -cd /data/tw/SearchEngine/third-party/xinference | |
| 39 | - | |
| 40 | -# 使用激活脚本 | |
| 41 | -source activate.sh | |
| 42 | - | |
| 43 | -# 运行演示 | |
| 44 | -python ecommerce_demo.py | |
| 45 | -``` | |
| 46 | - | |
| 47 | -## 📝 可用命令 | |
| 48 | - | |
| 49 | -| 快捷命令 | 说明 | | |
| 50 | -|---------|------| | |
| 51 | -| `xinference-env` | 激活环境并切换到 Xinference 目录 | | |
| 52 | -| `xinference-activate` | 仅激活 xinference 环境 | | |
| 53 | -| `xinference-cd` | 切换到 Xinference 目录 | | |
| 54 | - | |
| 55 | -## 🧪 测试环境 | |
| 56 | - | |
| 57 | -激活环境后,运行以下命令测试: | |
| 58 | - | |
| 59 | -```bash | |
| 60 | -python -c "from xinference_client import RESTfulClient; print('✅ 环境配置成功!')" | |
| 61 | -``` | |
| 62 | - | |
| 63 | -应该显示:`✅ 环境配置成功!` | |
| 64 | - | |
| 65 | -## 🔧 环境信息 | |
| 66 | - | |
| 67 | -- **环境名**: xinference | |
| 68 | -- **Python 版本**: 3.10.19 | |
| 69 | -- **已安装包**: | |
| 70 | - - xinference-client 1.15.0 | |
| 71 | - - numpy 2.2.6 | |
| 72 | - - requests 2.32.5 | |
| 73 | - | |
| 74 | -## 📚 下一步 | |
| 75 | - | |
| 76 | -1. **启动 Xinference 服务**(如果还没启动): | |
| 77 | - ```bash | |
| 78 | - bash start.sh | |
| 79 | - ``` | |
| 80 | - | |
| 81 | -2. **部署模型**: | |
| 82 | - ```bash | |
| 83 | - xinference-env | |
| 84 | - python deploy_models.py | |
| 85 | - ``` | |
| 86 | - | |
| 87 | -3. **运行演示**: | |
| 88 | - ```bash | |
| 89 | - xinference-env | |
| 90 | - python ecommerce_demo.py | |
| 91 | - ``` | |
| 92 | - | |
| 93 | -4. **退出环境**: | |
| 94 | - ```bash | |
| 95 | - conda deactivate | |
| 96 | - ``` | |
| 97 | - | |
| 98 | -## 🐛 常见问题 | |
| 99 | - | |
| 100 | -### 问题1:快捷命令不生效 | |
| 101 | - | |
| 102 | -```bash | |
| 103 | -# 重新加载配置 | |
| 104 | -source ~/.bashrc | |
| 105 | - | |
| 106 | -# 或者打开新的终端窗口 | |
| 107 | -``` | |
| 108 | - | |
| 109 | -### 问题2:找不到 xinference_client | |
| 110 | - | |
| 111 | -```bash | |
| 112 | -# 确认环境已激活 | |
| 113 | -conda activate xinference | |
| 114 | - | |
| 115 | -# 检查安装 | |
| 116 | -pip list | grep xinference | |
| 117 | - | |
| 118 | -# 如果没有,重新安装 | |
| 119 | -pip install xinference-client | |
| 120 | -``` | |
| 121 | - | |
| 122 | -### 问题3:启动脚本找不到命令 | |
| 123 | - | |
| 124 | -确保使用正确的 Python 环境: | |
| 125 | -```bash | |
| 126 | -# 激活环境后再运行 | |
| 127 | -conda activate xinference | |
| 128 | -python ecommerce_demo.py | |
| 129 | -``` | |
| 130 | - | |
| 131 | -## 💡 提示 | |
| 132 | - | |
| 133 | -- Xinference 服务运行在 Docker 中(端口 9997/9998) | |
| 134 | -- Python 环境仅用于运行客户端脚本 | |
| 135 | -- 使用 `xinference-env` 快捷命令最方便 | |
| 136 | - | |
| 137 | -详细文档请参考: [ENV_SETUP.md](ENV_SETUP.md) |
third-party/xinference/README.md deleted
| ... | ... | @@ -1,715 +0,0 @@ |
| 1 | -# Xinference 电商搜索部署完整指南 | |
| 2 | - | |
| 3 | -使用 Qwen3-Embedding 和 Qwen3-Reranker 构建两阶段搜索系统(密集检索 + 精排) | |
| 4 | - | |
| 5 | -## 📋 快速导航 | |
| 6 | - | |
| 7 | -- [快速开始](#快速开始) | |
| 8 | -- [系统要求](#系统要求) | |
| 9 | -- [完整安装步骤](#完整安装步骤) | |
| 10 | -- [GPU 配置](#gpu-配置) | |
| 11 | -- [模型部署](#模型部署) | |
| 12 | -- [使用示例](#使用示例) | |
| 13 | -- [故障排除](#故障排除) | |
| 14 | - | |
| 15 | ---- | |
| 16 | - | |
| 17 | -## 快速开始 | |
| 18 | - | |
| 19 | -### 最快 5 分钟上手(CPU 模式) | |
| 20 | - | |
| 21 | -```bash | |
| 22 | -cd /data/tw/SearchEngine/third-party/xinference | |
| 23 | - | |
| 24 | -# 1. 启动 Xinference 服务(自动 CPU 模式) | |
| 25 | -bash start.sh | |
| 26 | - | |
| 27 | -# 2. 创建 Python 环境 | |
| 28 | -bash setup_env.sh | |
| 29 | -bash setup_alias.sh | |
| 30 | -source ~/.bashrc | |
| 31 | - | |
| 32 | -# 3. 激活环境 | |
| 33 | -xinference-env | |
| 34 | - | |
| 35 | -# 4. 部署模型(CPU 模式,慢但可用) | |
| 36 | -python deploy_models.py | |
| 37 | - | |
| 38 | -# 5. 运行演示 | |
| 39 | -python ecommerce_demo.py | |
| 40 | -``` | |
| 41 | - | |
| 42 | -**注意**:CPU 模式仅用于测试,生产环境请使用 GPU 模式。 | |
| 43 | - | |
| 44 | ---- | |
| 45 | - | |
| 46 | -## 系统要求 | |
| 47 | - | |
| 48 | -### 硬件配置 | |
| 49 | - | |
| 50 | -| 组件 | 最低配置 | 推荐配置 | | |
| 51 | -|------|---------|---------| | |
| 52 | -| CPU | 8核 | 16核+ | | |
| 53 | -| 内存 | 16GB | 64GB+ | | |
| 54 | -| GPU | 无(CPU 模式) | NVIDIA Tesla T4 16GB+ | | |
| 55 | -| 存储 | 30GB | 100GB+ SSD | | |
| 56 | - | |
| 57 | -**GPU 显存需求**: | |
| 58 | -- Qwen3-Embedding (4B): ~8GB | |
| 59 | -- Qwen3-Reranker (4B): ~8GB | |
| 60 | -- 两个模型同时运行: ~16GB+ | |
| 61 | - | |
| 62 | -### 软件要求 | |
| 63 | - | |
| 64 | -- **操作系统**: Linux (Ubuntu 20.04+, CentOS 7+) | |
| 65 | -- **Docker**: 20.10+ | |
| 66 | -- **Docker Compose**: v2.0+ | |
| 67 | -- **NVIDIA Driver**: 525.0+ (GPU 模式) | |
| 68 | -- **Conda**: Miniconda3 或 Anaconda3 | |
| 69 | -- **Python**: 3.10 | |
| 70 | - | |
| 71 | ---- | |
| 72 | - | |
| 73 | -## 完整安装步骤 | |
| 74 | - | |
| 75 | -### 步骤 1: 环境检查 | |
| 76 | - | |
| 77 | -```bash | |
| 78 | -# 检查系统信息 | |
| 79 | -cat /etc/os-release | |
| 80 | - | |
| 81 | -# 检查 GPU(如果有) | |
| 82 | -nvidia-smi | |
| 83 | - | |
| 84 | -# 检查 Docker | |
| 85 | -docker --version | |
| 86 | -docker compose version | |
| 87 | - | |
| 88 | -# 检查 Conda | |
| 89 | -conda --version | |
| 90 | -``` | |
| 91 | - | |
| 92 | -### 步骤 2: 启动 Xinference 服务 | |
| 93 | - | |
| 94 | -#### 选项 A: CPU 模式(快速测试) | |
| 95 | - | |
| 96 | -```bash | |
| 97 | -cd /data/tw/SearchEngine/third-party/xinference | |
| 98 | -bash start.sh | |
| 99 | -``` | |
| 100 | - | |
| 101 | -**特点**: | |
| 102 | -- ✅ 无需额外配置 | |
| 103 | -- ✅ 快速启动 | |
| 104 | -- ❌ 速度慢(10-50倍) | |
| 105 | -- ❌ 仅适合测试 | |
| 106 | - | |
| 107 | -#### 选项 B: GPU 模式(生产环境) | |
| 108 | - | |
| 109 | -**自动安装 NVIDIA Container Toolkit**: | |
| 110 | - | |
| 111 | -```bash | |
| 112 | -cd /data/tw/SearchEngine/third-party/xinference | |
| 113 | -bash install_nvidia_container_toolkit.sh | |
| 114 | -``` | |
| 115 | - | |
| 116 | -脚本会自动: | |
| 117 | -1. 检测系统类型 | |
| 118 | -2. 添加 NVIDIA 仓库 | |
| 119 | -3. 安装 nvidia-container-toolkit | |
| 120 | -4. 配置 Docker 运行时 | |
| 121 | -5. 重启 Docker 服务 | |
| 122 | -6. 验证安装 | |
| 123 | - | |
| 124 | -**手动安装(可选)**: | |
| 125 | - | |
| 126 | -```bash | |
| 127 | -# Ubuntu/Debian | |
| 128 | -distribution=$(. /etc/os-release;echo $ID$VERSION_ID) | |
| 129 | -curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - | |
| 130 | -curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \ | |
| 131 | - sudo tee /etc/apt/sources.list.d/nvidia-docker.list | |
| 132 | - | |
| 133 | -sudo apt-get update | |
| 134 | -sudo apt-get install -y nvidia-container-toolkit | |
| 135 | - | |
| 136 | -sudo nvidia-ctk runtime configure --runtime=docker | |
| 137 | -sudo systemctl restart docker | |
| 138 | -``` | |
| 139 | - | |
| 140 | -**安装完成后,重新启动服务**: | |
| 141 | - | |
| 142 | -```bash | |
| 143 | -# 停止旧容器 | |
| 144 | -sudo docker stop xinference | |
| 145 | -sudo docker rm xinference | |
| 146 | - | |
| 147 | -# 重新启动(自动使用 GPU) | |
| 148 | -bash start.sh | |
| 149 | -``` | |
| 150 | - | |
| 151 | -**验证 GPU 模式**: | |
| 152 | - | |
| 153 | -```bash | |
| 154 | -# 检查服务日志 | |
| 155 | -sudo docker logs xinference | grep -i gpu | |
| 156 | - | |
| 157 | -# 或者检查 GPU 使用 | |
| 158 | -nvidia-smi | |
| 159 | -``` | |
| 160 | - | |
| 161 | -### 步骤 3: 配置 Python 环境 | |
| 162 | - | |
| 163 | -```bash | |
| 164 | -cd /data/tw/SearchEngine/third-party/xinference | |
| 165 | - | |
| 166 | -# 自动创建环境 | |
| 167 | -bash setup_env.sh | |
| 168 | - | |
| 169 | -# 设置快捷别名 | |
| 170 | -bash setup_alias.sh | |
| 171 | - | |
| 172 | -# 使配置生效 | |
| 173 | -source ~/.bashrc | |
| 174 | -``` | |
| 175 | - | |
| 176 | -**环境信息**: | |
| 177 | -- 环境名: `xinference` | |
| 178 | -- Python: 3.10.19 | |
| 179 | -- 包: xinference-client 1.15.0, numpy 2.2.6 | |
| 180 | - | |
| 181 | -**快捷命令**: | |
| 182 | -```bash | |
| 183 | -xinference-env # 激活环境并切换目录 | |
| 184 | -xinference-activate # 仅激活环境 | |
| 185 | -xinference-cd # 切换到 xinference 目录 | |
| 186 | -``` | |
| 187 | - | |
| 188 | -### 步骤 4: 验证安装 | |
| 189 | - | |
| 190 | -```bash | |
| 191 | -# 检查 Xinference 服务 | |
| 192 | -curl http://localhost:9997/v1/models | |
| 193 | - | |
| 194 | -# 预期输出 | |
| 195 | -{"object":"list","data":[]} | |
| 196 | - | |
| 197 | -# 检查 Python 环境 | |
| 198 | -xinference-env | |
| 199 | -python -c "from xinference_client import RESTfulClient; print('✅ 环境配置成功!')" | |
| 200 | - | |
| 201 | -# 查看 Dashboard | |
| 202 | -open http://localhost:9998 | |
| 203 | -# 或在浏览器访问 | |
| 204 | -``` | |
| 205 | - | |
| 206 | ---- | |
| 207 | - | |
| 208 | -## GPU 配置 | |
| 209 | - | |
| 210 | -### 检查 GPU 状态 | |
| 211 | - | |
| 212 | -```bash | |
| 213 | -# 检查 NVIDIA 驱动 | |
| 214 | -nvidia-smi | |
| 215 | - | |
| 216 | -# 检查 Docker GPU 支持 | |
| 217 | -sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi | |
| 218 | - | |
| 219 | -# 预期输出(成功) | |
| 220 | -+-----------------------------------------------------------------------------+ | |
| 221 | -| NVIDIA-SMI 525.85.12 Driver Version: 525.85.12 CUDA Version: 11.0 | | |
| 222 | -|-------------------------------+----------------------+----------------------+ | |
| 223 | -| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | |
| 224 | -| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | |
| 225 | -|===============================+======================+======================| | |
| 226 | -| 0 Tesla T4 On | 00000000:00:1E.0 Off | 0 | | |
| 227 | -| 34% 42C P8 16W / 70W | 0MiB / 16384MiB | 0% Default | | |
| 228 | -+-------------------------------+----------------------+----------------------+ | |
| 229 | -``` | |
| 230 | - | |
| 231 | -### GPU 可用性测试 | |
| 232 | - | |
| 233 | -```bash | |
| 234 | -# 测试 1: 检查 Xinference 是否能看到 GPU | |
| 235 | -curl http://localhost:9997/v1/models | |
| 236 | - | |
| 237 | -# 测试 2: 尝试部署一个小模型测试 GPU | |
| 238 | -xinference-env | |
| 239 | -python -c " | |
| 240 | -from xinference_client import RESTfulClient | |
| 241 | -client = RESTfulClient('http://localhost:9997') | |
| 242 | -# 列出支持的 GPU 模型 | |
| 243 | -# 如果成功,说明 GPU 可用 | |
| 244 | -" | |
| 245 | -``` | |
| 246 | - | |
| 247 | -### 多 GPU 配置 | |
| 248 | - | |
| 249 | -```bash | |
| 250 | -# 使用 GPU 0 | |
| 251 | -python deploy_models.py --gpu 0 | |
| 252 | - | |
| 253 | -# 使用 GPU 1 | |
| 254 | -python deploy_models.py --gpu 1 | |
| 255 | - | |
| 256 | -# 使用多个 GPU | |
| 257 | -python deploy_models.py --gpu 0,1 | |
| 258 | -``` | |
| 259 | - | |
| 260 | ---- | |
| 261 | - | |
| 262 | -## 模型部署 | |
| 263 | - | |
| 264 | -### 部署 Qwen3 4B 模型 | |
| 265 | - | |
| 266 | -#### 完整部署(Embedding + Reranker) | |
| 267 | - | |
| 268 | -```bash | |
| 269 | -xinference-env | |
| 270 | - | |
| 271 | -# 部署所有模型 | |
| 272 | -python deploy_models.py | |
| 273 | -``` | |
| 274 | - | |
| 275 | -**预期输出**: | |
| 276 | -``` | |
| 277 | -============================================================ | |
| 278 | - Qwen3 模型自动部署 | |
| 279 | -============================================================ | |
| 280 | - | |
| 281 | -🔗 连接到 Xinference 服务: http://localhost:9997 | |
| 282 | -✅ 连接成功! | |
| 283 | - | |
| 284 | -============================================================ | |
| 285 | - 部署 Qwen3-Embedding 模型 (4B) | |
| 286 | -============================================================ | |
| 287 | - | |
| 288 | -⏳ 正在部署,首次运行需要下载模型,请耐心等待... | |
| 289 | - 模型大小: ~8GB | |
| 290 | - 上下文长度: 8192 tokens | |
| 291 | - 向量维度: 1024 | |
| 292 | - | |
| 293 | -✅ Qwen3-Embedding 部署成功! | |
| 294 | - 模型 UID: qwen3-embedding-4b | |
| 295 | - | |
| 296 | -============================================================ | |
| 297 | - 部署 Qwen3-Reranker 模型 (4B) | |
| 298 | -============================================================ | |
| 299 | - | |
| 300 | -✅ Qwen3-Reranker 部署成功! | |
| 301 | - 模型 UID: qwen3-reranker-4b | |
| 302 | - | |
| 303 | -============================================================ | |
| 304 | - 🎉 模型部署完成! | |
| 305 | -============================================================ | |
| 306 | -``` | |
| 307 | - | |
| 308 | -**预计时间**(GPU 模式): | |
| 309 | -- 首次下载: 5-15 分钟(取决于网络) | |
| 310 | -- 模型加载: 2-5 分钟 | |
| 311 | - | |
| 312 | -#### 单独部署 | |
| 313 | - | |
| 314 | -```bash | |
| 315 | -# 仅部署 Embedding | |
| 316 | -python deploy_models.py --embedding-only | |
| 317 | - | |
| 318 | -# 仅部署 Reranker | |
| 319 | -python deploy_models.py --reranker-only | |
| 320 | - | |
| 321 | -# 指定 GPU | |
| 322 | -python deploy_models.py --gpu 1 | |
| 323 | -``` | |
| 324 | - | |
| 325 | -#### 查看已部署模型 | |
| 326 | - | |
| 327 | -```bash | |
| 328 | -# 使用脚本 | |
| 329 | -python deploy_models.py --list | |
| 330 | - | |
| 331 | -# 使用 API | |
| 332 | -curl http://localhost:9997/v1/models | python -m json.tool | |
| 333 | -``` | |
| 334 | - | |
| 335 | -### 测试模型 | |
| 336 | - | |
| 337 | -#### 测试 Embedding | |
| 338 | - | |
| 339 | -```bash | |
| 340 | -xinference-env | |
| 341 | -python -c " | |
| 342 | -from xinference_client import RESTfulClient | |
| 343 | - | |
| 344 | -client = RESTfulClient('http://localhost:9997') | |
| 345 | -model = client.get_model('qwen3-embedding-4b') | |
| 346 | - | |
| 347 | -result = model.create_embedding('测试文本') | |
| 348 | -vector = result['data'][0]['embedding'] | |
| 349 | - | |
| 350 | -print(f'✅ 向量维度: {len(vector)}') | |
| 351 | -print(f'✅ 前5维: {vector[:5]}') | |
| 352 | -" | |
| 353 | -``` | |
| 354 | - | |
| 355 | -#### 测试 Reranker | |
| 356 | - | |
| 357 | -```bash | |
| 358 | -xinference-env | |
| 359 | -python -c " | |
| 360 | -from xinference_client import RESTfulClient | |
| 361 | - | |
| 362 | -client = RESTfulClient('http://localhost:9997') | |
| 363 | -model = client.get_model('qwen3-reranker-4b') | |
| 364 | - | |
| 365 | -query = '适合老人用的智能手机' | |
| 366 | -docs = ['华为畅享60 6000mAh', '小米手环8'] | |
| 367 | - | |
| 368 | -results = model.rerank([(query, doc) for doc in docs]) | |
| 369 | - | |
| 370 | -for doc, score in zip(docs, results): | |
| 371 | - print(f'[{score[\"relevance_score\"]:.4f}] {doc}') | |
| 372 | -" | |
| 373 | -``` | |
| 374 | - | |
| 375 | ---- | |
| 376 | - | |
| 377 | -## 使用示例 | |
| 378 | - | |
| 379 | -### 电商搜索两阶段架构 | |
| 380 | - | |
| 381 | -``` | |
| 382 | -用户查询: "适合老人用的智能手机大屏幕长续航" | |
| 383 | - ↓ | |
| 384 | -┌──────────────────────────────────────┐ | |
| 385 | -│ 阶段1: 密集检索 (Dense Retrieval) │ | |
| 386 | -│ Qwen3-Embedding (4B) │ | |
| 387 | -│ 召回 Top-200 │ | |
| 388 | -└──────────────────────────────────────┘ | |
| 389 | - ↓ | |
| 390 | -┌──────────────────────────────────────┐ | |
| 391 | -│ 阶段2: 精排 (Reranking) │ | |
| 392 | -│ Qwen3-Reranker (4B) │ | |
| 393 | -│ Top-200 → Top-10 │ | |
| 394 | -└──────────────────────────────────────┘ | |
| 395 | - ↓ | |
| 396 | -最终结果 | |
| 397 | -``` | |
| 398 | - | |
| 399 | -### 运行演示 | |
| 400 | - | |
| 401 | -```bash | |
| 402 | -xinference-env | |
| 403 | - | |
| 404 | -# 完整演示 | |
| 405 | -python ecommerce_demo.py | |
| 406 | - | |
| 407 | -# 简单演示 | |
| 408 | -python ecommerce_demo.py --simple | |
| 409 | - | |
| 410 | -# 指定模型 UID | |
| 411 | -python ecommerce_demo.py --embedding qwen3-embedding-4b --reranker qwen3-reranker-4b | |
| 412 | -``` | |
| 413 | - | |
| 414 | -**输出示例**: | |
| 415 | -``` | |
| 416 | -====================================================================== | |
| 417 | - 🛒 电商搜索实战演示 - Qwen3 双塔架构 | |
| 418 | -====================================================================== | |
| 419 | - | |
| 420 | -🔗 连接到 Xinference 服务... | |
| 421 | -✅ 连接成功! | |
| 422 | - | |
| 423 | -⏳ 加载模型... | |
| 424 | -✅ 模型加载完成 | |
| 425 | - | |
| 426 | -====================================================================== | |
| 427 | -🔍 搜索查询: 适合老人用的智能手机大屏幕长续航 | |
| 428 | -====================================================================== | |
| 429 | - | |
| 430 | -📊 阶段1: 密集检索(召回 Top-200) | |
| 431 | ----------------------------------------------------------------------- | |
| 432 | -⏱️ 密集检索耗时: 0.23秒 | |
| 433 | -✅ 召回 200 个候选商品 | |
| 434 | - | |
| 435 | -🎯 阶段2: 精排(Cross-Encoder 打分) | |
| 436 | ----------------------------------------------------------------------- | |
| 437 | -⏱️ 精排耗时: 0.15秒 | |
| 438 | - | |
| 439 | -🎯 搜索结果 (Top 5): | |
| 440 | ----------------------------------------------------------------------- | |
| 441 | -1. [0.9876] 华为畅享60 6000mAh超长续航 护眼大屏 鸿蒙系统 | |
| 442 | -2. [0.9654] OPPO A1 5000mAh电池 简易模式适合长辈 | |
| 443 | -3. [0.9432] vivo Y78 5000mAh大电池 120Hz高刷屏 | |
| 444 | -4. [0.9210] 荣耀Play7T 6000mAh巨量电池 双卡双待 | |
| 445 | -5. [0.8976] 诺基亚C31 5050mAh电池 耐用三防 | |
| 446 | -``` | |
| 447 | - | |
| 448 | -### 代码示例 | |
| 449 | - | |
| 450 | -#### Python SDK | |
| 451 | - | |
| 452 | -```python | |
| 453 | -from xinference_client import RESTfulClient | |
| 454 | - | |
| 455 | -# 连接服务 | |
| 456 | -client = RESTfulClient("http://localhost:9997") | |
| 457 | - | |
| 458 | -# 获取模型 | |
| 459 | -embedding_model = client.get_model("qwen3-embedding-4b") | |
| 460 | -reranker_model = client.get_model("qwen3-reranker-4b") | |
| 461 | - | |
| 462 | -# 1. 生成 Embedding | |
| 463 | -query = "高端智能手机" | |
| 464 | -query_vector = embedding_model.create_embedding(query)["data"][0]["embedding"] | |
| 465 | - | |
| 466 | -# 2. 搜索商品(假设有预计算的向量) | |
| 467 | -# products_with_vectors = [...] | |
| 468 | - | |
| 469 | -# 3. 精排 | |
| 470 | -results = reranker_model.rerank([ | |
| 471 | - (query, "华为 Mate 60 Pro 卫星通信"), | |
| 472 | - (query, "iPhone 15 Pro Max 钛金属"), | |
| 473 | - (query, "小米14 Pro 徕卡光学") | |
| 474 | -]) | |
| 475 | - | |
| 476 | -for result in results: | |
| 477 | - print(f"[{result['relevance_score']:.4f}] {result['index']}") | |
| 478 | -``` | |
| 479 | - | |
| 480 | -#### REST API | |
| 481 | - | |
| 482 | -```bash | |
| 483 | -# Embedding API | |
| 484 | -curl -X POST http://localhost:9997/v1/embeddings \ | |
| 485 | - -H "Content-Type: application/json" \ | |
| 486 | - -d '{ | |
| 487 | - "model": "qwen3-embedding-4b", | |
| 488 | - "input": ["测试文本"] | |
| 489 | - }' | |
| 490 | - | |
| 491 | -# Reranker API | |
| 492 | -curl -X POST http://localhost:9997/v1/rerank \ | |
| 493 | - -H "Content-Type: application/json" \ | |
| 494 | - -d '{ | |
| 495 | - "model": "qwen3-reranker-4b", | |
| 496 | - "query": "适合老人用的手机", | |
| 497 | - "documents": [ | |
| 498 | - "华为畅享60 6000mAh", | |
| 499 | - "小米手环8" | |
| 500 | - ], | |
| 501 | - "top_n": 5 | |
| 502 | - }' | |
| 503 | -``` | |
| 504 | - | |
| 505 | ---- | |
| 506 | - | |
| 507 | -## 故障排除 | |
| 508 | - | |
| 509 | -### 问题 1: GPU 不可用 | |
| 510 | - | |
| 511 | -**症状**: | |
| 512 | -``` | |
| 513 | -Worker can only see these GPUs: []. | |
| 514 | -``` | |
| 515 | - | |
| 516 | -**解决方案**: | |
| 517 | - | |
| 518 | -```bash | |
| 519 | -# 1. 安装 NVIDIA Container Toolkit | |
| 520 | -bash install_nvidia_container_toolkit.sh | |
| 521 | - | |
| 522 | -# 2. 重启服务 | |
| 523 | -sudo docker stop xinference | |
| 524 | -sudo docker rm xinference | |
| 525 | -bash start.sh | |
| 526 | - | |
| 527 | -# 3. 验证 | |
| 528 | -nvidia-smi | |
| 529 | -``` | |
| 530 | - | |
| 531 | -### 问题 2: 模型下载失败 | |
| 532 | - | |
| 533 | -**症状**: | |
| 534 | -``` | |
| 535 | -Failed to download model | |
| 536 | -``` | |
| 537 | - | |
| 538 | -**解决方案**: | |
| 539 | - | |
| 540 | -```bash | |
| 541 | -# 使用国内镜像 | |
| 542 | -export HF_ENDPOINT=https://hf-mirror.com | |
| 543 | -python deploy_models.py | |
| 544 | -``` | |
| 545 | - | |
| 546 | -### 问题 3: 显存不足 | |
| 547 | - | |
| 548 | -**症状**: | |
| 549 | -``` | |
| 550 | -CUDA out of memory | |
| 551 | -``` | |
| 552 | - | |
| 553 | -**解决方案**: | |
| 554 | - | |
| 555 | -```bash | |
| 556 | -# 只部署一个模型 | |
| 557 | -python deploy_models.py --embedding-only | |
| 558 | - | |
| 559 | -# 或使用不同的 GPU | |
| 560 | -python deploy_models.py --gpu 1 | |
| 561 | -``` | |
| 562 | - | |
| 563 | -### 问题 4: 导入错误 | |
| 564 | - | |
| 565 | -**症状**: | |
| 566 | -``` | |
| 567 | -ModuleNotFoundError: No module named 'xinference' | |
| 568 | -``` | |
| 569 | - | |
| 570 | -**解决方案**: | |
| 571 | - | |
| 572 | -```bash | |
| 573 | -# 激活正确的环境 | |
| 574 | -xinference-env | |
| 575 | - | |
| 576 | -# 重新安装依赖 | |
| 577 | -pip install xinference-client --force-reinstall | |
| 578 | -``` | |
| 579 | - | |
| 580 | -### 问题 5: 服务连接失败 | |
| 581 | - | |
| 582 | -**解决方案**: | |
| 583 | - | |
| 584 | -```bash | |
| 585 | -# 检查服务状态 | |
| 586 | -curl http://localhost:9997/v1/models | |
| 587 | - | |
| 588 | -# 查看日志 | |
| 589 | -sudo docker logs -f xinference | |
| 590 | - | |
| 591 | -# 重启服务 | |
| 592 | -sudo docker restart xinference | |
| 593 | -``` | |
| 594 | - | |
| 595 | ---- | |
| 596 | - | |
| 597 | -## 进阶配置 | |
| 598 | - | |
| 599 | -### 性能优化 | |
| 600 | - | |
| 601 | -1. **离线向量预计算** | |
| 602 | -2. **向量数据库集成**(Milvus/Pinecone) | |
| 603 | -3. **Redis 缓存** | |
| 604 | -4. **批量处理** | |
| 605 | - | |
| 606 | -### 生产部署 | |
| 607 | - | |
| 608 | -``` | |
| 609 | - ┌─────────────┐ | |
| 610 | - │ 负载均衡 │ | |
| 611 | - └──────┬──────┘ | |
| 612 | - │ | |
| 613 | - ┌────────────┼────────────┐ | |
| 614 | - │ │ │ | |
| 615 | - ┌────▼────┐ ┌───▼────┐ ┌───▼────┐ | |
| 616 | - │Xinference│ │Xinference│ │Xinference│ | |
| 617 | - │ Instance1│ │ Instance2│ │ Instance3│ | |
| 618 | - └────┬────┘ └───┬────┘ └───┬────┘ | |
| 619 | - │ │ │ | |
| 620 | - └────────────┼────────────┘ | |
| 621 | - │ | |
| 622 | - ┌──────▼──────┐ | |
| 623 | - │ 向量数据库 │ | |
| 624 | - │ (Milvus) │ | |
| 625 | - └─────────────┘ | |
| 626 | -``` | |
| 627 | - | |
| 628 | -### 监控指标 | |
| 629 | - | |
| 630 | -```python | |
| 631 | -metrics = { | |
| 632 | - "latency_p50": "< 100ms", | |
| 633 | - "latency_p99": "< 300ms", | |
| 634 | - "qps": "100+", | |
| 635 | - "gpu_utilization": "< 80%", | |
| 636 | - "cache_hit_rate": "> 60%" | |
| 637 | -} | |
| 638 | -``` | |
| 639 | - | |
| 640 | ---- | |
| 641 | - | |
| 642 | -## 附录 | |
| 643 | - | |
| 644 | -### 常用命令 | |
| 645 | - | |
| 646 | -```bash | |
| 647 | -# 服务管理 | |
| 648 | -bash start.sh # 启动服务 | |
| 649 | -sudo docker logs -f xinference # 查看日志 | |
| 650 | -sudo docker restart xinference # 重启服务 | |
| 651 | -sudo docker stop xinference # 停止服务 | |
| 652 | - | |
| 653 | -# 环境管理 | |
| 654 | -xinference-env # 激活环境 | |
| 655 | -conda deactivate # 退出环境 | |
| 656 | - | |
| 657 | -# 模型管理 | |
| 658 | -python deploy_models.py # 部署模型 | |
| 659 | -python deploy_models.py --list # 列出模型 | |
| 660 | -curl http://localhost:9997/v1/models # API 查询 | |
| 661 | - | |
| 662 | -# 演示示例 | |
| 663 | -python ecommerce_demo.py # 运行演示 | |
| 664 | -python ecommerce_demo.py --simple # 简单示例 | |
| 665 | -``` | |
| 666 | - | |
| 667 | -### 目录结构 | |
| 668 | - | |
| 669 | -``` | |
| 670 | -xinference/ | |
| 671 | -├── README.md # 本文档 | |
| 672 | -├── QUICKSTART.md # 快速开始 | |
| 673 | -├── ENV_SETUP.md # 环境配置详解 | |
| 674 | -├── start.sh # 服务启动脚本 | |
| 675 | -├── install_nvidia_container_toolkit.sh # NVIDIA Toolkit 安装 | |
| 676 | -├── setup_env.sh # Conda 环境创建 | |
| 677 | -├── setup_alias.sh # Shell 别名配置 | |
| 678 | -├── activate.sh # 环境激活脚本 | |
| 679 | -├── deploy_models.py # 模型部署脚本 | |
| 680 | -├── ecommerce_demo.py # 电商搜索示例 | |
| 681 | -├── docker-compose.yml # Docker Compose 配置 | |
| 682 | -└── models/ # 模型存储目录 | |
| 683 | -``` | |
| 684 | - | |
| 685 | -### 参考资源 | |
| 686 | - | |
| 687 | -- [Xinference 官方文档](https://inference.readthedocs.io/) | |
| 688 | -- [Qwen3 模型介绍](https://github.com/QwenLM/Qwen3) | |
| 689 | -- [Docker 部署指南](https://docs.docker.com/) | |
| 690 | -- [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/) | |
| 691 | - | |
| 692 | ---- | |
| 693 | - | |
| 694 | -## 总结 | |
| 695 | - | |
| 696 | -本文档提供了 Xinference 电商搜索系统的完整部署指南: | |
| 697 | - | |
| 698 | -✅ **环境准备**: Docker + Conda + GPU 驱动 | |
| 699 | -✅ **服务安装**: CPU/GPU 两种模式 | |
| 700 | -✅ **环境配置**: 自动化脚本 + 快捷命令 | |
| 701 | -✅ **模型部署**: Qwen3-Embedding 4B + Qwen3-Reranker 4B | |
| 702 | -✅ **使用示例**: 两阶段搜索架构 | |
| 703 | -✅ **故障排除**: 常见问题及解决方案 | |
| 704 | - | |
| 705 | -**下一步**: | |
| 706 | -1. 根据需求选择 CPU 或 GPU 模式 | |
| 707 | -2. 部署模型并运行演示 | |
| 708 | -3. 根据实际数据集调整参数 | |
| 709 | -4. 集成向量数据库进行优化 | |
| 710 | - | |
| 711 | ---- | |
| 712 | - | |
| 713 | -**文档版本**: v1.0 | |
| 714 | -**最后更新**: 2025-12-25 | |
| 715 | -**维护者**: Search Engine Team |
third-party/xinference/STATUS.md deleted
| ... | ... | @@ -1,158 +0,0 @@ |
| 1 | -# 当前状态总结 | |
| 2 | - | |
| 3 | -## ✅ 已完成 | |
| 4 | - | |
| 5 | -1. **Xinference 服务**:正常运行(CPU 模式) | |
| 6 | -2. **API 测试**:正常工作(http://localhost:9997) | |
| 7 | -3. **Python 环境**:已配置(xinference conda 环境) | |
| 8 | -4. **代码修复**:所有脚本已更新 | |
| 9 | -5. **GPU 设备**:硬件正常(Tesla T4) | |
| 10 | -6. **NVIDIA 驱动**:已安装(570.86.10) | |
| 11 | - | |
| 12 | -## ❌ 当前问题 | |
| 13 | - | |
| 14 | -**GPU 检测失败**:Xinference worker 无法检测到 GPU | |
| 15 | - | |
| 16 | -``` | |
| 17 | -Worker cannot use the GPUs with these indexes: [0] | |
| 18 | -Worker can only see these GPUs: [] | |
| 19 | -``` | |
| 20 | - | |
| 21 | -**根本原因**:缺少 `nvidia-container-toolkit`,Docker 容器无法正确暴露 GPU 给 Xinference | |
| 22 | - | |
| 23 | -## 🔧 解决方案(三选一) | |
| 24 | - | |
| 25 | -### 方案 1:安装 nvidia-container-toolkit(推荐,但需要网络) | |
| 26 | - | |
| 27 | -**当有网络时执行**: | |
| 28 | - | |
| 29 | -```bash | |
| 30 | -# 下载并安装 RPM 包(需要访问 GitHub 或 NVIDIA 镜像) | |
| 31 | -# 方法 A: 从 NVIDIA 仓库 | |
| 32 | -distribution=$(. /etc/os-release;echo $ID$VERSION_ID | sed 's/\.//') | |
| 33 | -curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | \ | |
| 34 | - sudo tee /etc/yum.repos.d/nvidia-docker.repo | |
| 35 | - | |
| 36 | -sudo yum install -y nvidia-container-toolkit | |
| 37 | -sudo nvidia-ctk runtime configure --runtime=docker | |
| 38 | -sudo systemctl restart docker | |
| 39 | - | |
| 40 | -# 重启 Xinference | |
| 41 | -sudo docker stop xinference | |
| 42 | -sudo docker rm xinference | |
| 43 | -bash start.sh # 会自动使用 GPU | |
| 44 | - | |
| 45 | -# 部署模型 | |
| 46 | -python deploy_models.py | |
| 47 | -``` | |
| 48 | - | |
| 49 | -### 方案 2:使用 CPU 模式(当前可用,但速度慢) | |
| 50 | - | |
| 51 | -**立即可用**: | |
| 52 | - | |
| 53 | -```bash | |
| 54 | -# 使用 start.sh(CPU 模式) | |
| 55 | -bash start.sh | |
| 56 | - | |
| 57 | -# 部署较小的模型(修改 deploy_models.py 中的模型大小) | |
| 58 | -# 将 model_size_in_billions=4 改为 model_size_in_billions=0 | |
| 59 | -python deploy_models.py | |
| 60 | - | |
| 61 | -# 或直接运行演示(使用默认小模型) | |
| 62 | -python ecommerce_demo.py | |
| 63 | -``` | |
| 64 | - | |
| 65 | -**优点**: | |
| 66 | -- ✅ 立即可用 | |
| 67 | -- ✅ 无需额外配置 | |
| 68 | -- ✅ 功能完整 | |
| 69 | - | |
| 70 | -**缺点**: | |
| 71 | -- ❌ 速度慢(10-50倍) | |
| 72 | -- ❌ 仅适合测试和演示 | |
| 73 | - | |
| 74 | -### 方案 3:等待网络恢复后安装 | |
| 75 | - | |
| 76 | -**保存以下命令供后续使用**: | |
| 77 | - | |
| 78 | -```bash | |
| 79 | -# NVIDIA Container Toolkit 安装命令 | |
| 80 | -# 保存到文件: ~/install_nvidia_toolkit_when_network_available.sh | |
| 81 | - | |
| 82 | -#!/bin/bash | |
| 83 | -# AliLinux/RHEL8 安装脚本 | |
| 84 | - | |
| 85 | -# 1. 添加仓库 | |
| 86 | -curl -s -L https://nvidia.github.io/nvidia-docker/rhel8/nvidia-docker.repo | \ | |
| 87 | - sudo tee /etc/yum.repos.d/nvidia-docker.repo | |
| 88 | - | |
| 89 | -# 2. 安装 | |
| 90 | -sudo yum install -y nvidia-container-toolkit | |
| 91 | - | |
| 92 | -# 3. 配置 Docker | |
| 93 | -sudo nvidia-ctk runtime configure --runtime=docker | |
| 94 | - | |
| 95 | -# 4. 重启 Docker | |
| 96 | -sudo systemctl restart docker | |
| 97 | - | |
| 98 | -# 5. 验证 | |
| 99 | -sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi | |
| 100 | -``` | |
| 101 | - | |
| 102 | -## 📊 当前系统信息 | |
| 103 | - | |
| 104 | -```bash | |
| 105 | -# 硬件 | |
| 106 | -GPU: Tesla T4 16GB | |
| 107 | -驱动: 570.86.10 | |
| 108 | - | |
| 109 | -# 软件 | |
| 110 | -Docker: 26.1.3 | |
| 111 | -Python: 3.10.19 (xinference 环境) | |
| 112 | -Xinference: 运行中(端口 9997/9998) | |
| 113 | - | |
| 114 | -# 服务状态 | |
| 115 | -API: http://localhost:9997 ✓ | |
| 116 | -Dashboard: http://localhost:9998 | |
| 117 | -``` | |
| 118 | - | |
| 119 | -## 🎯 推荐操作 | |
| 120 | - | |
| 121 | -**立即可做**: | |
| 122 | -```bash | |
| 123 | -# 使用 CPU 模式测试功能 | |
| 124 | -# 1. 确保服务运行 | |
| 125 | -curl http://localhost:9997/v1/models | |
| 126 | - | |
| 127 | -# 2. 运行简单演示 | |
| 128 | -cd /data/tw/SearchEngine/third-party/xinference | |
| 129 | -/home/tw/miniconda3/envs/xinference/bin/python ecommerce_demo.py --simple | |
| 130 | -``` | |
| 131 | - | |
| 132 | -**后续优化**: | |
| 133 | -1. 等待网络恢复,安装 nvidia-container-toolkit | |
| 134 | -2. 重启服务使用 GPU | |
| 135 | -3. 部署 Qwen3 4B 模型 | |
| 136 | - | |
| 137 | -## 📝 已创建的文件 | |
| 138 | - | |
| 139 | -- `start_gpu_manual.sh` - 手动 GPU 启动脚本(需要 nvidia-container-toolkit) | |
| 140 | -- `install_nvidia_container_toolkit.sh` - 自动安装脚本(不支持 AliLinux) | |
| 141 | -- `README.md` - 完整文档 | |
| 142 | -- `QUICKSTART.md` - 快速开始 | |
| 143 | -- `ENV_SETUP.md` - 环境配置 | |
| 144 | - | |
| 145 | -## 💡 技术说明 | |
| 146 | - | |
| 147 | -**为什么需要 nvidia-container-toolkit**? | |
| 148 | - | |
| 149 | -Docker 容器需要通过 NVIDIA Container Runtime 来访问 GPU,这包括: | |
| 150 | -1. NVIDIA 驱动库的映射 | |
| 151 | -2. GPU 设备的暴露 | |
| 152 | -3. CUDA 环境的配置 | |
| 153 | - | |
| 154 | -手动映射设备和库(我们尝试的方法)对于简单的容器可以工作,但 Xinference 使用 vLLM 引擎,它需要完整的 CUDA 运行时环境。 | |
| 155 | - | |
| 156 | -**为什么 CPU 模式可以工作**? | |
| 157 | - | |
| 158 | -Xinference 支持降级到 CPU 模式,通过设置环境变量或自动检测。虽然速度慢,但功能完整。 |
third-party/xinference/activate.sh
| 1 | 1 | #!/bin/bash |
| 2 | 2 | |
| 3 | -# 快速切换到 Xinference 环境的便捷脚本 | |
| 4 | - | |
| 5 | -ENV_NAME="xinference" | |
| 6 | - | |
| 7 | -# 初始化 conda | |
| 8 | -eval "$(conda shell.bash hook)" | |
| 9 | - | |
| 10 | -# 激活环境 | |
| 11 | -conda activate $ENV_NAME | |
| 12 | - | |
| 13 | -echo "✅ 已切换到 Xinference 环境" | |
| 14 | -echo "" | |
| 15 | -echo "可用命令:" | |
| 16 | -echo " - python ecommerce_demo.py # 运行电商搜索演示" | |
| 17 | -echo " - python deploy_models.py # 部署模型" | |
| 18 | -echo " - conda deactivate # 退出环境" | |
| 19 | -echo "" | |
| 20 | - | |
| 21 | -# 保持 shell 打开 | |
| 22 | -exec $SHELL | |
| 3 | +source /home/tw/miniconda3/etc/profile.d/conda.sh | |
| 4 | +conda activate xinference | ... | ... |
third-party/xinference/api_examples.sh deleted
| ... | ... | @@ -1,356 +0,0 @@ |
| 1 | -#!/bin/bash | |
| 2 | - | |
| 3 | -# Xinference REST API 调用示例 | |
| 4 | -# 演示如何通过 HTTP API 调用 Qwen3-Embedding 和 Qwen3-Reranker | |
| 5 | - | |
| 6 | -# 设置服务地址 | |
| 7 | -XINFERENCE_HOST="http://localhost:9997" | |
| 8 | -MODEL_EMBEDDING="qwen3-embedding" | |
| 9 | -MODEL_RERANKER="qwen3-reranker" | |
| 10 | - | |
| 11 | -echo "=========================================" | |
| 12 | -echo " Xinference REST API 调用示例" | |
| 13 | -echo "=========================================" | |
| 14 | -echo "" | |
| 15 | - | |
| 16 | -# 颜色定义 | |
| 17 | -GREEN='\033[0;32m' | |
| 18 | -BLUE='\033[0;34m' | |
| 19 | -YELLOW='\033[1;33m' | |
| 20 | -NC='\033[0m' # No Color | |
| 21 | - | |
| 22 | -print_section() { | |
| 23 | - echo "" | |
| 24 | - echo -e "${BLUE}=========================================${NC}" | |
| 25 | - echo -e "${BLUE} $1${NC}" | |
| 26 | - echo -e "${BLUE}=========================================${NC}" | |
| 27 | - echo "" | |
| 28 | -} | |
| 29 | - | |
| 30 | -print_info() { | |
| 31 | - echo -e "${GREEN}➜${NC} $1" | |
| 32 | -} | |
| 33 | - | |
| 34 | -# ============================================ | |
| 35 | -# 1. 查看服务状态 | |
| 36 | -# ============================================ | |
| 37 | -print_section "1. 查看服务状态和已部署模型" | |
| 38 | - | |
| 39 | -print_info "查看所有已部署的模型:" | |
| 40 | -curl -s "${XINFERENCE_HOST}/v1/models" | python3 -m json.tool | |
| 41 | - | |
| 42 | -echo "" | |
| 43 | -print_info "查看服务健康状态:" | |
| 44 | -curl -s "${XINFERENCE_HOST}/v1/models" > /dev/null && echo "✅ 服务健康" || echo "❌ 服务异常" | |
| 45 | - | |
| 46 | -# ============================================ | |
| 47 | -# 2. Embedding API 调用 | |
| 48 | -# ============================================ | |
| 49 | -print_section "2. Qwen3-Embedding API 调用" | |
| 50 | - | |
| 51 | -print_info "单个文本 embedding 生成:" | |
| 52 | -curl -X POST "${XINFERENCE_HOST}/v1/embeddings" \ | |
| 53 | - -H "Content-Type: application/json" \ | |
| 54 | - -d '{ | |
| 55 | - "model": "'${MODEL_EMBEDDING}'", | |
| 56 | - "input": ["适合老人用的智能手机大屏幕长续航"] | |
| 57 | - }' | python3 -m json.tool | |
| 58 | - | |
| 59 | -echo "" | |
| 60 | -print_info "批量文本 embedding 生成:" | |
| 61 | -curl -X POST "${XINFERENCE_HOST}/v1/embeddings" \ | |
| 62 | - -H "Content-Type: application/json" \ | |
| 63 | - -d '{ | |
| 64 | - "model": "'${MODEL_EMBEDDING}'", | |
| 65 | - "input": [ | |
| 66 | - "红米Note12 5000mAh大电量", | |
| 67 | - "华为畅享60 6000mAh超长续航", | |
| 68 | - "小米手环8 智能运动监测" | |
| 69 | - ] | |
| 70 | - }' | python3 -m json.tool | |
| 71 | - | |
| 72 | -# ============================================ | |
| 73 | -# 3. Reranker API 调用 | |
| 74 | -# ============================================ | |
| 75 | -print_section "3. Qwen3-Reranker API 调用" | |
| 76 | - | |
| 77 | -print_info "精排候选商品:" | |
| 78 | - | |
| 79 | -curl -X POST "${XINFERENCE_HOST}/v1/rerank" \ | |
| 80 | - -H "Content-Type: application/json" \ | |
| 81 | - -d '{ | |
| 82 | - "model": "'${MODEL_RERANKER}'", | |
| 83 | - "query": "适合老人用的智能手机大屏幕长续航", | |
| 84 | - "documents": [ | |
| 85 | - "红米Note12 5000mAh大电量 6.67英寸大屏 老人模式", | |
| 86 | - "iPhone 15 Pro Max 专业摄影旗舰", | |
| 87 | - "华为畅享60 6000mAh超长续航 护眼大屏", | |
| 88 | - "OPPO A1 5000mAh电池 简易模式适合长辈", | |
| 89 | - "小米手环8 智能运动监测" | |
| 90 | - ], | |
| 91 | - "top_n": 5 | |
| 92 | - }' | python3 -m json.tool | |
| 93 | - | |
| 94 | -# ============================================ | |
| 95 | -# 4. 电商搜索实战:两阶段检索 | |
| 96 | -# ============================================ | |
| 97 | -print_section "4. 电商搜索实战:完整两阶段检索流程" | |
| 98 | - | |
| 99 | -# 阶段1: 密集检索 | |
| 100 | -print_info "阶段1: 为用户 query 生成向量" | |
| 101 | -echo "" | |
| 102 | -echo "Query: 适合老人用的智能手机大屏幕长续航" | |
| 103 | - | |
| 104 | -QUERY_VECTOR=$(curl -s -X POST "${XINFERENCE_HOST}/v1/embeddings" \ | |
| 105 | - -H "Content-Type: application/json" \ | |
| 106 | - -d '{ | |
| 107 | - "model": "'${MODEL_EMBEDDING}'", | |
| 108 | - "input": ["适合老人用的智能手机大屏幕长续航"] | |
| 109 | - }' | python3 -c "import sys, json; print(json.dumps(json.load(sys.stdin)['data'][0]['embedding'])))") | |
| 110 | - | |
| 111 | -echo "Query 向量维度: $(echo $QUERY_VECTOR | python3 -c "import sys, json; print(len(json.load(sys.stdin)))")" | |
| 112 | - | |
| 113 | -echo "" | |
| 114 | -print_info "为候选商品生成向量(在实际应用中,这些向量应预计算并存储)" | |
| 115 | - | |
| 116 | -# 为简化演示,这里只显示部分候选商品的向量生成 | |
| 117 | -CANDIDATES=( | |
| 118 | - "红米Note12 5000mAh大电量" | |
| 119 | - "华为畅享60 6000mAh超长续航" | |
| 120 | - "小米手环8" | |
| 121 | -) | |
| 122 | - | |
| 123 | -for candidate in "${CANDIDATES[@]}"; do | |
| 124 | - echo "" | |
| 125 | - echo "商品: $candidate" | |
| 126 | - curl -s -X POST "${XINFERENCE_HOST}/v1/embeddings" \ | |
| 127 | - -H "Content-Type: application/json" \ | |
| 128 | - -d '{ | |
| 129 | - "model": "'${MODEL_EMBEDDING}'", | |
| 130 | - "input": ["'${candidate}'"] | |
| 131 | - }' | python3 -c "import sys, json; data=json.load(sys.stdin); print(f\" 向量维度: {len(data['data'][0]['embedding'])}\")" | |
| 132 | -done | |
| 133 | - | |
| 134 | -# 阶段2: 精排 | |
| 135 | -echo "" | |
| 136 | -print_info "阶段2: 使用 Reranker 对召回结果进行精排" | |
| 137 | - | |
| 138 | -curl -X POST "${XINFERENCE_HOST}/v1/rerank" \ | |
| 139 | - -H "Content-Type: application/json" \ | |
| 140 | - -d '{ | |
| 141 | - "model": "'${MODEL_RERANKER}'", | |
| 142 | - "query": "适合老人用的智能手机大屏幕长续航", | |
| 143 | - "documents": [ | |
| 144 | - "红米Note12 5000mAh大电量 6.67英寸大屏 老人模式", | |
| 145 | - "华为畅享60 6000mAh超长续航 护眼大屏", | |
| 146 | - "OPPO A1 5000mAh电池 简易模式适合长辈", | |
| 147 | - "iPhone 15 Pro Max 专业摄影旗舰", | |
| 148 | - "小米手环8 智能运动监测" | |
| 149 | - ], | |
| 150 | - "top_n": 3 | |
| 151 | - }' | python3 -m json.tool | |
| 152 | - | |
| 153 | -# ============================================ | |
| 154 | -# 5. 高级用法:批量处理 | |
| 155 | -# ============================================ | |
| 156 | -print_section "5. 批量 Embedding 生成(离线任务)" | |
| 157 | - | |
| 158 | -print_info "为大量商品生成 embedding(模拟离线任务)" | |
| 159 | -echo "注意: 实际生产环境中,批量大小建议为 100-1000" | |
| 160 | - | |
| 161 | -# 创建批量输入文件 | |
| 162 | -cat > /tmp/batch_input.json <<EOF | |
| 163 | -{ | |
| 164 | - "model": "${MODEL_EMBEDDING}", | |
| 165 | - "input": [ | |
| 166 | - "商品1: 红米Note12 5000mAh大电量 6.67英寸大屏", | |
| 167 | - "商品2: iPhone 15 Pro Max 专业摄影旗舰", | |
| 168 | - "商品3: 华为畅享60 6000mAh超长续航 护眼大屏", | |
| 169 | - "商品4: OPPO A1 5000mAh电池 简易模式适合长辈", | |
| 170 | - "商品5: 小米手环8 智能运动监测", | |
| 171 | - "商品6: vivo Y78 5000mAh大电池 120Hz高刷屏", | |
| 172 | - "商品7: 三星Galaxy A54 5000mAh 防水防尘", | |
| 173 | - "商品8: 荣耀Play7T 6000mAh巨量电池", | |
| 174 | - "商品9: 真我11 Pro 2亿像素 100W快充", | |
| 175 | - "商品10: 诺基亚C31 5050mAh电池 耐用三防" | |
| 176 | - ] | |
| 177 | -} | |
| 178 | -EOF | |
| 179 | - | |
| 180 | -echo "" | |
| 181 | -echo "批量生成 10 个商品的 embedding..." | |
| 182 | -curl -X POST "${XINFERENCE_HOST}/v1/embeddings" \ | |
| 183 | - -H "Content-Type: application/json" \ | |
| 184 | - -d @/tmp/batch_input.json | python3 -c "import sys, json; data=json.load(sys.stdin); print(f'✅ 成功生成 {len(data[\"data\"])} 个向量'); print(f'向量维度: {len(data[\"data\"][0][\"embedding\"])}')" | |
| 185 | - | |
| 186 | -# ============================================ | |
| 187 | -# 6. Python 调用示例 | |
| 188 | -# ============================================ | |
| 189 | -print_section "6. Python 调用示例" | |
| 190 | - | |
| 191 | -cat << 'EOF' | |
| 192 | - | |
| 193 | -# Python 调用示例代码: | |
| 194 | - | |
| 195 | -import requests | |
| 196 | -import numpy as np | |
| 197 | - | |
| 198 | -XINFERENCE_HOST = "http://localhost:9997" | |
| 199 | - | |
| 200 | -# Embedding 调用 | |
| 201 | -def get_embedding(text: str) -> list: | |
| 202 | - """获取文本的 embedding 向量""" | |
| 203 | - response = requests.post( | |
| 204 | - f"{XINFERENCE_HOST}/v1/embeddings", | |
| 205 | - json={ | |
| 206 | - "model": "qwen3-embedding", | |
| 207 | - "input": [text] | |
| 208 | - } | |
| 209 | - ) | |
| 210 | - return response.json()["data"][0]["embedding"] | |
| 211 | - | |
| 212 | -# 批量 Embedding | |
| 213 | -def get_embeddings(texts: list) -> list: | |
| 214 | - """批量获取 embedding 向量""" | |
| 215 | - response = requests.post( | |
| 216 | - f"{XINFERENCE_HOST}/v1/embeddings", | |
| 217 | - json={ | |
| 218 | - "model": "qwen3-embedding", | |
| 219 | - "input": texts | |
| 220 | - } | |
| 221 | - ) | |
| 222 | - return [item["embedding"] for item in response.json()["data"]] | |
| 223 | - | |
| 224 | -# Reranker 调用 | |
| 225 | -def rerank(query: str, documents: list, top_n: int = 10) -> list: | |
| 226 | - """使用 reranker 对文档排序""" | |
| 227 | - response = requests.post( | |
| 228 | - f"{XINFERENCE_HOST}/v1/rerank", | |
| 229 | - json={ | |
| 230 | - "model": "qwen3-reranker", | |
| 231 | - "query": query, | |
| 232 | - "documents": documents, | |
| 233 | - "top_n": top_n | |
| 234 | - } | |
| 235 | - ) | |
| 236 | - return response.json()["results"] | |
| 237 | - | |
| 238 | -# 计算余弦相似度 | |
| 239 | -def cosine_similarity(vec1: list, vec2: list) -> float: | |
| 240 | - """计算两个向量的余弦相似度""" | |
| 241 | - v1 = np.array(vec1) | |
| 242 | - v2 = np.array(vec2) | |
| 243 | - return float(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))) | |
| 244 | - | |
| 245 | -# 完整搜索流程 | |
| 246 | -def search(query: str, products: list) -> list: | |
| 247 | - """两阶段搜索""" | |
| 248 | - # 阶段1: 密集检索(简化示例) | |
| 249 | - query_vec = get_embedding(query) | |
| 250 | - similarities = [] | |
| 251 | - for product in products: | |
| 252 | - prod_vec = get_embedding(product) | |
| 253 | - sim = cosine_similarity(query_vec, prod_vec) | |
| 254 | - similarities.append((product, sim)) | |
| 255 | - | |
| 256 | - # 取 Top-200 | |
| 257 | - similarities.sort(key=lambda x: x[1], reverse=True) | |
| 258 | - top_200 = [p for p, s in similarities[:200]] | |
| 259 | - | |
| 260 | - # 阶段2: 精排 | |
| 261 | - reranked = rerank(query, top_200, top_n=10) | |
| 262 | - return reranked | |
| 263 | - | |
| 264 | -# 使用示例 | |
| 265 | -if __name__ == "__main__": | |
| 266 | - query = "适合老人用的智能手机大屏幕长续航" | |
| 267 | - products = [ | |
| 268 | - "红米Note12 5000mAh大电量 6.67英寸大屏 老人模式", | |
| 269 | - "华为畅享60 6000mAh超长续航 护眼大屏", | |
| 270 | - "小米手环8 智能运动监测" | |
| 271 | - ] | |
| 272 | - | |
| 273 | - results = search(query, products) | |
| 274 | - for r in results: | |
| 275 | - print(f"[{r['relevance_score']:.4f}] {r['document']}") | |
| 276 | - | |
| 277 | -EOF | |
| 278 | - | |
| 279 | -# ============================================ | |
| 280 | -# 7. 性能测试 | |
| 281 | -# ============================================ | |
| 282 | -print_section "7. 性能测试" | |
| 283 | - | |
| 284 | -print_info "测试 Embedding API 响应时间:" | |
| 285 | -echo "" | |
| 286 | - | |
| 287 | -for i in {1..5}; do | |
| 288 | - START=$(date +%s%N) | |
| 289 | - curl -s -X POST "${XINFERENCE_HOST}/v1/embeddings" \ | |
| 290 | - -H "Content-Type: application/json" \ | |
| 291 | - -d '{ | |
| 292 | - "model": "'${MODEL_EMBEDDING}'", | |
| 293 | - "input": ["测试文本"] | |
| 294 | - }' > /dev/null | |
| 295 | - END=$(date +%s%N) | |
| 296 | - ELAPSED=$((($END - $START) / 1000000)) | |
| 297 | - echo " 请求 $i: ${ELAPSED}ms" | |
| 298 | -done | |
| 299 | - | |
| 300 | -echo "" | |
| 301 | -print_info "测试 Reranker API 响应时间:" | |
| 302 | - | |
| 303 | -for i in {1..5}; do | |
| 304 | - START=$(date +%s%N) | |
| 305 | - curl -s -X POST "${XINFERENCE_HOST}/v1/rerank" \ | |
| 306 | - -H "Content-Type: application/json" \ | |
| 307 | - -d '{ | |
| 308 | - "model": "'${MODEL_RERANKER}'", | |
| 309 | - "query": "测试查询", | |
| 310 | - "documents": ["文档1", "文档2", "文档3"], | |
| 311 | - "top_n": 3 | |
| 312 | - }' > /dev/null | |
| 313 | - END=$(date +%s%N) | |
| 314 | - ELAPSED=$((($END - $START) / 1000000)) | |
| 315 | - echo " 请求 $i: ${ELAPSED}ms" | |
| 316 | -done | |
| 317 | - | |
| 318 | -# ============================================ | |
| 319 | -# 8. 常见问题排查 | |
| 320 | -# ============================================ | |
| 321 | -print_section "8. 常见问题排查" | |
| 322 | - | |
| 323 | -print_info "检查服务是否运行:" | |
| 324 | -curl -s "${XINFERENCE_HOST}/v1/models" > /dev/null && echo "✅ 服务正常" || echo "❌ 服务未启动,请运行: ./start.sh" | |
| 325 | - | |
| 326 | -echo "" | |
| 327 | -print_info "检查模型是否部署:" | |
| 328 | -MODELS=$(curl -s "${XINFERENCE_HOST}/v1/models") | |
| 329 | -echo "$MODELS" | python3 -c " | |
| 330 | -import sys, json | |
| 331 | -try: | |
| 332 | - models = json.load(sys.stdin) | |
| 333 | - if models: | |
| 334 | - print('✅ 已部署模型:') | |
| 335 | - for m in models: | |
| 336 | - print(f' - {m.get(\"model_type\")}: {m.get(\"model_uid\")}') | |
| 337 | - else: | |
| 338 | - print('❌ 没有已部署的模型,请运行: python deploy_models.py') | |
| 339 | -except: | |
| 340 | - print('❌ 无法获取模型信息') | |
| 341 | -" | |
| 342 | - | |
| 343 | -echo "" | |
| 344 | -print_info "查看 GPU 使用情况:" | |
| 345 | -if command -v nvidia-smi &> /dev/null; then | |
| 346 | - nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv,noheader | while read line; do | |
| 347 | - echo " GPU $line" | |
| 348 | - done | |
| 349 | -else | |
| 350 | - echo " ⚠️ nvidia-smi 未安装,无法查看 GPU 信息" | |
| 351 | -fi | |
| 352 | - | |
| 353 | -echo "" | |
| 354 | -echo "=========================================" | |
| 355 | -echo " ✅ API 调用示例演示完成" | |
| 356 | -echo "=========================================" |
third-party/xinference/deploy_models.py deleted
| ... | ... | @@ -1,192 +0,0 @@ |
| 1 | -#!/usr/bin/env python3 | |
| 2 | -""" | |
| 3 | -Qwen3 模型部署脚本 | |
| 4 | -自动部署 Qwen3-Embedding 和 Qwen3-Reranker 模型 | |
| 5 | -""" | |
| 6 | - | |
| 7 | -import time | |
| 8 | -import sys | |
| 9 | -from xinference_client import RESTfulClient as Client | |
| 10 | - | |
| 11 | - | |
| 12 | -def print_section(title): | |
| 13 | - """打印分节标题""" | |
| 14 | - print("\n" + "="*60) | |
| 15 | - print(f" {title}") | |
| 16 | - print("="*60 + "\n") | |
| 17 | - | |
| 18 | - | |
| 19 | -def deploy_qwen3_models(host="http://localhost:9997", gpu_idx=[0]): | |
| 20 | - """ | |
| 21 | - 部署 Qwen3 模型 | |
| 22 | - | |
| 23 | - Args: | |
| 24 | - host: Xinference 服务地址 | |
| 25 | - gpu_idx: GPU 索引 | |
| 26 | - """ | |
| 27 | - print_section("Qwen3 模型自动部署") | |
| 28 | - | |
| 29 | - # 连接到 Xinference 服务 | |
| 30 | - print(f"🔗 连接到 Xinference 服务: {host}") | |
| 31 | - try: | |
| 32 | - client = Client(host) | |
| 33 | - print("✅ 连接成功!\n") | |
| 34 | - except Exception as e: | |
| 35 | - print(f"❌ 连接失败: {e}") | |
| 36 | - print("\n💡 请确保 Xinference 服务已启动:") | |
| 37 | - print(" ./start.sh") | |
| 38 | - sys.exit(1) | |
| 39 | - | |
| 40 | - # 部署 Qwen3-Embedding 模型 | |
| 41 | - print_section("部署 Qwen3-Embedding 模型 (4B)") | |
| 42 | - print("⏳ 正在部署,首次运行需要下载模型,请耐心等待...") | |
| 43 | - print(" 模型大小: ~8GB") | |
| 44 | - print(" 上下文长度: 8192 tokens") | |
| 45 | - print(" 向量维度: 1024\n") | |
| 46 | - | |
| 47 | - try: | |
| 48 | - embedding_uid = client.launch_model( | |
| 49 | - model_name="qwen3-embedding", | |
| 50 | - model_size_in_billions=4, | |
| 51 | - model_type="embedding", | |
| 52 | - engine="vllm", | |
| 53 | - gpu_idx=gpu_idx, | |
| 54 | - ) | |
| 55 | - print(f"✅ Qwen3-Embedding 部署成功!") | |
| 56 | - print(f" 模型 UID: {embedding_uid}\n") | |
| 57 | - | |
| 58 | - # 等待模型加载完成 | |
| 59 | - print("⏳ 等待模型完全加载...") | |
| 60 | - time.sleep(5) | |
| 61 | - | |
| 62 | - # 测试模型 | |
| 63 | - embedding_model = client.get_model(embedding_uid) | |
| 64 | - test_result = embedding_model.create_embedding("测试文本") | |
| 65 | - if test_result and "data" in test_result: | |
| 66 | - vector_dim = len(test_result["data"][0]["embedding"]) | |
| 67 | - print(f"✅ 模型测试成功!向量维度: {vector_dim}\n") | |
| 68 | - else: | |
| 69 | - print("⚠️ 模型部署成功但测试失败\n") | |
| 70 | - | |
| 71 | - except Exception as e: | |
| 72 | - print(f"❌ Qwen3-Embedding 部署失败: {e}\n") | |
| 73 | - return None | |
| 74 | - | |
| 75 | - # 部署 Qwen3-Reranker 模型 | |
| 76 | - print_section("部署 Qwen3-Reranker 模型 (4B)") | |
| 77 | - print("⏳ 正在部署,首次运行需要下载模型,请耐心等待...") | |
| 78 | - print(" 模型大小: ~8GB") | |
| 79 | - print(" 架构: Cross-Encoder\n") | |
| 80 | - | |
| 81 | - try: | |
| 82 | - reranker_uid = client.launch_model( | |
| 83 | - model_name="qwen3-reranker", | |
| 84 | - model_size_in_billions=4, | |
| 85 | - model_type="rerank", | |
| 86 | - engine="vllm", | |
| 87 | - gpu_idx=gpu_idx, | |
| 88 | - ) | |
| 89 | - print(f"✅ Qwen3-Reranker 部署成功!") | |
| 90 | - print(f" 模型 UID: {reranker_uid}\n") | |
| 91 | - | |
| 92 | - # 等待模型加载完成 | |
| 93 | - print("⏳ 等待模型完全加载...") | |
| 94 | - time.sleep(5) | |
| 95 | - | |
| 96 | - # 测试模型 | |
| 97 | - reranker_model = client.get_model(reranker_uid) | |
| 98 | - test_result = reranker_model.rerank( | |
| 99 | - [("测试查询", "测试文档")] | |
| 100 | - ) | |
| 101 | - if test_result and len(test_result) > 0: | |
| 102 | - print(f"✅ 模型测试成功!\n") | |
| 103 | - else: | |
| 104 | - print("⚠️ 模型部署成功但测试失败\n") | |
| 105 | - | |
| 106 | - except Exception as e: | |
| 107 | - print(f"❌ Qwen3-Reranker 部署失败: {e}") | |
| 108 | - print("💡 可能的原因: GPU 显存不足,请尝试:") | |
| 109 | - print(" 1. 使用不同的 GPU 索引: python deploy_models.py --gpu 1") | |
| 110 | - print(" 2. 只部署 embedding 模型: python deploy_models.py --embedding-only") | |
| 111 | - return None | |
| 112 | - | |
| 113 | - # 显示部署摘要 | |
| 114 | - print_section("🎉 模型部署完成!") | |
| 115 | - print(f"✅ Qwen3-Embedding UID: {embedding_uid}") | |
| 116 | - print(f"✅ Qwen3-Reranker UID: {reranker_uid}") | |
| 117 | - print("\n📝 下一步:") | |
| 118 | - print(" 1. 运行电商搜索示例: python ecommerce_demo.py") | |
| 119 | - print(" 2. 查看 API 调用示例: cat api_examples.sh") | |
| 120 | - print(" 3. 查看 Dashboard: http://localhost:9998") | |
| 121 | - print(" 4. 查看所有模型: curl http://localhost:9997/v1/models") | |
| 122 | - print("") | |
| 123 | - | |
| 124 | - return { | |
| 125 | - "embedding_uid": embedding_uid, | |
| 126 | - "reranker_uid": reranker_uid | |
| 127 | - } | |
| 128 | - | |
| 129 | - | |
| 130 | -def list_models(host="http://localhost:9997"): | |
| 131 | - """列出所有已部署的模型""" | |
| 132 | - print_section("已部署模型列表") | |
| 133 | - try: | |
| 134 | - client = Client(host) | |
| 135 | - models = client.list_models() | |
| 136 | - | |
| 137 | - if not models: | |
| 138 | - print("📭 当前没有已部署的模型") | |
| 139 | - else: | |
| 140 | - for model in models: | |
| 141 | - model_type = model.get("model_type", "unknown") | |
| 142 | - model_uid = model.get("model_uid", "unknown") | |
| 143 | - print(f"📦 {model_type.upper()}: {model_uid}") | |
| 144 | - print() | |
| 145 | - except Exception as e: | |
| 146 | - print(f"❌ 获取模型列表失败: {e}\n") | |
| 147 | - | |
| 148 | - | |
| 149 | -if __name__ == "__main__": | |
| 150 | - import argparse | |
| 151 | - | |
| 152 | - parser = argparse.ArgumentParser(description="部署 Qwen3 模型到 Xinference") | |
| 153 | - parser.add_argument("--host", default="http://localhost:9997", help="Xinference 服务地址") | |
| 154 | - parser.add_argument("--gpu", default="0", help="GPU 索引(逗号分隔,如: 0 或 0,1)") | |
| 155 | - parser.add_argument("--embedding-only", action="store_true", help="仅部署 embedding 模型") | |
| 156 | - parser.add_argument("--reranker-only", action="store_true", help="仅部署 reranker 模型") | |
| 157 | - parser.add_argument("--list", action="store_true", help="列出已部署的模型") | |
| 158 | - | |
| 159 | - args = parser.parse_args() | |
| 160 | - | |
| 161 | - # 将 GPU 字符串转换为列表 | |
| 162 | - gpu_idx = [int(x.strip()) for x in args.gpu.split(",")] | |
| 163 | - | |
| 164 | - if args.list: | |
| 165 | - list_models(args.host) | |
| 166 | - elif args.embedding_only: | |
| 167 | - # 仅部署 embedding | |
| 168 | - print_section("部署 Qwen3-Embedding 模型 (4B)") | |
| 169 | - client = Client(args.host) | |
| 170 | - embedding_uid = client.launch_model( | |
| 171 | - model_name="qwen3-embedding", | |
| 172 | - model_size_in_billions=4, | |
| 173 | - model_type="embedding", | |
| 174 | - engine="vllm", | |
| 175 | - gpu_idx=gpu_idx, | |
| 176 | - ) | |
| 177 | - print(f"✅ Embedding 模型部署成功: {embedding_uid}") | |
| 178 | - elif args.reranker_only: | |
| 179 | - # 仅部署 reranker | |
| 180 | - print_section("部署 Qwen3-Reranker 模型 (4B)") | |
| 181 | - client = Client(args.host) | |
| 182 | - reranker_uid = client.launch_model( | |
| 183 | - model_name="qwen3-reranker", | |
| 184 | - model_size_in_billions=4, | |
| 185 | - model_type="rerank", | |
| 186 | - engine="vllm", | |
| 187 | - gpu_idx=gpu_idx, | |
| 188 | - ) | |
| 189 | - print(f"✅ Reranker 模型部署成功: {reranker_uid}") | |
| 190 | - else: | |
| 191 | - # 部署所有模型 | |
| 192 | - deploy_qwen3_models(args.host, gpu_idx) |
third-party/xinference/docker-compose.yml deleted
| ... | ... | @@ -1,28 +0,0 @@ |
| 1 | -version: '3.8' | |
| 2 | - | |
| 3 | -services: | |
| 4 | - xinference: | |
| 5 | - image: xprobe/xinference:latest | |
| 6 | - container_name: xinference | |
| 7 | - ports: | |
| 8 | - - "9997:9997" | |
| 9 | - - "9998:9998" | |
| 10 | - environment: | |
| 11 | - - XINFERENCE_HOME=/data | |
| 12 | - volumes: | |
| 13 | - - ./models:/data | |
| 14 | - deploy: | |
| 15 | - resources: | |
| 16 | - reservations: | |
| 17 | - devices: | |
| 18 | - - driver: nvidia | |
| 19 | - count: all | |
| 20 | - capabilities: [gpu] | |
| 21 | - restart: unless-stopped | |
| 22 | - command: xinference-local -H 0.0.0.0 | |
| 23 | - healthcheck: | |
| 24 | - test: ["CMD", "curl", "-f", "http://localhost:9997/v1/models"] | |
| 25 | - interval: 30s | |
| 26 | - timeout: 10s | |
| 27 | - retries: 3 | |
| 28 | - start_period: 40s |
third-party/xinference/ecommerce_demo.py deleted
| ... | ... | @@ -1,318 +0,0 @@ |
| 1 | -#!/usr/bin/env python3 | |
| 2 | -""" | |
| 3 | -电商搜索实战示例 | |
| 4 | -演示如何使用 Qwen3-Embedding 和 Qwen3-Reranker 构建两阶段搜索系统 | |
| 5 | -""" | |
| 6 | - | |
| 7 | -import time | |
| 8 | -from typing import List, Tuple | |
| 9 | -from xinference_client import RESTfulClient as Client | |
| 10 | - | |
| 11 | - | |
| 12 | -class EcommerceSearchEngine: | |
| 13 | - """电商搜索引擎""" | |
| 14 | - | |
| 15 | - def __init__(self, host="http://localhost:9997"): | |
| 16 | - """ | |
| 17 | - 初始化搜索引擎 | |
| 18 | - | |
| 19 | - Args: | |
| 20 | - host: Xinference 服务地址 | |
| 21 | - """ | |
| 22 | - print("🔗 连接到 Xinference 服务...") | |
| 23 | - self.client = Client(host) | |
| 24 | - self.embedding_model = None | |
| 25 | - self.reranker_model = None | |
| 26 | - print("✅ 连接成功!\n") | |
| 27 | - | |
| 28 | - def load_models(self, embedding_uid=None, reranker_uid=None): | |
| 29 | - """ | |
| 30 | - 加载模型 | |
| 31 | - | |
| 32 | - Args: | |
| 33 | - embedding_uid: Embedding 模型 UID | |
| 34 | - reranker_uid: Reranker 模型 UID | |
| 35 | - """ | |
| 36 | - # 列出所有模型 | |
| 37 | - models = self.client.list_models() | |
| 38 | - model_dict = {m.get("model_type"): m.get("model_uid") for m in models} | |
| 39 | - | |
| 40 | - # 使用提供的 UID 或自动查找 | |
| 41 | - self.embedding_uid = embedding_uid or model_dict.get("embedding") | |
| 42 | - self.reranker_uid = reranker_uid or model_dict.get("rerank") | |
| 43 | - | |
| 44 | - if not self.embedding_uid: | |
| 45 | - raise ValueError("❌ 未找到 Embedding 模型,请先运行: python deploy_models.py") | |
| 46 | - if not self.reranker_uid: | |
| 47 | - raise ValueError("❌ 未找到 Reranker 模型,请先运行: python deploy_models.py") | |
| 48 | - | |
| 49 | - print(f"📦 加载 Embedding 模型: {self.embedding_uid}") | |
| 50 | - self.embedding_model = self.client.get_model(self.embedding_uid) | |
| 51 | - print("✅ Embedding 模型加载完成\n") | |
| 52 | - | |
| 53 | - print(f"📦 加载 Reranker 模型: {self.reranker_uid}") | |
| 54 | - self.reranker_model = self.client.get_model(self.reranker_uid) | |
| 55 | - print("✅ Reranker 模型加载完成\n") | |
| 56 | - | |
| 57 | - def dense_retrieval(self, query: str, candidates: List[str], top_k: int = 200) -> List[Tuple[str, float]]: | |
| 58 | - """ | |
| 59 | - 密集检索阶段(第一阶:粗筛) | |
| 60 | - | |
| 61 | - 在实际生产环境中,这里会使用 Faiss 或向量数据库进行 ANN 搜索 | |
| 62 | - 从百万级商品中快速召回 Top-K 候选 | |
| 63 | - | |
| 64 | - Args: | |
| 65 | - query: 用户查询 | |
| 66 | - candidates: 候选商品列表 | |
| 67 | - top_k: 返回的数量 | |
| 68 | - | |
| 69 | - Returns: | |
| 70 | - [(商品, 相似度分数), ...] | |
| 71 | - """ | |
| 72 | - start_time = time.time() | |
| 73 | - | |
| 74 | - # 生成 query 向量 | |
| 75 | - query_embedding = self.embedding_model.create_embedding(query)["data"][0]["embedding"] | |
| 76 | - | |
| 77 | - # 为所有候选商品生成向量 | |
| 78 | - # 注意:生产环境中这些向量应该预计算并存储在向量数据库中 | |
| 79 | - candidate_embeddings = [] | |
| 80 | - for product in candidates: | |
| 81 | - emb = self.embedding_model.create_embedding(product)["data"][0]["embedding"] | |
| 82 | - candidate_embeddings.append((product, emb)) | |
| 83 | - | |
| 84 | - # 计算余弦相似度(简化版,生产环境使用 Faiss) | |
| 85 | - import numpy as np | |
| 86 | - query_vec = np.array(query_embedding) | |
| 87 | - query_vec = query_vec / np.linalg.norm(query_vec) # 归一化 | |
| 88 | - | |
| 89 | - similarities = [] | |
| 90 | - for product, emb in candidate_embeddings: | |
| 91 | - emb_vec = np.array(emb) | |
| 92 | - emb_vec = emb_vec / np.linalg.norm(emb_vec) | |
| 93 | - similarity = float(np.dot(query_vec, emb_vec)) | |
| 94 | - similarities.append((product, similarity)) | |
| 95 | - | |
| 96 | - # 按 similarity 排序,取 Top-K | |
| 97 | - similarities.sort(key=lambda x: x[1], reverse=True) | |
| 98 | - top_results = similarities[:top_k] | |
| 99 | - | |
| 100 | - elapsed = time.time() - start_time | |
| 101 | - print(f"⏱️ 密集检索耗时: {elapsed:.2f}秒") | |
| 102 | - | |
| 103 | - return top_results | |
| 104 | - | |
| 105 | - def cross_encoder_rerank(self, query: str, candidates: List[str]) -> List[Tuple[str, float]]: | |
| 106 | - """ | |
| 107 | - 精排阶段(第二阶:细排) | |
| 108 | - | |
| 109 | - 使用 Cross-Encoder 对密集检索的结果进行精确打分 | |
| 110 | - | |
| 111 | - Args: | |
| 112 | - query: 用户查询 | |
| 113 | - candidates: 候选商品列表 | |
| 114 | - | |
| 115 | - Returns: | |
| 116 | - [(商品, 相关性分数), ...] | |
| 117 | - """ | |
| 118 | - start_time = time.time() | |
| 119 | - | |
| 120 | - # 构建 query-document 对 | |
| 121 | - pairs = [(query, product) for product in candidates] | |
| 122 | - | |
| 123 | - # 批量打分 | |
| 124 | - rerank_results = self.reranker_model.rerank(pairs) | |
| 125 | - | |
| 126 | - # 组合结果 | |
| 127 | - results = list(zip(candidates, rerank_results)) | |
| 128 | - | |
| 129 | - # 按相关性分数排序 | |
| 130 | - results.sort(key=lambda x: x[1]["relevance_score"], reverse=True) | |
| 131 | - | |
| 132 | - elapsed = time.time() - start_time | |
| 133 | - print(f"⏱️ 精排耗时: {elapsed:.2f}秒") | |
| 134 | - | |
| 135 | - return results | |
| 136 | - | |
| 137 | - def search(self, query: str, product_catalog: List[str], top_k: int = 10) -> List[Tuple[str, float]]: | |
| 138 | - """ | |
| 139 | - 完整的两阶段搜索流程 | |
| 140 | - | |
| 141 | - Args: | |
| 142 | - query: 用户查询 | |
| 143 | - product_catalog: 商品目录(假设有数万到数百万商品) | |
| 144 | - top_k: 最终返回的结果数 | |
| 145 | - | |
| 146 | - Returns: | |
| 147 | - [(商品, 相关性分数), ...] | |
| 148 | - """ | |
| 149 | - print(f"\n{'='*70}") | |
| 150 | - print(f"🔍 搜索查询: {query}") | |
| 151 | - print(f"{'='*70}\n") | |
| 152 | - | |
| 153 | - # 阶段1:密集检索召回 Top-200 | |
| 154 | - print("📊 阶段1: 密集检索(召回 Top-200)") | |
| 155 | - print("-" * 70) | |
| 156 | - recall_top_k = min(200, len(product_catalog)) | |
| 157 | - retrieved = self.dense_retrieval(query, product_catalog, top_k=recall_top_k) | |
| 158 | - retrieved_products = [p for p, s in retrieved] | |
| 159 | - | |
| 160 | - print(f"✅ 召回 {len(retrieved)} 个候选商品\n") | |
| 161 | - | |
| 162 | - # 阶段2:Cross-Encoder 精排 | |
| 163 | - print("🎯 阶段2: 精排(Cross-Encoder 打分)") | |
| 164 | - print("-" * 70) | |
| 165 | - reranked = self.cross_encoder_rerank(query, retrieved_products) | |
| 166 | - | |
| 167 | - # 取最终 Top-K | |
| 168 | - final_results = reranked[:top_k] | |
| 169 | - | |
| 170 | - return final_results | |
| 171 | - | |
| 172 | - | |
| 173 | -def demo_ecommerce_search(): | |
| 174 | - """电商搜索演示""" | |
| 175 | - | |
| 176 | - print("\n" + "="*70) | |
| 177 | - print(" 🛒 电商搜索实战演示 - Qwen3 双塔架构") | |
| 178 | - print("="*70 + "\n") | |
| 179 | - | |
| 180 | - # 初始化搜索引擎 | |
| 181 | - engine = EcommerceSearchEngine(host="http://localhost:9997") | |
| 182 | - | |
| 183 | - # 加载模型 | |
| 184 | - print("⏳ 加载模型...") | |
| 185 | - engine.load_models() | |
| 186 | - | |
| 187 | - # 模拟商品数据库(实际应用中可能有数百万商品) | |
| 188 | - product_catalog = [ | |
| 189 | - "红米Note12 5000mAh大电量 6.67英寸大屏 老人模式", | |
| 190 | - "iPhone 15 Pro Max 专业摄影旗舰 A17芯片", | |
| 191 | - "华为畅享60 6000mAh超长续航 护眼大屏 鸿蒙系统", | |
| 192 | - "OPPO A1 5000mAh电池 简易模式适合长辈", | |
| 193 | - "小米手环8 智能运动监测 血氧心率", | |
| 194 | - "vivo Y78 5000mAh大电池 120Hz高刷屏", | |
| 195 | - "三星Galaxy A54 5000mAh 防水防尘", | |
| 196 | - "荣耀Play7T 6000mAh巨量电池 双卡双待", | |
| 197 | - "真我11 Pro 2亿像素 100W快充", | |
| 198 | - "诺基亚C31 5050mAh电池 耐用三防", | |
| 199 | - "联想拯救者Y70 8GB+256GB 骁龙8+", | |
| 200 | - "摩托罗拉edge S30 骁龙888+ 144Hz", | |
| 201 | - "一加Ace 2V 天玑9000 80W快充", | |
| 202 | - "iQOO Neo8 独显芯片双芯 120W闪充", | |
| 203 | - "Redmi K60 2K高光屏 骁龙8+", | |
| 204 | - "华为Mate 60 Pro 卫星通信 鸿蒙4.0", | |
| 205 | - "iPhone 14 128GB A15芯片", | |
| 206 | - "OPPO Reno10 Pro 人像镜头 100W", | |
| 207 | - "vivo X90s 天玑9200+ 蔡司影像", | |
| 208 | - "小米13 Pro 徕卡光学镜头", | |
| 209 | - ] | |
| 210 | - | |
| 211 | - # 测试查询 | |
| 212 | - test_queries = [ | |
| 213 | - "适合老人用的智能手机大屏幕长续航", | |
| 214 | - "拍照效果好的手机推荐", | |
| 215 | - "性价比高的游戏手机", | |
| 216 | - ] | |
| 217 | - | |
| 218 | - # 执行搜索 | |
| 219 | - for query in test_queries: | |
| 220 | - results = engine.search(query, product_catalog, top_k=5) | |
| 221 | - | |
| 222 | - # 显示结果 | |
| 223 | - print(f"\n🎯 搜索结果 (Top 5):") | |
| 224 | - print("-" * 70) | |
| 225 | - for i, (product, score) in enumerate(results, 1): | |
| 226 | - print(f"{i}. [{score['relevance_score']:.4f}] {product}") | |
| 227 | - print() | |
| 228 | - | |
| 229 | - # 性能统计 | |
| 230 | - print("\n" + "="*70) | |
| 231 | - print("📊 生产环境部署建议") | |
| 232 | - print("="*70) | |
| 233 | - print(""" | |
| 234 | -1. 离线批量处理: | |
| 235 | - - 每天凌晨使用 Qwen3-Embedding 为全量商品生成向量 | |
| 236 | - - 存储到 Milvus/Pinecone 等向量数据库 | |
| 237 | - - 预计耗时: 2亿商品约 4-6 小时 | |
| 238 | - | |
| 239 | -2. 在线实时搜索: | |
| 240 | - - 用户 query 实时生成 embedding | |
| 241 | - - 向量数据库 ANN 检索召回 Top-1000 (耗时 < 50ms) | |
| 242 | - - Qwen3-Reranker 精排 Top-1000 → Top-50 (耗时 < 200ms) | |
| 243 | - - 总体延迟: < 300ms | |
| 244 | - | |
| 245 | -3. 缓存优化: | |
| 246 | - - Top 10000 热搜 query 的 embedding 和结果缓存到 Redis | |
| 247 | - - QPS 提升 10-20 倍 | |
| 248 | - | |
| 249 | -4. 混合检索: | |
| 250 | - - 结合 BM25 关键词召回(头部 Query 准确率更高) | |
| 251 | - - 向量召回 + 关键词召回 → 合并去重 → 精排 | |
| 252 | - | |
| 253 | -5. A/B 测试建议: | |
| 254 | - - 对照组: 纯 BM25 或传统 embedding 模型 | |
| 255 | - - 实验组: Qwen3-Embedding + Qwen3-Reranker | |
| 256 | - - 核心指标: CTR, CVR, GMV, 用户停留时间 | |
| 257 | - """) | |
| 258 | - | |
| 259 | - | |
| 260 | -def demo_simple_usage(): | |
| 261 | - """简单的使用示例""" | |
| 262 | - print("\n" + "="*70) | |
| 263 | - print(" 📝 快速使用示例") | |
| 264 | - print("="*70 + "\n") | |
| 265 | - | |
| 266 | - # 连接到服务 | |
| 267 | - client = Client("http://localhost:9997") | |
| 268 | - | |
| 269 | - # 列出可用模型 | |
| 270 | - models = client.list_models() | |
| 271 | - print("可用模型:") | |
| 272 | - for model in models: | |
| 273 | - print(f" - {model.get('model_type')}: {model.get('model_uid')}") | |
| 274 | - | |
| 275 | - # 假设模型已部署 | |
| 276 | - if models: | |
| 277 | - embedding_model = next((m for m in models if m.get("model_type") == "embedding"), None) | |
| 278 | - reranker_model = next((m for m in models if m.get("model_type") == "rerank"), None) | |
| 279 | - | |
| 280 | - if embedding_model: | |
| 281 | - print(f"\n使用 Embedding 模型: {embedding_model['model_uid']}") | |
| 282 | - model = client.get_model(embedding_model['model_uid']) | |
| 283 | - result = model.create_embedding("测试文本") | |
| 284 | - print(f"向量维度: {len(result['data'][0]['embedding'])}") | |
| 285 | - | |
| 286 | - if reranker_model: | |
| 287 | - print(f"\n使用 Reranker 模型: {reranker_model['model_uid']}") | |
| 288 | - model = client.get_model(reranker_model['model_uid']) | |
| 289 | - query = "适合老人用的智能手机" | |
| 290 | - docs = ["华为畅享60 6000mAh", "小米手环8"] | |
| 291 | - result = model.rerank([(query, d) for d in docs]) | |
| 292 | - for doc, score in zip(docs, result): | |
| 293 | - print(f" [{score['relevance_score']:.4f}] {doc}") | |
| 294 | - | |
| 295 | - | |
| 296 | -if __name__ == "__main__": | |
| 297 | - import argparse | |
| 298 | - | |
| 299 | - parser = argparse.ArgumentParser(description="电商搜索实战演示") | |
| 300 | - parser.add_argument("--host", default="http://localhost:9997", help="Xinference 服务地址") | |
| 301 | - parser.add_argument("--simple", action="store_true", help="运行简单示例") | |
| 302 | - parser.add_argument("--embedding", help="指定 Embedding 模型 UID") | |
| 303 | - parser.add_argument("--reranker", help="指定 Reranker 模型 UID") | |
| 304 | - | |
| 305 | - args = parser.parse_args() | |
| 306 | - | |
| 307 | - try: | |
| 308 | - if args.simple: | |
| 309 | - demo_simple_usage() | |
| 310 | - else: | |
| 311 | - demo_ecommerce_search() | |
| 312 | - except Exception as e: | |
| 313 | - print(f"\n❌ 错误: {e}") | |
| 314 | - print("\n💡 请确保:") | |
| 315 | - print(" 1. Xinference 服务正在运行: ./start.sh") | |
| 316 | - print(" 2. 模型已部署: python deploy_models.py") | |
| 317 | - import sys | |
| 318 | - sys.exit(1) |
third-party/xinference/install_nvidia_container_toolkit.sh deleted
| ... | ... | @@ -1,137 +0,0 @@ |
| 1 | -#!/bin/bash | |
| 2 | - | |
| 3 | -# NVIDIA Container Toolkit 自动安装脚本 | |
| 4 | -# 适用于 Ubuntu/Debian 系统 | |
| 5 | - | |
| 6 | -set -e | |
| 7 | - | |
| 8 | -echo "=========================================" | |
| 9 | -echo " NVIDIA Container Toolkit 安装脚本" | |
| 10 | -echo "=========================================" | |
| 11 | -echo "" | |
| 12 | - | |
| 13 | -# 检测系统发行版 | |
| 14 | -if [ -f /etc/os-release ]; then | |
| 15 | - . /etc/os-release | |
| 16 | - OS=$ID | |
| 17 | - OS_VERSION=$VERSION_ID | |
| 18 | -else | |
| 19 | - echo "❌ 无法检测系统类型" | |
| 20 | - exit 1 | |
| 21 | -fi | |
| 22 | - | |
| 23 | -echo "检测到系统: $OS $OS_VERSION" | |
| 24 | -echo "" | |
| 25 | - | |
| 26 | -# 检查 NVIDIA 驱动 | |
| 27 | -echo "🔍 检查 NVIDIA 驱动..." | |
| 28 | -if ! command -v nvidia-smi &> /dev/null; then | |
| 29 | - echo "❌ 未找到 NVIDIA 驱动,请先安装 NVIDIA 驱动" | |
| 30 | - echo " 访问: https://www.nvidia.com/Download/index.aspx" | |
| 31 | - exit 1 | |
| 32 | -fi | |
| 33 | - | |
| 34 | -echo "✅ NVIDIA 驱动已安装:" | |
| 35 | -nvidia-smi --query-gpu=name,driver_version --format=csv,noheader | |
| 36 | -echo "" | |
| 37 | - | |
| 38 | -# 检查 Docker | |
| 39 | -echo "🔍 检查 Docker..." | |
| 40 | -if ! command -v docker &> /dev/null; then | |
| 41 | - echo "❌ 未找到 Docker,请先安装 Docker" | |
| 42 | - exit 1 | |
| 43 | -fi | |
| 44 | - | |
| 45 | -echo "✅ Docker 已安装" | |
| 46 | -docker --version | |
| 47 | -echo "" | |
| 48 | - | |
| 49 | -# 添加 NVIDIA 仓库 | |
| 50 | -echo "📦 添加 NVIDIA Container Toolkit 仓库..." | |
| 51 | - | |
| 52 | -if [ "$OS" = "ubuntu" ] || [ "$OS" = "debian" ]; then | |
| 53 | - # Ubuntu/Debian | |
| 54 | - distribution=$OS$(. /etc/os-release; echo $VERSION_ID | sed 's/\.//') | |
| 55 | - | |
| 56 | - curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - | |
| 57 | - curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \ | |
| 58 | - sudo tee /etc/apt/sources.list.d/nvidia-docker.list | |
| 59 | - | |
| 60 | - echo "✅ 仓库配置完成" | |
| 61 | -elif [ "$OS" = "centos" ] || [ "$OS" = "rhel" ]; then | |
| 62 | - # CentOS/RHEL | |
| 63 | - distribution=$OS$(. /etc/os-release; echo $VERSION_ID | sed 's/\.//') | |
| 64 | - | |
| 65 | - curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | \ | |
| 66 | - sudo tee /etc/yum.repos.d/nvidia-docker.repo | |
| 67 | - | |
| 68 | - echo "✅ 仓库配置完成" | |
| 69 | -else | |
| 70 | - echo "⚠️ 不支持的系统: $OS" | |
| 71 | - echo " 请手动安装,参考: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html" | |
| 72 | - exit 1 | |
| 73 | -fi | |
| 74 | - | |
| 75 | -echo "" | |
| 76 | - | |
| 77 | -# 更新包列表 | |
| 78 | -echo "🔄 更新包列表..." | |
| 79 | -if [ "$OS" = "ubuntu" ] || [ "$OS" = "debian" ]; then | |
| 80 | - sudo apt-get update | |
| 81 | -elif [ "$OS" = "centos" ] || [ "$OS" = "rhel" ]; then | |
| 82 | - sudo yum makecache | |
| 83 | -fi | |
| 84 | -echo "" | |
| 85 | - | |
| 86 | -# 安装 NVIDIA Container Toolkit | |
| 87 | -echo "🔨 安装 NVIDIA Container Toolkit..." | |
| 88 | -if [ "$OS" = "ubuntu" ] || [ "$OS" = "debian" ]; then | |
| 89 | - sudo apt-get install -y nvidia-container-toolkit | |
| 90 | -elif [ "$OS" = "centos" ] || [ "$OS" = "rhel" ]; then | |
| 91 | - sudo yum install -y nvidia-container-toolkit | |
| 92 | -fi | |
| 93 | -echo "" | |
| 94 | - | |
| 95 | -# 配置 Docker | |
| 96 | -echo "⚙️ 配置 Docker 使用 NVIDIA 运行时..." | |
| 97 | -sudo nvidia-ctk runtime configure --runtime=docker | |
| 98 | -echo "✅ Docker 配置完成" | |
| 99 | -echo "" | |
| 100 | - | |
| 101 | -# 重启 Docker | |
| 102 | -echo "🔄 重启 Docker 服务..." | |
| 103 | -sudo systemctl restart docker | |
| 104 | -echo "✅ Docker 重启完成" | |
| 105 | -echo "" | |
| 106 | - | |
| 107 | -# 验证安装 | |
| 108 | -echo "🧪 验证安装..." | |
| 109 | -echo "测试 Docker GPU 访问..." | |
| 110 | -if sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi &> /dev/null; then | |
| 111 | - echo "✅ NVIDIA Container Toolkit 安装成功!" | |
| 112 | - echo "" | |
| 113 | - echo "GPU 可用于 Docker 容器!" | |
| 114 | -else | |
| 115 | - echo "❌ 验证失败,请检查日志" | |
| 116 | - exit 1 | |
| 117 | -fi | |
| 118 | - | |
| 119 | -echo "" | |
| 120 | -echo "=========================================" | |
| 121 | -echo " 🎉 安装完成!" | |
| 122 | -echo "=========================================" | |
| 123 | -echo "" | |
| 124 | -echo "📝 下一步:" | |
| 125 | -echo " 1. 停止当前的 Xinference 容器(如果在运行):" | |
| 126 | -echo " sudo docker stop xinference" | |
| 127 | -echo " sudo docker rm xinference" | |
| 128 | -echo "" | |
| 129 | -echo " 2. 重新启动 Xinference 服务(会自动使用 GPU):" | |
| 130 | -echo " cd $(pwd)" | |
| 131 | -echo " bash start.sh" | |
| 132 | -echo "" | |
| 133 | -echo " 3. 部署模型:" | |
| 134 | -echo " python deploy_models.py" | |
| 135 | -echo "" | |
| 136 | -echo "✅ 现在可以使用 GPU 加速了!" | |
| 137 | -echo "" |
| ... | ... | @@ -0,0 +1,249 @@ |
| 1 | +import openai | |
| 2 | +import time | |
| 3 | +import statistics | |
| 4 | +from concurrent.futures import ThreadPoolExecutor, as_completed | |
| 5 | +from typing import List, Dict, Tuple | |
| 6 | +import json | |
| 7 | + | |
| 8 | + | |
| 9 | +class EmbeddingPerformanceTester: | |
| 10 | + def __init__(self, base_url: str = "http://127.0.0.1:9997/v1"): | |
| 11 | + """初始化性能测试器""" | |
| 12 | + self.client = openai.Client( | |
| 13 | + api_key="cannot be empty", # 实际使用时请替换为真实API Key | |
| 14 | + base_url=base_url | |
| 15 | + ) | |
| 16 | + | |
| 17 | + def test_single_request(self, model: str, input_text: List[str]) -> Tuple[bool, float]: | |
| 18 | + """测试单个请求,返回成功状态和耗时""" | |
| 19 | + try: | |
| 20 | + start_time = time.perf_counter() | |
| 21 | + response = self.client.embeddings.create( | |
| 22 | + model=model, | |
| 23 | + input=input_text | |
| 24 | + ) | |
| 25 | + end_time = time.perf_counter() | |
| 26 | + | |
| 27 | + # 验证响应格式 | |
| 28 | + if response and hasattr(response, 'data'): | |
| 29 | + return True, end_time - start_time | |
| 30 | + else: | |
| 31 | + return False, end_time - start_time | |
| 32 | + | |
| 33 | + except Exception as e: | |
| 34 | + print(f"请求失败 - 模型 {model}: {str(e)}") | |
| 35 | + return False, 0.0 | |
| 36 | + | |
| 37 | + def test_model_sequential(self, model: str, input_text: List[str], | |
| 38 | + iterations: int = 1000) -> Dict: | |
| 39 | + """顺序执行性能测试""" | |
| 40 | + print(f"\n开始顺序测试模型: {model}") | |
| 41 | + print(f"测试次数: {iterations}") | |
| 42 | + | |
| 43 | + successes = 0 | |
| 44 | + failures = 0 | |
| 45 | + latencies = [] | |
| 46 | + | |
| 47 | + for i in range(iterations): | |
| 48 | + if i % 100 == 0 and i > 0: | |
| 49 | + print(f" 已完成 {i}/{iterations} 次请求...") | |
| 50 | + | |
| 51 | + success, latency = self.test_single_request(model, input_text) | |
| 52 | + | |
| 53 | + if success: | |
| 54 | + successes += 1 | |
| 55 | + latencies.append(latency) | |
| 56 | + else: | |
| 57 | + failures += 1 | |
| 58 | + | |
| 59 | + return self._calculate_stats(model, successes, failures, latencies) | |
| 60 | + | |
| 61 | + def test_model_concurrent(self, model: str, input_text: List[str], | |
| 62 | + iterations: int = 1000, max_workers: int = 10) -> Dict: | |
| 63 | + """并发执行性能测试""" | |
| 64 | + print(f"\n开始并发测试模型: {model}") | |
| 65 | + print(f"测试次数: {iterations}, 并发数: {max_workers}") | |
| 66 | + | |
| 67 | + successes = 0 | |
| 68 | + failures = 0 | |
| 69 | + latencies = [] | |
| 70 | + | |
| 71 | + with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
| 72 | + # 提交所有任务 | |
| 73 | + future_to_request = { | |
| 74 | + executor.submit(self.test_single_request, model, input_text): i | |
| 75 | + for i in range(iterations) | |
| 76 | + } | |
| 77 | + | |
| 78 | + # 收集结果 | |
| 79 | + completed = 0 | |
| 80 | + for future in as_completed(future_to_request): | |
| 81 | + completed += 1 | |
| 82 | + if completed % 100 == 0: | |
| 83 | + print(f" 已完成 {completed}/{iterations} 次请求...") | |
| 84 | + | |
| 85 | + try: | |
| 86 | + success, latency = future.result() | |
| 87 | + if success: | |
| 88 | + successes += 1 | |
| 89 | + latencies.append(latency) | |
| 90 | + else: | |
| 91 | + failures += 1 | |
| 92 | + except Exception as e: | |
| 93 | + print(f"请求异常: {str(e)}") | |
| 94 | + failures += 1 | |
| 95 | + | |
| 96 | + return self._calculate_stats(model, successes, failures, latencies) | |
| 97 | + | |
| 98 | + def _calculate_stats(self, model: str, successes: int, | |
| 99 | + failures: int, latencies: List[float]) -> Dict: | |
| 100 | + """计算性能统计信息""" | |
| 101 | + if not latencies: | |
| 102 | + return { | |
| 103 | + "model": model, | |
| 104 | + "total_requests": successes + failures, | |
| 105 | + "successful_requests": successes, | |
| 106 | + "failed_requests": failures, | |
| 107 | + "success_rate": 0.0, | |
| 108 | + "error": "无成功请求" | |
| 109 | + } | |
| 110 | + | |
| 111 | + stats = { | |
| 112 | + "model": model, | |
| 113 | + "total_requests": successes + failures, | |
| 114 | + "successful_requests": successes, | |
| 115 | + "failed_requests": failures, | |
| 116 | + "success_rate": successes / (successes + failures) * 100, | |
| 117 | + "total_time": sum(latencies), | |
| 118 | + "avg_latency": statistics.mean(latencies), | |
| 119 | + "min_latency": min(latencies), | |
| 120 | + "max_latency": max(latencies), | |
| 121 | + "p50_latency": statistics.median(latencies), | |
| 122 | + "p95_latency": sorted(latencies)[int(len(latencies) * 0.95)], | |
| 123 | + "p99_latency": sorted(latencies)[int(len(latencies) * 0.99)], | |
| 124 | + "requests_per_second": len(latencies) / sum(latencies) if sum(latencies) > 0 else 0 | |
| 125 | + } | |
| 126 | + | |
| 127 | + # 添加标准差(如果有多于一个样本) | |
| 128 | + if len(latencies) > 1: | |
| 129 | + stats["std_dev"] = statistics.stdev(latencies) | |
| 130 | + | |
| 131 | + return stats | |
| 132 | + | |
| 133 | + def print_results(self, results: Dict): | |
| 134 | + """打印测试结果""" | |
| 135 | + print("\n" + "="*60) | |
| 136 | + print(f"性能测试结果 - {results['model']}") | |
| 137 | + print("="*60) | |
| 138 | + | |
| 139 | + if "error" in results: | |
| 140 | + print(f"错误: {results['error']}") | |
| 141 | + return | |
| 142 | + | |
| 143 | + print(f"总请求数: {results['total_requests']}") | |
| 144 | + print(f"成功请求: {results['successful_requests']}") | |
| 145 | + print(f"失败请求: {results['failed_requests']}") | |
| 146 | + print(f"成功率: {results['success_rate']:.2f}%") | |
| 147 | + print(f"总耗时: {results['total_time']:.4f}秒") | |
| 148 | + print(f"平均延迟: {results['avg_latency']:.4f}秒") | |
| 149 | + print(f"最小延迟: {results['min_latency']:.4f}秒") | |
| 150 | + print(f"最大延迟: {results['max_latency']:.4f}秒") | |
| 151 | + print(f"P50延迟: {results['p50_latency']:.4f}秒") | |
| 152 | + print(f"P95延迟: {results['p95_latency']:.4f}秒") | |
| 153 | + print(f"P99延迟: {results['p99_latency']:.4f}秒") | |
| 154 | + | |
| 155 | + if "std_dev" in results: | |
| 156 | + print(f"标准差: {results['std_dev']:.4f}秒") | |
| 157 | + | |
| 158 | + print(f"QPS: {results['requests_per_second']:.2f} 请求/秒") | |
| 159 | + print("="*60) | |
| 160 | + | |
| 161 | + def save_results(self, results_list: List[Dict], filename: str = "performance_results.json"): | |
| 162 | + """保存测试结果到JSON文件""" | |
| 163 | + with open(filename, 'w', encoding='utf-8') as f: | |
| 164 | + json.dump(results_list, f, indent=2, ensure_ascii=False) | |
| 165 | + print(f"\n结果已保存到: {filename}") | |
| 166 | + | |
| 167 | + | |
| 168 | +def main(): | |
| 169 | + """主函数""" | |
| 170 | + # 初始化测试器 | |
| 171 | + tester = EmbeddingPerformanceTester() | |
| 172 | + | |
| 173 | + # 测试配置 | |
| 174 | + test_input = ["What is the capital of China?"] | |
| 175 | + iterations = 1000 | |
| 176 | + test_models = ['bge-m3', 'Qwen3-Embedding-0.6B'] | |
| 177 | + | |
| 178 | + print("="*60) | |
| 179 | + print("Embedding API 性能测试") | |
| 180 | + print("="*60) | |
| 181 | + | |
| 182 | + all_results = [] | |
| 183 | + | |
| 184 | + # 测试模式选择 | |
| 185 | + print("\n选择测试模式:") | |
| 186 | + print("1. 顺序测试 (Sequential)") | |
| 187 | + print("2. 并发测试 (Concurrent)") | |
| 188 | + print("3. 两种模式都测试") | |
| 189 | + | |
| 190 | + mode = input("请输入选择 (1/2/3, 默认1): ").strip() | |
| 191 | + | |
| 192 | + for model in test_models: | |
| 193 | + print(f"\n{'='*60}") | |
| 194 | + print(f"测试模型: {model}") | |
| 195 | + print(f"{'='*60}") | |
| 196 | + | |
| 197 | + if mode in ['2', '3']: | |
| 198 | + # 并发测试 | |
| 199 | + concurrent_results = tester.test_model_concurrent( | |
| 200 | + model=model, | |
| 201 | + input_text=test_input, | |
| 202 | + iterations=iterations, | |
| 203 | + max_workers=10 # 可根据需要调整并发数 | |
| 204 | + ) | |
| 205 | + tester.print_results(concurrent_results) | |
| 206 | + concurrent_results["test_mode"] = "concurrent" | |
| 207 | + all_results.append(concurrent_results) | |
| 208 | + | |
| 209 | + if mode in ['1', '3'] or not mode: | |
| 210 | + # 顺序测试 | |
| 211 | + sequential_results = tester.test_model_sequential( | |
| 212 | + model=model, | |
| 213 | + input_text=test_input, | |
| 214 | + iterations=iterations | |
| 215 | + ) | |
| 216 | + tester.print_results(sequential_results) | |
| 217 | + sequential_results["test_mode"] = "sequential" | |
| 218 | + all_results.append(sequential_results) | |
| 219 | + | |
| 220 | + # 保存结果 | |
| 221 | + tester.save_results(all_results) | |
| 222 | + | |
| 223 | + # 汇总对比 | |
| 224 | + print("\n" + "="*60) | |
| 225 | + print("性能测试汇总对比") | |
| 226 | + print("="*60) | |
| 227 | + | |
| 228 | + for result in all_results: | |
| 229 | + if "error" not in result: | |
| 230 | + print(f"\n模型: {result['model']} ({result['test_mode']})") | |
| 231 | + print(f" QPS: {result['requests_per_second']:.2f}") | |
| 232 | + print(f" 平均延迟: {result['avg_latency']:.4f}秒") | |
| 233 | + print(f" 成功率: {result['success_rate']:.2f}%") | |
| 234 | + | |
| 235 | + | |
| 236 | +if __name__ == "__main__": | |
| 237 | + # 添加一个简单的健康检查 | |
| 238 | + try: | |
| 239 | + tester = EmbeddingPerformanceTester() | |
| 240 | + # 快速测试连接 | |
| 241 | + test_result = tester.test_single_request('bge-m3', ["test"]) | |
| 242 | + if test_result[0]: | |
| 243 | + print("API连接正常,开始性能测试...") | |
| 244 | + main() | |
| 245 | + else: | |
| 246 | + print("API连接失败,请检查服务是否正常运行") | |
| 247 | + except Exception as e: | |
| 248 | + print(f"初始化失败: {str(e)}") | |
| 249 | + print("请确保OpenAI客户端已安装: pip install openai") | ... | ... |
| ... | ... | @@ -0,0 +1,265 @@ |
| 1 | +import requests | |
| 2 | +import time | |
| 3 | +import statistics | |
| 4 | +from concurrent.futures import ThreadPoolExecutor, as_completed | |
| 5 | +from typing import List, Dict, Tuple | |
| 6 | +import json | |
| 7 | + | |
| 8 | + | |
| 9 | +class EmbeddingPerformanceTester: | |
| 10 | + def __init__(self, base_url: str = "http://127.0.0.1:9997/v1"): | |
| 11 | + """初始化性能测试器""" | |
| 12 | + self.base_url = base_url | |
| 13 | + self.embeddings_url = f"{base_url}/embeddings" | |
| 14 | + | |
| 15 | + def test_single_request(self, model: str, input_text: List[str]) -> Tuple[bool, float]: | |
| 16 | + """测试单个请求,返回成功状态和耗时""" | |
| 17 | + try: | |
| 18 | + start_time = time.perf_counter() | |
| 19 | + | |
| 20 | + # 构建请求体 | |
| 21 | + # 如果 input_text 是列表,取第一个元素;如果是字符串,直接使用 | |
| 22 | + input_value = input_text[0] if isinstance(input_text, list) and len(input_text) > 0 else input_text | |
| 23 | + | |
| 24 | + response = requests.post( | |
| 25 | + self.embeddings_url, | |
| 26 | + headers={ | |
| 27 | + 'accept': 'application/json', | |
| 28 | + 'Content-Type': 'application/json' | |
| 29 | + }, | |
| 30 | + json={ | |
| 31 | + "model": model, | |
| 32 | + "input": input_value | |
| 33 | + }, | |
| 34 | + timeout=60 # 设置超时时间 | |
| 35 | + ) | |
| 36 | + | |
| 37 | + end_time = time.perf_counter() | |
| 38 | + | |
| 39 | + # 验证响应格式 | |
| 40 | + if response.status_code == 200: | |
| 41 | + result = response.json() | |
| 42 | + if result and 'data' in result and len(result['data']) > 0: | |
| 43 | + return True, end_time - start_time | |
| 44 | + else: | |
| 45 | + return False, end_time - start_time | |
| 46 | + else: | |
| 47 | + return False, end_time - start_time | |
| 48 | + | |
| 49 | + except Exception as e: | |
| 50 | + print(f"请求失败 - 模型 {model}: {str(e)}") | |
| 51 | + return False, 0.0 | |
| 52 | + | |
| 53 | + def test_model_sequential(self, model: str, input_text: List[str], | |
| 54 | + iterations: int = 1000) -> Dict: | |
| 55 | + """顺序执行性能测试""" | |
| 56 | + print(f"\n开始顺序测试模型: {model}") | |
| 57 | + print(f"测试次数: {iterations}") | |
| 58 | + | |
| 59 | + successes = 0 | |
| 60 | + failures = 0 | |
| 61 | + latencies = [] | |
| 62 | + | |
| 63 | + for i in range(iterations): | |
| 64 | + if i % 100 == 0 and i > 0: | |
| 65 | + print(f" 已完成 {i}/{iterations} 次请求...") | |
| 66 | + | |
| 67 | + success, latency = self.test_single_request(model, input_text) | |
| 68 | + | |
| 69 | + if success: | |
| 70 | + successes += 1 | |
| 71 | + latencies.append(latency) | |
| 72 | + else: | |
| 73 | + failures += 1 | |
| 74 | + | |
| 75 | + return self._calculate_stats(model, successes, failures, latencies) | |
| 76 | + | |
| 77 | + def test_model_concurrent(self, model: str, input_text: List[str], | |
| 78 | + iterations: int = 1000, max_workers: int = 10) -> Dict: | |
| 79 | + """并发执行性能测试""" | |
| 80 | + print(f"\n开始并发测试模型: {model}") | |
| 81 | + print(f"测试次数: {iterations}, 并发数: {max_workers}") | |
| 82 | + | |
| 83 | + successes = 0 | |
| 84 | + failures = 0 | |
| 85 | + latencies = [] | |
| 86 | + | |
| 87 | + with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
| 88 | + # 提交所有任务 | |
| 89 | + future_to_request = { | |
| 90 | + executor.submit(self.test_single_request, model, input_text): i | |
| 91 | + for i in range(iterations) | |
| 92 | + } | |
| 93 | + | |
| 94 | + # 收集结果 | |
| 95 | + completed = 0 | |
| 96 | + for future in as_completed(future_to_request): | |
| 97 | + completed += 1 | |
| 98 | + if completed % 100 == 0: | |
| 99 | + print(f" 已完成 {completed}/{iterations} 次请求...") | |
| 100 | + | |
| 101 | + try: | |
| 102 | + success, latency = future.result() | |
| 103 | + if success: | |
| 104 | + successes += 1 | |
| 105 | + latencies.append(latency) | |
| 106 | + else: | |
| 107 | + failures += 1 | |
| 108 | + except Exception as e: | |
| 109 | + print(f"请求异常: {str(e)}") | |
| 110 | + failures += 1 | |
| 111 | + | |
| 112 | + return self._calculate_stats(model, successes, failures, latencies) | |
| 113 | + | |
| 114 | + def _calculate_stats(self, model: str, successes: int, | |
| 115 | + failures: int, latencies: List[float]) -> Dict: | |
| 116 | + """计算性能统计信息""" | |
| 117 | + if not latencies: | |
| 118 | + return { | |
| 119 | + "model": model, | |
| 120 | + "total_requests": successes + failures, | |
| 121 | + "successful_requests": successes, | |
| 122 | + "failed_requests": failures, | |
| 123 | + "success_rate": 0.0, | |
| 124 | + "error": "无成功请求" | |
| 125 | + } | |
| 126 | + | |
| 127 | + stats = { | |
| 128 | + "model": model, | |
| 129 | + "total_requests": successes + failures, | |
| 130 | + "successful_requests": successes, | |
| 131 | + "failed_requests": failures, | |
| 132 | + "success_rate": successes / (successes + failures) * 100, | |
| 133 | + "total_time": sum(latencies), | |
| 134 | + "avg_latency": statistics.mean(latencies), | |
| 135 | + "min_latency": min(latencies), | |
| 136 | + "max_latency": max(latencies), | |
| 137 | + "p50_latency": statistics.median(latencies), | |
| 138 | + "p95_latency": sorted(latencies)[int(len(latencies) * 0.95)], | |
| 139 | + "p99_latency": sorted(latencies)[int(len(latencies) * 0.99)], | |
| 140 | + "requests_per_second": len(latencies) / sum(latencies) if sum(latencies) > 0 else 0 | |
| 141 | + } | |
| 142 | + | |
| 143 | + # 添加标准差(如果有多于一个样本) | |
| 144 | + if len(latencies) > 1: | |
| 145 | + stats["std_dev"] = statistics.stdev(latencies) | |
| 146 | + | |
| 147 | + return stats | |
| 148 | + | |
| 149 | + def print_results(self, results: Dict): | |
| 150 | + """打印测试结果""" | |
| 151 | + print("\n" + "="*60) | |
| 152 | + print(f"性能测试结果 - {results['model']}") | |
| 153 | + print("="*60) | |
| 154 | + | |
| 155 | + if "error" in results: | |
| 156 | + print(f"错误: {results['error']}") | |
| 157 | + return | |
| 158 | + | |
| 159 | + print(f"总请求数: {results['total_requests']}") | |
| 160 | + print(f"成功请求: {results['successful_requests']}") | |
| 161 | + print(f"失败请求: {results['failed_requests']}") | |
| 162 | + print(f"成功率: {results['success_rate']:.2f}%") | |
| 163 | + print(f"总耗时: {results['total_time']:.4f}秒") | |
| 164 | + print(f"平均延迟: {results['avg_latency']:.4f}秒") | |
| 165 | + print(f"最小延迟: {results['min_latency']:.4f}秒") | |
| 166 | + print(f"最大延迟: {results['max_latency']:.4f}秒") | |
| 167 | + print(f"P50延迟: {results['p50_latency']:.4f}秒") | |
| 168 | + print(f"P95延迟: {results['p95_latency']:.4f}秒") | |
| 169 | + print(f"P99延迟: {results['p99_latency']:.4f}秒") | |
| 170 | + | |
| 171 | + if "std_dev" in results: | |
| 172 | + print(f"标准差: {results['std_dev']:.4f}秒") | |
| 173 | + | |
| 174 | + print(f"QPS: {results['requests_per_second']:.2f} 请求/秒") | |
| 175 | + print("="*60) | |
| 176 | + | |
| 177 | + def save_results(self, results_list: List[Dict], filename: str = "performance_results.json"): | |
| 178 | + """保存测试结果到JSON文件""" | |
| 179 | + with open(filename, 'w', encoding='utf-8') as f: | |
| 180 | + json.dump(results_list, f, indent=2, ensure_ascii=False) | |
| 181 | + print(f"\n结果已保存到: {filename}") | |
| 182 | + | |
| 183 | + | |
| 184 | +def main(): | |
| 185 | + """主函数""" | |
| 186 | + # 初始化测试器 | |
| 187 | + tester = EmbeddingPerformanceTester() | |
| 188 | + | |
| 189 | + # 测试配置 | |
| 190 | + test_input = ["What is the capital of China?"] | |
| 191 | + iterations = 1000 | |
| 192 | + test_models = ['bge-m3', 'Qwen3-Embedding-0.6B'] | |
| 193 | + | |
| 194 | + print("="*60) | |
| 195 | + print("Embedding API 性能测试 (HTTP)") | |
| 196 | + print("="*60) | |
| 197 | + | |
| 198 | + all_results = [] | |
| 199 | + | |
| 200 | + # 测试模式选择 | |
| 201 | + print("\n选择测试模式:") | |
| 202 | + print("1. 顺序测试 (Sequential)") | |
| 203 | + print("2. 并发测试 (Concurrent)") | |
| 204 | + print("3. 两种模式都测试") | |
| 205 | + | |
| 206 | + mode = input("请输入选择 (1/2/3, 默认1): ").strip() | |
| 207 | + | |
| 208 | + for model in test_models: | |
| 209 | + print(f"\n{'='*60}") | |
| 210 | + print(f"测试模型: {model}") | |
| 211 | + print(f"{'='*60}") | |
| 212 | + | |
| 213 | + if mode in ['2', '3']: | |
| 214 | + # 并发测试 | |
| 215 | + concurrent_results = tester.test_model_concurrent( | |
| 216 | + model=model, | |
| 217 | + input_text=test_input, | |
| 218 | + iterations=iterations, | |
| 219 | + max_workers=10 # 可根据需要调整并发数 | |
| 220 | + ) | |
| 221 | + tester.print_results(concurrent_results) | |
| 222 | + concurrent_results["test_mode"] = "concurrent" | |
| 223 | + all_results.append(concurrent_results) | |
| 224 | + | |
| 225 | + if mode in ['1', '3'] or not mode: | |
| 226 | + # 顺序测试 | |
| 227 | + sequential_results = tester.test_model_sequential( | |
| 228 | + model=model, | |
| 229 | + input_text=test_input, | |
| 230 | + iterations=iterations | |
| 231 | + ) | |
| 232 | + tester.print_results(sequential_results) | |
| 233 | + sequential_results["test_mode"] = "sequential" | |
| 234 | + all_results.append(sequential_results) | |
| 235 | + | |
| 236 | + # 保存结果 | |
| 237 | + tester.save_results(all_results) | |
| 238 | + | |
| 239 | + # 汇总对比 | |
| 240 | + print("\n" + "="*60) | |
| 241 | + print("性能测试汇总对比") | |
| 242 | + print("="*60) | |
| 243 | + | |
| 244 | + for result in all_results: | |
| 245 | + if "error" not in result: | |
| 246 | + print(f"\n模型: {result['model']} ({result['test_mode']})") | |
| 247 | + print(f" QPS: {result['requests_per_second']:.2f}") | |
| 248 | + print(f" 平均延迟: {result['avg_latency']:.4f}秒") | |
| 249 | + print(f" 成功率: {result['success_rate']:.2f}%") | |
| 250 | + | |
| 251 | + | |
| 252 | +if __name__ == "__main__": | |
| 253 | + # 添加一个简单的健康检查 | |
| 254 | + try: | |
| 255 | + tester = EmbeddingPerformanceTester() | |
| 256 | + # 快速测试连接 | |
| 257 | + test_result = tester.test_single_request('bge-m3', ["test"]) | |
| 258 | + if test_result[0]: | |
| 259 | + print("API连接正常,开始性能测试...") | |
| 260 | + main() | |
| 261 | + else: | |
| 262 | + print("API连接失败,请检查服务是否正常运行") | |
| 263 | + except Exception as e: | |
| 264 | + print(f"初始化失败: {str(e)}") | |
| 265 | + print("请确保 requests 库已安装: pip install requests") | |
| 0 | 266 | \ No newline at end of file | ... | ... |
| ... | ... | @@ -0,0 +1,108 @@ |
| 1 | +import openai | |
| 2 | +import time | |
| 3 | +import requests | |
| 4 | +import json | |
| 5 | + | |
| 6 | + | |
| 7 | +client = openai.Client( | |
| 8 | + api_key="cannot be empty", | |
| 9 | + base_url="http://127.0.0.1:9997/v1" | |
| 10 | +) | |
| 11 | + | |
| 12 | +# 记录开始时间 | |
| 13 | +start_time = time.time() | |
| 14 | + | |
| 15 | +a = client.embeddings.create( | |
| 16 | + model='bge-m3', | |
| 17 | + input=["What is the capital of China?"] | |
| 18 | +) | |
| 19 | + | |
| 20 | +# 记录结束时间 | |
| 21 | +end_time = time.time() | |
| 22 | + | |
| 23 | +#print(a) | |
| 24 | +print(f"\n耗时: {end_time - start_time:.4f} 秒") | |
| 25 | + | |
| 26 | +# 记录开始时间 | |
| 27 | +start_time = time.time() | |
| 28 | + | |
| 29 | +a = client.embeddings.create( | |
| 30 | + model='Qwen3-Embedding-0.6B', | |
| 31 | + input=["What is the capital of China?"] | |
| 32 | +) | |
| 33 | + | |
| 34 | +# 记录结束时间 | |
| 35 | +end_time = time.time() | |
| 36 | + | |
| 37 | +#print(a) | |
| 38 | +print(f"\n耗时: {end_time - start_time:.4f} 秒") | |
| 39 | + | |
| 40 | +# ========== HTTP API 测试 ========== | |
| 41 | +print("\n" + "="*50) | |
| 42 | +print("HTTP API 测试") | |
| 43 | +print("="*50) | |
| 44 | + | |
| 45 | +# 配置 | |
| 46 | +XINFERENCE_HOST = "127.0.0.1" | |
| 47 | +XINFERENCE_PORT = "9997" | |
| 48 | +base_url = f"http://{XINFERENCE_HOST}:{XINFERENCE_PORT}/v1/embeddings" | |
| 49 | + | |
| 50 | +# 测试 bge-m3 模型 | |
| 51 | +print("\n测试模型: bge-m3") | |
| 52 | +start_time = time.time() | |
| 53 | + | |
| 54 | +response = requests.post( | |
| 55 | + base_url, | |
| 56 | + headers={ | |
| 57 | + 'accept': 'application/json', | |
| 58 | + 'Content-Type': 'application/json' | |
| 59 | + }, | |
| 60 | + json={ | |
| 61 | + "model": "bge-m3", | |
| 62 | + "input": "What is the capital of China?" | |
| 63 | + } | |
| 64 | +) | |
| 65 | + | |
| 66 | +end_time = time.time() | |
| 67 | + | |
| 68 | +if response.status_code == 200: | |
| 69 | + result = response.json() | |
| 70 | + print(f"状态码: {response.status_code}") | |
| 71 | + print(f"模型: {result.get('model', 'N/A')}") | |
| 72 | + print(f"使用token数: {result.get('usage', {}).get('total_tokens', 'N/A')}") | |
| 73 | + print(f"嵌入向量维度: {len(result.get('data', [{}])[0].get('embedding', []))}") | |
| 74 | + print(f"耗时: {end_time - start_time:.4f} 秒") | |
| 75 | +else: | |
| 76 | + print(f"请求失败,状态码: {response.status_code}") | |
| 77 | + print(f"错误信息: {response.text}") | |
| 78 | + | |
| 79 | +# 测试 Qwen3-Embedding-0.6B 模型 | |
| 80 | +print("\n测试模型: Qwen3-Embedding-0.6B") | |
| 81 | +start_time = time.time() | |
| 82 | + | |
| 83 | +response = requests.post( | |
| 84 | + base_url, | |
| 85 | + headers={ | |
| 86 | + 'accept': 'application/json', | |
| 87 | + 'Content-Type': 'application/json' | |
| 88 | + }, | |
| 89 | + json={ | |
| 90 | + "model": "Qwen3-Embedding-0.6B", | |
| 91 | + "input": "What is the capital of China?" | |
| 92 | + } | |
| 93 | +) | |
| 94 | + | |
| 95 | +end_time = time.time() | |
| 96 | + | |
| 97 | +if response.status_code == 200: | |
| 98 | + result = response.json() | |
| 99 | + print(f"状态码: {response.status_code}") | |
| 100 | + print(f"模型: {result.get('model', 'N/A')}") | |
| 101 | + print(f"使用token数: {result.get('usage', {}).get('total_tokens', 'N/A')}") | |
| 102 | + print(f"嵌入向量维度: {len(result.get('data', [{}])[0].get('embedding', []))}") | |
| 103 | + print(f"耗时: {end_time - start_time:.4f} 秒") | |
| 104 | +else: | |
| 105 | + print(f"请求失败,状态码: {response.status_code}") | |
| 106 | + print(f"错误信息: {response.text}") | |
| 107 | + | |
| 108 | + | ... | ... |
third-party/xinference/requirements.txt deleted
third-party/xinference/setup_alias.sh deleted
| ... | ... | @@ -1,75 +0,0 @@ |
| 1 | -#!/bin/bash | |
| 2 | - | |
| 3 | -# 自动为 Xinference 设置别名到 Shell 配置文件 | |
| 4 | - | |
| 5 | -ENV_NAME="xinference" | |
| 6 | -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | |
| 7 | - | |
| 8 | -echo "=========================================" | |
| 9 | -echo " 设置 Xinference 快捷别名" | |
| 10 | -echo "=========================================" | |
| 11 | -echo "" | |
| 12 | - | |
| 13 | -# 检测 Shell 类型 | |
| 14 | -if [ -n "$ZSH_VERSION" ]; then | |
| 15 | - SHELL_CONFIG="$HOME/.zshrc" | |
| 16 | - SHELL_NAME="zsh" | |
| 17 | -elif [ -n "$BASH_VERSION" ]; then | |
| 18 | - SHELL_CONFIG="$HOME/.bashrc" | |
| 19 | - SHELL_NAME="bash" | |
| 20 | -else | |
| 21 | - echo "⚠️ 无法检测 Shell 类型,请手动添加别名" | |
| 22 | - exit 1 | |
| 23 | -fi | |
| 24 | - | |
| 25 | -echo "检测到 Shell: $SHELL_NAME" | |
| 26 | -echo "配置文件: $SHELL_CONFIG" | |
| 27 | -echo "" | |
| 28 | - | |
| 29 | -# 别名内容 | |
| 30 | -ALIAS_CONTENT=" | |
| 31 | -# Xinference 环境快捷命令 | |
| 32 | -alias xinference-env='source ${SCRIPT_DIR}/activate.sh' | |
| 33 | -alias xinference-activate='conda activate ${ENV_NAME}' | |
| 34 | -alias xinference-cd='cd ${SCRIPT_DIR}' | |
| 35 | -" | |
| 36 | - | |
| 37 | -# 检查是否已存在 | |
| 38 | -if grep -q "xinference-env" "$SHELL_CONFIG" 2>/dev/null; then | |
| 39 | - echo "⚠️ 别名已存在于 $SHELL_CONFIG" | |
| 40 | - read -p "是否重新添加? (y/N): " -n 1 -r | |
| 41 | - echo | |
| 42 | - if [[ ! $REPLY =~ ^[Yy]$ ]]; then | |
| 43 | - echo "跳过设置" | |
| 44 | - exit 0 | |
| 45 | - fi | |
| 46 | - # 删除旧的别名 | |
| 47 | - sed -i '/# Xinference 环境快捷命令/,/xinference-cd/d' "$SHELL_CONFIG" | |
| 48 | -fi | |
| 49 | - | |
| 50 | -# 添加别名 | |
| 51 | -echo "" >> "$SHELL_CONFIG" | |
| 52 | -echo "$ALIAS_CONTENT" >> "$SHELL_CONFIG" | |
| 53 | - | |
| 54 | -echo "✅ 别名已添加到 $SHELL_CONFIG" | |
| 55 | -echo "" | |
| 56 | -echo "=========================================" | |
| 57 | -echo " 可用快捷命令" | |
| 58 | -echo "=========================================" | |
| 59 | -echo "" | |
| 60 | -echo " 1. xinference-env - 激活环境并切换到目录" | |
| 61 | -echo " 2. xinference-activate - 仅激活环境" | |
| 62 | -echo " 3. xinference-cd - 切换到 Xinference 目录" | |
| 63 | -echo "" | |
| 64 | -echo "=========================================" | |
| 65 | -echo " 使配置生效" | |
| 66 | -echo "=========================================" | |
| 67 | -echo "" | |
| 68 | -echo "运行以下命令使配置生效:" | |
| 69 | -echo "" | |
| 70 | -echo " source $SHELL_CONFIG" | |
| 71 | -echo "" | |
| 72 | -echo "或者重新打开终端" | |
| 73 | -echo "" | |
| 74 | -echo "然后就可以直接使用 xinference-env 命令了!" | |
| 75 | -echo "" |
third-party/xinference/setup_env.sh deleted
| ... | ... | @@ -1,80 +0,0 @@ |
| 1 | -#!/bin/bash | |
| 2 | - | |
| 3 | -# Xinference Conda 环境创建脚本 | |
| 4 | -# 用于创建独立的 Python 环境来运行 Xinference 客户端 | |
| 5 | - | |
| 6 | -set -e | |
| 7 | - | |
| 8 | -ENV_NAME="xinference" | |
| 9 | -PYTHON_VERSION="3.10" | |
| 10 | - | |
| 11 | -echo "=========================================" | |
| 12 | -echo " 创建 Xinference Conda 环境" | |
| 13 | -echo "=========================================" | |
| 14 | -echo "" | |
| 15 | - | |
| 16 | -# 检查 conda 是否可用 | |
| 17 | -if ! command -v conda &> /dev/null; then | |
| 18 | - echo "❌ 错误: conda 未安装或未在 PATH 中" | |
| 19 | - echo " 请先安装 Miniconda 或 Anaconda" | |
| 20 | - exit 1 | |
| 21 | -fi | |
| 22 | - | |
| 23 | -# 检查环境是否已存在 | |
| 24 | -if conda env list | grep -q "^${ENV_NAME} "; then | |
| 25 | - echo "⚠️ 环境 '${ENV_NAME}' 已存在" | |
| 26 | - read -p "是否删除并重新创建? (y/N): " -n 1 -r | |
| 27 | - echo | |
| 28 | - if [[ $REPLY =~ ^[Yy]$ ]]; then | |
| 29 | - echo "🗑️ 删除旧环境..." | |
| 30 | - conda env remove -n $ENV_NAME -y | |
| 31 | - else | |
| 32 | - echo "✅ 使用现有环境" | |
| 33 | - echo " 激活命令: conda activate $ENV_NAME" | |
| 34 | - exit 0 | |
| 35 | - fi | |
| 36 | -fi | |
| 37 | - | |
| 38 | -echo "🔨 创建 conda 环境: ${ENV_NAME} (Python ${PYTHON_VERSION})" | |
| 39 | -conda create -n $ENV_NAME python=$PYTHON_VERSION -y | |
| 40 | - | |
| 41 | -echo "" | |
| 42 | -echo "📦 安装依赖包..." | |
| 43 | - | |
| 44 | -# 激活环境并安装依赖 | |
| 45 | -eval "$(conda shell.bash hook)" | |
| 46 | -conda activate $ENV_NAME | |
| 47 | - | |
| 48 | -# 基础依赖 | |
| 49 | -pip install -U pip | |
| 50 | - | |
| 51 | -# Xinference 客户端 | |
| 52 | -pip install xinference-client | |
| 53 | - | |
| 54 | -# 其他可能需要的依赖 | |
| 55 | -pip install numpy | |
| 56 | -pip install requests | |
| 57 | - | |
| 58 | -echo "" | |
| 59 | -echo "✅ 环境创建完成!" | |
| 60 | -echo "" | |
| 61 | -echo "=========================================" | |
| 62 | -echo " 环境信息" | |
| 63 | -echo "=========================================" | |
| 64 | -echo "环境名称: ${ENV_NAME}" | |
| 65 | -echo "Python 版本: ${PYTHON_VERSION}" | |
| 66 | -echo "" | |
| 67 | -echo "🚀 使用方法:" | |
| 68 | -echo "" | |
| 69 | -echo " 1. 激活环境:" | |
| 70 | -echo " conda activate ${ENV_NAME}" | |
| 71 | -echo "" | |
| 72 | -echo " 2. 运行演示:" | |
| 73 | -echo " python ecommerce_demo.py" | |
| 74 | -echo "" | |
| 75 | -echo " 3. 退出环境:" | |
| 76 | -echo " conda deactivate" | |
| 77 | -echo "" | |
| 78 | -echo "💡 提示: 将以下别名添加到 ~/.bashrc 或 ~/.zshrc 以快速切换:" | |
| 79 | -echo " alias xinference='conda activate ${ENV_NAME}'" | |
| 80 | -echo "" |
third-party/xinference/start.sh deleted
| ... | ... | @@ -1,164 +0,0 @@ |
| 1 | -#!/bin/bash | |
| 2 | - | |
| 3 | -# Xinference Docker 部署脚本 | |
| 4 | -# 用于启动 Xinference 服务 | |
| 5 | - | |
| 6 | -set -e | |
| 7 | - | |
| 8 | -# 检查是否需要 sudo | |
| 9 | -if sudo -n docker info &> /dev/null 2>&1; then | |
| 10 | - DOCKER="sudo docker" | |
| 11 | - # 检测 Docker Compose 的形式(V2: docker compose 或 V1: docker-compose) | |
| 12 | - if sudo docker compose version &> /dev/null 2>&1; then | |
| 13 | - DOCKER_COMPOSE="sudo docker compose" | |
| 14 | - echo "✅ 检测到 Docker Compose V2" | |
| 15 | - elif command -v docker-compose &> /dev/null; then | |
| 16 | - DOCKER_COMPOSE="sudo docker-compose" | |
| 17 | - echo "✅ 检测到 Docker Compose V1" | |
| 18 | - else | |
| 19 | - echo "❌ 错误: 未找到 Docker Compose" | |
| 20 | - exit 1 | |
| 21 | - fi | |
| 22 | -else | |
| 23 | - DOCKER="docker" | |
| 24 | - # 检测 Docker Compose 的形式 | |
| 25 | - if docker compose version &> /dev/null 2>&1; then | |
| 26 | - DOCKER_COMPOSE="docker compose" | |
| 27 | - echo "✅ 检测到 Docker Compose V2" | |
| 28 | - elif command -v docker-compose &> /dev/null; then | |
| 29 | - DOCKER_COMPOSE="docker-compose" | |
| 30 | - echo "✅ 检测到 Docker Compose V1" | |
| 31 | - else | |
| 32 | - echo "❌ 错误: 未找到 Docker Compose" | |
| 33 | - exit 1 | |
| 34 | - fi | |
| 35 | -fi | |
| 36 | - | |
| 37 | -echo "=========================================" | |
| 38 | -echo " Xinference Docker 部署脚本" | |
| 39 | -echo "=========================================" | |
| 40 | -echo "" | |
| 41 | - | |
| 42 | -# 检查 GPU | |
| 43 | -echo "🔍 检查 GPU 可用性..." | |
| 44 | -if command -v nvidia-smi &> /dev/null; then | |
| 45 | - echo "✅ 检测到 NVIDIA GPU:" | |
| 46 | - nvidia-smi --query-gpu=name,memory.total --format=csv,noheader | head -n 1 | |
| 47 | - | |
| 48 | - # 检查 NVIDIA Container Toolkit | |
| 49 | - if $DOCKER run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi &> /dev/null; then | |
| 50 | - GPU_AVAILABLE=true | |
| 51 | - echo "✅ NVIDIA Container Toolkit 已安装,GPU 可用" | |
| 52 | - else | |
| 53 | - GPU_AVAILABLE=false | |
| 54 | - echo "⚠️ 检测到 GPU 但 NVIDIA Container Toolkit 未安装" | |
| 55 | - echo "⚠️ 将使用 CPU 模式启动" | |
| 56 | - echo "💡 如需 GPU 加速,请安装 NVIDIA Container Toolkit(见下方说明)" | |
| 57 | - fi | |
| 58 | -else | |
| 59 | - echo "⚠️ 未检测到 NVIDIA GPU,将使用 CPU 模式" | |
| 60 | - GPU_AVAILABLE=false | |
| 61 | -fi | |
| 62 | - | |
| 63 | -# 创建模型目录 | |
| 64 | -echo "" | |
| 65 | -echo "📁 创建模型存储目录..." | |
| 66 | -mkdir -p models | |
| 67 | - | |
| 68 | -# 拉取镜像 | |
| 69 | -echo "" | |
| 70 | -echo "🐳 拉取 Xinference Docker 镜像..." | |
| 71 | -if [ "$GPU_AVAILABLE" = true ]; then | |
| 72 | - $DOCKER pull xprobe/xinference:latest | |
| 73 | -else | |
| 74 | - echo "⚠️ CPU 模式:如需 GPU 支持,请配置好 NVIDIA Docker 运行时" | |
| 75 | - $DOCKER pull xprobe/xinference:latest | |
| 76 | -fi | |
| 77 | - | |
| 78 | -# 停止并删除旧容器 | |
| 79 | -echo "" | |
| 80 | -echo "🛑 停止并删除旧容器..." | |
| 81 | -$DOCKER stop xinference 2>/dev/null || true | |
| 82 | -$DOCKER rm xinference 2>/dev/null || true | |
| 83 | -$DOCKER_COMPOSE down 2>/dev/null || true | |
| 84 | - | |
| 85 | -# 启动服务 | |
| 86 | -echo "" | |
| 87 | -echo "🚀 启动 Xinference 服务..." | |
| 88 | -if [ "$GPU_AVAILABLE" = true ]; then | |
| 89 | - echo "🔥 使用 GPU 模式启动..." | |
| 90 | - $DOCKER_COMPOSE up -d | |
| 91 | -else | |
| 92 | - # CPU 模式:直接使用 docker run | |
| 93 | - echo "💻 使用 CPU 模式启动..." | |
| 94 | - $DOCKER run -d \ | |
| 95 | - --name xinference \ | |
| 96 | - -p 9997:9997 -p 9998:9998 \ | |
| 97 | - -v "$(pwd)/models:/data" \ | |
| 98 | - -e XINFERENCE_HOME=/data \ | |
| 99 | - --restart unless-stopped \ | |
| 100 | - xprobe/xinference:latest \ | |
| 101 | - xinference-local -H 0.0.0.0 | |
| 102 | -fi | |
| 103 | - | |
| 104 | -# 等待服务启动 | |
| 105 | -echo "" | |
| 106 | -echo "⏳ 等待服务启动..." | |
| 107 | -for i in {1..30}; do | |
| 108 | - if curl -s http://localhost:9997/v1/models > /dev/null 2>&1; then | |
| 109 | - echo "✅ Xinference 服务启动成功!" | |
| 110 | - break | |
| 111 | - fi | |
| 112 | - echo " 等待中... ($i/30)" | |
| 113 | - sleep 2 | |
| 114 | -done | |
| 115 | - | |
| 116 | -# 检查服务状态 | |
| 117 | -echo "" | |
| 118 | -echo "📊 服务状态检查..." | |
| 119 | -if curl -s http://localhost:9997/v1/models > /dev/null 2>&1; then | |
| 120 | - echo "✅ 服务健康检查通过" | |
| 121 | - echo "" | |
| 122 | - echo "=========================================" | |
| 123 | - echo " 🎉 部署成功!" | |
| 124 | - echo "=========================================" | |
| 125 | - echo "" | |
| 126 | - echo "📍 服务地址:" | |
| 127 | - echo " - API: http://localhost:9997" | |
| 128 | - echo " - Dashboard: http://localhost:9998" | |
| 129 | - echo "" | |
| 130 | - echo "📝 下一步操作:" | |
| 131 | - echo " 1. 查看日志: $DOCKER logs -f xinference" | |
| 132 | - echo " 2. 部署模型: python deploy_models.py" | |
| 133 | - echo " 3. 测试搜索: python ecommerce_demo.py" | |
| 134 | - echo "" | |
| 135 | - | |
| 136 | - if [ "$GPU_AVAILABLE" = false ]; then | |
| 137 | - echo "💡 启用 GPU 加速(可选):" | |
| 138 | - echo " 如果你想使用 GPU 加速,请安装 NVIDIA Container Toolkit:" | |
| 139 | - echo "" | |
| 140 | - echo " # 1. 添加 NVIDIA 仓库" | |
| 141 | - echo " curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -" | |
| 142 | - echo " distribution=\$(. /etc/os-release;echo \$ID\$VERSION_ID)" | |
| 143 | - echo " curl -s -L https://nvidia.github.io/nvidia-docker/\$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list" | |
| 144 | - echo "" | |
| 145 | - echo " # 2. 安装 NVIDIA Container Toolkit" | |
| 146 | - echo " sudo apt-get update" | |
| 147 | - echo " sudo apt-get install -y nvidia-container-toolkit" | |
| 148 | - echo "" | |
| 149 | - echo " # 3. 重启 Docker" | |
| 150 | - echo " sudo systemctl restart docker" | |
| 151 | - echo "" | |
| 152 | - echo " # 4. 重新运行此脚本" | |
| 153 | - echo " bash start.sh" | |
| 154 | - echo "" | |
| 155 | - fi | |
| 156 | - | |
| 157 | - echo "📚 查看所有已部署模型:" | |
| 158 | - echo " curl http://localhost:9997/v1/models" | |
| 159 | - echo "" | |
| 160 | -else | |
| 161 | - echo "❌ 服务启动失败,请查看日志:" | |
| 162 | - echo " $DOCKER logs xinference" | |
| 163 | - exit 1 | |
| 164 | -fi |
third-party/xinference/start_gpu_manual.sh deleted
| ... | ... | @@ -1,92 +0,0 @@ |
| 1 | -#!/bin/bash | |
| 2 | - | |
| 3 | -# Xinference GPU 模式启动脚本(手动设备映射) | |
| 4 | -# 不依赖 nvidia-container-toolkit | |
| 5 | - | |
| 6 | -set -e | |
| 7 | - | |
| 8 | -echo "=========================================" | |
| 9 | -echo " Xinference GPU 模式启动(手动配置)" | |
| 10 | -echo "=========================================" | |
| 11 | -echo "" | |
| 12 | - | |
| 13 | -# 检查 GPU 设备 | |
| 14 | -if [ ! -e /dev/nvidia0 ]; then | |
| 15 | - echo "❌ 错误: 未找到 NVIDIA 设备文件" | |
| 16 | - echo " 请确保已安装 NVIDIA 驱动" | |
| 17 | - exit 1 | |
| 18 | -fi | |
| 19 | - | |
| 20 | -echo "✅ 检测到 NVIDIA 设备文件:" | |
| 21 | -ls -la /dev/nvidia* | head -5 | |
| 22 | -echo "" | |
| 23 | - | |
| 24 | -# 停止旧容器 | |
| 25 | -echo "🛑 停止旧容器..." | |
| 26 | -sudo docker stop xinference 2>/dev/null || true | |
| 27 | -sudo docker rm xinference 2>/dev/null || true | |
| 28 | -echo "" | |
| 29 | - | |
| 30 | -# 创建模型目录 | |
| 31 | -mkdir -p models | |
| 32 | - | |
| 33 | -# 拉取镜像(如果需要) | |
| 34 | -echo "🐳 检查镜像..." | |
| 35 | -if ! sudo docker image inspect xprobe/xinference:latest &> /dev/null; then | |
| 36 | - echo "拉取 Xinference 镜像..." | |
| 37 | - sudo docker pull xprobe/xinference:latest | |
| 38 | -fi | |
| 39 | -echo "" | |
| 40 | - | |
| 41 | -# 启动容器(手动映射 GPU 设备和库) | |
| 42 | -echo "🚀 启动 Xinference 容器(GPU 模式)..." | |
| 43 | -echo "映射设备: /dev/nvidia0, /dev/nvidiactl, /dev/nvidia-uvm" | |
| 44 | -echo "映射库: /usr/lib/libcuda.so*" | |
| 45 | -echo "" | |
| 46 | - | |
| 47 | -sudo docker run -d \ | |
| 48 | - --name xinference \ | |
| 49 | - --restart unless-stopped \ | |
| 50 | - -p 9997:9997 \ | |
| 51 | - -p 9998:9998 \ | |
| 52 | - -v "$(pwd)/models:/data" \ | |
| 53 | - -v /usr/lib/libcuda.so.1:/usr/lib/x86_64-linux-gnu/libcuda.so.1 \ | |
| 54 | - -v /usr/lib/libnvidia-ml.so.1:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1 \ | |
| 55 | - -e XINFERENCE_HOME=/data \ | |
| 56 | - -e LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/local/cuda/lib64 \ | |
| 57 | - --device /dev/nvidia0 \ | |
| 58 | - --device /dev/nvidiactl \ | |
| 59 | - --device /dev/nvidia-uvm \ | |
| 60 | - --device /dev/nvidia-uvm-tools \ | |
| 61 | - --security-opt "label=disable" \ | |
| 62 | - --privileged \ | |
| 63 | - xprobe/xinference:latest \ | |
| 64 | - xinference-local -H 0.0.0.0 | |
| 65 | - | |
| 66 | -echo "" | |
| 67 | -echo "⏳ 等待服务启动..." | |
| 68 | -sleep 3 | |
| 69 | - | |
| 70 | -# 检查容器状态 | |
| 71 | -if sudo docker ps | grep -q xinference; then | |
| 72 | - echo "✅ 容器启动成功!" | |
| 73 | - echo "" | |
| 74 | - echo "=========================================" | |
| 75 | - echo " 🎉 启动成功!" | |
| 76 | - echo "=========================================" | |
| 77 | - echo "" | |
| 78 | - echo "📍 服务地址:" | |
| 79 | - echo " - API: http://localhost:9997" | |
| 80 | - echo " - Dashboard: http://localhost:9998" | |
| 81 | - echo "" | |
| 82 | - echo "📝 查看日志:" | |
| 83 | - echo " sudo docker logs -f xinference" | |
| 84 | - echo "" | |
| 85 | - echo "📝 测试 GPU:" | |
| 86 | - echo " sudo docker exec xinference nvidia-smi" | |
| 87 | - echo "" | |
| 88 | -else | |
| 89 | - echo "❌ 容器启动失败,查看日志:" | |
| 90 | - sudo docker logs xinference | |
| 91 | - exit 1 | |
| 92 | -fi |
| 1 | -#!/bin/bash | |
| 2 | 1 | |
| 3 | -# Xinference 停止脚本 | |
| 4 | - | |
| 5 | -echo "🛑 停止 Xinference 服务..." | |
| 6 | - | |
| 7 | -# 停止 Docker Compose | |
| 8 | -docker-compose down 2>/dev/null || true | |
| 9 | - | |
| 10 | -# 停止并删除容器 | |
| 11 | -docker stop xinference 2>/dev/null || true | |
| 12 | -docker rm xinference 2>/dev/null || true | |
| 13 | - | |
| 14 | -echo "✅ Xinference 服务已停止" | |
| 2 | +xinference terminate --model-uid Qwen3-Reranker-0.6B --model-uid bge-m3 --model-uid Qwen3-Embedding-0.6B | ... | ... |
third-party/xinference/xinference使用文档.md deleted
| ... | ... | @@ -1,255 +0,0 @@ |
| 1 | -根据搜索结果分析,**Xinference从v1.7.0版本开始正式支持Qwen3-Embedding和Qwen3-Reranker模型**。但需要注意早期版本存在一些部署问题,建议升级到最新稳定版。 | |
| 2 | - | |
| 3 | ---- | |
| 4 | - | |
| 5 | -## **一、支持情况确认** | |
| 6 | - | |
| 7 | -### **✅ 已支持(v1.7.0+)** | |
| 8 | -- **Qwen3-Embedding**:支持0.6B等规格,最大上下文8192 tokens,输出1024维向量 | |
| 9 | -- **Qwen3-Reranker**:支持0.6B等规格,Cross-Encoder架构用于精排 | |
| 10 | - | |
| 11 | -### **⚠️ 已知问题与解决方案** | |
| 12 | -1. **Batch Size限制**:早期版本Qwen3-Reranker处理batch>1时报错`no padding token defined` | |
| 13 | - - **解决**:升级到v1.7.0.post1或更高版本 | |
| 14 | -2. **GPU显存异常**:vLLM引擎加载时显存占用过高 | |
| 15 | - - **解决**:升级到v1.7.1+,或使用`--cpu-offload-gb`参数 | |
| 16 | - | |
| 17 | ---- | |
| 18 | - | |
| 19 | -## **二、部署方案** | |
| 20 | - | |
| 21 | -### **方案1:Docker部署(推荐)** | |
| 22 | - | |
| 23 | -```bash | |
| 24 | -# 1. 拉取最新镜像(v1.7.1+) | |
| 25 | -docker pull xprobe/xinference:latest | |
| 26 | - | |
| 27 | -# 2. 启动Xinference服务 | |
| 28 | -docker run -d --name xinference \ | |
| 29 | - -p 9997:9997 -p 9998:9998 \ | |
| 30 | - --gpus all \ | |
| 31 | - -v /data/models:/data \ | |
| 32 | - -e XINFERENCE_HOME=/data \ | |
| 33 | - xprobe/xinference:latest xinference-local -H 0.0.0.0 | |
| 34 | - | |
| 35 | -# 3. 查看服务状态 | |
| 36 | -curl http://localhost:9997/v1/models | |
| 37 | -``` | |
| 38 | - | |
| 39 | -### **方案2:pip部署** | |
| 40 | - | |
| 41 | -```bash | |
| 42 | -# 1. 安装最新版本(务必v1.7.1+) | |
| 43 | -pip install "xinference>=1.7.1" | |
| 44 | - | |
| 45 | -# 2. 启动本地服务 | |
| 46 | -xinference-local --host 0.0.0.0 --port 9997 | |
| 47 | - | |
| 48 | -# 3. 或启动集群模式 | |
| 49 | -xinference-supervisor -H ${SUPERVISOR_HOST} | |
| 50 | -xinference-worker -e "http://${SUPERVISOR_HOST}:9997" | |
| 51 | -``` | |
| 52 | - | |
| 53 | ---- | |
| 54 | - | |
| 55 | -## **三、模型部署与使用** | |
| 56 | - | |
| 57 | -### **步骤1:部署Qwen3-Embedding模型** | |
| 58 | - | |
| 59 | -```python | |
| 60 | -from xinference.client import Client | |
| 61 | - | |
| 62 | -# 连接Xinference服务 | |
| 63 | -client = Client("http://localhost:9997") | |
| 64 | - | |
| 65 | -# 启动Qwen3-Embedding模型 | |
| 66 | -model_uid = client.launch_model( | |
| 67 | - model_name="qwen3-embedding", | |
| 68 | - model_size_in_billions=0, # 0表示自动选择可用版本 | |
| 69 | - model_type="embedding", | |
| 70 | - engine="vllm", # 推荐vLLM引擎 | |
| 71 | - gpu_idx="0", # 指定GPU | |
| 72 | -) | |
| 73 | -print(f"Embedding模型UID: {model_uid}") | |
| 74 | -``` | |
| 75 | - | |
| 76 | -### **步骤2:部署Qwen3-Reranker模型** | |
| 77 | - | |
| 78 | -```python | |
| 79 | -# 启动Qwen3-Reranker模型 | |
| 80 | -reranker_uid = client.launch_model( | |
| 81 | - model_name="qwen3-reranker", | |
| 82 | - model_size_in_billions=0, | |
| 83 | - model_type="rerank", # 明确指定为reranker | |
| 84 | - engine="vllm", | |
| 85 | - gpu_idx="0", # 可与embedding同卡,注意显存 | |
| 86 | -) | |
| 87 | -print(f"Reranker模型UID: {reranker_uid}") | |
| 88 | -``` | |
| 89 | - | |
| 90 | -### **步骤3:电商搜索实战使用** | |
| 91 | - | |
| 92 | -```python | |
| 93 | -# 获取模型实例 | |
| 94 | -embedding_model = client.get_model(model_uid) | |
| 95 | -reranker_model = client.get_model(reranker_uid) | |
| 96 | - | |
| 97 | -# 示例:电商搜索query和商品标题 | |
| 98 | -query = "适合老人用的智能手机大屏幕长续航" | |
| 99 | -candidate_products = [ | |
| 100 | - "红米Note12 5000mAh大电量 6.67英寸大屏 老人模式", | |
| 101 | - "iPhone 15 Pro Max 专业摄影旗舰", | |
| 102 | - "华为畅享60 6000mAh超长续航 护眼大屏", | |
| 103 | - "OPPO A1 5000mAh电池 简易模式适合长辈", | |
| 104 | - "小米手环8 智能运动监测", | |
| 105 | -] | |
| 106 | - | |
| 107 | -# 阶段1:密集检索召回Top-200 | |
| 108 | -query_embedding = embedding_model.create_embedding(query)["data"][0]["embedding"] | |
| 109 | -# 使用Faiss或向量数据库批量检索(伪代码) | |
| 110 | -# candidate_embeddings = [embedding_model.create_embedding(p)["data"][0]["embedding"] for p in candidate_products] | |
| 111 | -# top200_ids = faiss_search(query_embedding, candidate_embeddings, k=200) | |
| 112 | - | |
| 113 | -# 阶段2:精排Top-200为Top-50(实际场景) | |
| 114 | -# 为演示简化为精排全部5个 | |
| 115 | -pairs = [(query, product) for product in candidate_products] | |
| 116 | -rerank_scores = reranker_model.rerank(pairs) | |
| 117 | - | |
| 118 | -# 按重排序分数排序 | |
| 119 | -sorted_results = sorted( | |
| 120 | - zip(candidate_products, rerank_scores), | |
| 121 | - key=lambda x: x[1]["relevance_score"], | |
| 122 | - reverse=True | |
| 123 | -) | |
| 124 | - | |
| 125 | -# 输出结果 | |
| 126 | -for product, score in sorted_results: | |
| 127 | - print(f"相似度: {score['relevance_score']:.4f} | 商品: {product}") | |
| 128 | - | |
| 129 | -# 预期输出: | |
| 130 | -# 相似度: 0.9234 | 商品: 华为畅享60 6000mAh超长续航 护眼大屏 | |
| 131 | -# 相似度: 0.8912 | 商品: OPPO A1 5000mAh电池 简易模式适合长辈 | |
| 132 | -# 相似度: 0.8567 | 商品: 红米Note12 5000mAh大电量 6.67英寸大屏 老人模式 | |
| 133 | -# 相似度: 0.2345 | 商品: iPhone 15 Pro Max 专业摄影旗舰 | |
| 134 | -# 相似度: 0.1234 | 商品: 小米手环8 智能运动监测 | |
| 135 | -``` | |
| 136 | - | |
| 137 | ---- | |
| 138 | - | |
| 139 | -## **四、REST API调用方式** | |
| 140 | - | |
| 141 | -### **Embedding API(兼容OpenAI格式)** | |
| 142 | - | |
| 143 | -```bash | |
| 144 | -# 实时生成query向量 | |
| 145 | -curl -X POST http://localhost:9997/v1/embeddings \ | |
| 146 | - -H "Content-Type: application/json" \ | |
| 147 | - -d '{ | |
| 148 | - "model": "qwen3-embedding", | |
| 149 | - "input": ["适合老人用的智能手机大屏幕长续航"] | |
| 150 | - }' | |
| 151 | - | |
| 152 | -# 批量生成商品向量(离线任务) | |
| 153 | -curl -X POST http://localhost:9997/v1/embeddings \ | |
| 154 | - -H "Content-Type: application/json" \ | |
| 155 | - -d '{ | |
| 156 | - "model": "qwen3-embedding", | |
| 157 | - "input": ["商品标题1", "商品标题2", ..., "商品标题10000"] | |
| 158 | - }' | |
| 159 | -``` | |
| 160 | - | |
| 161 | -### **Reranker API** | |
| 162 | - | |
| 163 | -```bash | |
| 164 | -# 精排候选商品 | |
| 165 | -curl -X POST http://localhost:9997/v1/rerank \ | |
| 166 | - -H "Content-Type: application/json" \ | |
| 167 | - -d '{ | |
| 168 | - "model": "qwen3-reranker", | |
| 169 | - "query": "适合老人用的智能手机大屏幕长续航", | |
| 170 | - "documents": [ | |
| 171 | - "红米Note12 5000mAh大电量 6.67英寸大屏 老人模式", | |
| 172 | - "华为畅享60 6000mAh超长续航 护眼大屏" | |
| 173 | - ], | |
| 174 | - "top_n": 10 | |
| 175 | - }' | |
| 176 | -``` | |
| 177 | - | |
| 178 | ---- | |
| 179 | - | |
| 180 | -## **五、关键参数与优化** | |
| 181 | - | |
| 182 | -### **1. 显存配置建议** | |
| 183 | -```python | |
| 184 | -# 对于显存有限的场景,使用CPU卸载 | |
| 185 | -reranker_uid = client.launch_model( | |
| 186 | - model_name="qwen3-reranker", | |
| 187 | - model_type="rerank", | |
| 188 | - engine="vllm", | |
| 189 | - cpu_offload_gb=16, # 将部分计算卸载到CPU | |
| 190 | - gpu_memory_utilization=0.6, # 限制显存使用 | |
| 191 | -) | |
| 192 | -``` | |
| 193 | - | |
| 194 | -### **2. 性能调优** | |
| 195 | -```python | |
| 196 | -# Embedding批量处理优化 | |
| 197 | -embedding_model.create_embedding( | |
| 198 | - input_texts, | |
| 199 | - batch_size=100, # 增大批量提升吞吐 | |
| 200 | - normalize=True # 归一化向量用于余弦相似度 | |
| 201 | -) | |
| 202 | - | |
| 203 | -# Reranker并发控制(避免OOM) | |
| 204 | -reranker_model.rerank( | |
| 205 | - pairs, | |
| 206 | - batch_size=8, # 根据显存调整,A10G建议8-16 | |
| 207 | - max_length=512 # 限制输入长度 | |
| 208 | -) | |
| 209 | -``` | |
| 210 | - | |
| 211 | -### **3. 电商搜索最佳实践** | |
| 212 | -- **离线批量**:每天凌晨全量重新生成2亿商品embedding,存入向量数据库(Milvus/Pinecone) | |
| 213 | -- **在线实时**:用户query实时embedding,召回Top-1000 | |
| 214 | -- **精排阶段**:对Top-1000使用Qwen3-Reranker打分,取Top-50返回 | |
| 215 | -- **缓存策略**:TOP 10000热搜query的embedding和rerank结果缓存Redis,QPS提升10倍 | |
| 216 | -- **混合检索**:结合BM25关键词召回,提升头部Query准确率 | |
| 217 | - | |
| 218 | ---- | |
| 219 | - | |
| 220 | -## **六、版本兼容性矩阵** | |
| 221 | - | |
| 222 | -| Xinference版本 | Qwen3-Embedding | Qwen3-Reranker | 推荐度 | | |
| 223 | -|----------------|-----------------|----------------|--------| | |
| 224 | -| < v1.7.0 | ❌ 不支持 | ❌ 不支持 | 必须升级| | |
| 225 | -| v1.7.0 | ✅ 支持 | ⚠️ 有batch bug | 慎用 | | |
| 226 | -| v1.7.0.post1 | ✅ 支持 | ✅ 基本支持 | 可用 | | |
| 227 | -| **≥ v1.7.1** | **✅ 完美支持** | **✅ 完美支持** | **强烈推荐** | | |
| 228 | - | |
| 229 | -**建议**:生产环境务必使用 **v1.7.1或更高版本** | |
| 230 | - | |
| 231 | ---- | |
| 232 | - | |
| 233 | -## **七、监控与运维** | |
| 234 | - | |
| 235 | -```bash | |
| 236 | -# 查看模型运行状态 | |
| 237 | -curl http://localhost:9997/v1/models | |
| 238 | - | |
| 239 | -# 监控GPU显存(推荐部署Prometheus+Grafana) | |
| 240 | -nvidia-smi -l 1 | |
| 241 | - | |
| 242 | -# 日志排查 | |
| 243 | -docker logs -f xinference --tail 100 | |
| 244 | -``` | |
| 245 | - | |
| 246 | ---- | |
| 247 | - | |
| 248 | -## **总结** | |
| 249 | - | |
| 250 | -Xinference + Qwen3-Embedding + Qwen3-Reranker是**完全可行且已生产可用**的组合,特别适合电商搜索场景。只需注意: | |
| 251 | -1. **版本必须≥1.7.1** | |
| 252 | -2. **Reranker建议单独部署以避免显存争抢** | |
| 253 | -3. **两阶段检索(海选+精排)是最佳实践** | |
| 254 | - | |
| 255 | -如需具体压测数据或故障排查,可进一步咨询! | |
| 256 | 0 | \ No newline at end of file |
third-party/xinference/测试结果-perfermance_test_http.txt
0 → 100644
| ... | ... | @@ -0,0 +1,167 @@ |
| 1 | +$ p perfermance_test_http.py | |
| 2 | +API连接正常,开始性能测试... | |
| 3 | +============================================================ | |
| 4 | +Embedding API 性能测试 (HTTP) | |
| 5 | +============================================================ | |
| 6 | + | |
| 7 | +选择测试模式: | |
| 8 | +1. 顺序测试 (Sequential) | |
| 9 | +2. 并发测试 (Concurrent) | |
| 10 | +3. 两种模式都测试 | |
| 11 | +请输入选择 (1/2/3, 默认1): 3 | |
| 12 | + | |
| 13 | +============================================================ | |
| 14 | +测试模型: bge-m3 | |
| 15 | +============================================================ | |
| 16 | + | |
| 17 | +开始并发测试模型: bge-m3 | |
| 18 | +测试次数: 1000, 并发数: 10 | |
| 19 | + 已完成 100/1000 次请求... | |
| 20 | + 已完成 200/1000 次请求... | |
| 21 | + 已完成 300/1000 次请求... | |
| 22 | + 已完成 400/1000 次请求... | |
| 23 | + 已完成 500/1000 次请求... | |
| 24 | + 已完成 600/1000 次请求... | |
| 25 | + 已完成 700/1000 次请求... | |
| 26 | + 已完成 800/1000 次请求... | |
| 27 | + 已完成 900/1000 次请求... | |
| 28 | + 已完成 1000/1000 次请求... | |
| 29 | + | |
| 30 | +============================================================ | |
| 31 | +性能测试结果 - bge-m3 | |
| 32 | +============================================================ | |
| 33 | +总请求数: 1000 | |
| 34 | +成功请求: 1000 | |
| 35 | +失败请求: 0 | |
| 36 | +成功率: 100.00% | |
| 37 | +总耗时: 145.1439秒 | |
| 38 | +平均延迟: 0.1451秒 | |
| 39 | +最小延迟: 0.0311秒 | |
| 40 | +最大延迟: 0.5770秒 | |
| 41 | +P50延迟: 0.0599秒 | |
| 42 | +P95延迟: 0.5151秒 | |
| 43 | +P99延迟: 0.5704秒 | |
| 44 | +标准差: 0.1789秒 | |
| 45 | +QPS: 6.89 请求/秒 | |
| 46 | +============================================================ | |
| 47 | + | |
| 48 | +开始顺序测试模型: bge-m3 | |
| 49 | +测试次数: 1000 | |
| 50 | + 已完成 100/1000 次请求... | |
| 51 | + 已完成 200/1000 次请求... | |
| 52 | + 已完成 300/1000 次请求... | |
| 53 | + 已完成 400/1000 次请求... | |
| 54 | + 已完成 500/1000 次请求... | |
| 55 | + 已完成 600/1000 次请求... | |
| 56 | + 已完成 700/1000 次请求... | |
| 57 | + 已完成 800/1000 次请求... | |
| 58 | + 已完成 900/1000 次请求... | |
| 59 | + | |
| 60 | +============================================================ | |
| 61 | +性能测试结果 - bge-m3 | |
| 62 | +============================================================ | |
| 63 | +总请求数: 1000 | |
| 64 | +成功请求: 1000 | |
| 65 | +失败请求: 0 | |
| 66 | +成功率: 100.00% | |
| 67 | +总耗时: 74.5284秒 | |
| 68 | +平均延迟: 0.0745秒 | |
| 69 | +最小延迟: 0.0271秒 | |
| 70 | +最大延迟: 0.5767秒 | |
| 71 | +P50延迟: 0.0286秒 | |
| 72 | +P95延迟: 0.4797秒 | |
| 73 | +P99延迟: 0.5037秒 | |
| 74 | +标准差: 0.1364秒 | |
| 75 | +QPS: 13.42 请求/秒 | |
| 76 | +============================================================ | |
| 77 | + | |
| 78 | +============================================================ | |
| 79 | +测试模型: Qwen3-Embedding-0.6B | |
| 80 | +============================================================ | |
| 81 | + | |
| 82 | +开始并发测试模型: Qwen3-Embedding-0.6B | |
| 83 | +测试次数: 1000, 并发数: 10 | |
| 84 | + 已完成 100/1000 次请求... | |
| 85 | + 已完成 200/1000 次请求... | |
| 86 | + 已完成 300/1000 次请求... | |
| 87 | + 已完成 400/1000 次请求... | |
| 88 | + 已完成 500/1000 次请求... | |
| 89 | + 已完成 600/1000 次请求... | |
| 90 | + 已完成 700/1000 次请求... | |
| 91 | + 已完成 800/1000 次请求... | |
| 92 | + 已完成 900/1000 次请求... | |
| 93 | + 已完成 1000/1000 次请求... | |
| 94 | + | |
| 95 | +============================================================ | |
| 96 | +性能测试结果 - Qwen3-Embedding-0.6B | |
| 97 | +============================================================ | |
| 98 | +总请求数: 1000 | |
| 99 | +成功请求: 1000 | |
| 100 | +失败请求: 0 | |
| 101 | +成功率: 100.00% | |
| 102 | +总耗时: 195.7997秒 | |
| 103 | +平均延迟: 0.1958秒 | |
| 104 | +最小延迟: 0.0564秒 | |
| 105 | +最大延迟: 0.6201秒 | |
| 106 | +P50延迟: 0.1050秒 | |
| 107 | +P95延迟: 0.5674秒 | |
| 108 | +P99延迟: 0.5994秒 | |
| 109 | +标准差: 0.1829秒 | |
| 110 | +QPS: 5.11 请求/秒 | |
| 111 | +============================================================ | |
| 112 | + | |
| 113 | +开始顺序测试模型: Qwen3-Embedding-0.6B | |
| 114 | +测试次数: 1000 | |
| 115 | + 已完成 100/1000 次请求... | |
| 116 | + 已完成 200/1000 次请求... | |
| 117 | + 已完成 300/1000 次请求... | |
| 118 | + 已完成 400/1000 次请求... | |
| 119 | + 已完成 500/1000 次请求... | |
| 120 | + 已完成 600/1000 次请求... | |
| 121 | + 已完成 700/1000 次请求... | |
| 122 | + 已完成 800/1000 次请求... | |
| 123 | + 已完成 900/1000 次请求... | |
| 124 | + | |
| 125 | +============================================================ | |
| 126 | +性能测试结果 - Qwen3-Embedding-0.6B | |
| 127 | +============================================================ | |
| 128 | +总请求数: 1000 | |
| 129 | +成功请求: 1000 | |
| 130 | +失败请求: 0 | |
| 131 | +成功率: 100.00% | |
| 132 | +总耗时: 100.2533秒 | |
| 133 | +平均延迟: 0.1003秒 | |
| 134 | +最小延迟: 0.0513秒 | |
| 135 | +最大延迟: 0.6249秒 | |
| 136 | +P50延迟: 0.0539秒 | |
| 137 | +P95延迟: 0.4993秒 | |
| 138 | +P99延迟: 0.5180秒 | |
| 139 | +标准差: 0.1354秒 | |
| 140 | +QPS: 9.97 请求/秒 | |
| 141 | +============================================================ | |
| 142 | + | |
| 143 | +结果已保存到: performance_results.json | |
| 144 | + | |
| 145 | +============================================================ | |
| 146 | +性能测试汇总对比 | |
| 147 | +============================================================ | |
| 148 | + | |
| 149 | +模型: bge-m3 (concurrent) | |
| 150 | + QPS: 6.89 | |
| 151 | + 平均延迟: 0.1451秒 | |
| 152 | + 成功率: 100.00% | |
| 153 | + | |
| 154 | +模型: bge-m3 (sequential) | |
| 155 | + QPS: 13.42 | |
| 156 | + 平均延迟: 0.0745秒 | |
| 157 | + 成功率: 100.00% | |
| 158 | + | |
| 159 | +模型: Qwen3-Embedding-0.6B (concurrent) | |
| 160 | + QPS: 5.11 | |
| 161 | + 平均延迟: 0.1958秒 | |
| 162 | + 成功率: 100.00% | |
| 163 | + | |
| 164 | +模型: Qwen3-Embedding-0.6B (sequential) | |
| 165 | + QPS: 9.97 | |
| 166 | + 平均延迟: 0.1003秒 | |
| 167 | + 成功率: 100.00% | ... | ... |