From 775db2b03caf3e78999e91e24b497e8352a6cf1d Mon Sep 17 00:00:00 2001 From: tangwang Date: Fri, 26 Dec 2025 23:29:24 +0800 Subject: [PATCH] xinfer --- third-party/xinference/ENV_SETUP.md | 162 ------------------------------------------------------------------------------------------------------------------------------------------------------------------ third-party/xinference/QUICKSTART.md | 137 ----------------------------------------------------------------------------------------------------------------------------------------- third-party/xinference/README.md | 715 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- third-party/xinference/STATUS.md | 158 -------------------------------------------------------------------------------------------------------------------------------------------------------------- third-party/xinference/activate.sh | 22 ++-------------------- third-party/xinference/api_examples.sh | 356 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- third-party/xinference/deploy_models.py | 192 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ third-party/xinference/docker-compose.yml | 28 ---------------------------- third-party/xinference/ecommerce_demo.py | 318 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ third-party/xinference/install_nvidia_container_toolkit.sh | 137 ----------------------------------------------------------------------------------------------------------------------------------------- third-party/xinference/perfermance_test.py | 249 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ third-party/xinference/perfermance_test_http.py | 265 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ third-party/xinference/perfermance_test_single.py | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ third-party/xinference/requirements.txt | 3 --- third-party/xinference/setup_alias.sh | 75 --------------------------------------------------------------------------- third-party/xinference/setup_env.sh | 80 -------------------------------------------------------------------------------- third-party/xinference/start.sh | 164 -------------------------------------------------------------------------------------------------------------------------------------------------------------------- third-party/xinference/start_gpu_manual.sh | 92 -------------------------------------------------------------------------------------------- third-party/xinference/stop.sh | 14 +------------- third-party/xinference/test.sh | 10 ++++++++++ third-party/xinference/xinference使用文档.md | 255 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- third-party/xinference/测试结果-perfermance_test.txt | 0 third-party/xinference/测试结果-perfermance_test_http.txt | 167 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 23 files changed, 802 insertions(+), 2905 deletions(-) delete mode 100644 third-party/xinference/ENV_SETUP.md delete mode 100644 third-party/xinference/QUICKSTART.md delete mode 100644 third-party/xinference/README.md delete mode 100644 third-party/xinference/STATUS.md delete mode 100755 third-party/xinference/api_examples.sh delete mode 100644 third-party/xinference/deploy_models.py delete mode 100644 third-party/xinference/docker-compose.yml delete mode 100644 third-party/xinference/ecommerce_demo.py delete mode 100755 third-party/xinference/install_nvidia_container_toolkit.sh create mode 100644 third-party/xinference/perfermance_test.py create mode 100644 third-party/xinference/perfermance_test_http.py create mode 100644 third-party/xinference/perfermance_test_single.py delete mode 100644 third-party/xinference/requirements.txt delete mode 100755 third-party/xinference/setup_alias.sh delete mode 100755 third-party/xinference/setup_env.sh delete mode 100755 third-party/xinference/start.sh delete mode 100755 third-party/xinference/start_gpu_manual.sh mode change 100755 => 100644 third-party/xinference/stop.sh create mode 100644 third-party/xinference/test.sh delete mode 100644 third-party/xinference/xinference使用文档.md create mode 100644 third-party/xinference/测试结果-perfermance_test.txt create mode 100644 third-party/xinference/测试结果-perfermance_test_http.txt diff --git a/third-party/xinference/ENV_SETUP.md b/third-party/xinference/ENV_SETUP.md deleted file mode 100644 index 8214c61..0000000 --- a/third-party/xinference/ENV_SETUP.md +++ /dev/null @@ -1,162 +0,0 @@ -# Xinference 环境配置指南 - -独立的 Conda 环境用于运行 Xinference 客户端和演示脚本。 - -## 📋 快速开始 - -### 方法1:一键安装和配置(推荐) - -```bash -cd /data/tw/SearchEngine/third-party/xinference - -# 1. 创建环境 -bash setup_env.sh - -# 2. 设置快捷别名 -bash setup_alias.sh - -# 3. 使配置生效 -source ~/.bashrc # 或 source ~/.zshrc - -# 4. 激活环境 -xinference-env -``` - -### 方法2:手动配置 - -```bash -# 1. 创建环境 -conda create -n xinference python=3.10 -y - -# 2. 激活环境 -conda activate xinference - -# 3. 安装依赖 -pip install xinference-client numpy requests - -# 4. 运行演示 -python ecommerce_demo.py -``` - -## 🚀 快捷命令 - -配置别名后,可以使用以下快捷命令: - -| 命令 | 说明 | -|------|------| -| `xinference-env` | 激活环境并切换到 Xinference 目录 | -| `xinference-activate` | 仅激活 Xinference 环境 | -| `xinference-cd` | 切换到 Xinference 目录 | - -## 📦 已安装的包 - -- `xinference-client` - Xinference 客户端库 -- `numpy` - 数值计算库 -- `requests` - HTTP 请求库 - -## 💡 使用示例 - -### 激活环境后运行演示 - -```bash -# 方式1: 使用快捷命令 -xinference-env -python ecommerce_demo.py - -# 方式2: 手动激活 -conda activate xinference -python ecommerce_demo.py - -# 方式3: 使用激活脚本 -source activate.sh -python ecommerce_demo.py -``` - -### 部署模型 - -```bash -# 确保环境已激活 -xinference-env - -# 运行部署脚本 -python deploy_models.py -``` - -### 运行简单测试 - -```bash -xinference-env -python ecommerce_demo.py --simple -``` - -## 🛠️ 脚本说明 - -| 脚本 | 说明 | -|------|------| -| `setup_env.sh` | 创建 Xinference Conda 环境 | -| `setup_alias.sh` | 设置 Shell 快捷别名 | -| `activate.sh` | 快速激活环境的脚本 | -| `start.sh` | 启动 Xinference Docker 服务 | - -## 🔧 环境管理 - -### 更新环境 - -```bash -conda activate xinference -pip install --upgrade xinference-client numpy -``` - -### 删除环境 - -```bash -conda env remove -n xinference -y -``` - -### 查看已安装的包 - -```bash -conda activate xinference -pip list -``` - -## 📝 注意事项 - -1. **Docker 服务**:Xinference 服务本身运行在 Docker 中,不需要 Python 环境 -2. **客户端**:Python 环境仅用于运行客户端脚本(如 `ecommerce_demo.py`) -3. **独立环境**:使用独立环境避免与其他项目的依赖冲突 - -## 🐛 故障排除 - -### 问题:找不到 xinference-client - -```bash -# 重新安装 -conda activate xinference -pip install xinference-client --force-reinstall -``` - -### 问题:环境激活失败 - -```bash -# 初始化 conda -conda init bash -source ~/.bashrc - -# 然后再激活 -conda activate xinference -``` - -### 问题:快捷命令不生效 - -```bash -# 手动添加别名到 ~/.bashrc -echo 'alias xinference-env="source /data/tw/SearchEngine/third-party/xinference/activate.sh"' >> ~/.bashrc -source ~/.bashrc -``` - -## 📚 相关文档 - -- [Xinference 官方文档](https://inference.readthedocs.io/) -- [电商搜索演示](ecommerce_demo.py) -- [模型部署脚本](deploy_models.py) diff --git a/third-party/xinference/QUICKSTART.md b/third-party/xinference/QUICKSTART.md deleted file mode 100644 index 5b6e946..0000000 --- a/third-party/xinference/QUICKSTART.md +++ /dev/null @@ -1,137 +0,0 @@ -# Xinference 快速使用指南 - -## ✅ 环境已配置完成! - -您的 **xinference** conda 环境已创建并配置好了所有依赖。 - -## 🚀 快速开始 - -### 方式1:使用快捷命令(推荐) - -在新的终端窗口中: - -```bash -# 首次使用需要重新加载配置 -source ~/.bashrc - -# 然后就可以直接使用快捷命令 -xinference-env - -# 运行演示 -python ecommerce_demo.py -``` - -### 方式2:手动激活 - -```bash -# 激活环境 -conda activate xinference - -# 运行演示 -python ecommerce_demo.py -``` - -### 方式3:使用激活脚本 - -```bash -# 切换到目录 -cd /data/tw/SearchEngine/third-party/xinference - -# 使用激活脚本 -source activate.sh - -# 运行演示 -python ecommerce_demo.py -``` - -## 📝 可用命令 - -| 快捷命令 | 说明 | -|---------|------| -| `xinference-env` | 激活环境并切换到 Xinference 目录 | -| `xinference-activate` | 仅激活 xinference 环境 | -| `xinference-cd` | 切换到 Xinference 目录 | - -## 🧪 测试环境 - -激活环境后,运行以下命令测试: - -```bash -python -c "from xinference_client import RESTfulClient; print('✅ 环境配置成功!')" -``` - -应该显示:`✅ 环境配置成功!` - -## 🔧 环境信息 - -- **环境名**: xinference -- **Python 版本**: 3.10.19 -- **已安装包**: - - xinference-client 1.15.0 - - numpy 2.2.6 - - requests 2.32.5 - -## 📚 下一步 - -1. **启动 Xinference 服务**(如果还没启动): - ```bash - bash start.sh - ``` - -2. **部署模型**: - ```bash - xinference-env - python deploy_models.py - ``` - -3. **运行演示**: - ```bash - xinference-env - python ecommerce_demo.py - ``` - -4. **退出环境**: - ```bash - conda deactivate - ``` - -## 🐛 常见问题 - -### 问题1:快捷命令不生效 - -```bash -# 重新加载配置 -source ~/.bashrc - -# 或者打开新的终端窗口 -``` - -### 问题2:找不到 xinference_client - -```bash -# 确认环境已激活 -conda activate xinference - -# 检查安装 -pip list | grep xinference - -# 如果没有,重新安装 -pip install xinference-client -``` - -### 问题3:启动脚本找不到命令 - -确保使用正确的 Python 环境: -```bash -# 激活环境后再运行 -conda activate xinference -python ecommerce_demo.py -``` - -## 💡 提示 - -- Xinference 服务运行在 Docker 中(端口 9997/9998) -- Python 环境仅用于运行客户端脚本 -- 使用 `xinference-env` 快捷命令最方便 - -详细文档请参考: [ENV_SETUP.md](ENV_SETUP.md) diff --git a/third-party/xinference/README.md b/third-party/xinference/README.md deleted file mode 100644 index d03c3b5..0000000 --- a/third-party/xinference/README.md +++ /dev/null @@ -1,715 +0,0 @@ -# Xinference 电商搜索部署完整指南 - -使用 Qwen3-Embedding 和 Qwen3-Reranker 构建两阶段搜索系统(密集检索 + 精排) - -## 📋 快速导航 - -- [快速开始](#快速开始) -- [系统要求](#系统要求) -- [完整安装步骤](#完整安装步骤) -- [GPU 配置](#gpu-配置) -- [模型部署](#模型部署) -- [使用示例](#使用示例) -- [故障排除](#故障排除) - ---- - -## 快速开始 - -### 最快 5 分钟上手(CPU 模式) - -```bash -cd /data/tw/SearchEngine/third-party/xinference - -# 1. 启动 Xinference 服务(自动 CPU 模式) -bash start.sh - -# 2. 创建 Python 环境 -bash setup_env.sh -bash setup_alias.sh -source ~/.bashrc - -# 3. 激活环境 -xinference-env - -# 4. 部署模型(CPU 模式,慢但可用) -python deploy_models.py - -# 5. 运行演示 -python ecommerce_demo.py -``` - -**注意**:CPU 模式仅用于测试,生产环境请使用 GPU 模式。 - ---- - -## 系统要求 - -### 硬件配置 - -| 组件 | 最低配置 | 推荐配置 | -|------|---------|---------| -| CPU | 8核 | 16核+ | -| 内存 | 16GB | 64GB+ | -| GPU | 无(CPU 模式) | NVIDIA Tesla T4 16GB+ | -| 存储 | 30GB | 100GB+ SSD | - -**GPU 显存需求**: -- Qwen3-Embedding (4B): ~8GB -- Qwen3-Reranker (4B): ~8GB -- 两个模型同时运行: ~16GB+ - -### 软件要求 - -- **操作系统**: Linux (Ubuntu 20.04+, CentOS 7+) -- **Docker**: 20.10+ -- **Docker Compose**: v2.0+ -- **NVIDIA Driver**: 525.0+ (GPU 模式) -- **Conda**: Miniconda3 或 Anaconda3 -- **Python**: 3.10 - ---- - -## 完整安装步骤 - -### 步骤 1: 环境检查 - -```bash -# 检查系统信息 -cat /etc/os-release - -# 检查 GPU(如果有) -nvidia-smi - -# 检查 Docker -docker --version -docker compose version - -# 检查 Conda -conda --version -``` - -### 步骤 2: 启动 Xinference 服务 - -#### 选项 A: CPU 模式(快速测试) - -```bash -cd /data/tw/SearchEngine/third-party/xinference -bash start.sh -``` - -**特点**: -- ✅ 无需额外配置 -- ✅ 快速启动 -- ❌ 速度慢(10-50倍) -- ❌ 仅适合测试 - -#### 选项 B: GPU 模式(生产环境) - -**自动安装 NVIDIA Container Toolkit**: - -```bash -cd /data/tw/SearchEngine/third-party/xinference -bash install_nvidia_container_toolkit.sh -``` - -脚本会自动: -1. 检测系统类型 -2. 添加 NVIDIA 仓库 -3. 安装 nvidia-container-toolkit -4. 配置 Docker 运行时 -5. 重启 Docker 服务 -6. 验证安装 - -**手动安装(可选)**: - -```bash -# Ubuntu/Debian -distribution=$(. /etc/os-release;echo $ID$VERSION_ID) -curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - -curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \ - sudo tee /etc/apt/sources.list.d/nvidia-docker.list - -sudo apt-get update -sudo apt-get install -y nvidia-container-toolkit - -sudo nvidia-ctk runtime configure --runtime=docker -sudo systemctl restart docker -``` - -**安装完成后,重新启动服务**: - -```bash -# 停止旧容器 -sudo docker stop xinference -sudo docker rm xinference - -# 重新启动(自动使用 GPU) -bash start.sh -``` - -**验证 GPU 模式**: - -```bash -# 检查服务日志 -sudo docker logs xinference | grep -i gpu - -# 或者检查 GPU 使用 -nvidia-smi -``` - -### 步骤 3: 配置 Python 环境 - -```bash -cd /data/tw/SearchEngine/third-party/xinference - -# 自动创建环境 -bash setup_env.sh - -# 设置快捷别名 -bash setup_alias.sh - -# 使配置生效 -source ~/.bashrc -``` - -**环境信息**: -- 环境名: `xinference` -- Python: 3.10.19 -- 包: xinference-client 1.15.0, numpy 2.2.6 - -**快捷命令**: -```bash -xinference-env # 激活环境并切换目录 -xinference-activate # 仅激活环境 -xinference-cd # 切换到 xinference 目录 -``` - -### 步骤 4: 验证安装 - -```bash -# 检查 Xinference 服务 -curl http://localhost:9997/v1/models - -# 预期输出 -{"object":"list","data":[]} - -# 检查 Python 环境 -xinference-env -python -c "from xinference_client import RESTfulClient; print('✅ 环境配置成功!')" - -# 查看 Dashboard -open http://localhost:9998 -# 或在浏览器访问 -``` - ---- - -## GPU 配置 - -### 检查 GPU 状态 - -```bash -# 检查 NVIDIA 驱动 -nvidia-smi - -# 检查 Docker GPU 支持 -sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi - -# 预期输出(成功) -+-----------------------------------------------------------------------------+ -| NVIDIA-SMI 525.85.12 Driver Version: 525.85.12 CUDA Version: 11.0 | -|-------------------------------+----------------------+----------------------+ -| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | -| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | -|===============================+======================+======================| -| 0 Tesla T4 On | 00000000:00:1E.0 Off | 0 | -| 34% 42C P8 16W / 70W | 0MiB / 16384MiB | 0% Default | -+-------------------------------+----------------------+----------------------+ -``` - -### GPU 可用性测试 - -```bash -# 测试 1: 检查 Xinference 是否能看到 GPU -curl http://localhost:9997/v1/models - -# 测试 2: 尝试部署一个小模型测试 GPU -xinference-env -python -c " -from xinference_client import RESTfulClient -client = RESTfulClient('http://localhost:9997') -# 列出支持的 GPU 模型 -# 如果成功,说明 GPU 可用 -" -``` - -### 多 GPU 配置 - -```bash -# 使用 GPU 0 -python deploy_models.py --gpu 0 - -# 使用 GPU 1 -python deploy_models.py --gpu 1 - -# 使用多个 GPU -python deploy_models.py --gpu 0,1 -``` - ---- - -## 模型部署 - -### 部署 Qwen3 4B 模型 - -#### 完整部署(Embedding + Reranker) - -```bash -xinference-env - -# 部署所有模型 -python deploy_models.py -``` - -**预期输出**: -``` -============================================================ - Qwen3 模型自动部署 -============================================================ - -🔗 连接到 Xinference 服务: http://localhost:9997 -✅ 连接成功! - -============================================================ - 部署 Qwen3-Embedding 模型 (4B) -============================================================ - -⏳ 正在部署,首次运行需要下载模型,请耐心等待... - 模型大小: ~8GB - 上下文长度: 8192 tokens - 向量维度: 1024 - -✅ Qwen3-Embedding 部署成功! - 模型 UID: qwen3-embedding-4b - -============================================================ - 部署 Qwen3-Reranker 模型 (4B) -============================================================ - -✅ Qwen3-Reranker 部署成功! - 模型 UID: qwen3-reranker-4b - -============================================================ - 🎉 模型部署完成! -============================================================ -``` - -**预计时间**(GPU 模式): -- 首次下载: 5-15 分钟(取决于网络) -- 模型加载: 2-5 分钟 - -#### 单独部署 - -```bash -# 仅部署 Embedding -python deploy_models.py --embedding-only - -# 仅部署 Reranker -python deploy_models.py --reranker-only - -# 指定 GPU -python deploy_models.py --gpu 1 -``` - -#### 查看已部署模型 - -```bash -# 使用脚本 -python deploy_models.py --list - -# 使用 API -curl http://localhost:9997/v1/models | python -m json.tool -``` - -### 测试模型 - -#### 测试 Embedding - -```bash -xinference-env -python -c " -from xinference_client import RESTfulClient - -client = RESTfulClient('http://localhost:9997') -model = client.get_model('qwen3-embedding-4b') - -result = model.create_embedding('测试文本') -vector = result['data'][0]['embedding'] - -print(f'✅ 向量维度: {len(vector)}') -print(f'✅ 前5维: {vector[:5]}') -" -``` - -#### 测试 Reranker - -```bash -xinference-env -python -c " -from xinference_client import RESTfulClient - -client = RESTfulClient('http://localhost:9997') -model = client.get_model('qwen3-reranker-4b') - -query = '适合老人用的智能手机' -docs = ['华为畅享60 6000mAh', '小米手环8'] - -results = model.rerank([(query, doc) for doc in docs]) - -for doc, score in zip(docs, results): - print(f'[{score[\"relevance_score\"]:.4f}] {doc}') -" -``` - ---- - -## 使用示例 - -### 电商搜索两阶段架构 - -``` -用户查询: "适合老人用的智能手机大屏幕长续航" - ↓ -┌──────────────────────────────────────┐ -│ 阶段1: 密集检索 (Dense Retrieval) │ -│ Qwen3-Embedding (4B) │ -│ 召回 Top-200 │ -└──────────────────────────────────────┘ - ↓ -┌──────────────────────────────────────┐ -│ 阶段2: 精排 (Reranking) │ -│ Qwen3-Reranker (4B) │ -│ Top-200 → Top-10 │ -└──────────────────────────────────────┘ - ↓ -最终结果 -``` - -### 运行演示 - -```bash -xinference-env - -# 完整演示 -python ecommerce_demo.py - -# 简单演示 -python ecommerce_demo.py --simple - -# 指定模型 UID -python ecommerce_demo.py --embedding qwen3-embedding-4b --reranker qwen3-reranker-4b -``` - -**输出示例**: -``` -====================================================================== - 🛒 电商搜索实战演示 - Qwen3 双塔架构 -====================================================================== - -🔗 连接到 Xinference 服务... -✅ 连接成功! - -⏳ 加载模型... -✅ 模型加载完成 - -====================================================================== -🔍 搜索查询: 适合老人用的智能手机大屏幕长续航 -====================================================================== - -📊 阶段1: 密集检索(召回 Top-200) ----------------------------------------------------------------------- -⏱️ 密集检索耗时: 0.23秒 -✅ 召回 200 个候选商品 - -🎯 阶段2: 精排(Cross-Encoder 打分) ----------------------------------------------------------------------- -⏱️ 精排耗时: 0.15秒 - -🎯 搜索结果 (Top 5): ----------------------------------------------------------------------- -1. [0.9876] 华为畅享60 6000mAh超长续航 护眼大屏 鸿蒙系统 -2. [0.9654] OPPO A1 5000mAh电池 简易模式适合长辈 -3. [0.9432] vivo Y78 5000mAh大电池 120Hz高刷屏 -4. [0.9210] 荣耀Play7T 6000mAh巨量电池 双卡双待 -5. [0.8976] 诺基亚C31 5050mAh电池 耐用三防 -``` - -### 代码示例 - -#### Python SDK - -```python -from xinference_client import RESTfulClient - -# 连接服务 -client = RESTfulClient("http://localhost:9997") - -# 获取模型 -embedding_model = client.get_model("qwen3-embedding-4b") -reranker_model = client.get_model("qwen3-reranker-4b") - -# 1. 生成 Embedding -query = "高端智能手机" -query_vector = embedding_model.create_embedding(query)["data"][0]["embedding"] - -# 2. 搜索商品(假设有预计算的向量) -# products_with_vectors = [...] - -# 3. 精排 -results = reranker_model.rerank([ - (query, "华为 Mate 60 Pro 卫星通信"), - (query, "iPhone 15 Pro Max 钛金属"), - (query, "小米14 Pro 徕卡光学") -]) - -for result in results: - print(f"[{result['relevance_score']:.4f}] {result['index']}") -``` - -#### REST API - -```bash -# Embedding API -curl -X POST http://localhost:9997/v1/embeddings \ - -H "Content-Type: application/json" \ - -d '{ - "model": "qwen3-embedding-4b", - "input": ["测试文本"] - }' - -# Reranker API -curl -X POST http://localhost:9997/v1/rerank \ - -H "Content-Type: application/json" \ - -d '{ - "model": "qwen3-reranker-4b", - "query": "适合老人用的手机", - "documents": [ - "华为畅享60 6000mAh", - "小米手环8" - ], - "top_n": 5 - }' -``` - ---- - -## 故障排除 - -### 问题 1: GPU 不可用 - -**症状**: -``` -Worker can only see these GPUs: []. -``` - -**解决方案**: - -```bash -# 1. 安装 NVIDIA Container Toolkit -bash install_nvidia_container_toolkit.sh - -# 2. 重启服务 -sudo docker stop xinference -sudo docker rm xinference -bash start.sh - -# 3. 验证 -nvidia-smi -``` - -### 问题 2: 模型下载失败 - -**症状**: -``` -Failed to download model -``` - -**解决方案**: - -```bash -# 使用国内镜像 -export HF_ENDPOINT=https://hf-mirror.com -python deploy_models.py -``` - -### 问题 3: 显存不足 - -**症状**: -``` -CUDA out of memory -``` - -**解决方案**: - -```bash -# 只部署一个模型 -python deploy_models.py --embedding-only - -# 或使用不同的 GPU -python deploy_models.py --gpu 1 -``` - -### 问题 4: 导入错误 - -**症状**: -``` -ModuleNotFoundError: No module named 'xinference' -``` - -**解决方案**: - -```bash -# 激活正确的环境 -xinference-env - -# 重新安装依赖 -pip install xinference-client --force-reinstall -``` - -### 问题 5: 服务连接失败 - -**解决方案**: - -```bash -# 检查服务状态 -curl http://localhost:9997/v1/models - -# 查看日志 -sudo docker logs -f xinference - -# 重启服务 -sudo docker restart xinference -``` - ---- - -## 进阶配置 - -### 性能优化 - -1. **离线向量预计算** -2. **向量数据库集成**(Milvus/Pinecone) -3. **Redis 缓存** -4. **批量处理** - -### 生产部署 - -``` - ┌─────────────┐ - │ 负载均衡 │ - └──────┬──────┘ - │ - ┌────────────┼────────────┐ - │ │ │ - ┌────▼────┐ ┌───▼────┐ ┌───▼────┐ - │Xinference│ │Xinference│ │Xinference│ - │ Instance1│ │ Instance2│ │ Instance3│ - └────┬────┘ └───┬────┘ └───┬────┘ - │ │ │ - └────────────┼────────────┘ - │ - ┌──────▼──────┐ - │ 向量数据库 │ - │ (Milvus) │ - └─────────────┘ -``` - -### 监控指标 - -```python -metrics = { - "latency_p50": "< 100ms", - "latency_p99": "< 300ms", - "qps": "100+", - "gpu_utilization": "< 80%", - "cache_hit_rate": "> 60%" -} -``` - ---- - -## 附录 - -### 常用命令 - -```bash -# 服务管理 -bash start.sh # 启动服务 -sudo docker logs -f xinference # 查看日志 -sudo docker restart xinference # 重启服务 -sudo docker stop xinference # 停止服务 - -# 环境管理 -xinference-env # 激活环境 -conda deactivate # 退出环境 - -# 模型管理 -python deploy_models.py # 部署模型 -python deploy_models.py --list # 列出模型 -curl http://localhost:9997/v1/models # API 查询 - -# 演示示例 -python ecommerce_demo.py # 运行演示 -python ecommerce_demo.py --simple # 简单示例 -``` - -### 目录结构 - -``` -xinference/ -├── README.md # 本文档 -├── QUICKSTART.md # 快速开始 -├── ENV_SETUP.md # 环境配置详解 -├── start.sh # 服务启动脚本 -├── install_nvidia_container_toolkit.sh # NVIDIA Toolkit 安装 -├── setup_env.sh # Conda 环境创建 -├── setup_alias.sh # Shell 别名配置 -├── activate.sh # 环境激活脚本 -├── deploy_models.py # 模型部署脚本 -├── ecommerce_demo.py # 电商搜索示例 -├── docker-compose.yml # Docker Compose 配置 -└── models/ # 模型存储目录 -``` - -### 参考资源 - -- [Xinference 官方文档](https://inference.readthedocs.io/) -- [Qwen3 模型介绍](https://github.com/QwenLM/Qwen3) -- [Docker 部署指南](https://docs.docker.com/) -- [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/) - ---- - -## 总结 - -本文档提供了 Xinference 电商搜索系统的完整部署指南: - -✅ **环境准备**: Docker + Conda + GPU 驱动 -✅ **服务安装**: CPU/GPU 两种模式 -✅ **环境配置**: 自动化脚本 + 快捷命令 -✅ **模型部署**: Qwen3-Embedding 4B + Qwen3-Reranker 4B -✅ **使用示例**: 两阶段搜索架构 -✅ **故障排除**: 常见问题及解决方案 - -**下一步**: -1. 根据需求选择 CPU 或 GPU 模式 -2. 部署模型并运行演示 -3. 根据实际数据集调整参数 -4. 集成向量数据库进行优化 - ---- - -**文档版本**: v1.0 -**最后更新**: 2025-12-25 -**维护者**: Search Engine Team diff --git a/third-party/xinference/STATUS.md b/third-party/xinference/STATUS.md deleted file mode 100644 index 3e5a3c1..0000000 --- a/third-party/xinference/STATUS.md +++ /dev/null @@ -1,158 +0,0 @@ -# 当前状态总结 - -## ✅ 已完成 - -1. **Xinference 服务**:正常运行(CPU 模式) -2. **API 测试**:正常工作(http://localhost:9997) -3. **Python 环境**:已配置(xinference conda 环境) -4. **代码修复**:所有脚本已更新 -5. **GPU 设备**:硬件正常(Tesla T4) -6. **NVIDIA 驱动**:已安装(570.86.10) - -## ❌ 当前问题 - -**GPU 检测失败**:Xinference worker 无法检测到 GPU - -``` -Worker cannot use the GPUs with these indexes: [0] -Worker can only see these GPUs: [] -``` - -**根本原因**:缺少 `nvidia-container-toolkit`,Docker 容器无法正确暴露 GPU 给 Xinference - -## 🔧 解决方案(三选一) - -### 方案 1:安装 nvidia-container-toolkit(推荐,但需要网络) - -**当有网络时执行**: - -```bash -# 下载并安装 RPM 包(需要访问 GitHub 或 NVIDIA 镜像) -# 方法 A: 从 NVIDIA 仓库 -distribution=$(. /etc/os-release;echo $ID$VERSION_ID | sed 's/\.//') -curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | \ - sudo tee /etc/yum.repos.d/nvidia-docker.repo - -sudo yum install -y nvidia-container-toolkit -sudo nvidia-ctk runtime configure --runtime=docker -sudo systemctl restart docker - -# 重启 Xinference -sudo docker stop xinference -sudo docker rm xinference -bash start.sh # 会自动使用 GPU - -# 部署模型 -python deploy_models.py -``` - -### 方案 2:使用 CPU 模式(当前可用,但速度慢) - -**立即可用**: - -```bash -# 使用 start.sh(CPU 模式) -bash start.sh - -# 部署较小的模型(修改 deploy_models.py 中的模型大小) -# 将 model_size_in_billions=4 改为 model_size_in_billions=0 -python deploy_models.py - -# 或直接运行演示(使用默认小模型) -python ecommerce_demo.py -``` - -**优点**: -- ✅ 立即可用 -- ✅ 无需额外配置 -- ✅ 功能完整 - -**缺点**: -- ❌ 速度慢(10-50倍) -- ❌ 仅适合测试和演示 - -### 方案 3:等待网络恢复后安装 - -**保存以下命令供后续使用**: - -```bash -# NVIDIA Container Toolkit 安装命令 -# 保存到文件: ~/install_nvidia_toolkit_when_network_available.sh - -#!/bin/bash -# AliLinux/RHEL8 安装脚本 - -# 1. 添加仓库 -curl -s -L https://nvidia.github.io/nvidia-docker/rhel8/nvidia-docker.repo | \ - sudo tee /etc/yum.repos.d/nvidia-docker.repo - -# 2. 安装 -sudo yum install -y nvidia-container-toolkit - -# 3. 配置 Docker -sudo nvidia-ctk runtime configure --runtime=docker - -# 4. 重启 Docker -sudo systemctl restart docker - -# 5. 验证 -sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi -``` - -## 📊 当前系统信息 - -```bash -# 硬件 -GPU: Tesla T4 16GB -驱动: 570.86.10 - -# 软件 -Docker: 26.1.3 -Python: 3.10.19 (xinference 环境) -Xinference: 运行中(端口 9997/9998) - -# 服务状态 -API: http://localhost:9997 ✓ -Dashboard: http://localhost:9998 -``` - -## 🎯 推荐操作 - -**立即可做**: -```bash -# 使用 CPU 模式测试功能 -# 1. 确保服务运行 -curl http://localhost:9997/v1/models - -# 2. 运行简单演示 -cd /data/tw/SearchEngine/third-party/xinference -/home/tw/miniconda3/envs/xinference/bin/python ecommerce_demo.py --simple -``` - -**后续优化**: -1. 等待网络恢复,安装 nvidia-container-toolkit -2. 重启服务使用 GPU -3. 部署 Qwen3 4B 模型 - -## 📝 已创建的文件 - -- `start_gpu_manual.sh` - 手动 GPU 启动脚本(需要 nvidia-container-toolkit) -- `install_nvidia_container_toolkit.sh` - 自动安装脚本(不支持 AliLinux) -- `README.md` - 完整文档 -- `QUICKSTART.md` - 快速开始 -- `ENV_SETUP.md` - 环境配置 - -## 💡 技术说明 - -**为什么需要 nvidia-container-toolkit**? - -Docker 容器需要通过 NVIDIA Container Runtime 来访问 GPU,这包括: -1. NVIDIA 驱动库的映射 -2. GPU 设备的暴露 -3. CUDA 环境的配置 - -手动映射设备和库(我们尝试的方法)对于简单的容器可以工作,但 Xinference 使用 vLLM 引擎,它需要完整的 CUDA 运行时环境。 - -**为什么 CPU 模式可以工作**? - -Xinference 支持降级到 CPU 模式,通过设置环境变量或自动检测。虽然速度慢,但功能完整。 diff --git a/third-party/xinference/activate.sh b/third-party/xinference/activate.sh index 13b9cb9..5c9413a 100755 --- a/third-party/xinference/activate.sh +++ b/third-party/xinference/activate.sh @@ -1,22 +1,4 @@ #!/bin/bash -# 快速切换到 Xinference 环境的便捷脚本 - -ENV_NAME="xinference" - -# 初始化 conda -eval "$(conda shell.bash hook)" - -# 激活环境 -conda activate $ENV_NAME - -echo "✅ 已切换到 Xinference 环境" -echo "" -echo "可用命令:" -echo " - python ecommerce_demo.py # 运行电商搜索演示" -echo " - python deploy_models.py # 部署模型" -echo " - conda deactivate # 退出环境" -echo "" - -# 保持 shell 打开 -exec $SHELL +source /home/tw/miniconda3/etc/profile.d/conda.sh +conda activate xinference diff --git a/third-party/xinference/api_examples.sh b/third-party/xinference/api_examples.sh deleted file mode 100755 index f8b25e9..0000000 --- a/third-party/xinference/api_examples.sh +++ /dev/null @@ -1,356 +0,0 @@ -#!/bin/bash - -# Xinference REST API 调用示例 -# 演示如何通过 HTTP API 调用 Qwen3-Embedding 和 Qwen3-Reranker - -# 设置服务地址 -XINFERENCE_HOST="http://localhost:9997" -MODEL_EMBEDDING="qwen3-embedding" -MODEL_RERANKER="qwen3-reranker" - -echo "=========================================" -echo " Xinference REST API 调用示例" -echo "=========================================" -echo "" - -# 颜色定义 -GREEN='\033[0;32m' -BLUE='\033[0;34m' -YELLOW='\033[1;33m' -NC='\033[0m' # No Color - -print_section() { - echo "" - echo -e "${BLUE}=========================================${NC}" - echo -e "${BLUE} $1${NC}" - echo -e "${BLUE}=========================================${NC}" - echo "" -} - -print_info() { - echo -e "${GREEN}➜${NC} $1" -} - -# ============================================ -# 1. 查看服务状态 -# ============================================ -print_section "1. 查看服务状态和已部署模型" - -print_info "查看所有已部署的模型:" -curl -s "${XINFERENCE_HOST}/v1/models" | python3 -m json.tool - -echo "" -print_info "查看服务健康状态:" -curl -s "${XINFERENCE_HOST}/v1/models" > /dev/null && echo "✅ 服务健康" || echo "❌ 服务异常" - -# ============================================ -# 2. Embedding API 调用 -# ============================================ -print_section "2. Qwen3-Embedding API 调用" - -print_info "单个文本 embedding 生成:" -curl -X POST "${XINFERENCE_HOST}/v1/embeddings" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "'${MODEL_EMBEDDING}'", - "input": ["适合老人用的智能手机大屏幕长续航"] - }' | python3 -m json.tool - -echo "" -print_info "批量文本 embedding 生成:" -curl -X POST "${XINFERENCE_HOST}/v1/embeddings" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "'${MODEL_EMBEDDING}'", - "input": [ - "红米Note12 5000mAh大电量", - "华为畅享60 6000mAh超长续航", - "小米手环8 智能运动监测" - ] - }' | python3 -m json.tool - -# ============================================ -# 3. Reranker API 调用 -# ============================================ -print_section "3. Qwen3-Reranker API 调用" - -print_info "精排候选商品:" - -curl -X POST "${XINFERENCE_HOST}/v1/rerank" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "'${MODEL_RERANKER}'", - "query": "适合老人用的智能手机大屏幕长续航", - "documents": [ - "红米Note12 5000mAh大电量 6.67英寸大屏 老人模式", - "iPhone 15 Pro Max 专业摄影旗舰", - "华为畅享60 6000mAh超长续航 护眼大屏", - "OPPO A1 5000mAh电池 简易模式适合长辈", - "小米手环8 智能运动监测" - ], - "top_n": 5 - }' | python3 -m json.tool - -# ============================================ -# 4. 电商搜索实战:两阶段检索 -# ============================================ -print_section "4. 电商搜索实战:完整两阶段检索流程" - -# 阶段1: 密集检索 -print_info "阶段1: 为用户 query 生成向量" -echo "" -echo "Query: 适合老人用的智能手机大屏幕长续航" - -QUERY_VECTOR=$(curl -s -X POST "${XINFERENCE_HOST}/v1/embeddings" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "'${MODEL_EMBEDDING}'", - "input": ["适合老人用的智能手机大屏幕长续航"] - }' | python3 -c "import sys, json; print(json.dumps(json.load(sys.stdin)['data'][0]['embedding'])))") - -echo "Query 向量维度: $(echo $QUERY_VECTOR | python3 -c "import sys, json; print(len(json.load(sys.stdin)))")" - -echo "" -print_info "为候选商品生成向量(在实际应用中,这些向量应预计算并存储)" - -# 为简化演示,这里只显示部分候选商品的向量生成 -CANDIDATES=( - "红米Note12 5000mAh大电量" - "华为畅享60 6000mAh超长续航" - "小米手环8" -) - -for candidate in "${CANDIDATES[@]}"; do - echo "" - echo "商品: $candidate" - curl -s -X POST "${XINFERENCE_HOST}/v1/embeddings" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "'${MODEL_EMBEDDING}'", - "input": ["'${candidate}'"] - }' | python3 -c "import sys, json; data=json.load(sys.stdin); print(f\" 向量维度: {len(data['data'][0]['embedding'])}\")" -done - -# 阶段2: 精排 -echo "" -print_info "阶段2: 使用 Reranker 对召回结果进行精排" - -curl -X POST "${XINFERENCE_HOST}/v1/rerank" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "'${MODEL_RERANKER}'", - "query": "适合老人用的智能手机大屏幕长续航", - "documents": [ - "红米Note12 5000mAh大电量 6.67英寸大屏 老人模式", - "华为畅享60 6000mAh超长续航 护眼大屏", - "OPPO A1 5000mAh电池 简易模式适合长辈", - "iPhone 15 Pro Max 专业摄影旗舰", - "小米手环8 智能运动监测" - ], - "top_n": 3 - }' | python3 -m json.tool - -# ============================================ -# 5. 高级用法:批量处理 -# ============================================ -print_section "5. 批量 Embedding 生成(离线任务)" - -print_info "为大量商品生成 embedding(模拟离线任务)" -echo "注意: 实际生产环境中,批量大小建议为 100-1000" - -# 创建批量输入文件 -cat > /tmp/batch_input.json < list: - """获取文本的 embedding 向量""" - response = requests.post( - f"{XINFERENCE_HOST}/v1/embeddings", - json={ - "model": "qwen3-embedding", - "input": [text] - } - ) - return response.json()["data"][0]["embedding"] - -# 批量 Embedding -def get_embeddings(texts: list) -> list: - """批量获取 embedding 向量""" - response = requests.post( - f"{XINFERENCE_HOST}/v1/embeddings", - json={ - "model": "qwen3-embedding", - "input": texts - } - ) - return [item["embedding"] for item in response.json()["data"]] - -# Reranker 调用 -def rerank(query: str, documents: list, top_n: int = 10) -> list: - """使用 reranker 对文档排序""" - response = requests.post( - f"{XINFERENCE_HOST}/v1/rerank", - json={ - "model": "qwen3-reranker", - "query": query, - "documents": documents, - "top_n": top_n - } - ) - return response.json()["results"] - -# 计算余弦相似度 -def cosine_similarity(vec1: list, vec2: list) -> float: - """计算两个向量的余弦相似度""" - v1 = np.array(vec1) - v2 = np.array(vec2) - return float(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))) - -# 完整搜索流程 -def search(query: str, products: list) -> list: - """两阶段搜索""" - # 阶段1: 密集检索(简化示例) - query_vec = get_embedding(query) - similarities = [] - for product in products: - prod_vec = get_embedding(product) - sim = cosine_similarity(query_vec, prod_vec) - similarities.append((product, sim)) - - # 取 Top-200 - similarities.sort(key=lambda x: x[1], reverse=True) - top_200 = [p for p, s in similarities[:200]] - - # 阶段2: 精排 - reranked = rerank(query, top_200, top_n=10) - return reranked - -# 使用示例 -if __name__ == "__main__": - query = "适合老人用的智能手机大屏幕长续航" - products = [ - "红米Note12 5000mAh大电量 6.67英寸大屏 老人模式", - "华为畅享60 6000mAh超长续航 护眼大屏", - "小米手环8 智能运动监测" - ] - - results = search(query, products) - for r in results: - print(f"[{r['relevance_score']:.4f}] {r['document']}") - -EOF - -# ============================================ -# 7. 性能测试 -# ============================================ -print_section "7. 性能测试" - -print_info "测试 Embedding API 响应时间:" -echo "" - -for i in {1..5}; do - START=$(date +%s%N) - curl -s -X POST "${XINFERENCE_HOST}/v1/embeddings" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "'${MODEL_EMBEDDING}'", - "input": ["测试文本"] - }' > /dev/null - END=$(date +%s%N) - ELAPSED=$((($END - $START) / 1000000)) - echo " 请求 $i: ${ELAPSED}ms" -done - -echo "" -print_info "测试 Reranker API 响应时间:" - -for i in {1..5}; do - START=$(date +%s%N) - curl -s -X POST "${XINFERENCE_HOST}/v1/rerank" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "'${MODEL_RERANKER}'", - "query": "测试查询", - "documents": ["文档1", "文档2", "文档3"], - "top_n": 3 - }' > /dev/null - END=$(date +%s%N) - ELAPSED=$((($END - $START) / 1000000)) - echo " 请求 $i: ${ELAPSED}ms" -done - -# ============================================ -# 8. 常见问题排查 -# ============================================ -print_section "8. 常见问题排查" - -print_info "检查服务是否运行:" -curl -s "${XINFERENCE_HOST}/v1/models" > /dev/null && echo "✅ 服务正常" || echo "❌ 服务未启动,请运行: ./start.sh" - -echo "" -print_info "检查模型是否部署:" -MODELS=$(curl -s "${XINFERENCE_HOST}/v1/models") -echo "$MODELS" | python3 -c " -import sys, json -try: - models = json.load(sys.stdin) - if models: - print('✅ 已部署模型:') - for m in models: - print(f' - {m.get(\"model_type\")}: {m.get(\"model_uid\")}') - else: - print('❌ 没有已部署的模型,请运行: python deploy_models.py') -except: - print('❌ 无法获取模型信息') -" - -echo "" -print_info "查看 GPU 使用情况:" -if command -v nvidia-smi &> /dev/null; then - nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv,noheader | while read line; do - echo " GPU $line" - done -else - echo " ⚠️ nvidia-smi 未安装,无法查看 GPU 信息" -fi - -echo "" -echo "=========================================" -echo " ✅ API 调用示例演示完成" -echo "=========================================" diff --git a/third-party/xinference/deploy_models.py b/third-party/xinference/deploy_models.py deleted file mode 100644 index 3b3651b..0000000 --- a/third-party/xinference/deploy_models.py +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env python3 -""" -Qwen3 模型部署脚本 -自动部署 Qwen3-Embedding 和 Qwen3-Reranker 模型 -""" - -import time -import sys -from xinference_client import RESTfulClient as Client - - -def print_section(title): - """打印分节标题""" - print("\n" + "="*60) - print(f" {title}") - print("="*60 + "\n") - - -def deploy_qwen3_models(host="http://localhost:9997", gpu_idx=[0]): - """ - 部署 Qwen3 模型 - - Args: - host: Xinference 服务地址 - gpu_idx: GPU 索引 - """ - print_section("Qwen3 模型自动部署") - - # 连接到 Xinference 服务 - print(f"🔗 连接到 Xinference 服务: {host}") - try: - client = Client(host) - print("✅ 连接成功!\n") - except Exception as e: - print(f"❌ 连接失败: {e}") - print("\n💡 请确保 Xinference 服务已启动:") - print(" ./start.sh") - sys.exit(1) - - # 部署 Qwen3-Embedding 模型 - print_section("部署 Qwen3-Embedding 模型 (4B)") - print("⏳ 正在部署,首次运行需要下载模型,请耐心等待...") - print(" 模型大小: ~8GB") - print(" 上下文长度: 8192 tokens") - print(" 向量维度: 1024\n") - - try: - embedding_uid = client.launch_model( - model_name="qwen3-embedding", - model_size_in_billions=4, - model_type="embedding", - engine="vllm", - gpu_idx=gpu_idx, - ) - print(f"✅ Qwen3-Embedding 部署成功!") - print(f" 模型 UID: {embedding_uid}\n") - - # 等待模型加载完成 - print("⏳ 等待模型完全加载...") - time.sleep(5) - - # 测试模型 - embedding_model = client.get_model(embedding_uid) - test_result = embedding_model.create_embedding("测试文本") - if test_result and "data" in test_result: - vector_dim = len(test_result["data"][0]["embedding"]) - print(f"✅ 模型测试成功!向量维度: {vector_dim}\n") - else: - print("⚠️ 模型部署成功但测试失败\n") - - except Exception as e: - print(f"❌ Qwen3-Embedding 部署失败: {e}\n") - return None - - # 部署 Qwen3-Reranker 模型 - print_section("部署 Qwen3-Reranker 模型 (4B)") - print("⏳ 正在部署,首次运行需要下载模型,请耐心等待...") - print(" 模型大小: ~8GB") - print(" 架构: Cross-Encoder\n") - - try: - reranker_uid = client.launch_model( - model_name="qwen3-reranker", - model_size_in_billions=4, - model_type="rerank", - engine="vllm", - gpu_idx=gpu_idx, - ) - print(f"✅ Qwen3-Reranker 部署成功!") - print(f" 模型 UID: {reranker_uid}\n") - - # 等待模型加载完成 - print("⏳ 等待模型完全加载...") - time.sleep(5) - - # 测试模型 - reranker_model = client.get_model(reranker_uid) - test_result = reranker_model.rerank( - [("测试查询", "测试文档")] - ) - if test_result and len(test_result) > 0: - print(f"✅ 模型测试成功!\n") - else: - print("⚠️ 模型部署成功但测试失败\n") - - except Exception as e: - print(f"❌ Qwen3-Reranker 部署失败: {e}") - print("💡 可能的原因: GPU 显存不足,请尝试:") - print(" 1. 使用不同的 GPU 索引: python deploy_models.py --gpu 1") - print(" 2. 只部署 embedding 模型: python deploy_models.py --embedding-only") - return None - - # 显示部署摘要 - print_section("🎉 模型部署完成!") - print(f"✅ Qwen3-Embedding UID: {embedding_uid}") - print(f"✅ Qwen3-Reranker UID: {reranker_uid}") - print("\n📝 下一步:") - print(" 1. 运行电商搜索示例: python ecommerce_demo.py") - print(" 2. 查看 API 调用示例: cat api_examples.sh") - print(" 3. 查看 Dashboard: http://localhost:9998") - print(" 4. 查看所有模型: curl http://localhost:9997/v1/models") - print("") - - return { - "embedding_uid": embedding_uid, - "reranker_uid": reranker_uid - } - - -def list_models(host="http://localhost:9997"): - """列出所有已部署的模型""" - print_section("已部署模型列表") - try: - client = Client(host) - models = client.list_models() - - if not models: - print("📭 当前没有已部署的模型") - else: - for model in models: - model_type = model.get("model_type", "unknown") - model_uid = model.get("model_uid", "unknown") - print(f"📦 {model_type.upper()}: {model_uid}") - print() - except Exception as e: - print(f"❌ 获取模型列表失败: {e}\n") - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(description="部署 Qwen3 模型到 Xinference") - parser.add_argument("--host", default="http://localhost:9997", help="Xinference 服务地址") - parser.add_argument("--gpu", default="0", help="GPU 索引(逗号分隔,如: 0 或 0,1)") - parser.add_argument("--embedding-only", action="store_true", help="仅部署 embedding 模型") - parser.add_argument("--reranker-only", action="store_true", help="仅部署 reranker 模型") - parser.add_argument("--list", action="store_true", help="列出已部署的模型") - - args = parser.parse_args() - - # 将 GPU 字符串转换为列表 - gpu_idx = [int(x.strip()) for x in args.gpu.split(",")] - - if args.list: - list_models(args.host) - elif args.embedding_only: - # 仅部署 embedding - print_section("部署 Qwen3-Embedding 模型 (4B)") - client = Client(args.host) - embedding_uid = client.launch_model( - model_name="qwen3-embedding", - model_size_in_billions=4, - model_type="embedding", - engine="vllm", - gpu_idx=gpu_idx, - ) - print(f"✅ Embedding 模型部署成功: {embedding_uid}") - elif args.reranker_only: - # 仅部署 reranker - print_section("部署 Qwen3-Reranker 模型 (4B)") - client = Client(args.host) - reranker_uid = client.launch_model( - model_name="qwen3-reranker", - model_size_in_billions=4, - model_type="rerank", - engine="vllm", - gpu_idx=gpu_idx, - ) - print(f"✅ Reranker 模型部署成功: {reranker_uid}") - else: - # 部署所有模型 - deploy_qwen3_models(args.host, gpu_idx) diff --git a/third-party/xinference/docker-compose.yml b/third-party/xinference/docker-compose.yml deleted file mode 100644 index 7dfc7d7..0000000 --- a/third-party/xinference/docker-compose.yml +++ /dev/null @@ -1,28 +0,0 @@ -version: '3.8' - -services: - xinference: - image: xprobe/xinference:latest - container_name: xinference - ports: - - "9997:9997" - - "9998:9998" - environment: - - XINFERENCE_HOME=/data - volumes: - - ./models:/data - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: all - capabilities: [gpu] - restart: unless-stopped - command: xinference-local -H 0.0.0.0 - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:9997/v1/models"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s diff --git a/third-party/xinference/ecommerce_demo.py b/third-party/xinference/ecommerce_demo.py deleted file mode 100644 index efa36c2..0000000 --- a/third-party/xinference/ecommerce_demo.py +++ /dev/null @@ -1,318 +0,0 @@ -#!/usr/bin/env python3 -""" -电商搜索实战示例 -演示如何使用 Qwen3-Embedding 和 Qwen3-Reranker 构建两阶段搜索系统 -""" - -import time -from typing import List, Tuple -from xinference_client import RESTfulClient as Client - - -class EcommerceSearchEngine: - """电商搜索引擎""" - - def __init__(self, host="http://localhost:9997"): - """ - 初始化搜索引擎 - - Args: - host: Xinference 服务地址 - """ - print("🔗 连接到 Xinference 服务...") - self.client = Client(host) - self.embedding_model = None - self.reranker_model = None - print("✅ 连接成功!\n") - - def load_models(self, embedding_uid=None, reranker_uid=None): - """ - 加载模型 - - Args: - embedding_uid: Embedding 模型 UID - reranker_uid: Reranker 模型 UID - """ - # 列出所有模型 - models = self.client.list_models() - model_dict = {m.get("model_type"): m.get("model_uid") for m in models} - - # 使用提供的 UID 或自动查找 - self.embedding_uid = embedding_uid or model_dict.get("embedding") - self.reranker_uid = reranker_uid or model_dict.get("rerank") - - if not self.embedding_uid: - raise ValueError("❌ 未找到 Embedding 模型,请先运行: python deploy_models.py") - if not self.reranker_uid: - raise ValueError("❌ 未找到 Reranker 模型,请先运行: python deploy_models.py") - - print(f"📦 加载 Embedding 模型: {self.embedding_uid}") - self.embedding_model = self.client.get_model(self.embedding_uid) - print("✅ Embedding 模型加载完成\n") - - print(f"📦 加载 Reranker 模型: {self.reranker_uid}") - self.reranker_model = self.client.get_model(self.reranker_uid) - print("✅ Reranker 模型加载完成\n") - - def dense_retrieval(self, query: str, candidates: List[str], top_k: int = 200) -> List[Tuple[str, float]]: - """ - 密集检索阶段(第一阶:粗筛) - - 在实际生产环境中,这里会使用 Faiss 或向量数据库进行 ANN 搜索 - 从百万级商品中快速召回 Top-K 候选 - - Args: - query: 用户查询 - candidates: 候选商品列表 - top_k: 返回的数量 - - Returns: - [(商品, 相似度分数), ...] - """ - start_time = time.time() - - # 生成 query 向量 - query_embedding = self.embedding_model.create_embedding(query)["data"][0]["embedding"] - - # 为所有候选商品生成向量 - # 注意:生产环境中这些向量应该预计算并存储在向量数据库中 - candidate_embeddings = [] - for product in candidates: - emb = self.embedding_model.create_embedding(product)["data"][0]["embedding"] - candidate_embeddings.append((product, emb)) - - # 计算余弦相似度(简化版,生产环境使用 Faiss) - import numpy as np - query_vec = np.array(query_embedding) - query_vec = query_vec / np.linalg.norm(query_vec) # 归一化 - - similarities = [] - for product, emb in candidate_embeddings: - emb_vec = np.array(emb) - emb_vec = emb_vec / np.linalg.norm(emb_vec) - similarity = float(np.dot(query_vec, emb_vec)) - similarities.append((product, similarity)) - - # 按 similarity 排序,取 Top-K - similarities.sort(key=lambda x: x[1], reverse=True) - top_results = similarities[:top_k] - - elapsed = time.time() - start_time - print(f"⏱️ 密集检索耗时: {elapsed:.2f}秒") - - return top_results - - def cross_encoder_rerank(self, query: str, candidates: List[str]) -> List[Tuple[str, float]]: - """ - 精排阶段(第二阶:细排) - - 使用 Cross-Encoder 对密集检索的结果进行精确打分 - - Args: - query: 用户查询 - candidates: 候选商品列表 - - Returns: - [(商品, 相关性分数), ...] - """ - start_time = time.time() - - # 构建 query-document 对 - pairs = [(query, product) for product in candidates] - - # 批量打分 - rerank_results = self.reranker_model.rerank(pairs) - - # 组合结果 - results = list(zip(candidates, rerank_results)) - - # 按相关性分数排序 - results.sort(key=lambda x: x[1]["relevance_score"], reverse=True) - - elapsed = time.time() - start_time - print(f"⏱️ 精排耗时: {elapsed:.2f}秒") - - return results - - def search(self, query: str, product_catalog: List[str], top_k: int = 10) -> List[Tuple[str, float]]: - """ - 完整的两阶段搜索流程 - - Args: - query: 用户查询 - product_catalog: 商品目录(假设有数万到数百万商品) - top_k: 最终返回的结果数 - - Returns: - [(商品, 相关性分数), ...] - """ - print(f"\n{'='*70}") - print(f"🔍 搜索查询: {query}") - print(f"{'='*70}\n") - - # 阶段1:密集检索召回 Top-200 - print("📊 阶段1: 密集检索(召回 Top-200)") - print("-" * 70) - recall_top_k = min(200, len(product_catalog)) - retrieved = self.dense_retrieval(query, product_catalog, top_k=recall_top_k) - retrieved_products = [p for p, s in retrieved] - - print(f"✅ 召回 {len(retrieved)} 个候选商品\n") - - # 阶段2:Cross-Encoder 精排 - print("🎯 阶段2: 精排(Cross-Encoder 打分)") - print("-" * 70) - reranked = self.cross_encoder_rerank(query, retrieved_products) - - # 取最终 Top-K - final_results = reranked[:top_k] - - return final_results - - -def demo_ecommerce_search(): - """电商搜索演示""" - - print("\n" + "="*70) - print(" 🛒 电商搜索实战演示 - Qwen3 双塔架构") - print("="*70 + "\n") - - # 初始化搜索引擎 - engine = EcommerceSearchEngine(host="http://localhost:9997") - - # 加载模型 - print("⏳ 加载模型...") - engine.load_models() - - # 模拟商品数据库(实际应用中可能有数百万商品) - product_catalog = [ - "红米Note12 5000mAh大电量 6.67英寸大屏 老人模式", - "iPhone 15 Pro Max 专业摄影旗舰 A17芯片", - "华为畅享60 6000mAh超长续航 护眼大屏 鸿蒙系统", - "OPPO A1 5000mAh电池 简易模式适合长辈", - "小米手环8 智能运动监测 血氧心率", - "vivo Y78 5000mAh大电池 120Hz高刷屏", - "三星Galaxy A54 5000mAh 防水防尘", - "荣耀Play7T 6000mAh巨量电池 双卡双待", - "真我11 Pro 2亿像素 100W快充", - "诺基亚C31 5050mAh电池 耐用三防", - "联想拯救者Y70 8GB+256GB 骁龙8+", - "摩托罗拉edge S30 骁龙888+ 144Hz", - "一加Ace 2V 天玑9000 80W快充", - "iQOO Neo8 独显芯片双芯 120W闪充", - "Redmi K60 2K高光屏 骁龙8+", - "华为Mate 60 Pro 卫星通信 鸿蒙4.0", - "iPhone 14 128GB A15芯片", - "OPPO Reno10 Pro 人像镜头 100W", - "vivo X90s 天玑9200+ 蔡司影像", - "小米13 Pro 徕卡光学镜头", - ] - - # 测试查询 - test_queries = [ - "适合老人用的智能手机大屏幕长续航", - "拍照效果好的手机推荐", - "性价比高的游戏手机", - ] - - # 执行搜索 - for query in test_queries: - results = engine.search(query, product_catalog, top_k=5) - - # 显示结果 - print(f"\n🎯 搜索结果 (Top 5):") - print("-" * 70) - for i, (product, score) in enumerate(results, 1): - print(f"{i}. [{score['relevance_score']:.4f}] {product}") - print() - - # 性能统计 - print("\n" + "="*70) - print("📊 生产环境部署建议") - print("="*70) - print(""" -1. 离线批量处理: - - 每天凌晨使用 Qwen3-Embedding 为全量商品生成向量 - - 存储到 Milvus/Pinecone 等向量数据库 - - 预计耗时: 2亿商品约 4-6 小时 - -2. 在线实时搜索: - - 用户 query 实时生成 embedding - - 向量数据库 ANN 检索召回 Top-1000 (耗时 < 50ms) - - Qwen3-Reranker 精排 Top-1000 → Top-50 (耗时 < 200ms) - - 总体延迟: < 300ms - -3. 缓存优化: - - Top 10000 热搜 query 的 embedding 和结果缓存到 Redis - - QPS 提升 10-20 倍 - -4. 混合检索: - - 结合 BM25 关键词召回(头部 Query 准确率更高) - - 向量召回 + 关键词召回 → 合并去重 → 精排 - -5. A/B 测试建议: - - 对照组: 纯 BM25 或传统 embedding 模型 - - 实验组: Qwen3-Embedding + Qwen3-Reranker - - 核心指标: CTR, CVR, GMV, 用户停留时间 - """) - - -def demo_simple_usage(): - """简单的使用示例""" - print("\n" + "="*70) - print(" 📝 快速使用示例") - print("="*70 + "\n") - - # 连接到服务 - client = Client("http://localhost:9997") - - # 列出可用模型 - models = client.list_models() - print("可用模型:") - for model in models: - print(f" - {model.get('model_type')}: {model.get('model_uid')}") - - # 假设模型已部署 - if models: - embedding_model = next((m for m in models if m.get("model_type") == "embedding"), None) - reranker_model = next((m for m in models if m.get("model_type") == "rerank"), None) - - if embedding_model: - print(f"\n使用 Embedding 模型: {embedding_model['model_uid']}") - model = client.get_model(embedding_model['model_uid']) - result = model.create_embedding("测试文本") - print(f"向量维度: {len(result['data'][0]['embedding'])}") - - if reranker_model: - print(f"\n使用 Reranker 模型: {reranker_model['model_uid']}") - model = client.get_model(reranker_model['model_uid']) - query = "适合老人用的智能手机" - docs = ["华为畅享60 6000mAh", "小米手环8"] - result = model.rerank([(query, d) for d in docs]) - for doc, score in zip(docs, result): - print(f" [{score['relevance_score']:.4f}] {doc}") - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(description="电商搜索实战演示") - parser.add_argument("--host", default="http://localhost:9997", help="Xinference 服务地址") - parser.add_argument("--simple", action="store_true", help="运行简单示例") - parser.add_argument("--embedding", help="指定 Embedding 模型 UID") - parser.add_argument("--reranker", help="指定 Reranker 模型 UID") - - args = parser.parse_args() - - try: - if args.simple: - demo_simple_usage() - else: - demo_ecommerce_search() - except Exception as e: - print(f"\n❌ 错误: {e}") - print("\n💡 请确保:") - print(" 1. Xinference 服务正在运行: ./start.sh") - print(" 2. 模型已部署: python deploy_models.py") - import sys - sys.exit(1) diff --git a/third-party/xinference/install_nvidia_container_toolkit.sh b/third-party/xinference/install_nvidia_container_toolkit.sh deleted file mode 100755 index 4cf361c..0000000 --- a/third-party/xinference/install_nvidia_container_toolkit.sh +++ /dev/null @@ -1,137 +0,0 @@ -#!/bin/bash - -# NVIDIA Container Toolkit 自动安装脚本 -# 适用于 Ubuntu/Debian 系统 - -set -e - -echo "=========================================" -echo " NVIDIA Container Toolkit 安装脚本" -echo "=========================================" -echo "" - -# 检测系统发行版 -if [ -f /etc/os-release ]; then - . /etc/os-release - OS=$ID - OS_VERSION=$VERSION_ID -else - echo "❌ 无法检测系统类型" - exit 1 -fi - -echo "检测到系统: $OS $OS_VERSION" -echo "" - -# 检查 NVIDIA 驱动 -echo "🔍 检查 NVIDIA 驱动..." -if ! command -v nvidia-smi &> /dev/null; then - echo "❌ 未找到 NVIDIA 驱动,请先安装 NVIDIA 驱动" - echo " 访问: https://www.nvidia.com/Download/index.aspx" - exit 1 -fi - -echo "✅ NVIDIA 驱动已安装:" -nvidia-smi --query-gpu=name,driver_version --format=csv,noheader -echo "" - -# 检查 Docker -echo "🔍 检查 Docker..." -if ! command -v docker &> /dev/null; then - echo "❌ 未找到 Docker,请先安装 Docker" - exit 1 -fi - -echo "✅ Docker 已安装" -docker --version -echo "" - -# 添加 NVIDIA 仓库 -echo "📦 添加 NVIDIA Container Toolkit 仓库..." - -if [ "$OS" = "ubuntu" ] || [ "$OS" = "debian" ]; then - # Ubuntu/Debian - distribution=$OS$(. /etc/os-release; echo $VERSION_ID | sed 's/\.//') - - curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - - curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \ - sudo tee /etc/apt/sources.list.d/nvidia-docker.list - - echo "✅ 仓库配置完成" -elif [ "$OS" = "centos" ] || [ "$OS" = "rhel" ]; then - # CentOS/RHEL - distribution=$OS$(. /etc/os-release; echo $VERSION_ID | sed 's/\.//') - - curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | \ - sudo tee /etc/yum.repos.d/nvidia-docker.repo - - echo "✅ 仓库配置完成" -else - echo "⚠️ 不支持的系统: $OS" - echo " 请手动安装,参考: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html" - exit 1 -fi - -echo "" - -# 更新包列表 -echo "🔄 更新包列表..." -if [ "$OS" = "ubuntu" ] || [ "$OS" = "debian" ]; then - sudo apt-get update -elif [ "$OS" = "centos" ] || [ "$OS" = "rhel" ]; then - sudo yum makecache -fi -echo "" - -# 安装 NVIDIA Container Toolkit -echo "🔨 安装 NVIDIA Container Toolkit..." -if [ "$OS" = "ubuntu" ] || [ "$OS" = "debian" ]; then - sudo apt-get install -y nvidia-container-toolkit -elif [ "$OS" = "centos" ] || [ "$OS" = "rhel" ]; then - sudo yum install -y nvidia-container-toolkit -fi -echo "" - -# 配置 Docker -echo "⚙️ 配置 Docker 使用 NVIDIA 运行时..." -sudo nvidia-ctk runtime configure --runtime=docker -echo "✅ Docker 配置完成" -echo "" - -# 重启 Docker -echo "🔄 重启 Docker 服务..." -sudo systemctl restart docker -echo "✅ Docker 重启完成" -echo "" - -# 验证安装 -echo "🧪 验证安装..." -echo "测试 Docker GPU 访问..." -if sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi &> /dev/null; then - echo "✅ NVIDIA Container Toolkit 安装成功!" - echo "" - echo "GPU 可用于 Docker 容器!" -else - echo "❌ 验证失败,请检查日志" - exit 1 -fi - -echo "" -echo "=========================================" -echo " 🎉 安装完成!" -echo "=========================================" -echo "" -echo "📝 下一步:" -echo " 1. 停止当前的 Xinference 容器(如果在运行):" -echo " sudo docker stop xinference" -echo " sudo docker rm xinference" -echo "" -echo " 2. 重新启动 Xinference 服务(会自动使用 GPU):" -echo " cd $(pwd)" -echo " bash start.sh" -echo "" -echo " 3. 部署模型:" -echo " python deploy_models.py" -echo "" -echo "✅ 现在可以使用 GPU 加速了!" -echo "" diff --git a/third-party/xinference/perfermance_test.py b/third-party/xinference/perfermance_test.py new file mode 100644 index 0000000..a5b492a --- /dev/null +++ b/third-party/xinference/perfermance_test.py @@ -0,0 +1,249 @@ +import openai +import time +import statistics +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import List, Dict, Tuple +import json + + +class EmbeddingPerformanceTester: + def __init__(self, base_url: str = "http://127.0.0.1:9997/v1"): + """初始化性能测试器""" + self.client = openai.Client( + api_key="cannot be empty", # 实际使用时请替换为真实API Key + base_url=base_url + ) + + def test_single_request(self, model: str, input_text: List[str]) -> Tuple[bool, float]: + """测试单个请求,返回成功状态和耗时""" + try: + start_time = time.perf_counter() + response = self.client.embeddings.create( + model=model, + input=input_text + ) + end_time = time.perf_counter() + + # 验证响应格式 + if response and hasattr(response, 'data'): + return True, end_time - start_time + else: + return False, end_time - start_time + + except Exception as e: + print(f"请求失败 - 模型 {model}: {str(e)}") + return False, 0.0 + + def test_model_sequential(self, model: str, input_text: List[str], + iterations: int = 1000) -> Dict: + """顺序执行性能测试""" + print(f"\n开始顺序测试模型: {model}") + print(f"测试次数: {iterations}") + + successes = 0 + failures = 0 + latencies = [] + + for i in range(iterations): + if i % 100 == 0 and i > 0: + print(f" 已完成 {i}/{iterations} 次请求...") + + success, latency = self.test_single_request(model, input_text) + + if success: + successes += 1 + latencies.append(latency) + else: + failures += 1 + + return self._calculate_stats(model, successes, failures, latencies) + + def test_model_concurrent(self, model: str, input_text: List[str], + iterations: int = 1000, max_workers: int = 10) -> Dict: + """并发执行性能测试""" + print(f"\n开始并发测试模型: {model}") + print(f"测试次数: {iterations}, 并发数: {max_workers}") + + successes = 0 + failures = 0 + latencies = [] + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + # 提交所有任务 + future_to_request = { + executor.submit(self.test_single_request, model, input_text): i + for i in range(iterations) + } + + # 收集结果 + completed = 0 + for future in as_completed(future_to_request): + completed += 1 + if completed % 100 == 0: + print(f" 已完成 {completed}/{iterations} 次请求...") + + try: + success, latency = future.result() + if success: + successes += 1 + latencies.append(latency) + else: + failures += 1 + except Exception as e: + print(f"请求异常: {str(e)}") + failures += 1 + + return self._calculate_stats(model, successes, failures, latencies) + + def _calculate_stats(self, model: str, successes: int, + failures: int, latencies: List[float]) -> Dict: + """计算性能统计信息""" + if not latencies: + return { + "model": model, + "total_requests": successes + failures, + "successful_requests": successes, + "failed_requests": failures, + "success_rate": 0.0, + "error": "无成功请求" + } + + stats = { + "model": model, + "total_requests": successes + failures, + "successful_requests": successes, + "failed_requests": failures, + "success_rate": successes / (successes + failures) * 100, + "total_time": sum(latencies), + "avg_latency": statistics.mean(latencies), + "min_latency": min(latencies), + "max_latency": max(latencies), + "p50_latency": statistics.median(latencies), + "p95_latency": sorted(latencies)[int(len(latencies) * 0.95)], + "p99_latency": sorted(latencies)[int(len(latencies) * 0.99)], + "requests_per_second": len(latencies) / sum(latencies) if sum(latencies) > 0 else 0 + } + + # 添加标准差(如果有多于一个样本) + if len(latencies) > 1: + stats["std_dev"] = statistics.stdev(latencies) + + return stats + + def print_results(self, results: Dict): + """打印测试结果""" + print("\n" + "="*60) + print(f"性能测试结果 - {results['model']}") + print("="*60) + + if "error" in results: + print(f"错误: {results['error']}") + return + + print(f"总请求数: {results['total_requests']}") + print(f"成功请求: {results['successful_requests']}") + print(f"失败请求: {results['failed_requests']}") + print(f"成功率: {results['success_rate']:.2f}%") + print(f"总耗时: {results['total_time']:.4f}秒") + print(f"平均延迟: {results['avg_latency']:.4f}秒") + print(f"最小延迟: {results['min_latency']:.4f}秒") + print(f"最大延迟: {results['max_latency']:.4f}秒") + print(f"P50延迟: {results['p50_latency']:.4f}秒") + print(f"P95延迟: {results['p95_latency']:.4f}秒") + print(f"P99延迟: {results['p99_latency']:.4f}秒") + + if "std_dev" in results: + print(f"标准差: {results['std_dev']:.4f}秒") + + print(f"QPS: {results['requests_per_second']:.2f} 请求/秒") + print("="*60) + + def save_results(self, results_list: List[Dict], filename: str = "performance_results.json"): + """保存测试结果到JSON文件""" + with open(filename, 'w', encoding='utf-8') as f: + json.dump(results_list, f, indent=2, ensure_ascii=False) + print(f"\n结果已保存到: {filename}") + + +def main(): + """主函数""" + # 初始化测试器 + tester = EmbeddingPerformanceTester() + + # 测试配置 + test_input = ["What is the capital of China?"] + iterations = 1000 + test_models = ['bge-m3', 'Qwen3-Embedding-0.6B'] + + print("="*60) + print("Embedding API 性能测试") + print("="*60) + + all_results = [] + + # 测试模式选择 + print("\n选择测试模式:") + print("1. 顺序测试 (Sequential)") + print("2. 并发测试 (Concurrent)") + print("3. 两种模式都测试") + + mode = input("请输入选择 (1/2/3, 默认1): ").strip() + + for model in test_models: + print(f"\n{'='*60}") + print(f"测试模型: {model}") + print(f"{'='*60}") + + if mode in ['2', '3']: + # 并发测试 + concurrent_results = tester.test_model_concurrent( + model=model, + input_text=test_input, + iterations=iterations, + max_workers=10 # 可根据需要调整并发数 + ) + tester.print_results(concurrent_results) + concurrent_results["test_mode"] = "concurrent" + all_results.append(concurrent_results) + + if mode in ['1', '3'] or not mode: + # 顺序测试 + sequential_results = tester.test_model_sequential( + model=model, + input_text=test_input, + iterations=iterations + ) + tester.print_results(sequential_results) + sequential_results["test_mode"] = "sequential" + all_results.append(sequential_results) + + # 保存结果 + tester.save_results(all_results) + + # 汇总对比 + print("\n" + "="*60) + print("性能测试汇总对比") + print("="*60) + + for result in all_results: + if "error" not in result: + print(f"\n模型: {result['model']} ({result['test_mode']})") + print(f" QPS: {result['requests_per_second']:.2f}") + print(f" 平均延迟: {result['avg_latency']:.4f}秒") + print(f" 成功率: {result['success_rate']:.2f}%") + + +if __name__ == "__main__": + # 添加一个简单的健康检查 + try: + tester = EmbeddingPerformanceTester() + # 快速测试连接 + test_result = tester.test_single_request('bge-m3', ["test"]) + if test_result[0]: + print("API连接正常,开始性能测试...") + main() + else: + print("API连接失败,请检查服务是否正常运行") + except Exception as e: + print(f"初始化失败: {str(e)}") + print("请确保OpenAI客户端已安装: pip install openai") diff --git a/third-party/xinference/perfermance_test_http.py b/third-party/xinference/perfermance_test_http.py new file mode 100644 index 0000000..590035a --- /dev/null +++ b/third-party/xinference/perfermance_test_http.py @@ -0,0 +1,265 @@ +import requests +import time +import statistics +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import List, Dict, Tuple +import json + + +class EmbeddingPerformanceTester: + def __init__(self, base_url: str = "http://127.0.0.1:9997/v1"): + """初始化性能测试器""" + self.base_url = base_url + self.embeddings_url = f"{base_url}/embeddings" + + def test_single_request(self, model: str, input_text: List[str]) -> Tuple[bool, float]: + """测试单个请求,返回成功状态和耗时""" + try: + start_time = time.perf_counter() + + # 构建请求体 + # 如果 input_text 是列表,取第一个元素;如果是字符串,直接使用 + input_value = input_text[0] if isinstance(input_text, list) and len(input_text) > 0 else input_text + + response = requests.post( + self.embeddings_url, + headers={ + 'accept': 'application/json', + 'Content-Type': 'application/json' + }, + json={ + "model": model, + "input": input_value + }, + timeout=60 # 设置超时时间 + ) + + end_time = time.perf_counter() + + # 验证响应格式 + if response.status_code == 200: + result = response.json() + if result and 'data' in result and len(result['data']) > 0: + return True, end_time - start_time + else: + return False, end_time - start_time + else: + return False, end_time - start_time + + except Exception as e: + print(f"请求失败 - 模型 {model}: {str(e)}") + return False, 0.0 + + def test_model_sequential(self, model: str, input_text: List[str], + iterations: int = 1000) -> Dict: + """顺序执行性能测试""" + print(f"\n开始顺序测试模型: {model}") + print(f"测试次数: {iterations}") + + successes = 0 + failures = 0 + latencies = [] + + for i in range(iterations): + if i % 100 == 0 and i > 0: + print(f" 已完成 {i}/{iterations} 次请求...") + + success, latency = self.test_single_request(model, input_text) + + if success: + successes += 1 + latencies.append(latency) + else: + failures += 1 + + return self._calculate_stats(model, successes, failures, latencies) + + def test_model_concurrent(self, model: str, input_text: List[str], + iterations: int = 1000, max_workers: int = 10) -> Dict: + """并发执行性能测试""" + print(f"\n开始并发测试模型: {model}") + print(f"测试次数: {iterations}, 并发数: {max_workers}") + + successes = 0 + failures = 0 + latencies = [] + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + # 提交所有任务 + future_to_request = { + executor.submit(self.test_single_request, model, input_text): i + for i in range(iterations) + } + + # 收集结果 + completed = 0 + for future in as_completed(future_to_request): + completed += 1 + if completed % 100 == 0: + print(f" 已完成 {completed}/{iterations} 次请求...") + + try: + success, latency = future.result() + if success: + successes += 1 + latencies.append(latency) + else: + failures += 1 + except Exception as e: + print(f"请求异常: {str(e)}") + failures += 1 + + return self._calculate_stats(model, successes, failures, latencies) + + def _calculate_stats(self, model: str, successes: int, + failures: int, latencies: List[float]) -> Dict: + """计算性能统计信息""" + if not latencies: + return { + "model": model, + "total_requests": successes + failures, + "successful_requests": successes, + "failed_requests": failures, + "success_rate": 0.0, + "error": "无成功请求" + } + + stats = { + "model": model, + "total_requests": successes + failures, + "successful_requests": successes, + "failed_requests": failures, + "success_rate": successes / (successes + failures) * 100, + "total_time": sum(latencies), + "avg_latency": statistics.mean(latencies), + "min_latency": min(latencies), + "max_latency": max(latencies), + "p50_latency": statistics.median(latencies), + "p95_latency": sorted(latencies)[int(len(latencies) * 0.95)], + "p99_latency": sorted(latencies)[int(len(latencies) * 0.99)], + "requests_per_second": len(latencies) / sum(latencies) if sum(latencies) > 0 else 0 + } + + # 添加标准差(如果有多于一个样本) + if len(latencies) > 1: + stats["std_dev"] = statistics.stdev(latencies) + + return stats + + def print_results(self, results: Dict): + """打印测试结果""" + print("\n" + "="*60) + print(f"性能测试结果 - {results['model']}") + print("="*60) + + if "error" in results: + print(f"错误: {results['error']}") + return + + print(f"总请求数: {results['total_requests']}") + print(f"成功请求: {results['successful_requests']}") + print(f"失败请求: {results['failed_requests']}") + print(f"成功率: {results['success_rate']:.2f}%") + print(f"总耗时: {results['total_time']:.4f}秒") + print(f"平均延迟: {results['avg_latency']:.4f}秒") + print(f"最小延迟: {results['min_latency']:.4f}秒") + print(f"最大延迟: {results['max_latency']:.4f}秒") + print(f"P50延迟: {results['p50_latency']:.4f}秒") + print(f"P95延迟: {results['p95_latency']:.4f}秒") + print(f"P99延迟: {results['p99_latency']:.4f}秒") + + if "std_dev" in results: + print(f"标准差: {results['std_dev']:.4f}秒") + + print(f"QPS: {results['requests_per_second']:.2f} 请求/秒") + print("="*60) + + def save_results(self, results_list: List[Dict], filename: str = "performance_results.json"): + """保存测试结果到JSON文件""" + with open(filename, 'w', encoding='utf-8') as f: + json.dump(results_list, f, indent=2, ensure_ascii=False) + print(f"\n结果已保存到: {filename}") + + +def main(): + """主函数""" + # 初始化测试器 + tester = EmbeddingPerformanceTester() + + # 测试配置 + test_input = ["What is the capital of China?"] + iterations = 1000 + test_models = ['bge-m3', 'Qwen3-Embedding-0.6B'] + + print("="*60) + print("Embedding API 性能测试 (HTTP)") + print("="*60) + + all_results = [] + + # 测试模式选择 + print("\n选择测试模式:") + print("1. 顺序测试 (Sequential)") + print("2. 并发测试 (Concurrent)") + print("3. 两种模式都测试") + + mode = input("请输入选择 (1/2/3, 默认1): ").strip() + + for model in test_models: + print(f"\n{'='*60}") + print(f"测试模型: {model}") + print(f"{'='*60}") + + if mode in ['2', '3']: + # 并发测试 + concurrent_results = tester.test_model_concurrent( + model=model, + input_text=test_input, + iterations=iterations, + max_workers=10 # 可根据需要调整并发数 + ) + tester.print_results(concurrent_results) + concurrent_results["test_mode"] = "concurrent" + all_results.append(concurrent_results) + + if mode in ['1', '3'] or not mode: + # 顺序测试 + sequential_results = tester.test_model_sequential( + model=model, + input_text=test_input, + iterations=iterations + ) + tester.print_results(sequential_results) + sequential_results["test_mode"] = "sequential" + all_results.append(sequential_results) + + # 保存结果 + tester.save_results(all_results) + + # 汇总对比 + print("\n" + "="*60) + print("性能测试汇总对比") + print("="*60) + + for result in all_results: + if "error" not in result: + print(f"\n模型: {result['model']} ({result['test_mode']})") + print(f" QPS: {result['requests_per_second']:.2f}") + print(f" 平均延迟: {result['avg_latency']:.4f}秒") + print(f" 成功率: {result['success_rate']:.2f}%") + + +if __name__ == "__main__": + # 添加一个简单的健康检查 + try: + tester = EmbeddingPerformanceTester() + # 快速测试连接 + test_result = tester.test_single_request('bge-m3', ["test"]) + if test_result[0]: + print("API连接正常,开始性能测试...") + main() + else: + print("API连接失败,请检查服务是否正常运行") + except Exception as e: + print(f"初始化失败: {str(e)}") + print("请确保 requests 库已安装: pip install requests") \ No newline at end of file diff --git a/third-party/xinference/perfermance_test_single.py b/third-party/xinference/perfermance_test_single.py new file mode 100644 index 0000000..e89359a --- /dev/null +++ b/third-party/xinference/perfermance_test_single.py @@ -0,0 +1,108 @@ +import openai +import time +import requests +import json + + +client = openai.Client( + api_key="cannot be empty", + base_url="http://127.0.0.1:9997/v1" +) + +# 记录开始时间 +start_time = time.time() + +a = client.embeddings.create( + model='bge-m3', + input=["What is the capital of China?"] +) + +# 记录结束时间 +end_time = time.time() + +#print(a) +print(f"\n耗时: {end_time - start_time:.4f} 秒") + +# 记录开始时间 +start_time = time.time() + +a = client.embeddings.create( + model='Qwen3-Embedding-0.6B', + input=["What is the capital of China?"] +) + +# 记录结束时间 +end_time = time.time() + +#print(a) +print(f"\n耗时: {end_time - start_time:.4f} 秒") + +# ========== HTTP API 测试 ========== +print("\n" + "="*50) +print("HTTP API 测试") +print("="*50) + +# 配置 +XINFERENCE_HOST = "127.0.0.1" +XINFERENCE_PORT = "9997" +base_url = f"http://{XINFERENCE_HOST}:{XINFERENCE_PORT}/v1/embeddings" + +# 测试 bge-m3 模型 +print("\n测试模型: bge-m3") +start_time = time.time() + +response = requests.post( + base_url, + headers={ + 'accept': 'application/json', + 'Content-Type': 'application/json' + }, + json={ + "model": "bge-m3", + "input": "What is the capital of China?" + } +) + +end_time = time.time() + +if response.status_code == 200: + result = response.json() + print(f"状态码: {response.status_code}") + print(f"模型: {result.get('model', 'N/A')}") + print(f"使用token数: {result.get('usage', {}).get('total_tokens', 'N/A')}") + print(f"嵌入向量维度: {len(result.get('data', [{}])[0].get('embedding', []))}") + print(f"耗时: {end_time - start_time:.4f} 秒") +else: + print(f"请求失败,状态码: {response.status_code}") + print(f"错误信息: {response.text}") + +# 测试 Qwen3-Embedding-0.6B 模型 +print("\n测试模型: Qwen3-Embedding-0.6B") +start_time = time.time() + +response = requests.post( + base_url, + headers={ + 'accept': 'application/json', + 'Content-Type': 'application/json' + }, + json={ + "model": "Qwen3-Embedding-0.6B", + "input": "What is the capital of China?" + } +) + +end_time = time.time() + +if response.status_code == 200: + result = response.json() + print(f"状态码: {response.status_code}") + print(f"模型: {result.get('model', 'N/A')}") + print(f"使用token数: {result.get('usage', {}).get('total_tokens', 'N/A')}") + print(f"嵌入向量维度: {len(result.get('data', [{}])[0].get('embedding', []))}") + print(f"耗时: {end_time - start_time:.4f} 秒") +else: + print(f"请求失败,状态码: {response.status_code}") + print(f"错误信息: {response.text}") + + diff --git a/third-party/xinference/requirements.txt b/third-party/xinference/requirements.txt deleted file mode 100644 index 8b29a3a..0000000 --- a/third-party/xinference/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -xinference>=1.7.1 -numpy>=1.24.0 -requests>=2.31.0 diff --git a/third-party/xinference/setup_alias.sh b/third-party/xinference/setup_alias.sh deleted file mode 100755 index 864ad16..0000000 --- a/third-party/xinference/setup_alias.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/bin/bash - -# 自动为 Xinference 设置别名到 Shell 配置文件 - -ENV_NAME="xinference" -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -echo "=========================================" -echo " 设置 Xinference 快捷别名" -echo "=========================================" -echo "" - -# 检测 Shell 类型 -if [ -n "$ZSH_VERSION" ]; then - SHELL_CONFIG="$HOME/.zshrc" - SHELL_NAME="zsh" -elif [ -n "$BASH_VERSION" ]; then - SHELL_CONFIG="$HOME/.bashrc" - SHELL_NAME="bash" -else - echo "⚠️ 无法检测 Shell 类型,请手动添加别名" - exit 1 -fi - -echo "检测到 Shell: $SHELL_NAME" -echo "配置文件: $SHELL_CONFIG" -echo "" - -# 别名内容 -ALIAS_CONTENT=" -# Xinference 环境快捷命令 -alias xinference-env='source ${SCRIPT_DIR}/activate.sh' -alias xinference-activate='conda activate ${ENV_NAME}' -alias xinference-cd='cd ${SCRIPT_DIR}' -" - -# 检查是否已存在 -if grep -q "xinference-env" "$SHELL_CONFIG" 2>/dev/null; then - echo "⚠️ 别名已存在于 $SHELL_CONFIG" - read -p "是否重新添加? (y/N): " -n 1 -r - echo - if [[ ! $REPLY =~ ^[Yy]$ ]]; then - echo "跳过设置" - exit 0 - fi - # 删除旧的别名 - sed -i '/# Xinference 环境快捷命令/,/xinference-cd/d' "$SHELL_CONFIG" -fi - -# 添加别名 -echo "" >> "$SHELL_CONFIG" -echo "$ALIAS_CONTENT" >> "$SHELL_CONFIG" - -echo "✅ 别名已添加到 $SHELL_CONFIG" -echo "" -echo "=========================================" -echo " 可用快捷命令" -echo "=========================================" -echo "" -echo " 1. xinference-env - 激活环境并切换到目录" -echo " 2. xinference-activate - 仅激活环境" -echo " 3. xinference-cd - 切换到 Xinference 目录" -echo "" -echo "=========================================" -echo " 使配置生效" -echo "=========================================" -echo "" -echo "运行以下命令使配置生效:" -echo "" -echo " source $SHELL_CONFIG" -echo "" -echo "或者重新打开终端" -echo "" -echo "然后就可以直接使用 xinference-env 命令了!" -echo "" diff --git a/third-party/xinference/setup_env.sh b/third-party/xinference/setup_env.sh deleted file mode 100755 index 95ba5b7..0000000 --- a/third-party/xinference/setup_env.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/bin/bash - -# Xinference Conda 环境创建脚本 -# 用于创建独立的 Python 环境来运行 Xinference 客户端 - -set -e - -ENV_NAME="xinference" -PYTHON_VERSION="3.10" - -echo "=========================================" -echo " 创建 Xinference Conda 环境" -echo "=========================================" -echo "" - -# 检查 conda 是否可用 -if ! command -v conda &> /dev/null; then - echo "❌ 错误: conda 未安装或未在 PATH 中" - echo " 请先安装 Miniconda 或 Anaconda" - exit 1 -fi - -# 检查环境是否已存在 -if conda env list | grep -q "^${ENV_NAME} "; then - echo "⚠️ 环境 '${ENV_NAME}' 已存在" - read -p "是否删除并重新创建? (y/N): " -n 1 -r - echo - if [[ $REPLY =~ ^[Yy]$ ]]; then - echo "🗑️ 删除旧环境..." - conda env remove -n $ENV_NAME -y - else - echo "✅ 使用现有环境" - echo " 激活命令: conda activate $ENV_NAME" - exit 0 - fi -fi - -echo "🔨 创建 conda 环境: ${ENV_NAME} (Python ${PYTHON_VERSION})" -conda create -n $ENV_NAME python=$PYTHON_VERSION -y - -echo "" -echo "📦 安装依赖包..." - -# 激活环境并安装依赖 -eval "$(conda shell.bash hook)" -conda activate $ENV_NAME - -# 基础依赖 -pip install -U pip - -# Xinference 客户端 -pip install xinference-client - -# 其他可能需要的依赖 -pip install numpy -pip install requests - -echo "" -echo "✅ 环境创建完成!" -echo "" -echo "=========================================" -echo " 环境信息" -echo "=========================================" -echo "环境名称: ${ENV_NAME}" -echo "Python 版本: ${PYTHON_VERSION}" -echo "" -echo "🚀 使用方法:" -echo "" -echo " 1. 激活环境:" -echo " conda activate ${ENV_NAME}" -echo "" -echo " 2. 运行演示:" -echo " python ecommerce_demo.py" -echo "" -echo " 3. 退出环境:" -echo " conda deactivate" -echo "" -echo "💡 提示: 将以下别名添加到 ~/.bashrc 或 ~/.zshrc 以快速切换:" -echo " alias xinference='conda activate ${ENV_NAME}'" -echo "" diff --git a/third-party/xinference/start.sh b/third-party/xinference/start.sh deleted file mode 100755 index b4ecf89..0000000 --- a/third-party/xinference/start.sh +++ /dev/null @@ -1,164 +0,0 @@ -#!/bin/bash - -# Xinference Docker 部署脚本 -# 用于启动 Xinference 服务 - -set -e - -# 检查是否需要 sudo -if sudo -n docker info &> /dev/null 2>&1; then - DOCKER="sudo docker" - # 检测 Docker Compose 的形式(V2: docker compose 或 V1: docker-compose) - if sudo docker compose version &> /dev/null 2>&1; then - DOCKER_COMPOSE="sudo docker compose" - echo "✅ 检测到 Docker Compose V2" - elif command -v docker-compose &> /dev/null; then - DOCKER_COMPOSE="sudo docker-compose" - echo "✅ 检测到 Docker Compose V1" - else - echo "❌ 错误: 未找到 Docker Compose" - exit 1 - fi -else - DOCKER="docker" - # 检测 Docker Compose 的形式 - if docker compose version &> /dev/null 2>&1; then - DOCKER_COMPOSE="docker compose" - echo "✅ 检测到 Docker Compose V2" - elif command -v docker-compose &> /dev/null; then - DOCKER_COMPOSE="docker-compose" - echo "✅ 检测到 Docker Compose V1" - else - echo "❌ 错误: 未找到 Docker Compose" - exit 1 - fi -fi - -echo "=========================================" -echo " Xinference Docker 部署脚本" -echo "=========================================" -echo "" - -# 检查 GPU -echo "🔍 检查 GPU 可用性..." -if command -v nvidia-smi &> /dev/null; then - echo "✅ 检测到 NVIDIA GPU:" - nvidia-smi --query-gpu=name,memory.total --format=csv,noheader | head -n 1 - - # 检查 NVIDIA Container Toolkit - if $DOCKER run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi &> /dev/null; then - GPU_AVAILABLE=true - echo "✅ NVIDIA Container Toolkit 已安装,GPU 可用" - else - GPU_AVAILABLE=false - echo "⚠️ 检测到 GPU 但 NVIDIA Container Toolkit 未安装" - echo "⚠️ 将使用 CPU 模式启动" - echo "💡 如需 GPU 加速,请安装 NVIDIA Container Toolkit(见下方说明)" - fi -else - echo "⚠️ 未检测到 NVIDIA GPU,将使用 CPU 模式" - GPU_AVAILABLE=false -fi - -# 创建模型目录 -echo "" -echo "📁 创建模型存储目录..." -mkdir -p models - -# 拉取镜像 -echo "" -echo "🐳 拉取 Xinference Docker 镜像..." -if [ "$GPU_AVAILABLE" = true ]; then - $DOCKER pull xprobe/xinference:latest -else - echo "⚠️ CPU 模式:如需 GPU 支持,请配置好 NVIDIA Docker 运行时" - $DOCKER pull xprobe/xinference:latest -fi - -# 停止并删除旧容器 -echo "" -echo "🛑 停止并删除旧容器..." -$DOCKER stop xinference 2>/dev/null || true -$DOCKER rm xinference 2>/dev/null || true -$DOCKER_COMPOSE down 2>/dev/null || true - -# 启动服务 -echo "" -echo "🚀 启动 Xinference 服务..." -if [ "$GPU_AVAILABLE" = true ]; then - echo "🔥 使用 GPU 模式启动..." - $DOCKER_COMPOSE up -d -else - # CPU 模式:直接使用 docker run - echo "💻 使用 CPU 模式启动..." - $DOCKER run -d \ - --name xinference \ - -p 9997:9997 -p 9998:9998 \ - -v "$(pwd)/models:/data" \ - -e XINFERENCE_HOME=/data \ - --restart unless-stopped \ - xprobe/xinference:latest \ - xinference-local -H 0.0.0.0 -fi - -# 等待服务启动 -echo "" -echo "⏳ 等待服务启动..." -for i in {1..30}; do - if curl -s http://localhost:9997/v1/models > /dev/null 2>&1; then - echo "✅ Xinference 服务启动成功!" - break - fi - echo " 等待中... ($i/30)" - sleep 2 -done - -# 检查服务状态 -echo "" -echo "📊 服务状态检查..." -if curl -s http://localhost:9997/v1/models > /dev/null 2>&1; then - echo "✅ 服务健康检查通过" - echo "" - echo "=========================================" - echo " 🎉 部署成功!" - echo "=========================================" - echo "" - echo "📍 服务地址:" - echo " - API: http://localhost:9997" - echo " - Dashboard: http://localhost:9998" - echo "" - echo "📝 下一步操作:" - echo " 1. 查看日志: $DOCKER logs -f xinference" - echo " 2. 部署模型: python deploy_models.py" - echo " 3. 测试搜索: python ecommerce_demo.py" - echo "" - - if [ "$GPU_AVAILABLE" = false ]; then - echo "💡 启用 GPU 加速(可选):" - echo " 如果你想使用 GPU 加速,请安装 NVIDIA Container Toolkit:" - echo "" - echo " # 1. 添加 NVIDIA 仓库" - echo " curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -" - echo " distribution=\$(. /etc/os-release;echo \$ID\$VERSION_ID)" - echo " curl -s -L https://nvidia.github.io/nvidia-docker/\$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list" - echo "" - echo " # 2. 安装 NVIDIA Container Toolkit" - echo " sudo apt-get update" - echo " sudo apt-get install -y nvidia-container-toolkit" - echo "" - echo " # 3. 重启 Docker" - echo " sudo systemctl restart docker" - echo "" - echo " # 4. 重新运行此脚本" - echo " bash start.sh" - echo "" - fi - - echo "📚 查看所有已部署模型:" - echo " curl http://localhost:9997/v1/models" - echo "" -else - echo "❌ 服务启动失败,请查看日志:" - echo " $DOCKER logs xinference" - exit 1 -fi diff --git a/third-party/xinference/start_gpu_manual.sh b/third-party/xinference/start_gpu_manual.sh deleted file mode 100755 index b852ab0..0000000 --- a/third-party/xinference/start_gpu_manual.sh +++ /dev/null @@ -1,92 +0,0 @@ -#!/bin/bash - -# Xinference GPU 模式启动脚本(手动设备映射) -# 不依赖 nvidia-container-toolkit - -set -e - -echo "=========================================" -echo " Xinference GPU 模式启动(手动配置)" -echo "=========================================" -echo "" - -# 检查 GPU 设备 -if [ ! -e /dev/nvidia0 ]; then - echo "❌ 错误: 未找到 NVIDIA 设备文件" - echo " 请确保已安装 NVIDIA 驱动" - exit 1 -fi - -echo "✅ 检测到 NVIDIA 设备文件:" -ls -la /dev/nvidia* | head -5 -echo "" - -# 停止旧容器 -echo "🛑 停止旧容器..." -sudo docker stop xinference 2>/dev/null || true -sudo docker rm xinference 2>/dev/null || true -echo "" - -# 创建模型目录 -mkdir -p models - -# 拉取镜像(如果需要) -echo "🐳 检查镜像..." -if ! sudo docker image inspect xprobe/xinference:latest &> /dev/null; then - echo "拉取 Xinference 镜像..." - sudo docker pull xprobe/xinference:latest -fi -echo "" - -# 启动容器(手动映射 GPU 设备和库) -echo "🚀 启动 Xinference 容器(GPU 模式)..." -echo "映射设备: /dev/nvidia0, /dev/nvidiactl, /dev/nvidia-uvm" -echo "映射库: /usr/lib/libcuda.so*" -echo "" - -sudo docker run -d \ - --name xinference \ - --restart unless-stopped \ - -p 9997:9997 \ - -p 9998:9998 \ - -v "$(pwd)/models:/data" \ - -v /usr/lib/libcuda.so.1:/usr/lib/x86_64-linux-gnu/libcuda.so.1 \ - -v /usr/lib/libnvidia-ml.so.1:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1 \ - -e XINFERENCE_HOME=/data \ - -e LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/local/cuda/lib64 \ - --device /dev/nvidia0 \ - --device /dev/nvidiactl \ - --device /dev/nvidia-uvm \ - --device /dev/nvidia-uvm-tools \ - --security-opt "label=disable" \ - --privileged \ - xprobe/xinference:latest \ - xinference-local -H 0.0.0.0 - -echo "" -echo "⏳ 等待服务启动..." -sleep 3 - -# 检查容器状态 -if sudo docker ps | grep -q xinference; then - echo "✅ 容器启动成功!" - echo "" - echo "=========================================" - echo " 🎉 启动成功!" - echo "=========================================" - echo "" - echo "📍 服务地址:" - echo " - API: http://localhost:9997" - echo " - Dashboard: http://localhost:9998" - echo "" - echo "📝 查看日志:" - echo " sudo docker logs -f xinference" - echo "" - echo "📝 测试 GPU:" - echo " sudo docker exec xinference nvidia-smi" - echo "" -else - echo "❌ 容器启动失败,查看日志:" - sudo docker logs xinference - exit 1 -fi diff --git a/third-party/xinference/stop.sh b/third-party/xinference/stop.sh old mode 100755 new mode 100644 index 47d0023..e9b4c2e --- a/third-party/xinference/stop.sh +++ b/third-party/xinference/stop.sh @@ -1,14 +1,2 @@ -#!/bin/bash -# Xinference 停止脚本 - -echo "🛑 停止 Xinference 服务..." - -# 停止 Docker Compose -docker-compose down 2>/dev/null || true - -# 停止并删除容器 -docker stop xinference 2>/dev/null || true -docker rm xinference 2>/dev/null || true - -echo "✅ Xinference 服务已停止" +xinference terminate --model-uid Qwen3-Reranker-0.6B --model-uid bge-m3 --model-uid Qwen3-Embedding-0.6B diff --git a/third-party/xinference/test.sh b/third-party/xinference/test.sh new file mode 100644 index 0000000..f22c096 --- /dev/null +++ b/third-party/xinference/test.sh @@ -0,0 +1,10 @@ +if [ "$CONDA_DEFAULT_ENV" != "tw" ]; then + echo "当前环境不是 tw,正在激活 tw 环境..." + source /home/tw/miniconda3/etc/profile.d/conda.sh + conda activate tw + echo "已激活 tw 环境" +else + echo "当前已经在 tw 环境中,无需重复激活" +fi + +python perfermance_test_single.py diff --git a/third-party/xinference/xinference使用文档.md b/third-party/xinference/xinference使用文档.md deleted file mode 100644 index faec6c1..0000000 --- a/third-party/xinference/xinference使用文档.md +++ /dev/null @@ -1,255 +0,0 @@ -根据搜索结果分析,**Xinference从v1.7.0版本开始正式支持Qwen3-Embedding和Qwen3-Reranker模型**。但需要注意早期版本存在一些部署问题,建议升级到最新稳定版。 - ---- - -## **一、支持情况确认** - -### **✅ 已支持(v1.7.0+)** -- **Qwen3-Embedding**:支持0.6B等规格,最大上下文8192 tokens,输出1024维向量 -- **Qwen3-Reranker**:支持0.6B等规格,Cross-Encoder架构用于精排 - -### **⚠️ 已知问题与解决方案** -1. **Batch Size限制**:早期版本Qwen3-Reranker处理batch>1时报错`no padding token defined` - - **解决**:升级到v1.7.0.post1或更高版本 -2. **GPU显存异常**:vLLM引擎加载时显存占用过高 - - **解决**:升级到v1.7.1+,或使用`--cpu-offload-gb`参数 - ---- - -## **二、部署方案** - -### **方案1:Docker部署(推荐)** - -```bash -# 1. 拉取最新镜像(v1.7.1+) -docker pull xprobe/xinference:latest - -# 2. 启动Xinference服务 -docker run -d --name xinference \ - -p 9997:9997 -p 9998:9998 \ - --gpus all \ - -v /data/models:/data \ - -e XINFERENCE_HOME=/data \ - xprobe/xinference:latest xinference-local -H 0.0.0.0 - -# 3. 查看服务状态 -curl http://localhost:9997/v1/models -``` - -### **方案2:pip部署** - -```bash -# 1. 安装最新版本(务必v1.7.1+) -pip install "xinference>=1.7.1" - -# 2. 启动本地服务 -xinference-local --host 0.0.0.0 --port 9997 - -# 3. 或启动集群模式 -xinference-supervisor -H ${SUPERVISOR_HOST} -xinference-worker -e "http://${SUPERVISOR_HOST}:9997" -``` - ---- - -## **三、模型部署与使用** - -### **步骤1:部署Qwen3-Embedding模型** - -```python -from xinference.client import Client - -# 连接Xinference服务 -client = Client("http://localhost:9997") - -# 启动Qwen3-Embedding模型 -model_uid = client.launch_model( - model_name="qwen3-embedding", - model_size_in_billions=0, # 0表示自动选择可用版本 - model_type="embedding", - engine="vllm", # 推荐vLLM引擎 - gpu_idx="0", # 指定GPU -) -print(f"Embedding模型UID: {model_uid}") -``` - -### **步骤2:部署Qwen3-Reranker模型** - -```python -# 启动Qwen3-Reranker模型 -reranker_uid = client.launch_model( - model_name="qwen3-reranker", - model_size_in_billions=0, - model_type="rerank", # 明确指定为reranker - engine="vllm", - gpu_idx="0", # 可与embedding同卡,注意显存 -) -print(f"Reranker模型UID: {reranker_uid}") -``` - -### **步骤3:电商搜索实战使用** - -```python -# 获取模型实例 -embedding_model = client.get_model(model_uid) -reranker_model = client.get_model(reranker_uid) - -# 示例:电商搜索query和商品标题 -query = "适合老人用的智能手机大屏幕长续航" -candidate_products = [ - "红米Note12 5000mAh大电量 6.67英寸大屏 老人模式", - "iPhone 15 Pro Max 专业摄影旗舰", - "华为畅享60 6000mAh超长续航 护眼大屏", - "OPPO A1 5000mAh电池 简易模式适合长辈", - "小米手环8 智能运动监测", -] - -# 阶段1:密集检索召回Top-200 -query_embedding = embedding_model.create_embedding(query)["data"][0]["embedding"] -# 使用Faiss或向量数据库批量检索(伪代码) -# candidate_embeddings = [embedding_model.create_embedding(p)["data"][0]["embedding"] for p in candidate_products] -# top200_ids = faiss_search(query_embedding, candidate_embeddings, k=200) - -# 阶段2:精排Top-200为Top-50(实际场景) -# 为演示简化为精排全部5个 -pairs = [(query, product) for product in candidate_products] -rerank_scores = reranker_model.rerank(pairs) - -# 按重排序分数排序 -sorted_results = sorted( - zip(candidate_products, rerank_scores), - key=lambda x: x[1]["relevance_score"], - reverse=True -) - -# 输出结果 -for product, score in sorted_results: - print(f"相似度: {score['relevance_score']:.4f} | 商品: {product}") - -# 预期输出: -# 相似度: 0.9234 | 商品: 华为畅享60 6000mAh超长续航 护眼大屏 -# 相似度: 0.8912 | 商品: OPPO A1 5000mAh电池 简易模式适合长辈 -# 相似度: 0.8567 | 商品: 红米Note12 5000mAh大电量 6.67英寸大屏 老人模式 -# 相似度: 0.2345 | 商品: iPhone 15 Pro Max 专业摄影旗舰 -# 相似度: 0.1234 | 商品: 小米手环8 智能运动监测 -``` - ---- - -## **四、REST API调用方式** - -### **Embedding API(兼容OpenAI格式)** - -```bash -# 实时生成query向量 -curl -X POST http://localhost:9997/v1/embeddings \ - -H "Content-Type: application/json" \ - -d '{ - "model": "qwen3-embedding", - "input": ["适合老人用的智能手机大屏幕长续航"] - }' - -# 批量生成商品向量(离线任务) -curl -X POST http://localhost:9997/v1/embeddings \ - -H "Content-Type: application/json" \ - -d '{ - "model": "qwen3-embedding", - "input": ["商品标题1", "商品标题2", ..., "商品标题10000"] - }' -``` - -### **Reranker API** - -```bash -# 精排候选商品 -curl -X POST http://localhost:9997/v1/rerank \ - -H "Content-Type: application/json" \ - -d '{ - "model": "qwen3-reranker", - "query": "适合老人用的智能手机大屏幕长续航", - "documents": [ - "红米Note12 5000mAh大电量 6.67英寸大屏 老人模式", - "华为畅享60 6000mAh超长续航 护眼大屏" - ], - "top_n": 10 - }' -``` - ---- - -## **五、关键参数与优化** - -### **1. 显存配置建议** -```python -# 对于显存有限的场景,使用CPU卸载 -reranker_uid = client.launch_model( - model_name="qwen3-reranker", - model_type="rerank", - engine="vllm", - cpu_offload_gb=16, # 将部分计算卸载到CPU - gpu_memory_utilization=0.6, # 限制显存使用 -) -``` - -### **2. 性能调优** -```python -# Embedding批量处理优化 -embedding_model.create_embedding( - input_texts, - batch_size=100, # 增大批量提升吞吐 - normalize=True # 归一化向量用于余弦相似度 -) - -# Reranker并发控制(避免OOM) -reranker_model.rerank( - pairs, - batch_size=8, # 根据显存调整,A10G建议8-16 - max_length=512 # 限制输入长度 -) -``` - -### **3. 电商搜索最佳实践** -- **离线批量**:每天凌晨全量重新生成2亿商品embedding,存入向量数据库(Milvus/Pinecone) -- **在线实时**:用户query实时embedding,召回Top-1000 -- **精排阶段**:对Top-1000使用Qwen3-Reranker打分,取Top-50返回 -- **缓存策略**:TOP 10000热搜query的embedding和rerank结果缓存Redis,QPS提升10倍 -- **混合检索**:结合BM25关键词召回,提升头部Query准确率 - ---- - -## **六、版本兼容性矩阵** - -| Xinference版本 | Qwen3-Embedding | Qwen3-Reranker | 推荐度 | -|----------------|-----------------|----------------|--------| -| < v1.7.0 | ❌ 不支持 | ❌ 不支持 | 必须升级| -| v1.7.0 | ✅ 支持 | ⚠️ 有batch bug | 慎用 | -| v1.7.0.post1 | ✅ 支持 | ✅ 基本支持 | 可用 | -| **≥ v1.7.1** | **✅ 完美支持** | **✅ 完美支持** | **强烈推荐** | - -**建议**:生产环境务必使用 **v1.7.1或更高版本** - ---- - -## **七、监控与运维** - -```bash -# 查看模型运行状态 -curl http://localhost:9997/v1/models - -# 监控GPU显存(推荐部署Prometheus+Grafana) -nvidia-smi -l 1 - -# 日志排查 -docker logs -f xinference --tail 100 -``` - ---- - -## **总结** - -Xinference + Qwen3-Embedding + Qwen3-Reranker是**完全可行且已生产可用**的组合,特别适合电商搜索场景。只需注意: -1. **版本必须≥1.7.1** -2. **Reranker建议单独部署以避免显存争抢** -3. **两阶段检索(海选+精排)是最佳实践** - -如需具体压测数据或故障排查,可进一步咨询! \ No newline at end of file diff --git a/third-party/xinference/测试结果-perfermance_test.txt b/third-party/xinference/测试结果-perfermance_test.txt new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/third-party/xinference/测试结果-perfermance_test.txt diff --git a/third-party/xinference/测试结果-perfermance_test_http.txt b/third-party/xinference/测试结果-perfermance_test_http.txt new file mode 100644 index 0000000..3f63bb4 --- /dev/null +++ b/third-party/xinference/测试结果-perfermance_test_http.txt @@ -0,0 +1,167 @@ +$ p perfermance_test_http.py +API连接正常,开始性能测试... +============================================================ +Embedding API 性能测试 (HTTP) +============================================================ + +选择测试模式: +1. 顺序测试 (Sequential) +2. 并发测试 (Concurrent) +3. 两种模式都测试 +请输入选择 (1/2/3, 默认1): 3 + +============================================================ +测试模型: bge-m3 +============================================================ + +开始并发测试模型: bge-m3 +测试次数: 1000, 并发数: 10 + 已完成 100/1000 次请求... + 已完成 200/1000 次请求... + 已完成 300/1000 次请求... + 已完成 400/1000 次请求... + 已完成 500/1000 次请求... + 已完成 600/1000 次请求... + 已完成 700/1000 次请求... + 已完成 800/1000 次请求... + 已完成 900/1000 次请求... + 已完成 1000/1000 次请求... + +============================================================ +性能测试结果 - bge-m3 +============================================================ +总请求数: 1000 +成功请求: 1000 +失败请求: 0 +成功率: 100.00% +总耗时: 145.1439秒 +平均延迟: 0.1451秒 +最小延迟: 0.0311秒 +最大延迟: 0.5770秒 +P50延迟: 0.0599秒 +P95延迟: 0.5151秒 +P99延迟: 0.5704秒 +标准差: 0.1789秒 +QPS: 6.89 请求/秒 +============================================================ + +开始顺序测试模型: bge-m3 +测试次数: 1000 + 已完成 100/1000 次请求... + 已完成 200/1000 次请求... + 已完成 300/1000 次请求... + 已完成 400/1000 次请求... + 已完成 500/1000 次请求... + 已完成 600/1000 次请求... + 已完成 700/1000 次请求... + 已完成 800/1000 次请求... + 已完成 900/1000 次请求... + +============================================================ +性能测试结果 - bge-m3 +============================================================ +总请求数: 1000 +成功请求: 1000 +失败请求: 0 +成功率: 100.00% +总耗时: 74.5284秒 +平均延迟: 0.0745秒 +最小延迟: 0.0271秒 +最大延迟: 0.5767秒 +P50延迟: 0.0286秒 +P95延迟: 0.4797秒 +P99延迟: 0.5037秒 +标准差: 0.1364秒 +QPS: 13.42 请求/秒 +============================================================ + +============================================================ +测试模型: Qwen3-Embedding-0.6B +============================================================ + +开始并发测试模型: Qwen3-Embedding-0.6B +测试次数: 1000, 并发数: 10 + 已完成 100/1000 次请求... + 已完成 200/1000 次请求... + 已完成 300/1000 次请求... + 已完成 400/1000 次请求... + 已完成 500/1000 次请求... + 已完成 600/1000 次请求... + 已完成 700/1000 次请求... + 已完成 800/1000 次请求... + 已完成 900/1000 次请求... + 已完成 1000/1000 次请求... + +============================================================ +性能测试结果 - Qwen3-Embedding-0.6B +============================================================ +总请求数: 1000 +成功请求: 1000 +失败请求: 0 +成功率: 100.00% +总耗时: 195.7997秒 +平均延迟: 0.1958秒 +最小延迟: 0.0564秒 +最大延迟: 0.6201秒 +P50延迟: 0.1050秒 +P95延迟: 0.5674秒 +P99延迟: 0.5994秒 +标准差: 0.1829秒 +QPS: 5.11 请求/秒 +============================================================ + +开始顺序测试模型: Qwen3-Embedding-0.6B +测试次数: 1000 + 已完成 100/1000 次请求... + 已完成 200/1000 次请求... + 已完成 300/1000 次请求... + 已完成 400/1000 次请求... + 已完成 500/1000 次请求... + 已完成 600/1000 次请求... + 已完成 700/1000 次请求... + 已完成 800/1000 次请求... + 已完成 900/1000 次请求... + +============================================================ +性能测试结果 - Qwen3-Embedding-0.6B +============================================================ +总请求数: 1000 +成功请求: 1000 +失败请求: 0 +成功率: 100.00% +总耗时: 100.2533秒 +平均延迟: 0.1003秒 +最小延迟: 0.0513秒 +最大延迟: 0.6249秒 +P50延迟: 0.0539秒 +P95延迟: 0.4993秒 +P99延迟: 0.5180秒 +标准差: 0.1354秒 +QPS: 9.97 请求/秒 +============================================================ + +结果已保存到: performance_results.json + +============================================================ +性能测试汇总对比 +============================================================ + +模型: bge-m3 (concurrent) + QPS: 6.89 + 平均延迟: 0.1451秒 + 成功率: 100.00% + +模型: bge-m3 (sequential) + QPS: 13.42 + 平均延迟: 0.0745秒 + 成功率: 100.00% + +模型: Qwen3-Embedding-0.6B (concurrent) + QPS: 5.11 + 平均延迟: 0.1958秒 + 成功率: 100.00% + +模型: Qwen3-Embedding-0.6B (sequential) + QPS: 9.97 + 平均延迟: 0.1003秒 + 成功率: 100.00% -- libgit2 0.21.2