Commit 3d508beb9fae5ebcd83a244ece6949be47296fed
1 parent
87cacb1b
reranker-4b-gguf
Showing
17 changed files
with
706 additions
and
47 deletions
Show diff stats
config/config.yaml
| ... | ... | @@ -381,7 +381,7 @@ services: |
| 381 | 381 | max_docs: 1000 |
| 382 | 382 | normalize: true |
| 383 | 383 | # 服务内后端(reranker 进程启动时读取) |
| 384 | - backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank | |
| 384 | + backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | qwen3_gguf | dashscope_rerank | |
| 385 | 385 | backends: |
| 386 | 386 | bge: |
| 387 | 387 | model_name: "BAAI/bge-reranker-v2-m3" |
| ... | ... | @@ -420,6 +420,29 @@ services: |
| 420 | 420 | use_fp16: true |
| 421 | 421 | # sdpa:默认无需 flash-attn;若已安装 flash_attn 可改为 flash_attention_2 |
| 422 | 422 | attn_implementation: "sdpa" |
| 423 | + qwen3_gguf: | |
| 424 | + repo_id: "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF" | |
| 425 | + filename: "*Q8_0.gguf" | |
| 426 | + cache_dir: "./model_cache" | |
| 427 | + local_dir: "./models/reranker/qwen3-reranker-4b-gguf" | |
| 428 | + instruction: "Rank products by query with category & style match prioritized" | |
| 429 | + # T4 16GB / 显存约 5~6GB 的保守配置 | |
| 430 | + n_ctx: 384 | |
| 431 | + n_batch: 384 | |
| 432 | + n_ubatch: 128 | |
| 433 | + n_gpu_layers: 24 | |
| 434 | + main_gpu: 0 | |
| 435 | + n_threads: 2 | |
| 436 | + n_threads_batch: 4 | |
| 437 | + flash_attn: true | |
| 438 | + offload_kqv: true | |
| 439 | + use_mmap: true | |
| 440 | + use_mlock: false | |
| 441 | + infer_batch_size: 8 | |
| 442 | + sort_by_doc_length: true | |
| 443 | + length_sort_mode: "char" | |
| 444 | + enable_warmup: true | |
| 445 | + verbose: false | |
| 423 | 446 | dashscope_rerank: |
| 424 | 447 | model_name: "qwen3-rerank" |
| 425 | 448 | # 按地域选择 endpoint: | ... | ... |
requirements_reranker_service.txt
| 1 | -# Isolated dependencies for reranker service (.venv-reranker) | |
| 1 | +# Legacy alias: qwen3_vllm reranker service env (.venv-reranker). | |
| 2 | 2 | # |
| 3 | -# Default backend is qwen3_vllm (Qwen3-Reranker-0.6B). | |
| 3 | +# Prefer backend-specific requirements files: | |
| 4 | +# - requirements_reranker_qwen3_vllm.txt | |
| 5 | +# - requirements_reranker_qwen3_gguf.txt | |
| 6 | +# - requirements_reranker_qwen3_transformers.txt | |
| 7 | +# - requirements_reranker_bge.txt | |
| 8 | +# - requirements_reranker_dashscope.txt | |
| 4 | 9 | |
| 5 | -fastapi>=0.100.0 | |
| 6 | -uvicorn[standard]>=0.23.0 | |
| 7 | -pydantic>=2.0.0 | |
| 8 | -numpy>=1.24.0 | |
| 9 | -pyyaml>=6.0 | |
| 10 | -transformers>=4.30.0 | |
| 11 | -vllm>=0.8.5 | |
| 10 | +-r requirements_reranker_qwen3_vllm.txt | ... | ... |
reranker/DEPLOYMENT_AND_TUNING.md
| 1 | -# Reranker 部署与性能调优手册(Qwen3-vLLM) | |
| 1 | +# Reranker 部署与性能调优手册(Qwen3-vLLM / Qwen3-GGUF) | |
| 2 | 2 | |
| 3 | 3 | 本文档沉淀当前项目在电商搜索重排场景下的可复用实践,覆盖: |
| 4 | 4 | |
| 5 | 5 | - 环境准备与安装部署 |
| 6 | -- `qwen3_vllm` 配置项与优化思路 | |
| 6 | +- `qwen3_vllm` / `qwen3_gguf` 配置项与优化思路 | |
| 7 | 7 | - 1000-doc 场景压测流程 |
| 8 | 8 | - 关键结论与推荐默认参数 |
| 9 | 9 | - 常见故障排查 |
| 10 | 10 | |
| 11 | 11 | 适用范围: |
| 12 | 12 | |
| 13 | -- 重排后端:`services.rerank.backend: qwen3_vllm` | |
| 14 | -- 模型:`Qwen/Qwen3-Reranker-0.6B` | |
| 13 | +- 重排后端:`services.rerank.backend: qwen3_vllm` 或 `qwen3_gguf` | |
| 14 | +- 模型:`Qwen/Qwen3-Reranker-0.6B` / `DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF` | |
| 15 | 15 | - 场景:query 较短(通常 < 100 tokens),doc 为商品标题或标题+简短描述,单请求 docs 约 1000 条 |
| 16 | 16 | |
| 17 | 17 | ## 1. 环境基线 |
| 18 | 18 | |
| 19 | -当前验证环境(2026-03-11): | |
| 19 | +当前验证环境(2026-03-25): | |
| 20 | 20 | |
| 21 | 21 | - GPU:`Tesla T4 16GB` |
| 22 | 22 | - Driver / CUDA:`570.158.01 / 12.8` |
| 23 | 23 | - Python:`3.12.3` |
| 24 | -- 关键依赖:`vllm==0.17.0`、`torch==2.10.0+cu128`、`transformers==4.57.6`、`fastapi==0.135.1`、`uvicorn==0.41.0` | |
| 24 | +- 关键依赖:`vllm==0.17.0`、`torch==2.10.0+cu128`、`transformers==4.57.6`、`llama-cpp-python>=0.3.16`、`fastapi==0.135.1`、`uvicorn==0.41.0` | |
| 25 | 25 | |
| 26 | 26 | ## 2. 环境准备与安装 |
| 27 | 27 | |
| 28 | 28 | ### 2.1 准备 reranker 独立虚拟环境 |
| 29 | 29 | |
| 30 | 30 | ```bash |
| 31 | -./scripts/setup_reranker_venv.sh | |
| 31 | +./scripts/setup_reranker_venv.sh qwen3_vllm | |
| 32 | +``` | |
| 33 | + | |
| 34 | +若使用 GGUF 并需要 CUDA: | |
| 35 | + | |
| 36 | +```bash | |
| 37 | +./scripts/setup_reranker_venv.sh qwen3_gguf | |
| 38 | +PATH=/usr/local/cuda/bin:$PATH \ | |
| 39 | +CUDACXX=/usr/local/cuda/bin/nvcc \ | |
| 40 | +CMAKE_ARGS="-DGGML_CUDA=on" \ | |
| 41 | +FORCE_CMAKE=1 \ | |
| 42 | +./.venv-reranker-gguf/bin/pip install --no-cache-dir --force-reinstall --no-build-isolation llama-cpp-python==0.3.18 | |
| 32 | 43 | ``` |
| 33 | 44 | |
| 34 | 45 | ### 2.2 基础检查 |
| ... | ... | @@ -37,6 +48,7 @@ |
| 37 | 48 | nvidia-smi |
| 38 | 49 | ./.venv-reranker/bin/python -c "import torch; print(torch.cuda.is_available())" |
| 39 | 50 | ./.venv-reranker/bin/python -c "import vllm, transformers; print(vllm.__version__, transformers.__version__)" |
| 51 | +./.venv-reranker-gguf/bin/python -c "import llama_cpp; print(llama_cpp.__version__)" | |
| 40 | 52 | ``` |
| 41 | 53 | |
| 42 | 54 | ## 3. 部署与运行 |
| ... | ... | @@ -64,6 +76,29 @@ services: |
| 64 | 76 | length_sort_mode: "char" # char | token |
| 65 | 77 | ``` |
| 66 | 78 | |
| 79 | +GGUF / T4 剩余显存约 `4.8~6GB` 时,推荐基线: | |
| 80 | + | |
| 81 | +```yaml | |
| 82 | +services: | |
| 83 | + rerank: | |
| 84 | + backend: "qwen3_gguf" | |
| 85 | + backends: | |
| 86 | + qwen3_gguf: | |
| 87 | + repo_id: "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF" | |
| 88 | + filename: "*Q8_0.gguf" | |
| 89 | + local_dir: "./models/reranker/qwen3-reranker-4b-gguf" | |
| 90 | + cache_dir: "./model_cache" | |
| 91 | + n_ctx: 384 | |
| 92 | + n_batch: 384 | |
| 93 | + n_ubatch: 128 | |
| 94 | + n_gpu_layers: 24 | |
| 95 | + flash_attn: true | |
| 96 | + offload_kqv: true | |
| 97 | + infer_batch_size: 8 | |
| 98 | + sort_by_doc_length: true | |
| 99 | + length_sort_mode: "char" | |
| 100 | +``` | |
| 101 | + | |
| 67 | 102 | ### 3.2 启停命令 |
| 68 | 103 | |
| 69 | 104 | 推荐统一使用: |
| ... | ... | @@ -105,6 +140,13 @@ curl -sS http://127.0.0.1:6007/health |
| 105 | 140 | - `service_ctl.sh` 对 reranker 使用独立启动路径 |
| 106 | 141 | - 增加“稳定健康检查”(连续健康探测)避免“刚 healthy 即退出”的假阳性 |
| 107 | 142 | |
| 143 | +### 4.4 GGUF / T4 小显存优化原则 | |
| 144 | + | |
| 145 | +- `Q8_0` 权重约 `4.28GB`,但还要给 KV cache、CUDA 工作区和运行时碎片预留空间,不能按“模型大小 < 剩余显存”直接判断可行。 | |
| 146 | +- 当前业务是短 query + 商品标题,优先压缩 `n_ctx`;`384` 通常比默认长上下文更划算。 | |
| 147 | +- T4 小显存下先扫 `n_gpu_layers`,再尝试提高 `n_ctx`;`infer_batch_size` 在当前 GGUF 接入里主要是服务侧 work chunk,不是 llama.cpp 的真实算子 batch。 | |
| 148 | +- `flash_attn: true`、`offload_kqv: true` 默认保持开启;若 OOM,优先降低 `n_gpu_layers`。 | |
| 149 | + | |
| 108 | 150 | ## 5. 性能调优流程(标准流程) |
| 109 | 151 | |
| 110 | 152 | ### 5.1 使用一键压测脚本 |
| ... | ... | @@ -125,6 +167,13 @@ curl -sS http://127.0.0.1:6007/health |
| 125 | 167 | - `infer_batch_size`: `24 32 48 64` |
| 126 | 168 | - 并发组:`c=1`(看单请求延迟)、`c=4`(看并发吞吐与尾延迟) |
| 127 | 169 | |
| 170 | +GGUF 建议扫描: | |
| 171 | + | |
| 172 | +- `n_gpu_layers`: `20 24 28` | |
| 173 | +- `n_ctx`: `320 384 448` | |
| 174 | +- `infer_batch_size`: `4 8 12`(次要,仅影响服务侧 work chunk) | |
| 175 | +- 扫描顺序:先固定 `n_ctx=384`,找能稳定启动的最大 `n_gpu_layers`;再在显存允许时尝试 `n_ctx=448`;最后才微调 `infer_batch_size` | |
| 176 | + | |
| 128 | 177 | 可通过环境变量覆盖: |
| 129 | 178 | |
| 130 | 179 | - `BATCH_SIZES` |
| ... | ... | @@ -140,23 +189,28 @@ curl -sS http://127.0.0.1:6007/health |
| 140 | 189 | - `RERANK_VLLM_INFER_BATCH_SIZE` |
| 141 | 190 | - `RERANK_VLLM_SORT_BY_DOC_LENGTH` |
| 142 | 191 | |
| 143 | -## 6. 本轮关键结论(2026-03-11) | |
| 144 | - | |
| 145 | -基于报告: | |
| 146 | - | |
| 147 | -- `perf_reports/20260311/reranker_1000docs/report.md` | |
| 192 | +## 6. 本轮关键结论 | |
| 148 | 193 | |
| 149 | -结论: | |
| 194 | +vLLM(2026-03-11,见 `perf_reports/20260311/reranker_1000docs/report.md`): | |
| 150 | 195 | |
| 151 | 196 | - 对在线重排更重要的单请求延迟(`c=1`)指标,`infer_batch_size=64` 最优 |
| 152 | 197 | - `infer_batch_size=96` 在更高并发下吞吐略高,但会牺牲单请求延迟稳定性 |
| 153 | 198 | - 当前默认选择 `infer_batch_size=64` 作为平衡点 |
| 154 | 199 | |
| 200 | +GGUF(2026-03-25,本次接入): | |
| 201 | + | |
| 202 | +- `DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF` 的 `Q8_0` 体积约 `4.28GB`,结合当前机器实测剩余显存约 `4823 MiB`,默认不采用激进的全量 GPU offload。 | |
| 203 | +- 当前推荐默认值:`n_ctx=384`、`n_batch=384`、`n_ubatch=128`、`n_gpu_layers=24`、`infer_batch_size=8`。 | |
| 204 | +- 若现场剩余显存更接近 `6GB` 且碎片较少,可优先尝试 `n_gpu_layers=28`;若启动失败,回退到 `24` 或 `20`。 | |
| 205 | +- 由于当前工作区尚未缓存该 GGUF 权重,本次尚未完成真实吞吐压测;上线前需在部署机复跑一轮参数扫描并归档报告。 | |
| 206 | + | |
| 155 | 207 | ## 7. 生产建议 |
| 156 | 208 | |
| 157 | 209 | - 默认保持:`infer_batch_size: 64`、`sort_by_doc_length: true` |
| 158 | 210 | - 满足以下条件时可考虑提高到 `96`:业务以吞吐优先、可接受更高单请求延迟、已通过同机同数据压测验证收益 |
| 159 | 211 | - 每次改动后都必须复跑 `benchmark_reranker_1000docs.sh` 并归档结果 |
| 212 | +- GGUF 默认保持:`n_ctx: 384`、`n_gpu_layers: 24`、`infer_batch_size: 8`、`flash_attn: true`、`offload_kqv: true` | |
| 213 | +- GGUF 若 OOM:先降 `n_gpu_layers`,再降 `n_ctx`,最后再降 `infer_batch_size` | |
| 160 | 214 | |
| 161 | 215 | ## 8. 故障排查 |
| 162 | 216 | |
| ... | ... | @@ -194,6 +248,13 @@ lsof -i :6007 -P -n |
| 194 | 248 | - 降低 `infer_batch_size` |
| 195 | 249 | - 检查是否有其他进程占用同卡 |
| 196 | 250 | |
| 251 | +GGUF 优先调整: | |
| 252 | + | |
| 253 | +- 降低 `n_gpu_layers` | |
| 254 | +- 降低 `n_ctx` | |
| 255 | +- 降低 `infer_batch_size` | |
| 256 | +- 检查是否有其他进程占用同卡 | |
| 257 | + | |
| 197 | 258 | ## 9. 变更与验证清单 |
| 198 | 259 | |
| 199 | 260 | 每次 reranker 调优改动后,至少完成: | ... | ... |
reranker/README.md
| ... | ... | @@ -4,10 +4,10 @@ |
| 4 | 4 | |
| 5 | 5 | --- |
| 6 | 6 | |
| 7 | -Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwen3-vLLM、Qwen3-Transformers、DashScope 云重排)。调用方通过 HTTP 访问,不关心具体后端。 | |
| 7 | +Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwen3-vLLM、Qwen3-Transformers、Qwen3-GGUF、DashScope 云重排)。调用方通过 HTTP 访问,不关心具体后端。 | |
| 8 | 8 | |
| 9 | 9 | **特性** |
| 10 | -- 多后端:`qwen3_vllm`(默认,Qwen3-Reranker-0.6B + vLLM)、`qwen3_transformers`(纯 Transformers,无需 vLLM)、`bge`(兼容保留) | |
| 10 | +- 多后端:`qwen3_vllm`、`qwen3_transformers`、`qwen3_gguf`(Qwen3-Reranker-4B GGUF + llama.cpp)、`bge`(兼容保留) | |
| 11 | 11 | - 云后端:`dashscope_rerank`(调用 DashScope `/compatible-api/v1/reranks`,支持按地域切换 endpoint) |
| 12 | 12 | - 统一配置:`config/config.yaml` → `services.rerank.backend` / `services.rerank.backends.<name>` |
| 13 | 13 | - 文档去重、分数与输入顺序一致、FP16/GPU 支持(视后端) |
| ... | ... | @@ -19,6 +19,7 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe |
| 19 | 19 | - `backends/bge.py`:BGE 后端 |
| 20 | 20 | - `backends/qwen3_vllm.py`:Qwen3-Reranker-0.6B + vLLM 后端 |
| 21 | 21 | - `backends/qwen3_transformers.py`:Qwen3-Reranker-0.6B 纯 Transformers 后端(官方 Usage 方式) |
| 22 | + - `backends/qwen3_gguf.py`:Qwen3-Reranker-4B GGUF + llama.cpp 后端 | |
| 22 | 23 | - `backends/dashscope_rerank.py`:DashScope 云重排后端(HTTP 调用) |
| 23 | 24 | - `reranker/bge_reranker.py`:BGE 核心推理(被 bge 后端封装) |
| 24 | 25 | - `reranker/config.py`:服务端口、MAX_DOCS、NORMALIZE 等(后端参数在 config.yaml) |
| ... | ... | @@ -27,18 +28,33 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe |
| 27 | 28 | - 通用:`torch`、`transformers`、`fastapi`、`uvicorn`(隔离环境见 `requirements_reranker_service.txt`;全量 ML 环境另见 `requirements_ml.txt`) |
| 28 | 29 | - **Qwen3-vLLM 后端**:`vllm>=0.8.5`、`transformers>=4.51.0`(仅当使用 `backend: qwen3_vllm` 时需 vLLM) |
| 29 | 30 | - **Qwen3-Transformers 后端**:`transformers>=4.51.0`、`torch`(无需 vLLM,适合 CPU 或小显存) |
| 31 | +- **Qwen3-GGUF 后端**:`llama-cpp-python>=0.3.16` | |
| 32 | +- 现在按 backend 使用独立 venv: | |
| 33 | + - `qwen3_vllm` -> `.venv-reranker` | |
| 34 | + - `qwen3_gguf` -> `.venv-reranker-gguf` | |
| 35 | + - `qwen3_transformers` -> `.venv-reranker-transformers` | |
| 36 | + - `bge` -> `.venv-reranker-bge` | |
| 37 | + - `dashscope_rerank` -> `.venv-reranker-dashscope` | |
| 30 | 38 | ```bash |
| 31 | - ./scripts/setup_reranker_venv.sh | |
| 39 | + ./scripts/setup_reranker_venv.sh qwen3_gguf | |
| 40 | + ``` | |
| 41 | + CUDA 构建建议: | |
| 42 | + ```bash | |
| 43 | + PATH=/usr/local/cuda/bin:$PATH \ | |
| 44 | + CUDACXX=/usr/local/cuda/bin/nvcc \ | |
| 45 | + CMAKE_ARGS="-DGGML_CUDA=on" \ | |
| 46 | + FORCE_CMAKE=1 \ | |
| 47 | + ./.venv-reranker-gguf/bin/pip install --no-cache-dir --force-reinstall --no-build-isolation llama-cpp-python==0.3.18 | |
| 32 | 48 | ``` |
| 33 | 49 | |
| 34 | 50 | ## 配置 |
| 35 | -- **后端选择**:`config/config.yaml` 中 `services.rerank.backend`(`qwen3_vllm` | `qwen3_transformers` | `bge` | `dashscope_rerank`),或环境变量 `RERANK_BACKEND`。 | |
| 51 | +- **后端选择**:`config/config.yaml` 中 `services.rerank.backend`(`qwen3_vllm` | `qwen3_transformers` | `qwen3_gguf` | `bge` | `dashscope_rerank`),或环境变量 `RERANK_BACKEND`。 | |
| 36 | 52 | - **后端参数**:`services.rerank.backends.bge` / `services.rerank.backends.qwen3_vllm`,例如: |
| 37 | 53 | |
| 38 | 54 | ```yaml |
| 39 | 55 | services: |
| 40 | 56 | rerank: |
| 41 | - backend: "qwen3_vllm" # 或 bge | |
| 57 | + backend: "qwen3_gguf" # 或 qwen3_vllm / bge | |
| 42 | 58 | backends: |
| 43 | 59 | bge: |
| 44 | 60 | model_name: "BAAI/bge-reranker-v2-m3" |
| ... | ... | @@ -65,6 +81,21 @@ services: |
| 65 | 81 | tensor_parallel_size: 1 |
| 66 | 82 | gpu_memory_utilization: 0.8 |
| 67 | 83 | instruction: "Given a shopping query, rank product titles by relevance" |
| 84 | + qwen3_gguf: | |
| 85 | + repo_id: "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF" | |
| 86 | + filename: "*Q8_0.gguf" | |
| 87 | + local_dir: "./models/reranker/qwen3-reranker-4b-gguf" | |
| 88 | + cache_dir: "./model_cache" | |
| 89 | + instruction: "Rank products by query with category & style match prioritized" | |
| 90 | + n_ctx: 384 | |
| 91 | + n_batch: 384 | |
| 92 | + n_ubatch: 128 | |
| 93 | + n_gpu_layers: 24 | |
| 94 | + flash_attn: true | |
| 95 | + offload_kqv: true | |
| 96 | + infer_batch_size: 8 | |
| 97 | + sort_by_doc_length: true | |
| 98 | + length_sort_mode: "char" | |
| 68 | 99 | dashscope_rerank: |
| 69 | 100 | model_name: "qwen3-rerank" |
| 70 | 101 | endpoint: "https://dashscope.aliyuncs.com/compatible-api/v1/reranks" |
| ... | ... | @@ -94,7 +125,7 @@ DashScope 认证: |
| 94 | 125 | ```bash |
| 95 | 126 | ./scripts/start_reranker.sh |
| 96 | 127 | ``` |
| 97 | -该脚本会使用隔离环境 `.venv-reranker`;首次请先执行 `./scripts/setup_reranker_venv.sh`。 | |
| 128 | +该脚本会按当前 `services.rerank.backend` 自动选择对应的独立 venv;首次请先执行 `./scripts/setup_reranker_venv.sh <backend>`。 | |
| 98 | 129 | |
| 99 | 130 | ## 性能压测(1000 docs) |
| 100 | 131 | ```bash |
| ... | ... | @@ -122,7 +153,7 @@ Content-Type: application/json |
| 122 | 153 | ``` |
| 123 | 154 | |
| 124 | 155 | `top_n` 为可选字段: |
| 125 | -- 对本地后端(`qwen3_vllm` / `qwen3_transformers` / `bge`)通常会忽略,仍返回全量分数。 | |
| 156 | +- 对本地后端(`qwen3_vllm` / `qwen3_transformers` / `qwen3_gguf` / `bge`)通常会忽略,仍返回全量分数。 | |
| 126 | 157 | - 对 `dashscope_rerank` 可用于控制云端返回的候选量,建议设置为 `page+size`(例如分页 `from=20,size=10` 时传 `30`)。 |
| 127 | 158 | |
| 128 | 159 | Response: |
| ... | ... | @@ -160,3 +191,4 @@ uvicorn reranker.server:app --host 0.0.0.0 --port 6007 --log-level info |
| 160 | 191 | - 运行时可用环境变量临时覆盖批量参数:`RERANK_VLLM_INFER_BATCH_SIZE`、`RERANK_VLLM_SORT_BY_DOC_LENGTH`。 |
| 161 | 192 | - **Qwen3-vLLM**:参考 [Qwen3-Reranker-0.6B](https://huggingface.co/Qwen/Qwen3-Reranker-0.6B),需 GPU 与较多显存;与 BGE 相比适合长文本、高吞吐场景(vLLM 前缀缓存)。 |
| 162 | 193 | - **Qwen3-Transformers**:官方 Transformers Usage 方式,无需 vLLM;适合 CPU 或小显存。默认 `attn_implementation: "sdpa"`;若已安装 `flash_attn` 可设 `flash_attention_2`(未安装时服务会自动回退到 sdpa)。 |
| 194 | +- **Qwen3-GGUF**:参考 [DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF](https://huggingface.co/DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF)。单卡 T4 且仅剩约 `4.8~6GB` 显存时,推荐 `Q8_0 + n_ctx=384 + n_gpu_layers=24 + flash_attn=true + offload_kqv=true` 起步;若启动 OOM,优先把 `n_gpu_layers` 下调到 `20`,再把 `n_ctx` 下调到 `320`。`infer_batch_size` 在 GGUF 后端是服务侧 work chunk,大多不如 `n_gpu_layers` / `n_ctx` 关键。 | ... | ... |
reranker/backends/__init__.py
| ... | ... | @@ -46,11 +46,14 @@ def get_rerank_backend(name: str, config: Dict[str, Any]) -> RerankBackendProtoc |
| 46 | 46 | if name == "qwen3_transformers": |
| 47 | 47 | from reranker.backends.qwen3_transformers import Qwen3TransformersRerankerBackend |
| 48 | 48 | return Qwen3TransformersRerankerBackend(config) |
| 49 | + if name == "qwen3_gguf": | |
| 50 | + from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend | |
| 51 | + return Qwen3GGUFRerankerBackend(config) | |
| 49 | 52 | if name == "dashscope_rerank": |
| 50 | 53 | from reranker.backends.dashscope_rerank import DashScopeRerankBackend |
| 51 | 54 | return DashScopeRerankBackend(config) |
| 52 | 55 | raise ValueError( |
| 53 | - f"Unknown rerank backend: {name!r}. Supported: bge, qwen3_vllm, qwen3_transformers, dashscope_rerank" | |
| 56 | + f"Unknown rerank backend: {name!r}. Supported: bge, qwen3_vllm, qwen3_transformers, qwen3_gguf, dashscope_rerank" | |
| 54 | 57 | ) |
| 55 | 58 | |
| 56 | 59 | ... | ... |
| ... | ... | @@ -0,0 +1,327 @@ |
| 1 | +""" | |
| 2 | +Qwen3-Reranker-4B GGUF backend using llama-cpp-python. | |
| 3 | + | |
| 4 | +Reference: | |
| 5 | +- https://huggingface.co/DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF | |
| 6 | +- https://huggingface.co/Qwen/Qwen3-Reranker-4B | |
| 7 | +""" | |
| 8 | + | |
| 9 | +from __future__ import annotations | |
| 10 | + | |
| 11 | +import logging | |
| 12 | +import math | |
| 13 | +import os | |
| 14 | +import threading | |
| 15 | +import time | |
| 16 | +from typing import Any, Dict, List, Tuple | |
| 17 | + | |
| 18 | + | |
| 19 | +logger = logging.getLogger("reranker.backends.qwen3_gguf") | |
| 20 | + | |
| 21 | + | |
| 22 | +def deduplicate_with_positions(texts: List[str]) -> Tuple[List[str], List[int]]: | |
| 23 | + """Deduplicate texts globally while preserving first-seen order.""" | |
| 24 | + unique_texts: List[str] = [] | |
| 25 | + position_to_unique: List[int] = [] | |
| 26 | + seen: Dict[str, int] = {} | |
| 27 | + | |
| 28 | + for text in texts: | |
| 29 | + idx = seen.get(text) | |
| 30 | + if idx is None: | |
| 31 | + idx = len(unique_texts) | |
| 32 | + seen[text] = idx | |
| 33 | + unique_texts.append(text) | |
| 34 | + position_to_unique.append(idx) | |
| 35 | + | |
| 36 | + return unique_texts, position_to_unique | |
| 37 | + | |
| 38 | + | |
| 39 | +def _format_instruction(instruction: str, query: str, doc: str) -> str: | |
| 40 | + return "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}".format( | |
| 41 | + instruction=instruction, | |
| 42 | + query=query, | |
| 43 | + doc=doc, | |
| 44 | + ) | |
| 45 | + | |
| 46 | + | |
| 47 | +class Qwen3GGUFRerankerBackend: | |
| 48 | + """ | |
| 49 | + Qwen3-Reranker-4B GGUF backend using llama.cpp through llama-cpp-python. | |
| 50 | + | |
| 51 | + Tuned for short-query / short-doc reranking on a memory-constrained single T4. | |
| 52 | + Config from services.rerank.backends.qwen3_gguf. | |
| 53 | + """ | |
| 54 | + | |
| 55 | + def __init__(self, config: Dict[str, Any]) -> None: | |
| 56 | + self._config = config or {} | |
| 57 | + self._repo_id = str( | |
| 58 | + self._config.get("repo_id") or "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF" | |
| 59 | + ).strip() | |
| 60 | + self._filename = str(self._config.get("filename") or "*Q8_0.gguf").strip() | |
| 61 | + self._model_path = str(self._config.get("model_path") or "").strip() | |
| 62 | + self._cache_dir = str(self._config.get("cache_dir") or "").strip() or None | |
| 63 | + self._local_dir = str(self._config.get("local_dir") or "").strip() or None | |
| 64 | + self._instruction = str( | |
| 65 | + self._config.get("instruction") | |
| 66 | + or "Rank products by query with category & style match prioritized" | |
| 67 | + ) | |
| 68 | + self._infer_batch_size = int( | |
| 69 | + os.getenv("RERANK_GGUF_INFER_BATCH_SIZE") or self._config.get("infer_batch_size", 8) | |
| 70 | + ) | |
| 71 | + sort_by_doc_length = os.getenv("RERANK_GGUF_SORT_BY_DOC_LENGTH") | |
| 72 | + if sort_by_doc_length is None: | |
| 73 | + sort_by_doc_length = self._config.get("sort_by_doc_length", True) | |
| 74 | + self._sort_by_doc_length = str(sort_by_doc_length).strip().lower() in { | |
| 75 | + "1", | |
| 76 | + "true", | |
| 77 | + "yes", | |
| 78 | + "y", | |
| 79 | + "on", | |
| 80 | + } | |
| 81 | + self._length_sort_mode = str(self._config.get("length_sort_mode") or "char").strip().lower() | |
| 82 | + | |
| 83 | + n_ctx = int(self._config.get("n_ctx", self._config.get("max_model_len", 384))) | |
| 84 | + n_batch = int(self._config.get("n_batch", min(n_ctx, 384))) | |
| 85 | + n_ubatch = int(self._config.get("n_ubatch", min(n_batch, 128))) | |
| 86 | + n_gpu_layers = int(self._config.get("n_gpu_layers", 24)) | |
| 87 | + main_gpu = int(self._config.get("main_gpu", 0)) | |
| 88 | + n_threads = int(self._config.get("n_threads", 2)) | |
| 89 | + n_threads_batch = int(self._config.get("n_threads_batch", 4)) | |
| 90 | + flash_attn = bool(self._config.get("flash_attn", True)) | |
| 91 | + offload_kqv = bool(self._config.get("offload_kqv", True)) | |
| 92 | + use_mmap = bool(self._config.get("use_mmap", True)) | |
| 93 | + use_mlock = bool(self._config.get("use_mlock", False)) | |
| 94 | + verbose = bool(self._config.get("verbose", False)) | |
| 95 | + enable_warmup = bool(self._config.get("enable_warmup", True)) | |
| 96 | + | |
| 97 | + if self._infer_batch_size <= 0: | |
| 98 | + raise ValueError(f"infer_batch_size must be > 0, got {self._infer_batch_size}") | |
| 99 | + if n_ctx <= 0: | |
| 100 | + raise ValueError(f"n_ctx must be > 0, got {n_ctx}") | |
| 101 | + if n_batch <= 0 or n_ubatch <= 0: | |
| 102 | + raise ValueError(f"n_batch/n_ubatch must be > 0, got {n_batch}/{n_ubatch}") | |
| 103 | + | |
| 104 | + try: | |
| 105 | + from llama_cpp import Llama | |
| 106 | + except Exception as exc: # pragma: no cover - depends on optional dependency | |
| 107 | + raise RuntimeError( | |
| 108 | + "qwen3_gguf backend requires llama-cpp-python. " | |
| 109 | + "Install the qwen3_gguf backend venv first via scripts/setup_reranker_venv.sh qwen3_gguf." | |
| 110 | + ) from exc | |
| 111 | + | |
| 112 | + self._llama_class = Llama | |
| 113 | + self._n_ctx = n_ctx | |
| 114 | + self._n_batch = n_batch | |
| 115 | + self._n_ubatch = n_ubatch | |
| 116 | + self._n_gpu_layers = n_gpu_layers | |
| 117 | + self._enable_warmup = enable_warmup | |
| 118 | + self._infer_lock = threading.Lock() | |
| 119 | + | |
| 120 | + logger.info( | |
| 121 | + "[Qwen3_GGUF] Loading model repo=%s filename=%s model_path=%s n_ctx=%s n_batch=%s n_ubatch=%s n_gpu_layers=%s flash_attn=%s offload_kqv=%s", | |
| 122 | + self._repo_id, | |
| 123 | + self._filename, | |
| 124 | + self._model_path or None, | |
| 125 | + n_ctx, | |
| 126 | + n_batch, | |
| 127 | + n_ubatch, | |
| 128 | + n_gpu_layers, | |
| 129 | + flash_attn, | |
| 130 | + offload_kqv, | |
| 131 | + ) | |
| 132 | + | |
| 133 | + llm_kwargs = { | |
| 134 | + "n_ctx": n_ctx, | |
| 135 | + "n_batch": n_batch, | |
| 136 | + "n_ubatch": n_ubatch, | |
| 137 | + "n_gpu_layers": n_gpu_layers, | |
| 138 | + "main_gpu": main_gpu, | |
| 139 | + "n_threads": n_threads, | |
| 140 | + "n_threads_batch": n_threads_batch, | |
| 141 | + "logits_all": True, | |
| 142 | + "offload_kqv": offload_kqv, | |
| 143 | + "flash_attn": flash_attn, | |
| 144 | + "use_mmap": use_mmap, | |
| 145 | + "use_mlock": use_mlock, | |
| 146 | + "verbose": verbose, | |
| 147 | + } | |
| 148 | + llm_kwargs = {key: value for key, value in llm_kwargs.items() if value is not None} | |
| 149 | + self._llm = self._load_model(llm_kwargs) | |
| 150 | + self._model_name = self._model_path or f"{self._repo_id}:{self._filename}" | |
| 151 | + | |
| 152 | + self._prefix = ( | |
| 153 | + "<|im_start|>system\n" | |
| 154 | + "Judge whether the Document meets the requirements based on the Query and the Instruct provided. " | |
| 155 | + 'Note that the answer can only be "yes" or "no".' | |
| 156 | + "<|im_end|>\n<|im_start|>user\n" | |
| 157 | + ) | |
| 158 | + self._suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n" | |
| 159 | + self._prefix_tokens = self._tokenize(self._prefix, special=True) | |
| 160 | + self._suffix_tokens = self._tokenize(self._suffix, special=True) | |
| 161 | + self._effective_max_len = self._n_ctx - len(self._prefix_tokens) - len(self._suffix_tokens) | |
| 162 | + if self._effective_max_len <= 16: | |
| 163 | + raise RuntimeError( | |
| 164 | + f"n_ctx={self._n_ctx} is too small after prompt overhead; effective={self._effective_max_len}" | |
| 165 | + ) | |
| 166 | + | |
| 167 | + self._true_token = self._single_token_id("yes") | |
| 168 | + self._false_token = self._single_token_id("no") | |
| 169 | + | |
| 170 | + if self._enable_warmup: | |
| 171 | + self._warmup() | |
| 172 | + | |
| 173 | + logger.info( | |
| 174 | + "[Qwen3_GGUF] Model ready | model=%s effective_max_len=%s infer_batch_size=%s sort_by_doc_length=%s", | |
| 175 | + self._model_name, | |
| 176 | + self._effective_max_len, | |
| 177 | + self._infer_batch_size, | |
| 178 | + self._sort_by_doc_length, | |
| 179 | + ) | |
| 180 | + | |
| 181 | + def _load_model(self, llm_kwargs: Dict[str, Any]): | |
| 182 | + if self._model_path: | |
| 183 | + return self._llama_class(model_path=self._model_path, **llm_kwargs) | |
| 184 | + return self._llama_class.from_pretrained( | |
| 185 | + repo_id=self._repo_id, | |
| 186 | + filename=self._filename, | |
| 187 | + local_dir=self._local_dir, | |
| 188 | + cache_dir=self._cache_dir, | |
| 189 | + **llm_kwargs, | |
| 190 | + ) | |
| 191 | + | |
| 192 | + def _tokenize(self, text: str, *, special: bool) -> List[int]: | |
| 193 | + return list( | |
| 194 | + self._llm.tokenize( | |
| 195 | + text.encode("utf-8"), | |
| 196 | + add_bos=False, | |
| 197 | + special=special, | |
| 198 | + ) | |
| 199 | + ) | |
| 200 | + | |
| 201 | + def _single_token_id(self, text: str) -> int: | |
| 202 | + token_ids = self._tokenize(text, special=False) | |
| 203 | + if len(token_ids) != 1: | |
| 204 | + raise RuntimeError(f"Expected {text!r} to be one token, got {token_ids}") | |
| 205 | + return int(token_ids[0]) | |
| 206 | + | |
| 207 | + def _warmup(self) -> None: | |
| 208 | + try: | |
| 209 | + prompt = self._build_prompt_tokens("warmup query", "warmup document") | |
| 210 | + with self._infer_lock: | |
| 211 | + self._eval_logits(prompt) | |
| 212 | + except Exception as exc: # pragma: no cover - defensive | |
| 213 | + logger.warning("[Qwen3_GGUF] Warmup failed: %s", exc) | |
| 214 | + | |
| 215 | + def _build_prompt_tokens(self, query: str, doc: str) -> List[int]: | |
| 216 | + pair = _format_instruction(self._instruction, query, doc) | |
| 217 | + pair_tokens = self._tokenize(pair, special=False) | |
| 218 | + pair_tokens = pair_tokens[: self._effective_max_len] | |
| 219 | + return self._prefix_tokens + pair_tokens + self._suffix_tokens | |
| 220 | + | |
| 221 | + def _eval_logits(self, prompt_tokens: List[int]) -> List[float]: | |
| 222 | + self._llm.reset() | |
| 223 | + self._llm.eval(prompt_tokens) | |
| 224 | + logits = self._llm.eval_logits | |
| 225 | + if not logits: | |
| 226 | + raise RuntimeError("llama.cpp returned empty logits") | |
| 227 | + return list(logits[-1]) | |
| 228 | + | |
| 229 | + def _score_prompt(self, prompt_tokens: List[int]) -> float: | |
| 230 | + logits = self._eval_logits(prompt_tokens) | |
| 231 | + true_logit = float(logits[self._true_token]) | |
| 232 | + false_logit = float(logits[self._false_token]) | |
| 233 | + max_logit = max(true_logit, false_logit) | |
| 234 | + true_exp = math.exp(true_logit - max_logit) | |
| 235 | + false_exp = math.exp(false_logit - max_logit) | |
| 236 | + return float(true_exp / (true_exp + false_exp)) | |
| 237 | + | |
| 238 | + def _estimate_doc_lengths(self, docs: List[str]) -> List[int]: | |
| 239 | + if self._length_sort_mode == "token": | |
| 240 | + return [len(self._tokenize(text, special=False)) for text in docs] | |
| 241 | + return [len(text) for text in docs] | |
| 242 | + | |
| 243 | + def score_with_meta( | |
| 244 | + self, | |
| 245 | + query: str, | |
| 246 | + docs: List[str], | |
| 247 | + normalize: bool = True, | |
| 248 | + ) -> Tuple[List[float], Dict[str, Any]]: | |
| 249 | + start_ts = time.time() | |
| 250 | + total_docs = len(docs) if docs else 0 | |
| 251 | + output_scores: List[float] = [0.0] * total_docs | |
| 252 | + | |
| 253 | + query = "" if query is None else str(query).strip() | |
| 254 | + indexed: List[Tuple[int, str]] = [] | |
| 255 | + for i, doc in enumerate(docs or []): | |
| 256 | + if doc is None: | |
| 257 | + continue | |
| 258 | + text = str(doc).strip() | |
| 259 | + if not text: | |
| 260 | + continue | |
| 261 | + indexed.append((i, text)) | |
| 262 | + | |
| 263 | + if not query or not indexed: | |
| 264 | + elapsed_ms = (time.time() - start_ts) * 1000.0 | |
| 265 | + return output_scores, { | |
| 266 | + "input_docs": total_docs, | |
| 267 | + "usable_docs": len(indexed), | |
| 268 | + "unique_docs": 0, | |
| 269 | + "dedup_ratio": 0.0, | |
| 270 | + "elapsed_ms": round(elapsed_ms, 3), | |
| 271 | + "model": self._model_name, | |
| 272 | + "backend": "qwen3_gguf", | |
| 273 | + "normalize": normalize, | |
| 274 | + "infer_batch_size": self._infer_batch_size, | |
| 275 | + "inference_batches": 0, | |
| 276 | + "sort_by_doc_length": self._sort_by_doc_length, | |
| 277 | + "n_ctx": self._n_ctx, | |
| 278 | + "n_batch": self._n_batch, | |
| 279 | + "n_ubatch": self._n_ubatch, | |
| 280 | + "n_gpu_layers": self._n_gpu_layers, | |
| 281 | + } | |
| 282 | + | |
| 283 | + indexed_texts = [text for _, text in indexed] | |
| 284 | + unique_texts, position_to_unique = deduplicate_with_positions(indexed_texts) | |
| 285 | + | |
| 286 | + lengths = self._estimate_doc_lengths(unique_texts) | |
| 287 | + order = list(range(len(unique_texts))) | |
| 288 | + if self._sort_by_doc_length and len(unique_texts) > 1: | |
| 289 | + order = sorted(order, key=lambda i: lengths[i]) | |
| 290 | + | |
| 291 | + unique_scores: List[float] = [0.0] * len(unique_texts) | |
| 292 | + inference_batches = 0 | |
| 293 | + for start in range(0, len(order), self._infer_batch_size): | |
| 294 | + batch_indices = order[start : start + self._infer_batch_size] | |
| 295 | + inference_batches += 1 | |
| 296 | + for idx in batch_indices: | |
| 297 | + prompt = self._build_prompt_tokens(query, unique_texts[idx]) | |
| 298 | + with self._infer_lock: | |
| 299 | + unique_scores[idx] = self._score_prompt(prompt) | |
| 300 | + | |
| 301 | + for (orig_idx, _), unique_idx in zip(indexed, position_to_unique): | |
| 302 | + output_scores[orig_idx] = float(unique_scores[unique_idx]) | |
| 303 | + | |
| 304 | + elapsed_ms = (time.time() - start_ts) * 1000.0 | |
| 305 | + dedup_ratio = 0.0 | |
| 306 | + if indexed: | |
| 307 | + dedup_ratio = 1.0 - (len(unique_texts) / float(len(indexed))) | |
| 308 | + | |
| 309 | + meta = { | |
| 310 | + "input_docs": total_docs, | |
| 311 | + "usable_docs": len(indexed), | |
| 312 | + "unique_docs": len(unique_texts), | |
| 313 | + "dedup_ratio": round(dedup_ratio, 4), | |
| 314 | + "elapsed_ms": round(elapsed_ms, 3), | |
| 315 | + "model": self._model_name, | |
| 316 | + "backend": "qwen3_gguf", | |
| 317 | + "normalize": normalize, | |
| 318 | + "infer_batch_size": self._infer_batch_size, | |
| 319 | + "inference_batches": inference_batches, | |
| 320 | + "sort_by_doc_length": self._sort_by_doc_length, | |
| 321 | + "length_sort_mode": self._length_sort_mode, | |
| 322 | + "n_ctx": self._n_ctx, | |
| 323 | + "n_batch": self._n_batch, | |
| 324 | + "n_ubatch": self._n_ubatch, | |
| 325 | + "n_gpu_layers": self._n_gpu_layers, | |
| 326 | + } | |
| 327 | + return output_scores, meta | ... | ... |
reranker/server.py
| ... | ... | @@ -7,7 +7,7 @@ Request: { "query": "...", "docs": ["doc1", "doc2", ...], "normalize": optional |
| 7 | 7 | Response: { "scores": [float], "meta": {...} } |
| 8 | 8 | |
| 9 | 9 | Backend selected via config: services.rerank.backend |
| 10 | -(bge | qwen3_vllm | qwen3_transformers | dashscope_rerank), env RERANK_BACKEND. | |
| 10 | +(bge | qwen3_vllm | qwen3_transformers | qwen3_gguf | dashscope_rerank), env RERANK_BACKEND. | |
| 11 | 11 | """ |
| 12 | 12 | |
| 13 | 13 | import logging | ... | ... |
| ... | ... | @@ -0,0 +1,62 @@ |
| 1 | +#!/bin/bash | |
| 2 | +# | |
| 3 | +# Shared helpers for mapping reranker backends to isolated virtualenvs. | |
| 4 | +# | |
| 5 | + | |
| 6 | +set -euo pipefail | |
| 7 | + | |
| 8 | +detect_rerank_backend() { | |
| 9 | + local project_root="$1" | |
| 10 | + local backend="${RERANK_BACKEND:-}" | |
| 11 | + | |
| 12 | + if [[ -n "${backend}" ]]; then | |
| 13 | + printf '%s\n' "${backend}" | |
| 14 | + return 0 | |
| 15 | + fi | |
| 16 | + | |
| 17 | + backend="$( | |
| 18 | + awk ' | |
| 19 | + /^ rerank:$/ { in_rerank=1; next } | |
| 20 | + in_rerank && /^ [^ ]/ { in_rerank=0 } | |
| 21 | + in_rerank && /^ backend:/ { | |
| 22 | + gsub(/"/, "", $2) | |
| 23 | + print $2 | |
| 24 | + exit | |
| 25 | + } | |
| 26 | + ' "${project_root}/config/config.yaml" | |
| 27 | + )" | |
| 28 | + | |
| 29 | + if [[ -z "${backend}" ]]; then | |
| 30 | + backend="qwen3_vllm" | |
| 31 | + fi | |
| 32 | + | |
| 33 | + printf '%s\n' "${backend}" | |
| 34 | +} | |
| 35 | + | |
| 36 | +reranker_backend_venv_dir() { | |
| 37 | + local project_root="$1" | |
| 38 | + local backend="$2" | |
| 39 | + | |
| 40 | + case "${backend}" in | |
| 41 | + qwen3_vllm) printf '%s/.venv-reranker\n' "${project_root}" ;; | |
| 42 | + qwen3_gguf) printf '%s/.venv-reranker-gguf\n' "${project_root}" ;; | |
| 43 | + qwen3_transformers) printf '%s/.venv-reranker-transformers\n' "${project_root}" ;; | |
| 44 | + bge) printf '%s/.venv-reranker-bge\n' "${project_root}" ;; | |
| 45 | + dashscope_rerank) printf '%s/.venv-reranker-dashscope\n' "${project_root}" ;; | |
| 46 | + *) printf '%s/.venv-reranker-%s\n' "${project_root}" "${backend}" ;; | |
| 47 | + esac | |
| 48 | +} | |
| 49 | + | |
| 50 | +reranker_backend_requirements_file() { | |
| 51 | + local project_root="$1" | |
| 52 | + local backend="$2" | |
| 53 | + | |
| 54 | + case "${backend}" in | |
| 55 | + qwen3_vllm) printf '%s/requirements_reranker_qwen3_vllm.txt\n' "${project_root}" ;; | |
| 56 | + qwen3_gguf) printf '%s/requirements_reranker_qwen3_gguf.txt\n' "${project_root}" ;; | |
| 57 | + qwen3_transformers) printf '%s/requirements_reranker_qwen3_transformers.txt\n' "${project_root}" ;; | |
| 58 | + bge) printf '%s/requirements_reranker_bge.txt\n' "${project_root}" ;; | |
| 59 | + dashscope_rerank) printf '%s/requirements_reranker_dashscope.txt\n' "${project_root}" ;; | |
| 60 | + *) return 1 ;; | |
| 61 | + esac | |
| 62 | +} | ... | ... |
scripts/setup_reranker_venv.sh
| 1 | 1 | #!/bin/bash |
| 2 | 2 | # |
| 3 | -# Create isolated venv for reranker service (.venv-reranker). | |
| 3 | +# Create isolated venv for one reranker backend. | |
| 4 | 4 | # |
| 5 | 5 | set -euo pipefail |
| 6 | 6 | |
| 7 | 7 | PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)" |
| 8 | 8 | cd "${PROJECT_ROOT}" |
| 9 | 9 | |
| 10 | -VENV_DIR="${PROJECT_ROOT}/.venv-reranker" | |
| 11 | 10 | PYTHON_BIN="${PYTHON_BIN:-python3}" |
| 12 | 11 | TMP_DIR="${RERANKER_PIP_TMPDIR:-${PROJECT_ROOT}/.tmp/reranker-pip}" |
| 13 | 12 | |
| 13 | +# shellcheck source=scripts/lib/load_env.sh | |
| 14 | +source "${PROJECT_ROOT}/scripts/lib/load_env.sh" | |
| 15 | +load_env_file "${PROJECT_ROOT}/.env" | |
| 16 | +# shellcheck source=scripts/lib/reranker_backend_env.sh | |
| 17 | +source "${PROJECT_ROOT}/scripts/lib/reranker_backend_env.sh" | |
| 18 | + | |
| 19 | +BACKEND="${1:-$(detect_rerank_backend "${PROJECT_ROOT}")}" | |
| 20 | +VENV_DIR="${RERANKER_VENV:-$(reranker_backend_venv_dir "${PROJECT_ROOT}" "${BACKEND}")}" | |
| 21 | +REQ_FILE="$(reranker_backend_requirements_file "${PROJECT_ROOT}" "${BACKEND}")" | |
| 22 | + | |
| 23 | +if [[ ! -f "${REQ_FILE}" ]]; then | |
| 24 | + echo "ERROR: requirements file not found for reranker backend ${BACKEND}: ${REQ_FILE}" >&2 | |
| 25 | + exit 1 | |
| 26 | +fi | |
| 27 | + | |
| 14 | 28 | if ! command -v "${PYTHON_BIN}" >/dev/null 2>&1; then |
| 15 | 29 | echo "ERROR: python not found: ${PYTHON_BIN}" >&2 |
| 16 | 30 | exit 1 |
| ... | ... | @@ -34,9 +48,11 @@ PIP_ARGS=(--no-cache-dir) |
| 34 | 48 | |
| 35 | 49 | echo "Using TMPDIR=${TMPDIR}" |
| 36 | 50 | "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" --upgrade pip wheel |
| 37 | -"${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" -r requirements_reranker_service.txt | |
| 51 | +"${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" -r "${REQ_FILE}" | |
| 38 | 52 | |
| 39 | 53 | echo |
| 40 | 54 | echo "Done." |
| 55 | +echo "Backend: ${BACKEND}" | |
| 41 | 56 | echo "Reranker venv: ${VENV_DIR}" |
| 57 | +echo "Requirements: ${REQ_FILE}" | |
| 42 | 58 | echo "Start service: ./scripts/start_reranker.sh" | ... | ... |
scripts/start_reranker.sh
| 1 | 1 | #!/bin/bash |
| 2 | 2 | # |
| 3 | -# Start reranker service from isolated venv (.venv-reranker). | |
| 3 | +# Start reranker service from its backend-specific isolated venv. | |
| 4 | 4 | # |
| 5 | 5 | set -euo pipefail |
| 6 | 6 | |
| 7 | 7 | PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)" |
| 8 | 8 | cd "${PROJECT_ROOT}" |
| 9 | 9 | |
| 10 | -RERANKER_VENV="${RERANKER_VENV:-${PROJECT_ROOT}/.venv-reranker}" | |
| 11 | -PYTHON_BIN="${RERANKER_VENV}/bin/python" | |
| 12 | - | |
| 13 | -if [[ ! -x "${PYTHON_BIN}" ]]; then | |
| 14 | - echo "ERROR: reranker venv not found: ${RERANKER_VENV}" >&2 | |
| 15 | - echo "Please run: ./scripts/setup_reranker_venv.sh" >&2 | |
| 16 | - exit 1 | |
| 17 | -fi | |
| 18 | - | |
| 19 | 10 | # Load .env without activating main venv. |
| 20 | 11 | # shellcheck source=scripts/lib/load_env.sh |
| 21 | 12 | source "${PROJECT_ROOT}/scripts/lib/load_env.sh" |
| 22 | 13 | load_env_file "${PROJECT_ROOT}/.env" |
| 14 | +# shellcheck source=scripts/lib/reranker_backend_env.sh | |
| 15 | +source "${PROJECT_ROOT}/scripts/lib/reranker_backend_env.sh" | |
| 23 | 16 | |
| 24 | 17 | RERANKER_HOST="${RERANKER_HOST:-0.0.0.0}" |
| 25 | 18 | RERANKER_PORT="${RERANKER_PORT:-6007}" |
| 26 | -RERANK_BACKEND=$("${PYTHON_BIN}" -c "from config.services_config import get_rerank_backend_config; print(get_rerank_backend_config()[0])") | |
| 19 | +RERANK_BACKEND="${RERANK_BACKEND:-$(detect_rerank_backend "${PROJECT_ROOT}")}" | |
| 20 | +RERANKER_VENV="${RERANKER_VENV:-$(reranker_backend_venv_dir "${PROJECT_ROOT}" "${RERANK_BACKEND}")}" | |
| 21 | +PYTHON_BIN="${RERANKER_VENV}/bin/python" | |
| 22 | + | |
| 23 | +if [[ ! -x "${PYTHON_BIN}" ]]; then | |
| 24 | + echo "ERROR: reranker venv not found for backend ${RERANK_BACKEND}: ${RERANKER_VENV}" >&2 | |
| 25 | + echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2 | |
| 26 | + exit 1 | |
| 27 | +fi | |
| 27 | 28 | |
| 28 | 29 | # Keep vLLM/triton/torch caches out of system disk. |
| 29 | 30 | RERANKER_RUNTIME_DIR="${RERANKER_RUNTIME_DIR:-${PROJECT_ROOT}/.runtime/reranker}" |
| ... | ... | @@ -58,7 +59,21 @@ except Exception: |
| 58 | 59 | PY |
| 59 | 60 | then |
| 60 | 61 | echo "ERROR: qwen3_vllm backend requires vllm + CUDA runtime in ${RERANKER_VENV}." >&2 |
| 61 | - echo "Please run: ./scripts/setup_reranker_venv.sh and verify CUDA is available." >&2 | |
| 62 | + echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND} and verify CUDA is available." >&2 | |
| 63 | + exit 1 | |
| 64 | + fi | |
| 65 | +fi | |
| 66 | + | |
| 67 | +if [[ "${RERANK_BACKEND}" == "qwen3_gguf" ]]; then | |
| 68 | + if ! "${PYTHON_BIN}" - <<'PY' | |
| 69 | +try: | |
| 70 | + import llama_cpp # noqa: F401 | |
| 71 | +except Exception: | |
| 72 | + raise SystemExit(1) | |
| 73 | +PY | |
| 74 | + then | |
| 75 | + echo "ERROR: qwen3_gguf backend requires llama-cpp-python in ${RERANKER_VENV}." >&2 | |
| 76 | + echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2 | |
| 62 | 77 | exit 1 |
| 63 | 78 | fi |
| 64 | 79 | fi | ... | ... |
| ... | ... | @@ -0,0 +1,90 @@ |
| 1 | +from __future__ import annotations | |
| 2 | + | |
| 3 | +import sys | |
| 4 | +import types | |
| 5 | + | |
| 6 | +from reranker.backends import get_rerank_backend | |
| 7 | +from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend | |
| 8 | + | |
| 9 | + | |
| 10 | +class _FakeLlama: | |
| 11 | + def __init__(self, model_path: str | None = None, **kwargs): | |
| 12 | + self.model_path = model_path | |
| 13 | + self.kwargs = kwargs | |
| 14 | + self.eval_logits = [] | |
| 15 | + | |
| 16 | + @classmethod | |
| 17 | + def from_pretrained(cls, repo_id: str, filename: str, local_dir=None, cache_dir=None, **kwargs): | |
| 18 | + inst = cls(model_path=f"{repo_id}/{filename}", **kwargs) | |
| 19 | + inst.repo_id = repo_id | |
| 20 | + inst.filename = filename | |
| 21 | + inst.local_dir = local_dir | |
| 22 | + inst.cache_dir = cache_dir | |
| 23 | + return inst | |
| 24 | + | |
| 25 | + def tokenize(self, text: bytes, add_bos: bool = False, special: bool = False): | |
| 26 | + raw = text.decode("utf-8") | |
| 27 | + if raw == "yes": | |
| 28 | + return [1] | |
| 29 | + if raw == "no": | |
| 30 | + return [2] | |
| 31 | + return [10 + (ord(ch) % 17) for ch in raw] | |
| 32 | + | |
| 33 | + def reset(self): | |
| 34 | + return None | |
| 35 | + | |
| 36 | + def eval(self, prompt_tokens): | |
| 37 | + pos = float(sum(prompt_tokens) % 11) + 3.0 | |
| 38 | + neg = 1.0 | |
| 39 | + logits = [0.0] * 64 | |
| 40 | + logits[1] = pos | |
| 41 | + logits[2] = neg | |
| 42 | + self.eval_logits = [logits] | |
| 43 | + | |
| 44 | + | |
| 45 | +def _install_fake_llama_cpp(monkeypatch): | |
| 46 | + fake_module = types.SimpleNamespace(Llama=_FakeLlama) | |
| 47 | + monkeypatch.setitem(sys.modules, "llama_cpp", fake_module) | |
| 48 | + | |
| 49 | + | |
| 50 | +def test_qwen3_gguf_backend_factory_loads(monkeypatch): | |
| 51 | + _install_fake_llama_cpp(monkeypatch) | |
| 52 | + backend = get_rerank_backend( | |
| 53 | + "qwen3_gguf", | |
| 54 | + { | |
| 55 | + "repo_id": "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF", | |
| 56 | + "filename": "*Q8_0.gguf", | |
| 57 | + "enable_warmup": False, | |
| 58 | + }, | |
| 59 | + ) | |
| 60 | + assert isinstance(backend, Qwen3GGUFRerankerBackend) | |
| 61 | + | |
| 62 | + | |
| 63 | +def test_qwen3_gguf_backend_score_with_meta_dedup_and_restore(monkeypatch): | |
| 64 | + _install_fake_llama_cpp(monkeypatch) | |
| 65 | + backend = Qwen3GGUFRerankerBackend( | |
| 66 | + { | |
| 67 | + "repo_id": "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF", | |
| 68 | + "filename": "*Q8_0.gguf", | |
| 69 | + "enable_warmup": False, | |
| 70 | + "infer_batch_size": 2, | |
| 71 | + "sort_by_doc_length": True, | |
| 72 | + } | |
| 73 | + ) | |
| 74 | + | |
| 75 | + scores, meta = backend.score_with_meta( | |
| 76 | + query="wireless mouse", | |
| 77 | + docs=["doc-a", "doc-b", "doc-a", "", " ", None], | |
| 78 | + normalize=True, | |
| 79 | + ) | |
| 80 | + | |
| 81 | + assert len(scores) == 6 | |
| 82 | + assert scores[0] == scores[2] | |
| 83 | + assert scores[0] > 0.5 | |
| 84 | + assert scores[1] > 0.5 | |
| 85 | + assert scores[3:] == [0.0, 0.0, 0.0] | |
| 86 | + assert meta["input_docs"] == 6 | |
| 87 | + assert meta["usable_docs"] == 3 | |
| 88 | + assert meta["unique_docs"] == 2 | |
| 89 | + assert meta["backend"] == "qwen3_gguf" | |
| 90 | + assert meta["inference_batches"] == 1 | ... | ... |