From 5c21a485bbd6bd2f6876a1d2ddee6a6afbeeffa9 Mon Sep 17 00:00:00 2001 From: tangwang Date: Wed, 25 Mar 2026 15:04:48 +0800 Subject: [PATCH] qwen3-reranker-0.6b-gguf --- config/config.yaml | 37 +++++++++++++++++++++++++++++++------ config/services_config.py | 7 +++++++ requirements_reranker_qwen3_gguf.txt | 1 + requirements_reranker_qwen3_gguf_06b.txt | 3 +++ reranker/DEPLOYMENT_AND_TUNING.md | 6 +++--- reranker/GGUF_0_6B_INSTALL_AND_TUNING.md | 154 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ reranker/GGUF_INSTALL_AND_TUNING.md | 280 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ reranker/README.md | 28 ++++++++++++++++++++++------ reranker/backends/__init__.py | 11 +++++++++-- reranker/backends/qwen3_gguf.py | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------- reranker/server.py | 2 +- scripts/benchmark_reranker_gguf_local.py | 198 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/lib/reranker_backend_env.sh | 2 ++ scripts/setup_reranker_venv.sh | 24 ++++++++++++++++++++++++ scripts/start_reranker.sh | 24 ++++++++++++++++++------ tests/test_reranker_qwen3_gguf_backend.py | 31 ++++++++++++++++++++++++++++++- 16 files changed, 886 insertions(+), 47 deletions(-) create mode 100644 requirements_reranker_qwen3_gguf_06b.txt create mode 100644 reranker/GGUF_0_6B_INSTALL_AND_TUNING.md create mode 100644 reranker/GGUF_INSTALL_AND_TUNING.md create mode 100644 scripts/benchmark_reranker_gguf_local.py diff --git a/config/config.yaml b/config/config.yaml index 6dd76a8..79f7a79 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -381,7 +381,7 @@ services: max_docs: 1000 normalize: true # 服务内后端(reranker 进程启动时读取) - backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | qwen3_gguf | dashscope_rerank + backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank backends: bge: model_name: "BAAI/bge-reranker-v2-m3" @@ -426,11 +426,11 @@ services: cache_dir: "./model_cache" local_dir: "./models/reranker/qwen3-reranker-4b-gguf" instruction: "Rank products by query with category & style match prioritized" - # T4 16GB / 显存约 5~6GB 的保守配置 - n_ctx: 384 - n_batch: 384 - n_ubatch: 128 - n_gpu_layers: 24 + # T4 16GB / 性能优先配置:全量层 offload,实测比保守配置明显更快 + n_ctx: 512 + n_batch: 512 + n_ubatch: 512 + n_gpu_layers: 999 main_gpu: 0 n_threads: 2 n_threads_batch: 4 @@ -443,6 +443,31 @@ services: length_sort_mode: "char" enable_warmup: true verbose: false + qwen3_gguf_06b: + repo_id: "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF" + filename: "qwen3-reranker-0.6b-q8_0.gguf" + cache_dir: "./model_cache" + local_dir: "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf" + instruction: "Rank products by query with category & style match prioritized" + # 0.6B GGUF / online rerank baseline: + # 实测 400 titles 单请求约 265s,因此它更适合作为低显存功能后备,不适合在线低延迟主路由。 + n_ctx: 256 + n_batch: 256 + n_ubatch: 256 + n_gpu_layers: 999 + main_gpu: 0 + n_threads: 2 + n_threads_batch: 4 + flash_attn: true + offload_kqv: true + use_mmap: true + use_mlock: false + infer_batch_size: 32 + sort_by_doc_length: true + length_sort_mode: "char" + reuse_query_state: false + enable_warmup: true + verbose: false dashscope_rerank: model_name: "qwen3-rerank" # 按地域选择 endpoint: diff --git a/config/services_config.py b/config/services_config.py index 0fa30a5..092e04a 100644 --- a/config/services_config.py +++ b/config/services_config.py @@ -7,6 +7,7 @@ contains no independent parsing or precedence logic. from __future__ import annotations +import os from typing import Any, Dict, Tuple from config.loader import get_app_config @@ -61,6 +62,12 @@ def get_embedding_image_backend_config() -> Tuple[str, Dict[str, Any]]: def get_rerank_backend_config() -> Tuple[str, Dict[str, Any]]: cfg = get_app_config().services.rerank + backend = str(os.getenv("RERANK_BACKEND") or cfg.backend).strip() + if backend != cfg.backend: + backend_cfg = cfg.backends.get(backend) + if backend_cfg is None: + raise ValueError(f"Unknown rerank backend override from RERANK_BACKEND: {backend!r}") + return backend, dict(backend_cfg) return cfg.backend, cfg.get_backend_config() diff --git a/requirements_reranker_qwen3_gguf.txt b/requirements_reranker_qwen3_gguf.txt index c97f222..be724ef 100644 --- a/requirements_reranker_qwen3_gguf.txt +++ b/requirements_reranker_qwen3_gguf.txt @@ -1,4 +1,5 @@ # Isolated dependencies for qwen3_gguf reranker backend (.venv-reranker-gguf). -r requirements_reranker_base.txt +huggingface-hub>=0.32.0 llama-cpp-python>=0.3.16 diff --git a/requirements_reranker_qwen3_gguf_06b.txt b/requirements_reranker_qwen3_gguf_06b.txt new file mode 100644 index 0000000..c1ffd61 --- /dev/null +++ b/requirements_reranker_qwen3_gguf_06b.txt @@ -0,0 +1,3 @@ +# Isolated dependencies for qwen3_gguf_06b reranker backend (.venv-reranker-gguf-06b). + +-r requirements_reranker_qwen3_gguf.txt diff --git a/reranker/DEPLOYMENT_AND_TUNING.md b/reranker/DEPLOYMENT_AND_TUNING.md index c9a18eb..922c16d 100644 --- a/reranker/DEPLOYMENT_AND_TUNING.md +++ b/reranker/DEPLOYMENT_AND_TUNING.md @@ -3,15 +3,15 @@ 本文档沉淀当前项目在电商搜索重排场景下的可复用实践,覆盖: - 环境准备与安装部署 -- `qwen3_vllm` / `qwen3_gguf` 配置项与优化思路 +- `qwen3_vllm` / `qwen3_gguf` / `qwen3_gguf_06b` 配置项与优化思路 - 1000-doc 场景压测流程 - 关键结论与推荐默认参数 - 常见故障排查 适用范围: -- 重排后端:`services.rerank.backend: qwen3_vllm` 或 `qwen3_gguf` -- 模型:`Qwen/Qwen3-Reranker-0.6B` / `DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF` +- 重排后端:`services.rerank.backend: qwen3_vllm` / `qwen3_gguf` / `qwen3_gguf_06b` +- 模型:`Qwen/Qwen3-Reranker-0.6B` / `DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF` / `ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF` - 场景:query 较短(通常 < 100 tokens),doc 为商品标题或标题+简短描述,单请求 docs 约 1000 条 ## 1. 环境基线 diff --git a/reranker/GGUF_0_6B_INSTALL_AND_TUNING.md b/reranker/GGUF_0_6B_INSTALL_AND_TUNING.md new file mode 100644 index 0000000..68e1ddd --- /dev/null +++ b/reranker/GGUF_0_6B_INSTALL_AND_TUNING.md @@ -0,0 +1,154 @@ +# Qwen3-Reranker-0.6B GGUF 安装与调优 + +本文档覆盖 `qwen3_gguf_06b` 后端,对应模型: + +- Hugging Face: `ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF` +- 文件: `qwen3-reranker-0.6b-q8_0.gguf` +- 本地目录: `./models/reranker/qwen3-reranker-0.6b-q8_0-gguf` + +## 结论先看 + +这个后端已经接入完成,也能正常使用 GPU offload,但不适合当前项目的在线主链路场景。 + +目标场景是: + +- 1 个 query +- 400 个商品标题 +- 追求最短响应时间 + +实测最优配置下: + +- GPU 显存占用约 `894 MiB` +- 400 titles 单请求延迟约 `265318 ms` + +因此它更适合作为: + +- 低显存 fallback +- 功能验证 +- 本地离线实验 + +不建议作为在线低延迟 reranker 主 backend。 + +## 独立环境 + +`qwen3_gguf_06b` 使用独立 venv: + +- backend: `qwen3_gguf_06b` +- venv: `.venv-reranker-gguf-06b` +- requirements: `requirements_reranker_qwen3_gguf_06b.txt` + +安装: + +```bash +./scripts/setup_reranker_venv.sh qwen3_gguf_06b +``` + +如果需要确认是 CUDA 版 `llama-cpp-python`: + +```bash +./.venv-reranker-gguf-06b/bin/python - <<'PY' +import llama_cpp +print(llama_cpp.llama_supports_gpu_offload()) +PY +``` + +预期输出: + +```python +True +``` + +## 模型下载 + +推荐预先下载到本地,避免首次服务启动时在线拉取: + +```bash +mkdir -p models/reranker/qwen3-reranker-0.6b-q8_0-gguf +curl -L --fail -C - \ + -o models/reranker/qwen3-reranker-0.6b-q8_0-gguf/qwen3-reranker-0.6b-q8_0.gguf \ + 'https://huggingface.co/ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/resolve/main/qwen3-reranker-0.6b-q8_0.gguf?download=true' +``` + +当前实测文件大小: + +- `639153184` bytes + +## 推荐配置 + +`config/config.yaml` 中建议保留: + +```yaml +qwen3_gguf_06b: + repo_id: "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF" + filename: "qwen3-reranker-0.6b-q8_0.gguf" + local_dir: "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf" + cache_dir: "./model_cache" + instruction: "Rank products by query with category & style match prioritized" + n_ctx: 256 + n_batch: 256 + n_ubatch: 256 + n_gpu_layers: 999 + main_gpu: 0 + n_threads: 2 + n_threads_batch: 4 + flash_attn: true + offload_kqv: true + use_mmap: true + use_mlock: false + infer_batch_size: 32 + sort_by_doc_length: true + length_sort_mode: "char" + reuse_query_state: false + enable_warmup: true + verbose: false +``` + +## 调优结果 + +在当前机器上做了同机实测。标题文件来自 `/home/ubuntu/rerank_test/titles.1.8w`,查询为 `白色oversized T-shirt`。 + +80 titles: + +- `n_ctx=256, reuse_query_state=true` -> `60108 ms` +- `n_ctx=256, reuse_query_state=false` -> `53383~56893 ms` +- `n_ctx=320, reuse_query_state=true` -> `60961 ms` +- `n_ctx=384, reuse_query_state=true` -> `56578 ms` +- `n_ctx=384, reuse_query_state=false` -> `57272 ms` +- `n_ctx=512, reuse_query_state=false` -> `60542 ms` +- `n_ctx=256, reuse_query_state=false, n_threads=4, n_threads_batch=8` -> `61228 ms` + +400 titles: + +- `n_ctx=256, n_batch=256, n_ubatch=256, n_gpu_layers=999, reuse_query_state=false` + -> `265318 ms` + +## 经验沉淀 + +这次接入最重要的结论不是“哪个小参数更快”,而是: + +1. 这个 0.6B GGUF 权重虽然小,但当前后端实现仍是逐 doc 顺序打分。 +2. 对在线 400-title 请求来说,串行打分本身就是主瓶颈。 +3. `reuse_query_state` 在这个模型上没有带来收益,反而更慢。 +4. `n_ctx` 拉大到 `384/512` 也没有带来实质收益,反而更慢或持平。 +5. 这个 backend 的优势是低显存,不是低延迟。 + +如果目标是在线最短响应时间,优先级建议是: + +1. `qwen3_vllm` +2. 其他真正支持高吞吐批处理的后端 +3. `qwen3_gguf_06b` 仅作为低显存 fallback + +## 验证命令 + +本地直连 backend 调优: + +```bash +PYTHONPATH=/data/saas-search ./.venv-reranker-gguf/bin/python \ + scripts/benchmark_reranker_gguf_local.py --backend-name qwen3_gguf_06b --docs 400 +``` + +按服务方式启动: + +```bash +RERANK_BACKEND=qwen3_gguf_06b ./scripts/start_reranker.sh +``` diff --git a/reranker/GGUF_INSTALL_AND_TUNING.md b/reranker/GGUF_INSTALL_AND_TUNING.md new file mode 100644 index 0000000..773b249 --- /dev/null +++ b/reranker/GGUF_INSTALL_AND_TUNING.md @@ -0,0 +1,280 @@ +# Qwen3 GGUF 安装与调优手册 + +本文档只覆盖 `qwen3_gguf` 后端,目标机器为当前项目实测环境: + +- GPU: `Tesla T4 16GB` +- CUDA: `12.8` +- 模型: `DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF` +- 量化: `Q8_0` + +--- + +## 1. 结论先看 + +当前这套代码里,GGUF 后端的主要瓶颈不是“显存没吃满”,而是 **llama.cpp 按 doc 顺序逐条打分**。因此最有效的优化策略是: + +- 让模型层尽可能全部 offload 到 GPU +- 打开 `flash_attn` / `offload_kqv` +- 把 `n_ctx / n_batch / n_ubatch` 调到一个对短标题重排更合适的高效点 + +本轮在当前机器上的推荐配置是: + +```yaml +qwen3_gguf: + n_ctx: 512 + n_batch: 512 + n_ubatch: 512 + n_gpu_layers: 999 + n_threads: 2 + n_threads_batch: 4 + flash_attn: true + offload_kqv: true + infer_batch_size: 8 + sort_by_doc_length: true + length_sort_mode: "char" +``` + +说明: + +- `n_gpu_layers: 999` 在 llama.cpp 中等价于“尽可能全部层都 offload” +- 这台 T4 上,**即使全量 offload,当前模型也只占到约 `4.5 GiB` GPU 显存** +- 所以“允许 8G 显存”并不会自动带来更高速度;这个模型/后端在当前工作负载下已经接近“该用到的权重都上 GPU 了” + +--- + +## 2. 独立环境 + +`qwen3_gguf` 必须使用自己的独立 venv: + +- `qwen3_vllm` -> `.venv-reranker` +- `qwen3_gguf` -> `.venv-reranker-gguf` + +安装命令: + +```bash +./scripts/setup_reranker_venv.sh qwen3_gguf +``` + +脚本现在会自动做两件事: + +1. 安装 GGUF 后端所需 Python 依赖 +2. 在检测到 `/usr/local/cuda/bin/nvcc` 时,把 `llama-cpp-python` **重编译成 CUDA 版** + +--- + +## 3. GPU 版验证 + +必须验证不是 CPU-only 版: + +```bash +./.venv-reranker-gguf/bin/python - <<'PY' +import llama_cpp +print("supports_gpu_offload =", llama_cpp.llama_supports_gpu_offload()) +PY +``` + +正确结果应为: + +```text +supports_gpu_offload = True +``` + +还可以看动态库: + +```bash +ldd .venv-reranker-gguf/lib/python3.12/site-packages/llama_cpp/lib/libllama.so | rg 'cuda|cublas|ggml-cuda' +``` + +应能看到: + +- `libggml-cuda.so` +- `libcudart.so` +- `libcublas.so` + +--- + +## 4. 模型下载 + +当前使用本地文件优先策略,模型放在: + +```text +models/reranker/qwen3-reranker-4b-gguf/Qwen.Qwen3-Reranker-4B.Q8_0.gguf +``` + +若本地文件存在,后端会直接加载本地 GGUF,不再依赖启动时在线下载。 + +为了避免当前机器上 Hugging Face Xet 下载的 `416 Range Not Satisfiable` 问题,`start_reranker.sh` 已对 `qwen3_gguf` 默认设置: + +```bash +HF_HUB_DISABLE_XET=1 +``` + +--- + +## 5. 本地调优脚本 + +新增本地基准脚本: + +```bash +PYTHONPATH=/data/saas-search ./.venv-reranker-gguf/bin/python \ + scripts/benchmark_reranker_gguf_local.py --docs 64 --repeat 1 +``` + +它会直接实例化 GGUF backend,输出: + +- 模型加载耗时 +- 当前进程 GPU 显存占用 +- 单次 rerank 延迟 + +--- + +## 6. 本轮实测结果 + +测试条件: + +- Query: `白色oversized T-shirt` +- Docs: `64` 条商品标题 +- 本地脚本:`scripts/benchmark_reranker_gguf_local.py` +- 每组 1 次,重点比较相对趋势 + +结果: + +### 6.1 保守配置 + +```text +n_ctx=384 +n_batch=384 +n_ubatch=128 +n_gpu_layers=24 +``` + +- GPU 显存:`2984 MiB` +- 64 docs 延迟:`74347.91 ms` + +### 6.2 全量 offload + +```text +n_ctx=384 +n_batch=384 +n_ubatch=128 +n_gpu_layers=999 +``` + +- GPU 显存:`4338 MiB` +- 64 docs 延迟:`51401.77 ms` + +### 6.3 最优配置 + +```text +n_ctx=512 +n_batch=512 +n_ubatch=512 +n_gpu_layers=999 +``` + +- GPU 显存:`4564 MiB` +- 64 docs 延迟:`49116.10 ms` + +### 6.4 其它尝试 + +`n_threads=4 / n_threads_batch=8`: + +- GPU 显存:`4564 MiB` +- 64 docs 延迟:`49895.88 ms` +- 比推荐值略慢 + +`infer_batch_size=64`: + +- GPU 显存:`4564 MiB` +- 64 docs 延迟:`50723.36 ms` +- 也略慢 + +### 6.5 API 级验证 + +在把推荐配置写入 `config/config.yaml` 并重启服务后,使用: + +```bash +RERANK_BASE=http://127.0.0.1:6007 \ + ./.venv/bin/python scripts/benchmark_reranker_random_titles.py 64 --repeat 1 --query '白色oversized T-shirt' +``` + +得到: + +- `64 docs`:`50177.22 ms` + +再用: + +```bash +RERANK_BASE=http://127.0.0.1:6007 \ + ./.venv/bin/python scripts/benchmark_reranker_random_titles.py 153 --repeat 1 --query '白色oversized T-shirt' +``` + +得到: + +- `153 docs`:`115328.60 ms` + +对比旧日志中的保守配置: + +- 旧配置 `153 docs`:`153435.37 ms` +- 新配置 `153 docs`:`115328.60 ms` + +改善幅度约: + +- `24.8%` + +--- + +## 7. 为什么没有吃到 8G + +结论很重要: + +- 当前最优配置已经是“尽可能全量层 offload” +- 该 `Q8_0` 模型在这套 llama.cpp / T4 / 短文本重排场景下,**实测只需要约 `4.5 GiB` GPU 显存** +- 继续为了“吃满 8G”去增大 `n_ctx`,不会明显提升吞吐,反而可能带来额外开销 + +所以本轮不是“显存太保守”,而是: + +- 可 offload 的权重已经基本 offload 完了 +- 真正拖慢响应的是 **逐 doc 顺序推理** 这一后端实现路径 + +--- + +## 8. 生产建议 + +### 8.1 当前建议 + +保留以下参数: + +```yaml +n_ctx: 512 +n_batch: 512 +n_ubatch: 512 +n_gpu_layers: 999 +n_threads: 2 +n_threads_batch: 4 +flash_attn: true +offload_kqv: true +``` + +### 8.2 如果还嫌慢 + +优先级建议: + +1. 缩小 `rerank_window` +2. 减少传入 doc 数 +3. 若业务允许,切换到更适合高吞吐的后端 + +原因: + +- 当前 GGUF 后端是本地单进程、逐 doc 打分 +- 对长列表重排,它天然不如 vLLM / 云端 rerank API 擅长吞吐 + +--- + +## 9. 本轮落地文件 + +- `config/config.yaml` +- `scripts/setup_reranker_venv.sh` +- `scripts/start_reranker.sh` +- `scripts/benchmark_reranker_gguf_local.py` +- `reranker/GGUF_INSTALL_AND_TUNING.md` diff --git a/reranker/README.md b/reranker/README.md index 0460b3c..d77ef7f 100644 --- a/reranker/README.md +++ b/reranker/README.md @@ -1,13 +1,13 @@ # Reranker 模块 -**请求示例**见 `docs/QUICKSTART.md` §3.5。扩展规范见 `docs/DEVELOPER_GUIDE.md` §7。部署与调优实战见 `reranker/DEPLOYMENT_AND_TUNING.md`。 +**请求示例**见 `docs/QUICKSTART.md` §3.5。扩展规范见 `docs/DEVELOPER_GUIDE.md` §7。部署与调优实战见 `reranker/DEPLOYMENT_AND_TUNING.md`。`ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF` 的专项接入与调优结论见 `reranker/GGUF_0_6B_INSTALL_AND_TUNING.md`。 --- Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwen3-vLLM、Qwen3-Transformers、Qwen3-GGUF、DashScope 云重排)。调用方通过 HTTP 访问,不关心具体后端。 **特性** -- 多后端:`qwen3_vllm`、`qwen3_transformers`、`qwen3_gguf`(Qwen3-Reranker-4B GGUF + llama.cpp)、`bge`(兼容保留) +- 多后端:`qwen3_vllm`、`qwen3_transformers`、`qwen3_gguf`(Qwen3-Reranker-4B GGUF + llama.cpp)、`qwen3_gguf_06b`(Qwen3-Reranker-0.6B Q8_0 GGUF + llama.cpp)、`bge`(兼容保留) - 云后端:`dashscope_rerank`(调用 DashScope `/compatible-api/v1/reranks`,支持按地域切换 endpoint) - 统一配置:`config/config.yaml` → `services.rerank.backend` / `services.rerank.backends.` - 文档去重、分数与输入顺序一致、FP16/GPU 支持(视后端) @@ -19,7 +19,7 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe - `backends/bge.py`:BGE 后端 - `backends/qwen3_vllm.py`:Qwen3-Reranker-0.6B + vLLM 后端 - `backends/qwen3_transformers.py`:Qwen3-Reranker-0.6B 纯 Transformers 后端(官方 Usage 方式) - - `backends/qwen3_gguf.py`:Qwen3-Reranker-4B GGUF + llama.cpp 后端 + - `backends/qwen3_gguf.py`:Qwen3-Reranker GGUF + llama.cpp 后端(支持 `qwen3_gguf` / `qwen3_gguf_06b`) - `backends/dashscope_rerank.py`:DashScope 云重排后端(HTTP 调用) - `reranker/bge_reranker.py`:BGE 核心推理(被 bge 后端封装) - `reranker/config.py`:服务端口、MAX_DOCS、NORMALIZE 等(后端参数在 config.yaml) @@ -32,11 +32,12 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe - 现在按 backend 使用独立 venv: - `qwen3_vllm` -> `.venv-reranker` - `qwen3_gguf` -> `.venv-reranker-gguf` + - `qwen3_gguf_06b` -> `.venv-reranker-gguf-06b` - `qwen3_transformers` -> `.venv-reranker-transformers` - `bge` -> `.venv-reranker-bge` - `dashscope_rerank` -> `.venv-reranker-dashscope` ```bash - ./scripts/setup_reranker_venv.sh qwen3_gguf + ./scripts/setup_reranker_venv.sh qwen3_gguf_06b ``` CUDA 构建建议: ```bash @@ -48,7 +49,7 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe ``` ## 配置 -- **后端选择**:`config/config.yaml` 中 `services.rerank.backend`(`qwen3_vllm` | `qwen3_transformers` | `qwen3_gguf` | `bge` | `dashscope_rerank`),或环境变量 `RERANK_BACKEND`。 +- **后端选择**:`config/config.yaml` 中 `services.rerank.backend`(`qwen3_vllm` | `qwen3_transformers` | `qwen3_gguf` | `qwen3_gguf_06b` | `bge` | `dashscope_rerank`),或环境变量 `RERANK_BACKEND`。 - **后端参数**:`services.rerank.backends.bge` / `services.rerank.backends.qwen3_vllm`,例如: ```yaml @@ -96,6 +97,20 @@ services: infer_batch_size: 8 sort_by_doc_length: true length_sort_mode: "char" + qwen3_gguf_06b: + repo_id: "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF" + filename: "qwen3-reranker-0.6b-q8_0.gguf" + local_dir: "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf" + cache_dir: "./model_cache" + instruction: "Rank products by query with category & style match prioritized" + n_ctx: 256 + n_batch: 256 + n_ubatch: 256 + n_gpu_layers: 999 + infer_batch_size: 32 + sort_by_doc_length: true + length_sort_mode: "char" + reuse_query_state: false dashscope_rerank: model_name: "qwen3-rerank" endpoint: "https://dashscope.aliyuncs.com/compatible-api/v1/reranks" @@ -153,7 +168,7 @@ Content-Type: application/json ``` `top_n` 为可选字段: -- 对本地后端(`qwen3_vllm` / `qwen3_transformers` / `qwen3_gguf` / `bge`)通常会忽略,仍返回全量分数。 +- 对本地后端(`qwen3_vllm` / `qwen3_transformers` / `qwen3_gguf` / `qwen3_gguf_06b` / `bge`)通常会忽略,仍返回全量分数。 - 对 `dashscope_rerank` 可用于控制云端返回的候选量,建议设置为 `page+size`(例如分页 `from=20,size=10` 时传 `30`)。 Response: @@ -192,3 +207,4 @@ uvicorn reranker.server:app --host 0.0.0.0 --port 6007 --log-level info - **Qwen3-vLLM**:参考 [Qwen3-Reranker-0.6B](https://huggingface.co/Qwen/Qwen3-Reranker-0.6B),需 GPU 与较多显存;与 BGE 相比适合长文本、高吞吐场景(vLLM 前缀缓存)。 - **Qwen3-Transformers**:官方 Transformers Usage 方式,无需 vLLM;适合 CPU 或小显存。默认 `attn_implementation: "sdpa"`;若已安装 `flash_attn` 可设 `flash_attention_2`(未安装时服务会自动回退到 sdpa)。 - **Qwen3-GGUF**:参考 [DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF](https://huggingface.co/DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF)。单卡 T4 且仅剩约 `4.8~6GB` 显存时,推荐 `Q8_0 + n_ctx=384 + n_gpu_layers=24 + flash_attn=true + offload_kqv=true` 起步;若启动 OOM,优先把 `n_gpu_layers` 下调到 `20`,再把 `n_ctx` 下调到 `320`。`infer_batch_size` 在 GGUF 后端是服务侧 work chunk,大多不如 `n_gpu_layers` / `n_ctx` 关键。 +- **Qwen3-GGUF-0.6B**:参考 [ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF](https://huggingface.co/ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF)。它的优点是权重小、显存占用低,单进程实测约 `0.9~1.1 GiB`;但在当前 llama.cpp 串行打分接法下,`1 query + 400 titles` 的实测延迟仍约 `265s`。因此它更适合低显存功能后备,不适合作为在线低延迟主 reranker。 diff --git a/reranker/backends/__init__.py b/reranker/backends/__init__.py index 293eb1c..2daf354 100644 --- a/reranker/backends/__init__.py +++ b/reranker/backends/__init__.py @@ -48,12 +48,19 @@ def get_rerank_backend(name: str, config: Dict[str, Any]) -> RerankBackendProtoc return Qwen3TransformersRerankerBackend(config) if name == "qwen3_gguf": from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend - return Qwen3GGUFRerankerBackend(config) + gguf_config = dict(config or {}) + gguf_config.setdefault("_backend_name", "qwen3_gguf") + return Qwen3GGUFRerankerBackend(gguf_config) + if name == "qwen3_gguf_06b": + from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend + gguf_config = dict(config or {}) + gguf_config.setdefault("_backend_name", "qwen3_gguf_06b") + return Qwen3GGUFRerankerBackend(gguf_config) if name == "dashscope_rerank": from reranker.backends.dashscope_rerank import DashScopeRerankBackend return DashScopeRerankBackend(config) raise ValueError( - f"Unknown rerank backend: {name!r}. Supported: bge, qwen3_vllm, qwen3_transformers, qwen3_gguf, dashscope_rerank" + f"Unknown rerank backend: {name!r}. Supported: bge, qwen3_vllm, qwen3_transformers, qwen3_gguf, qwen3_gguf_06b, dashscope_rerank" ) diff --git a/reranker/backends/qwen3_gguf.py b/reranker/backends/qwen3_gguf.py index c980506..8dd8b7c 100644 --- a/reranker/backends/qwen3_gguf.py +++ b/reranker/backends/qwen3_gguf.py @@ -1,9 +1,11 @@ """ -Qwen3-Reranker-4B GGUF backend using llama-cpp-python. +Qwen3-Reranker GGUF backend using llama-cpp-python. Reference: - https://huggingface.co/DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF - https://huggingface.co/Qwen/Qwen3-Reranker-4B +- https://huggingface.co/ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF +- https://huggingface.co/Qwen/Qwen3-Reranker-0.6B """ from __future__ import annotations @@ -13,12 +15,27 @@ import math import os import threading import time +from pathlib import Path from typing import Any, Dict, List, Tuple logger = logging.getLogger("reranker.backends.qwen3_gguf") +_BACKEND_DEFAULTS: Dict[str, Dict[str, str]] = { + "qwen3_gguf": { + "repo_id": "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF", + "filename": "*Q8_0.gguf", + "local_dir": "./models/reranker/qwen3-reranker-4b-gguf", + }, + "qwen3_gguf_06b": { + "repo_id": "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF", + "filename": "qwen3-reranker-0.6b-q8_0.gguf", + "local_dir": "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf", + }, +} + + def deduplicate_with_positions(texts: List[str]) -> Tuple[List[str], List[int]]: """Deduplicate texts globally while preserving first-seen order.""" unique_texts: List[str] = [] @@ -46,21 +63,21 @@ def _format_instruction(instruction: str, query: str, doc: str) -> str: class Qwen3GGUFRerankerBackend: """ - Qwen3-Reranker-4B GGUF backend using llama.cpp through llama-cpp-python. + Qwen3-Reranker GGUF backend using llama.cpp through llama-cpp-python. - Tuned for short-query / short-doc reranking on a memory-constrained single T4. - Config from services.rerank.backends.qwen3_gguf. + Tuned for short-query / short-doc reranking on a single GPU. + Config from services.rerank.backends.. """ def __init__(self, config: Dict[str, Any]) -> None: self._config = config or {} - self._repo_id = str( - self._config.get("repo_id") or "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF" - ).strip() - self._filename = str(self._config.get("filename") or "*Q8_0.gguf").strip() + self._backend_name = str(self._config.get("_backend_name") or "qwen3_gguf").strip() + defaults = _BACKEND_DEFAULTS.get(self._backend_name, _BACKEND_DEFAULTS["qwen3_gguf"]) + self._repo_id = str(self._config.get("repo_id") or defaults["repo_id"]).strip() + self._filename = str(self._config.get("filename") or defaults["filename"]).strip() self._model_path = str(self._config.get("model_path") or "").strip() self._cache_dir = str(self._config.get("cache_dir") or "").strip() or None - self._local_dir = str(self._config.get("local_dir") or "").strip() or None + self._local_dir = str(self._config.get("local_dir") or defaults["local_dir"]).strip() or None self._instruction = str( self._config.get("instruction") or "Rank products by query with category & style match prioritized" @@ -79,6 +96,7 @@ class Qwen3GGUFRerankerBackend: "on", } self._length_sort_mode = str(self._config.get("length_sort_mode") or "char").strip().lower() + self._reuse_query_state = bool(self._config.get("reuse_query_state", False)) n_ctx = int(self._config.get("n_ctx", self._config.get("max_model_len", 384))) n_batch = int(self._config.get("n_batch", min(n_ctx, 384))) @@ -105,8 +123,9 @@ class Qwen3GGUFRerankerBackend: from llama_cpp import Llama except Exception as exc: # pragma: no cover - depends on optional dependency raise RuntimeError( - "qwen3_gguf backend requires llama-cpp-python. " - "Install the qwen3_gguf backend venv first via scripts/setup_reranker_venv.sh qwen3_gguf." + f"{self._backend_name} backend requires llama-cpp-python. " + f"Install the {self._backend_name} backend venv first via " + f"scripts/setup_reranker_venv.sh {self._backend_name}." ) from exc self._llama_class = Llama @@ -118,7 +137,8 @@ class Qwen3GGUFRerankerBackend: self._infer_lock = threading.Lock() logger.info( - "[Qwen3_GGUF] Loading model repo=%s filename=%s model_path=%s n_ctx=%s n_batch=%s n_ubatch=%s n_gpu_layers=%s flash_attn=%s offload_kqv=%s", + "[Qwen3_GGUF] Loading backend=%s repo=%s filename=%s model_path=%s n_ctx=%s n_batch=%s n_ubatch=%s n_gpu_layers=%s flash_attn=%s offload_kqv=%s reuse_query_state=%s", + self._backend_name, self._repo_id, self._filename, self._model_path or None, @@ -128,6 +148,7 @@ class Qwen3GGUFRerankerBackend: n_gpu_layers, flash_attn, offload_kqv, + self._reuse_query_state, ) llm_kwargs = { @@ -158,6 +179,7 @@ class Qwen3GGUFRerankerBackend: self._suffix = "<|im_end|>\n<|im_start|>assistant\n\n\n\n\n" self._prefix_tokens = self._tokenize(self._prefix, special=True) self._suffix_tokens = self._tokenize(self._suffix, special=True) + self._request_prefix_template = ": {instruction}\n: {query}\n: " self._effective_max_len = self._n_ctx - len(self._prefix_tokens) - len(self._suffix_tokens) if self._effective_max_len <= 16: raise RuntimeError( @@ -171,7 +193,8 @@ class Qwen3GGUFRerankerBackend: self._warmup() logger.info( - "[Qwen3_GGUF] Model ready | model=%s effective_max_len=%s infer_batch_size=%s sort_by_doc_length=%s", + "[Qwen3_GGUF] Model ready | backend=%s model=%s effective_max_len=%s infer_batch_size=%s sort_by_doc_length=%s", + self._backend_name, self._model_name, self._effective_max_len, self._infer_batch_size, @@ -181,6 +204,14 @@ class Qwen3GGUFRerankerBackend: def _load_model(self, llm_kwargs: Dict[str, Any]): if self._model_path: return self._llama_class(model_path=self._model_path, **llm_kwargs) + if self._local_dir: + matches = sorted( + path for path in Path(self._local_dir).glob(self._filename) if path.is_file() + ) + if matches: + local_model_path = str(matches[0].resolve()) + logger.info("[Qwen3_GGUF] Using local GGUF file: %s", local_model_path) + return self._llama_class(model_path=local_model_path, **llm_kwargs) return self._llama_class.from_pretrained( repo_id=self._repo_id, filename=self._filename, @@ -212,6 +243,13 @@ class Qwen3GGUFRerankerBackend: except Exception as exc: # pragma: no cover - defensive logger.warning("[Qwen3_GGUF] Warmup failed: %s", exc) + def _build_request_prefix_tokens(self, query: str) -> List[int]: + request_prefix = self._request_prefix_template.format( + instruction=self._instruction, + query=query, + ) + return self._tokenize(request_prefix, special=False) + def _build_prompt_tokens(self, query: str, doc: str) -> List[int]: pair = _format_instruction(self._instruction, query, doc) pair_tokens = self._tokenize(pair, special=False) @@ -235,6 +273,36 @@ class Qwen3GGUFRerankerBackend: false_exp = math.exp(false_logit - max_logit) return float(true_exp / (true_exp + false_exp)) + def _supports_query_state_reuse(self) -> bool: + return ( + self._reuse_query_state + and hasattr(self._llm, "save_state") + and hasattr(self._llm, "load_state") + ) + + def _build_query_state_locked(self, query: str): + request_prefix_tokens = self._build_request_prefix_tokens(query) + max_doc_tokens = self._effective_max_len - len(request_prefix_tokens) + if max_doc_tokens <= 0: + return None, 0 + self._llm.reset() + self._llm.eval(self._prefix_tokens + request_prefix_tokens) + return self._llm.save_state(), max_doc_tokens + + def _score_doc_with_state_locked(self, state, doc_tokens: List[int], max_doc_tokens: int) -> float: + self._llm.load_state(state) + self._llm.eval(doc_tokens[:max_doc_tokens] + self._suffix_tokens) + logits = self._llm.eval_logits + if not logits: + raise RuntimeError("llama.cpp returned empty logits") + final_logits = list(logits[-1]) + true_logit = float(final_logits[self._true_token]) + false_logit = float(final_logits[self._false_token]) + max_logit = max(true_logit, false_logit) + true_exp = math.exp(true_logit - max_logit) + false_exp = math.exp(false_logit - max_logit) + return float(true_exp / (true_exp + false_exp)) + def _estimate_doc_lengths(self, docs: List[str]) -> List[int]: if self._length_sort_mode == "token": return [len(self._tokenize(text, special=False)) for text in docs] @@ -269,7 +337,7 @@ class Qwen3GGUFRerankerBackend: "dedup_ratio": 0.0, "elapsed_ms": round(elapsed_ms, 3), "model": self._model_name, - "backend": "qwen3_gguf", + "backend": self._backend_name, "normalize": normalize, "infer_batch_size": self._infer_batch_size, "inference_batches": 0, @@ -289,14 +357,26 @@ class Qwen3GGUFRerankerBackend: order = sorted(order, key=lambda i: lengths[i]) unique_scores: List[float] = [0.0] * len(unique_texts) + unique_doc_tokens = [self._tokenize(text, special=False) for text in unique_texts] inference_batches = 0 - for start in range(0, len(order), self._infer_batch_size): - batch_indices = order[start : start + self._infer_batch_size] - inference_batches += 1 - for idx in batch_indices: - prompt = self._build_prompt_tokens(query, unique_texts[idx]) - with self._infer_lock: - unique_scores[idx] = self._score_prompt(prompt) + with self._infer_lock: + query_state = None + max_doc_tokens = self._effective_max_len + if self._supports_query_state_reuse(): + query_state, max_doc_tokens = self._build_query_state_locked(query) + for start in range(0, len(order), self._infer_batch_size): + batch_indices = order[start : start + self._infer_batch_size] + inference_batches += 1 + for idx in batch_indices: + if query_state is not None: + unique_scores[idx] = self._score_doc_with_state_locked( + query_state, + unique_doc_tokens[idx], + max_doc_tokens, + ) + else: + prompt = self._build_prompt_tokens(query, unique_texts[idx]) + unique_scores[idx] = self._score_prompt(prompt) for (orig_idx, _), unique_idx in zip(indexed, position_to_unique): output_scores[orig_idx] = float(unique_scores[unique_idx]) @@ -313,7 +393,7 @@ class Qwen3GGUFRerankerBackend: "dedup_ratio": round(dedup_ratio, 4), "elapsed_ms": round(elapsed_ms, 3), "model": self._model_name, - "backend": "qwen3_gguf", + "backend": self._backend_name, "normalize": normalize, "infer_batch_size": self._infer_batch_size, "inference_batches": inference_batches, @@ -323,5 +403,6 @@ class Qwen3GGUFRerankerBackend: "n_batch": self._n_batch, "n_ubatch": self._n_ubatch, "n_gpu_layers": self._n_gpu_layers, + "reuse_query_state": query_state is not None, } return output_scores, meta diff --git a/reranker/server.py b/reranker/server.py index 42d3ec0..5faccfc 100644 --- a/reranker/server.py +++ b/reranker/server.py @@ -7,7 +7,7 @@ Request: { "query": "...", "docs": ["doc1", "doc2", ...], "normalize": optional Response: { "scores": [float], "meta": {...} } Backend selected via config: services.rerank.backend -(bge | qwen3_vllm | qwen3_transformers | qwen3_gguf | dashscope_rerank), env RERANK_BACKEND. +(bge | qwen3_vllm | qwen3_transformers | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank), env RERANK_BACKEND. """ import logging diff --git a/scripts/benchmark_reranker_gguf_local.py b/scripts/benchmark_reranker_gguf_local.py new file mode 100644 index 0000000..2d12b33 --- /dev/null +++ b/scripts/benchmark_reranker_gguf_local.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 +""" +Local tuning probe for GGUF reranker backends. + +Runs the backend directly in a fresh process per config to measure: +- load time +- GPU memory used by this process +- single-request rerank latency + +Example: + ./.venv-reranker-gguf/bin/python scripts/benchmark_reranker_gguf_local.py + ./.venv-reranker-gguf-06b/bin/python scripts/benchmark_reranker_gguf_local.py --backend-name qwen3_gguf_06b --docs 400 +""" + +from __future__ import annotations + +import argparse +import json +import os +import random +import statistics +import subprocess +import sys +import time +from pathlib import Path +from typing import Any + + +DEFAULT_TITLES = Path("/home/ubuntu/rerank_test/titles.1.8w") + + +def load_titles(path: Path) -> list[str]: + items: list[str] = [] + with path.open(encoding="utf-8", errors="replace") as fh: + for line in fh: + text = line.strip() + if text: + items.append(text) + return items + + +def gpu_mem_for_pid(pid: int) -> int: + try: + out = subprocess.check_output( + [ + "nvidia-smi", + "--query-compute-apps=pid,used_gpu_memory", + "--format=csv,noheader,nounits", + ], + text=True, + ) + except Exception: + return -1 + for raw in out.splitlines(): + parts = [p.strip() for p in raw.split(",")] + if len(parts) != 2: + continue + try: + row_pid = int(parts[0]) + row_mem = int(parts[1]) + except ValueError: + continue + if row_pid == pid: + return row_mem + return -1 + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--backend-name", type=str, default="qwen3_gguf") + parser.add_argument("--titles-file", type=Path, default=DEFAULT_TITLES) + parser.add_argument("--query", type=str, default="白色oversized T-shirt") + parser.add_argument("--docs", type=int, default=160) + parser.add_argument("--repeat", type=int, default=1) + parser.add_argument("--seed", type=int, default=42) + parser.add_argument( + "--configs-json", + type=str, + default="", + help="JSON array of config objects; when omitted, uses built-in scan set.", + ) + args = parser.parse_args() + + if not args.titles_file.is_file(): + print(f"missing titles file: {args.titles_file}", file=sys.stderr) + return 2 + + titles = load_titles(args.titles_file) + if len(titles) < args.docs: + print(f"not enough titles: need {args.docs}, got {len(titles)}", file=sys.stderr) + return 2 + + random.seed(args.seed) + docs = random.sample(titles, args.docs) + + if args.configs_json: + configs = json.loads(args.configs_json) + elif args.backend_name == "qwen3_gguf_06b": + configs = [ + {"name": "gguf_06b_full_256", "n_ctx": 256, "n_batch": 256, "n_ubatch": 256, "n_gpu_layers": 999}, + {"name": "gguf_06b_full_320", "n_ctx": 320, "n_batch": 320, "n_ubatch": 320, "n_gpu_layers": 999}, + {"name": "gguf_06b_full_384", "n_ctx": 384, "n_batch": 384, "n_ubatch": 384, "n_gpu_layers": 999}, + {"name": "gguf_06b_full_512", "n_ctx": 512, "n_batch": 512, "n_ubatch": 512, "n_gpu_layers": 999}, + ] + else: + configs = [ + {"name": "gguf_t4_24g", "n_ctx": 384, "n_batch": 384, "n_ubatch": 128, "n_gpu_layers": 24}, + {"name": "gguf_t4_40g", "n_ctx": 384, "n_batch": 384, "n_ubatch": 128, "n_gpu_layers": 40}, + {"name": "gguf_t4_full", "n_ctx": 384, "n_batch": 384, "n_ubatch": 128, "n_gpu_layers": 999}, + {"name": "gguf_t4_full_512", "n_ctx": 512, "n_batch": 512, "n_ubatch": 256, "n_gpu_layers": 999}, + {"name": "gguf_t4_full_512_u512", "n_ctx": 512, "n_batch": 512, "n_ubatch": 512, "n_gpu_layers": 999}, + {"name": "gguf_t4_full_768", "n_ctx": 768, "n_batch": 768, "n_ubatch": 256, "n_gpu_layers": 999}, + ] + + from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend + + default_cfg_by_backend: dict[str, dict[str, Any]] = { + "qwen3_gguf": { + "_backend_name": "qwen3_gguf", + "repo_id": "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF", + "filename": "*Q8_0.gguf", + "local_dir": "./models/reranker/qwen3-reranker-4b-gguf", + "infer_batch_size": 8, + }, + "qwen3_gguf_06b": { + "_backend_name": "qwen3_gguf_06b", + "repo_id": "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF", + "filename": "qwen3-reranker-0.6b-q8_0.gguf", + "local_dir": "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf", + "infer_batch_size": 32, + }, + } + if args.backend_name not in default_cfg_by_backend: + print(f"unsupported backend: {args.backend_name}", file=sys.stderr) + return 2 + + base_cfg: dict[str, Any] = { + **default_cfg_by_backend[args.backend_name], + "instruction": "Rank products by query with category & style match prioritized", + "cache_dir": "./model_cache", + "main_gpu": 0, + "n_threads": 2, + "n_threads_batch": 4, + "flash_attn": True, + "offload_kqv": True, + "use_mmap": True, + "use_mlock": False, + "sort_by_doc_length": True, + "length_sort_mode": "char", + "enable_warmup": True, + "verbose": False, + "reuse_query_state": True, + } + + all_results: list[dict[str, Any]] = [] + for cfg in configs: + merged = dict(base_cfg) + merged.update(cfg) + name = str(merged.pop("name")) + + t0 = time.perf_counter() + backend = Qwen3GGUFRerankerBackend(merged) + load_ms = (time.perf_counter() - t0) * 1000.0 + gpu_mem_mib = gpu_mem_for_pid(os.getpid()) + + runs: list[float] = [] + last_meta: dict[str, Any] = {} + for _ in range(args.repeat): + t1 = time.perf_counter() + _scores, meta = backend.score_with_meta(args.query, docs, normalize=True) + runs.append((time.perf_counter() - t1) * 1000.0) + last_meta = dict(meta) + + result = { + "name": name, + "config": merged, + "load_ms": round(load_ms, 2), + "gpu_mem_mib": gpu_mem_mib, + "latency_ms_min": round(min(runs), 2), + "latency_ms_avg": round(statistics.mean(runs), 2), + "latency_ms_max": round(max(runs), 2), + "meta": last_meta, + } + all_results.append(result) + print(json.dumps(result, ensure_ascii=False)) + del backend + + print("SUMMARY") + for item in sorted(all_results, key=lambda x: x["latency_ms_avg"]): + print( + f'{item["name"]}: avg={item["latency_ms_avg"]}ms ' + f'gpu={item["gpu_mem_mib"]}MiB load={item["load_ms"]}ms' + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/lib/reranker_backend_env.sh b/scripts/lib/reranker_backend_env.sh index 6464f8b..4aec319 100644 --- a/scripts/lib/reranker_backend_env.sh +++ b/scripts/lib/reranker_backend_env.sh @@ -40,6 +40,7 @@ reranker_backend_venv_dir() { case "${backend}" in qwen3_vllm) printf '%s/.venv-reranker\n' "${project_root}" ;; qwen3_gguf) printf '%s/.venv-reranker-gguf\n' "${project_root}" ;; + qwen3_gguf_06b) printf '%s/.venv-reranker-gguf-06b\n' "${project_root}" ;; qwen3_transformers) printf '%s/.venv-reranker-transformers\n' "${project_root}" ;; bge) printf '%s/.venv-reranker-bge\n' "${project_root}" ;; dashscope_rerank) printf '%s/.venv-reranker-dashscope\n' "${project_root}" ;; @@ -54,6 +55,7 @@ reranker_backend_requirements_file() { case "${backend}" in qwen3_vllm) printf '%s/requirements_reranker_qwen3_vllm.txt\n' "${project_root}" ;; qwen3_gguf) printf '%s/requirements_reranker_qwen3_gguf.txt\n' "${project_root}" ;; + qwen3_gguf_06b) printf '%s/requirements_reranker_qwen3_gguf_06b.txt\n' "${project_root}" ;; qwen3_transformers) printf '%s/requirements_reranker_qwen3_transformers.txt\n' "${project_root}" ;; bge) printf '%s/requirements_reranker_bge.txt\n' "${project_root}" ;; dashscope_rerank) printf '%s/requirements_reranker_dashscope.txt\n' "${project_root}" ;; diff --git a/scripts/setup_reranker_venv.sh b/scripts/setup_reranker_venv.sh index 1b338f8..f20d533 100755 --- a/scripts/setup_reranker_venv.sh +++ b/scripts/setup_reranker_venv.sh @@ -50,6 +50,30 @@ echo "Using TMPDIR=${TMPDIR}" "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" --upgrade pip wheel "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" -r "${REQ_FILE}" +if [[ "${BACKEND}" == qwen3_gguf* ]]; then + if [[ -x "/usr/local/cuda/bin/nvcc" ]]; then + "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" \ + cmake \ + ninja \ + scikit-build-core \ + flit_core \ + setuptools-scm + echo "Rebuilding llama-cpp-python with CUDA support for ${BACKEND}" + PATH="/usr/local/cuda/bin:/usr/bin:/bin" \ + CC="/usr/bin/x86_64-linux-gnu-gcc" \ + CXX="/usr/bin/x86_64-linux-gnu-g++" \ + CUDACXX="/usr/local/cuda/bin/nvcc" \ + CMAKE_ARGS="-DGGML_CUDA=on" \ + FORCE_CMAKE=1 \ + "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" \ + --force-reinstall \ + --no-build-isolation \ + "llama-cpp-python==0.3.18" + else + echo "WARNING: /usr/local/cuda/bin/nvcc not found; ${BACKEND} will be installed without CUDA support." >&2 + fi +fi + echo echo "Done." echo "Backend: ${BACKEND}" diff --git a/scripts/start_reranker.sh b/scripts/start_reranker.sh index 2fd6649..9c4acd4 100755 --- a/scripts/start_reranker.sh +++ b/scripts/start_reranker.sh @@ -43,6 +43,10 @@ export TMPDIR="${RERANKER_RUNTIME_DIR}/tmp" export VLLM_NO_USAGE_STATS="${VLLM_NO_USAGE_STATS:-1}" export PATH="${RERANKER_VENV}/bin:${PATH}" +if [[ "${RERANK_BACKEND}" == qwen3_gguf* ]]; then + export HF_HUB_DISABLE_XET="${HF_HUB_DISABLE_XET:-1}" +fi + if [[ "${RERANK_BACKEND}" == "qwen3_vllm" ]]; then if ! command -v nvidia-smi >/dev/null 2>&1 || ! nvidia-smi >/dev/null 2>&1; then echo "ERROR: qwen3_vllm backend requires NVIDIA GPU, but nvidia-smi is unavailable." >&2 @@ -64,16 +68,24 @@ PY fi fi -if [[ "${RERANK_BACKEND}" == "qwen3_gguf" ]]; then - if ! "${PYTHON_BIN}" - <<'PY' +if [[ "${RERANK_BACKEND}" == qwen3_gguf* ]]; then + gguf_check_status=0 + "${PYTHON_BIN}" - <<'PY' || gguf_check_status=$? try: - import llama_cpp # noqa: F401 + import llama_cpp + if hasattr(llama_cpp, "llama_supports_gpu_offload") and not llama_cpp.llama_supports_gpu_offload(): + raise SystemExit(2) except Exception: raise SystemExit(1) PY - then - echo "ERROR: qwen3_gguf backend requires llama-cpp-python in ${RERANKER_VENV}." >&2 - echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2 + if [[ "${gguf_check_status}" != "0" ]]; then + if [[ "${gguf_check_status}" == "2" ]]; then + echo "ERROR: ${RERANK_BACKEND} backend detected a CPU-only llama-cpp-python build in ${RERANKER_VENV}." >&2 + echo "Please rerun: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2 + else + echo "ERROR: ${RERANK_BACKEND} backend requires llama-cpp-python in ${RERANKER_VENV}." >&2 + echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2 + fi exit 1 fi fi diff --git a/tests/test_reranker_qwen3_gguf_backend.py b/tests/test_reranker_qwen3_gguf_backend.py index 3886d1d..e4e5c89 100644 --- a/tests/test_reranker_qwen3_gguf_backend.py +++ b/tests/test_reranker_qwen3_gguf_backend.py @@ -12,6 +12,8 @@ class _FakeLlama: self.model_path = model_path self.kwargs = kwargs self.eval_logits = [] + self._tokens = [] + self.eval_call_count = 0 @classmethod def from_pretrained(cls, repo_id: str, filename: str, local_dir=None, cache_dir=None, **kwargs): @@ -31,16 +33,25 @@ class _FakeLlama: return [10 + (ord(ch) % 17) for ch in raw] def reset(self): + self._tokens = [] return None def eval(self, prompt_tokens): - pos = float(sum(prompt_tokens) % 11) + 3.0 + self.eval_call_count += 1 + self._tokens.extend(prompt_tokens) + pos = float(sum(self._tokens) % 11) + 3.0 neg = 1.0 logits = [0.0] * 64 logits[1] = pos logits[2] = neg self.eval_logits = [logits] + def save_state(self): + return list(self._tokens) + + def load_state(self, state): + self._tokens = list(state) + def _install_fake_llama_cpp(monkeypatch): fake_module = types.SimpleNamespace(Llama=_FakeLlama) @@ -58,6 +69,21 @@ def test_qwen3_gguf_backend_factory_loads(monkeypatch): }, ) assert isinstance(backend, Qwen3GGUFRerankerBackend) + assert backend._backend_name == "qwen3_gguf" + + +def test_qwen3_gguf_06b_backend_factory_loads(monkeypatch): + _install_fake_llama_cpp(monkeypatch) + backend = get_rerank_backend( + "qwen3_gguf_06b", + { + "enable_warmup": False, + }, + ) + assert isinstance(backend, Qwen3GGUFRerankerBackend) + assert backend._backend_name == "qwen3_gguf_06b" + assert backend._repo_id == "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF" + assert backend._filename == "qwen3-reranker-0.6b-q8_0.gguf" def test_qwen3_gguf_backend_score_with_meta_dedup_and_restore(monkeypatch): @@ -69,6 +95,7 @@ def test_qwen3_gguf_backend_score_with_meta_dedup_and_restore(monkeypatch): "enable_warmup": False, "infer_batch_size": 2, "sort_by_doc_length": True, + "reuse_query_state": True, } ) @@ -88,3 +115,5 @@ def test_qwen3_gguf_backend_score_with_meta_dedup_and_restore(monkeypatch): assert meta["unique_docs"] == 2 assert meta["backend"] == "qwen3_gguf" assert meta["inference_batches"] == 1 + assert meta["reuse_query_state"] is True + assert backend._llm.eval_call_count == 3 -- libgit2 0.21.2