Commit 5c21a485bbd6bd2f6876a1d2ddee6a6afbeeffa9
1 parent
3d508beb
qwen3-reranker-0.6b-gguf
Showing
16 changed files
with
886 additions
and
47 deletions
Show diff stats
config/config.yaml
| ... | ... | @@ -381,7 +381,7 @@ services: |
| 381 | 381 | max_docs: 1000 |
| 382 | 382 | normalize: true |
| 383 | 383 | # 服务内后端(reranker 进程启动时读取) |
| 384 | - backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | qwen3_gguf | dashscope_rerank | |
| 384 | + backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank | |
| 385 | 385 | backends: |
| 386 | 386 | bge: |
| 387 | 387 | model_name: "BAAI/bge-reranker-v2-m3" |
| ... | ... | @@ -426,11 +426,11 @@ services: |
| 426 | 426 | cache_dir: "./model_cache" |
| 427 | 427 | local_dir: "./models/reranker/qwen3-reranker-4b-gguf" |
| 428 | 428 | instruction: "Rank products by query with category & style match prioritized" |
| 429 | - # T4 16GB / 显存约 5~6GB 的保守配置 | |
| 430 | - n_ctx: 384 | |
| 431 | - n_batch: 384 | |
| 432 | - n_ubatch: 128 | |
| 433 | - n_gpu_layers: 24 | |
| 429 | + # T4 16GB / 性能优先配置:全量层 offload,实测比保守配置明显更快 | |
| 430 | + n_ctx: 512 | |
| 431 | + n_batch: 512 | |
| 432 | + n_ubatch: 512 | |
| 433 | + n_gpu_layers: 999 | |
| 434 | 434 | main_gpu: 0 |
| 435 | 435 | n_threads: 2 |
| 436 | 436 | n_threads_batch: 4 |
| ... | ... | @@ -443,6 +443,31 @@ services: |
| 443 | 443 | length_sort_mode: "char" |
| 444 | 444 | enable_warmup: true |
| 445 | 445 | verbose: false |
| 446 | + qwen3_gguf_06b: | |
| 447 | + repo_id: "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF" | |
| 448 | + filename: "qwen3-reranker-0.6b-q8_0.gguf" | |
| 449 | + cache_dir: "./model_cache" | |
| 450 | + local_dir: "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf" | |
| 451 | + instruction: "Rank products by query with category & style match prioritized" | |
| 452 | + # 0.6B GGUF / online rerank baseline: | |
| 453 | + # 实测 400 titles 单请求约 265s,因此它更适合作为低显存功能后备,不适合在线低延迟主路由。 | |
| 454 | + n_ctx: 256 | |
| 455 | + n_batch: 256 | |
| 456 | + n_ubatch: 256 | |
| 457 | + n_gpu_layers: 999 | |
| 458 | + main_gpu: 0 | |
| 459 | + n_threads: 2 | |
| 460 | + n_threads_batch: 4 | |
| 461 | + flash_attn: true | |
| 462 | + offload_kqv: true | |
| 463 | + use_mmap: true | |
| 464 | + use_mlock: false | |
| 465 | + infer_batch_size: 32 | |
| 466 | + sort_by_doc_length: true | |
| 467 | + length_sort_mode: "char" | |
| 468 | + reuse_query_state: false | |
| 469 | + enable_warmup: true | |
| 470 | + verbose: false | |
| 446 | 471 | dashscope_rerank: |
| 447 | 472 | model_name: "qwen3-rerank" |
| 448 | 473 | # 按地域选择 endpoint: | ... | ... |
config/services_config.py
| ... | ... | @@ -7,6 +7,7 @@ contains no independent parsing or precedence logic. |
| 7 | 7 | |
| 8 | 8 | from __future__ import annotations |
| 9 | 9 | |
| 10 | +import os | |
| 10 | 11 | from typing import Any, Dict, Tuple |
| 11 | 12 | |
| 12 | 13 | from config.loader import get_app_config |
| ... | ... | @@ -61,6 +62,12 @@ def get_embedding_image_backend_config() -> Tuple[str, Dict[str, Any]]: |
| 61 | 62 | |
| 62 | 63 | def get_rerank_backend_config() -> Tuple[str, Dict[str, Any]]: |
| 63 | 64 | cfg = get_app_config().services.rerank |
| 65 | + backend = str(os.getenv("RERANK_BACKEND") or cfg.backend).strip() | |
| 66 | + if backend != cfg.backend: | |
| 67 | + backend_cfg = cfg.backends.get(backend) | |
| 68 | + if backend_cfg is None: | |
| 69 | + raise ValueError(f"Unknown rerank backend override from RERANK_BACKEND: {backend!r}") | |
| 70 | + return backend, dict(backend_cfg) | |
| 64 | 71 | return cfg.backend, cfg.get_backend_config() |
| 65 | 72 | |
| 66 | 73 | ... | ... |
requirements_reranker_qwen3_gguf.txt
reranker/DEPLOYMENT_AND_TUNING.md
| ... | ... | @@ -3,15 +3,15 @@ |
| 3 | 3 | 本文档沉淀当前项目在电商搜索重排场景下的可复用实践,覆盖: |
| 4 | 4 | |
| 5 | 5 | - 环境准备与安装部署 |
| 6 | -- `qwen3_vllm` / `qwen3_gguf` 配置项与优化思路 | |
| 6 | +- `qwen3_vllm` / `qwen3_gguf` / `qwen3_gguf_06b` 配置项与优化思路 | |
| 7 | 7 | - 1000-doc 场景压测流程 |
| 8 | 8 | - 关键结论与推荐默认参数 |
| 9 | 9 | - 常见故障排查 |
| 10 | 10 | |
| 11 | 11 | 适用范围: |
| 12 | 12 | |
| 13 | -- 重排后端:`services.rerank.backend: qwen3_vllm` 或 `qwen3_gguf` | |
| 14 | -- 模型:`Qwen/Qwen3-Reranker-0.6B` / `DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF` | |
| 13 | +- 重排后端:`services.rerank.backend: qwen3_vllm` / `qwen3_gguf` / `qwen3_gguf_06b` | |
| 14 | +- 模型:`Qwen/Qwen3-Reranker-0.6B` / `DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF` / `ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF` | |
| 15 | 15 | - 场景:query 较短(通常 < 100 tokens),doc 为商品标题或标题+简短描述,单请求 docs 约 1000 条 |
| 16 | 16 | |
| 17 | 17 | ## 1. 环境基线 | ... | ... |
| ... | ... | @@ -0,0 +1,154 @@ |
| 1 | +# Qwen3-Reranker-0.6B GGUF 安装与调优 | |
| 2 | + | |
| 3 | +本文档覆盖 `qwen3_gguf_06b` 后端,对应模型: | |
| 4 | + | |
| 5 | +- Hugging Face: `ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF` | |
| 6 | +- 文件: `qwen3-reranker-0.6b-q8_0.gguf` | |
| 7 | +- 本地目录: `./models/reranker/qwen3-reranker-0.6b-q8_0-gguf` | |
| 8 | + | |
| 9 | +## 结论先看 | |
| 10 | + | |
| 11 | +这个后端已经接入完成,也能正常使用 GPU offload,但不适合当前项目的在线主链路场景。 | |
| 12 | + | |
| 13 | +目标场景是: | |
| 14 | + | |
| 15 | +- 1 个 query | |
| 16 | +- 400 个商品标题 | |
| 17 | +- 追求最短响应时间 | |
| 18 | + | |
| 19 | +实测最优配置下: | |
| 20 | + | |
| 21 | +- GPU 显存占用约 `894 MiB` | |
| 22 | +- 400 titles 单请求延迟约 `265318 ms` | |
| 23 | + | |
| 24 | +因此它更适合作为: | |
| 25 | + | |
| 26 | +- 低显存 fallback | |
| 27 | +- 功能验证 | |
| 28 | +- 本地离线实验 | |
| 29 | + | |
| 30 | +不建议作为在线低延迟 reranker 主 backend。 | |
| 31 | + | |
| 32 | +## 独立环境 | |
| 33 | + | |
| 34 | +`qwen3_gguf_06b` 使用独立 venv: | |
| 35 | + | |
| 36 | +- backend: `qwen3_gguf_06b` | |
| 37 | +- venv: `.venv-reranker-gguf-06b` | |
| 38 | +- requirements: `requirements_reranker_qwen3_gguf_06b.txt` | |
| 39 | + | |
| 40 | +安装: | |
| 41 | + | |
| 42 | +```bash | |
| 43 | +./scripts/setup_reranker_venv.sh qwen3_gguf_06b | |
| 44 | +``` | |
| 45 | + | |
| 46 | +如果需要确认是 CUDA 版 `llama-cpp-python`: | |
| 47 | + | |
| 48 | +```bash | |
| 49 | +./.venv-reranker-gguf-06b/bin/python - <<'PY' | |
| 50 | +import llama_cpp | |
| 51 | +print(llama_cpp.llama_supports_gpu_offload()) | |
| 52 | +PY | |
| 53 | +``` | |
| 54 | + | |
| 55 | +预期输出: | |
| 56 | + | |
| 57 | +```python | |
| 58 | +True | |
| 59 | +``` | |
| 60 | + | |
| 61 | +## 模型下载 | |
| 62 | + | |
| 63 | +推荐预先下载到本地,避免首次服务启动时在线拉取: | |
| 64 | + | |
| 65 | +```bash | |
| 66 | +mkdir -p models/reranker/qwen3-reranker-0.6b-q8_0-gguf | |
| 67 | +curl -L --fail -C - \ | |
| 68 | + -o models/reranker/qwen3-reranker-0.6b-q8_0-gguf/qwen3-reranker-0.6b-q8_0.gguf \ | |
| 69 | + 'https://huggingface.co/ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/resolve/main/qwen3-reranker-0.6b-q8_0.gguf?download=true' | |
| 70 | +``` | |
| 71 | + | |
| 72 | +当前实测文件大小: | |
| 73 | + | |
| 74 | +- `639153184` bytes | |
| 75 | + | |
| 76 | +## 推荐配置 | |
| 77 | + | |
| 78 | +`config/config.yaml` 中建议保留: | |
| 79 | + | |
| 80 | +```yaml | |
| 81 | +qwen3_gguf_06b: | |
| 82 | + repo_id: "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF" | |
| 83 | + filename: "qwen3-reranker-0.6b-q8_0.gguf" | |
| 84 | + local_dir: "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf" | |
| 85 | + cache_dir: "./model_cache" | |
| 86 | + instruction: "Rank products by query with category & style match prioritized" | |
| 87 | + n_ctx: 256 | |
| 88 | + n_batch: 256 | |
| 89 | + n_ubatch: 256 | |
| 90 | + n_gpu_layers: 999 | |
| 91 | + main_gpu: 0 | |
| 92 | + n_threads: 2 | |
| 93 | + n_threads_batch: 4 | |
| 94 | + flash_attn: true | |
| 95 | + offload_kqv: true | |
| 96 | + use_mmap: true | |
| 97 | + use_mlock: false | |
| 98 | + infer_batch_size: 32 | |
| 99 | + sort_by_doc_length: true | |
| 100 | + length_sort_mode: "char" | |
| 101 | + reuse_query_state: false | |
| 102 | + enable_warmup: true | |
| 103 | + verbose: false | |
| 104 | +``` | |
| 105 | + | |
| 106 | +## 调优结果 | |
| 107 | + | |
| 108 | +在当前机器上做了同机实测。标题文件来自 `/home/ubuntu/rerank_test/titles.1.8w`,查询为 `白色oversized T-shirt`。 | |
| 109 | + | |
| 110 | +80 titles: | |
| 111 | + | |
| 112 | +- `n_ctx=256, reuse_query_state=true` -> `60108 ms` | |
| 113 | +- `n_ctx=256, reuse_query_state=false` -> `53383~56893 ms` | |
| 114 | +- `n_ctx=320, reuse_query_state=true` -> `60961 ms` | |
| 115 | +- `n_ctx=384, reuse_query_state=true` -> `56578 ms` | |
| 116 | +- `n_ctx=384, reuse_query_state=false` -> `57272 ms` | |
| 117 | +- `n_ctx=512, reuse_query_state=false` -> `60542 ms` | |
| 118 | +- `n_ctx=256, reuse_query_state=false, n_threads=4, n_threads_batch=8` -> `61228 ms` | |
| 119 | + | |
| 120 | +400 titles: | |
| 121 | + | |
| 122 | +- `n_ctx=256, n_batch=256, n_ubatch=256, n_gpu_layers=999, reuse_query_state=false` | |
| 123 | + -> `265318 ms` | |
| 124 | + | |
| 125 | +## 经验沉淀 | |
| 126 | + | |
| 127 | +这次接入最重要的结论不是“哪个小参数更快”,而是: | |
| 128 | + | |
| 129 | +1. 这个 0.6B GGUF 权重虽然小,但当前后端实现仍是逐 doc 顺序打分。 | |
| 130 | +2. 对在线 400-title 请求来说,串行打分本身就是主瓶颈。 | |
| 131 | +3. `reuse_query_state` 在这个模型上没有带来收益,反而更慢。 | |
| 132 | +4. `n_ctx` 拉大到 `384/512` 也没有带来实质收益,反而更慢或持平。 | |
| 133 | +5. 这个 backend 的优势是低显存,不是低延迟。 | |
| 134 | + | |
| 135 | +如果目标是在线最短响应时间,优先级建议是: | |
| 136 | + | |
| 137 | +1. `qwen3_vllm` | |
| 138 | +2. 其他真正支持高吞吐批处理的后端 | |
| 139 | +3. `qwen3_gguf_06b` 仅作为低显存 fallback | |
| 140 | + | |
| 141 | +## 验证命令 | |
| 142 | + | |
| 143 | +本地直连 backend 调优: | |
| 144 | + | |
| 145 | +```bash | |
| 146 | +PYTHONPATH=/data/saas-search ./.venv-reranker-gguf/bin/python \ | |
| 147 | + scripts/benchmark_reranker_gguf_local.py --backend-name qwen3_gguf_06b --docs 400 | |
| 148 | +``` | |
| 149 | + | |
| 150 | +按服务方式启动: | |
| 151 | + | |
| 152 | +```bash | |
| 153 | +RERANK_BACKEND=qwen3_gguf_06b ./scripts/start_reranker.sh | |
| 154 | +``` | ... | ... |
| ... | ... | @@ -0,0 +1,280 @@ |
| 1 | +# Qwen3 GGUF 安装与调优手册 | |
| 2 | + | |
| 3 | +本文档只覆盖 `qwen3_gguf` 后端,目标机器为当前项目实测环境: | |
| 4 | + | |
| 5 | +- GPU: `Tesla T4 16GB` | |
| 6 | +- CUDA: `12.8` | |
| 7 | +- 模型: `DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF` | |
| 8 | +- 量化: `Q8_0` | |
| 9 | + | |
| 10 | +--- | |
| 11 | + | |
| 12 | +## 1. 结论先看 | |
| 13 | + | |
| 14 | +当前这套代码里,GGUF 后端的主要瓶颈不是“显存没吃满”,而是 **llama.cpp 按 doc 顺序逐条打分**。因此最有效的优化策略是: | |
| 15 | + | |
| 16 | +- 让模型层尽可能全部 offload 到 GPU | |
| 17 | +- 打开 `flash_attn` / `offload_kqv` | |
| 18 | +- 把 `n_ctx / n_batch / n_ubatch` 调到一个对短标题重排更合适的高效点 | |
| 19 | + | |
| 20 | +本轮在当前机器上的推荐配置是: | |
| 21 | + | |
| 22 | +```yaml | |
| 23 | +qwen3_gguf: | |
| 24 | + n_ctx: 512 | |
| 25 | + n_batch: 512 | |
| 26 | + n_ubatch: 512 | |
| 27 | + n_gpu_layers: 999 | |
| 28 | + n_threads: 2 | |
| 29 | + n_threads_batch: 4 | |
| 30 | + flash_attn: true | |
| 31 | + offload_kqv: true | |
| 32 | + infer_batch_size: 8 | |
| 33 | + sort_by_doc_length: true | |
| 34 | + length_sort_mode: "char" | |
| 35 | +``` | |
| 36 | + | |
| 37 | +说明: | |
| 38 | + | |
| 39 | +- `n_gpu_layers: 999` 在 llama.cpp 中等价于“尽可能全部层都 offload” | |
| 40 | +- 这台 T4 上,**即使全量 offload,当前模型也只占到约 `4.5 GiB` GPU 显存** | |
| 41 | +- 所以“允许 8G 显存”并不会自动带来更高速度;这个模型/后端在当前工作负载下已经接近“该用到的权重都上 GPU 了” | |
| 42 | + | |
| 43 | +--- | |
| 44 | + | |
| 45 | +## 2. 独立环境 | |
| 46 | + | |
| 47 | +`qwen3_gguf` 必须使用自己的独立 venv: | |
| 48 | + | |
| 49 | +- `qwen3_vllm` -> `.venv-reranker` | |
| 50 | +- `qwen3_gguf` -> `.venv-reranker-gguf` | |
| 51 | + | |
| 52 | +安装命令: | |
| 53 | + | |
| 54 | +```bash | |
| 55 | +./scripts/setup_reranker_venv.sh qwen3_gguf | |
| 56 | +``` | |
| 57 | + | |
| 58 | +脚本现在会自动做两件事: | |
| 59 | + | |
| 60 | +1. 安装 GGUF 后端所需 Python 依赖 | |
| 61 | +2. 在检测到 `/usr/local/cuda/bin/nvcc` 时,把 `llama-cpp-python` **重编译成 CUDA 版** | |
| 62 | + | |
| 63 | +--- | |
| 64 | + | |
| 65 | +## 3. GPU 版验证 | |
| 66 | + | |
| 67 | +必须验证不是 CPU-only 版: | |
| 68 | + | |
| 69 | +```bash | |
| 70 | +./.venv-reranker-gguf/bin/python - <<'PY' | |
| 71 | +import llama_cpp | |
| 72 | +print("supports_gpu_offload =", llama_cpp.llama_supports_gpu_offload()) | |
| 73 | +PY | |
| 74 | +``` | |
| 75 | + | |
| 76 | +正确结果应为: | |
| 77 | + | |
| 78 | +```text | |
| 79 | +supports_gpu_offload = True | |
| 80 | +``` | |
| 81 | + | |
| 82 | +还可以看动态库: | |
| 83 | + | |
| 84 | +```bash | |
| 85 | +ldd .venv-reranker-gguf/lib/python3.12/site-packages/llama_cpp/lib/libllama.so | rg 'cuda|cublas|ggml-cuda' | |
| 86 | +``` | |
| 87 | + | |
| 88 | +应能看到: | |
| 89 | + | |
| 90 | +- `libggml-cuda.so` | |
| 91 | +- `libcudart.so` | |
| 92 | +- `libcublas.so` | |
| 93 | + | |
| 94 | +--- | |
| 95 | + | |
| 96 | +## 4. 模型下载 | |
| 97 | + | |
| 98 | +当前使用本地文件优先策略,模型放在: | |
| 99 | + | |
| 100 | +```text | |
| 101 | +models/reranker/qwen3-reranker-4b-gguf/Qwen.Qwen3-Reranker-4B.Q8_0.gguf | |
| 102 | +``` | |
| 103 | + | |
| 104 | +若本地文件存在,后端会直接加载本地 GGUF,不再依赖启动时在线下载。 | |
| 105 | + | |
| 106 | +为了避免当前机器上 Hugging Face Xet 下载的 `416 Range Not Satisfiable` 问题,`start_reranker.sh` 已对 `qwen3_gguf` 默认设置: | |
| 107 | + | |
| 108 | +```bash | |
| 109 | +HF_HUB_DISABLE_XET=1 | |
| 110 | +``` | |
| 111 | + | |
| 112 | +--- | |
| 113 | + | |
| 114 | +## 5. 本地调优脚本 | |
| 115 | + | |
| 116 | +新增本地基准脚本: | |
| 117 | + | |
| 118 | +```bash | |
| 119 | +PYTHONPATH=/data/saas-search ./.venv-reranker-gguf/bin/python \ | |
| 120 | + scripts/benchmark_reranker_gguf_local.py --docs 64 --repeat 1 | |
| 121 | +``` | |
| 122 | + | |
| 123 | +它会直接实例化 GGUF backend,输出: | |
| 124 | + | |
| 125 | +- 模型加载耗时 | |
| 126 | +- 当前进程 GPU 显存占用 | |
| 127 | +- 单次 rerank 延迟 | |
| 128 | + | |
| 129 | +--- | |
| 130 | + | |
| 131 | +## 6. 本轮实测结果 | |
| 132 | + | |
| 133 | +测试条件: | |
| 134 | + | |
| 135 | +- Query: `白色oversized T-shirt` | |
| 136 | +- Docs: `64` 条商品标题 | |
| 137 | +- 本地脚本:`scripts/benchmark_reranker_gguf_local.py` | |
| 138 | +- 每组 1 次,重点比较相对趋势 | |
| 139 | + | |
| 140 | +结果: | |
| 141 | + | |
| 142 | +### 6.1 保守配置 | |
| 143 | + | |
| 144 | +```text | |
| 145 | +n_ctx=384 | |
| 146 | +n_batch=384 | |
| 147 | +n_ubatch=128 | |
| 148 | +n_gpu_layers=24 | |
| 149 | +``` | |
| 150 | + | |
| 151 | +- GPU 显存:`2984 MiB` | |
| 152 | +- 64 docs 延迟:`74347.91 ms` | |
| 153 | + | |
| 154 | +### 6.2 全量 offload | |
| 155 | + | |
| 156 | +```text | |
| 157 | +n_ctx=384 | |
| 158 | +n_batch=384 | |
| 159 | +n_ubatch=128 | |
| 160 | +n_gpu_layers=999 | |
| 161 | +``` | |
| 162 | + | |
| 163 | +- GPU 显存:`4338 MiB` | |
| 164 | +- 64 docs 延迟:`51401.77 ms` | |
| 165 | + | |
| 166 | +### 6.3 最优配置 | |
| 167 | + | |
| 168 | +```text | |
| 169 | +n_ctx=512 | |
| 170 | +n_batch=512 | |
| 171 | +n_ubatch=512 | |
| 172 | +n_gpu_layers=999 | |
| 173 | +``` | |
| 174 | + | |
| 175 | +- GPU 显存:`4564 MiB` | |
| 176 | +- 64 docs 延迟:`49116.10 ms` | |
| 177 | + | |
| 178 | +### 6.4 其它尝试 | |
| 179 | + | |
| 180 | +`n_threads=4 / n_threads_batch=8`: | |
| 181 | + | |
| 182 | +- GPU 显存:`4564 MiB` | |
| 183 | +- 64 docs 延迟:`49895.88 ms` | |
| 184 | +- 比推荐值略慢 | |
| 185 | + | |
| 186 | +`infer_batch_size=64`: | |
| 187 | + | |
| 188 | +- GPU 显存:`4564 MiB` | |
| 189 | +- 64 docs 延迟:`50723.36 ms` | |
| 190 | +- 也略慢 | |
| 191 | + | |
| 192 | +### 6.5 API 级验证 | |
| 193 | + | |
| 194 | +在把推荐配置写入 `config/config.yaml` 并重启服务后,使用: | |
| 195 | + | |
| 196 | +```bash | |
| 197 | +RERANK_BASE=http://127.0.0.1:6007 \ | |
| 198 | + ./.venv/bin/python scripts/benchmark_reranker_random_titles.py 64 --repeat 1 --query '白色oversized T-shirt' | |
| 199 | +``` | |
| 200 | + | |
| 201 | +得到: | |
| 202 | + | |
| 203 | +- `64 docs`:`50177.22 ms` | |
| 204 | + | |
| 205 | +再用: | |
| 206 | + | |
| 207 | +```bash | |
| 208 | +RERANK_BASE=http://127.0.0.1:6007 \ | |
| 209 | + ./.venv/bin/python scripts/benchmark_reranker_random_titles.py 153 --repeat 1 --query '白色oversized T-shirt' | |
| 210 | +``` | |
| 211 | + | |
| 212 | +得到: | |
| 213 | + | |
| 214 | +- `153 docs`:`115328.60 ms` | |
| 215 | + | |
| 216 | +对比旧日志中的保守配置: | |
| 217 | + | |
| 218 | +- 旧配置 `153 docs`:`153435.37 ms` | |
| 219 | +- 新配置 `153 docs`:`115328.60 ms` | |
| 220 | + | |
| 221 | +改善幅度约: | |
| 222 | + | |
| 223 | +- `24.8%` | |
| 224 | + | |
| 225 | +--- | |
| 226 | + | |
| 227 | +## 7. 为什么没有吃到 8G | |
| 228 | + | |
| 229 | +结论很重要: | |
| 230 | + | |
| 231 | +- 当前最优配置已经是“尽可能全量层 offload” | |
| 232 | +- 该 `Q8_0` 模型在这套 llama.cpp / T4 / 短文本重排场景下,**实测只需要约 `4.5 GiB` GPU 显存** | |
| 233 | +- 继续为了“吃满 8G”去增大 `n_ctx`,不会明显提升吞吐,反而可能带来额外开销 | |
| 234 | + | |
| 235 | +所以本轮不是“显存太保守”,而是: | |
| 236 | + | |
| 237 | +- 可 offload 的权重已经基本 offload 完了 | |
| 238 | +- 真正拖慢响应的是 **逐 doc 顺序推理** 这一后端实现路径 | |
| 239 | + | |
| 240 | +--- | |
| 241 | + | |
| 242 | +## 8. 生产建议 | |
| 243 | + | |
| 244 | +### 8.1 当前建议 | |
| 245 | + | |
| 246 | +保留以下参数: | |
| 247 | + | |
| 248 | +```yaml | |
| 249 | +n_ctx: 512 | |
| 250 | +n_batch: 512 | |
| 251 | +n_ubatch: 512 | |
| 252 | +n_gpu_layers: 999 | |
| 253 | +n_threads: 2 | |
| 254 | +n_threads_batch: 4 | |
| 255 | +flash_attn: true | |
| 256 | +offload_kqv: true | |
| 257 | +``` | |
| 258 | + | |
| 259 | +### 8.2 如果还嫌慢 | |
| 260 | + | |
| 261 | +优先级建议: | |
| 262 | + | |
| 263 | +1. 缩小 `rerank_window` | |
| 264 | +2. 减少传入 doc 数 | |
| 265 | +3. 若业务允许,切换到更适合高吞吐的后端 | |
| 266 | + | |
| 267 | +原因: | |
| 268 | + | |
| 269 | +- 当前 GGUF 后端是本地单进程、逐 doc 打分 | |
| 270 | +- 对长列表重排,它天然不如 vLLM / 云端 rerank API 擅长吞吐 | |
| 271 | + | |
| 272 | +--- | |
| 273 | + | |
| 274 | +## 9. 本轮落地文件 | |
| 275 | + | |
| 276 | +- `config/config.yaml` | |
| 277 | +- `scripts/setup_reranker_venv.sh` | |
| 278 | +- `scripts/start_reranker.sh` | |
| 279 | +- `scripts/benchmark_reranker_gguf_local.py` | |
| 280 | +- `reranker/GGUF_INSTALL_AND_TUNING.md` | ... | ... |
reranker/README.md
| 1 | 1 | # Reranker 模块 |
| 2 | 2 | |
| 3 | -**请求示例**见 `docs/QUICKSTART.md` §3.5。扩展规范见 `docs/DEVELOPER_GUIDE.md` §7。部署与调优实战见 `reranker/DEPLOYMENT_AND_TUNING.md`。 | |
| 3 | +**请求示例**见 `docs/QUICKSTART.md` §3.5。扩展规范见 `docs/DEVELOPER_GUIDE.md` §7。部署与调优实战见 `reranker/DEPLOYMENT_AND_TUNING.md`。`ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF` 的专项接入与调优结论见 `reranker/GGUF_0_6B_INSTALL_AND_TUNING.md`。 | |
| 4 | 4 | |
| 5 | 5 | --- |
| 6 | 6 | |
| 7 | 7 | Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwen3-vLLM、Qwen3-Transformers、Qwen3-GGUF、DashScope 云重排)。调用方通过 HTTP 访问,不关心具体后端。 |
| 8 | 8 | |
| 9 | 9 | **特性** |
| 10 | -- 多后端:`qwen3_vllm`、`qwen3_transformers`、`qwen3_gguf`(Qwen3-Reranker-4B GGUF + llama.cpp)、`bge`(兼容保留) | |
| 10 | +- 多后端:`qwen3_vllm`、`qwen3_transformers`、`qwen3_gguf`(Qwen3-Reranker-4B GGUF + llama.cpp)、`qwen3_gguf_06b`(Qwen3-Reranker-0.6B Q8_0 GGUF + llama.cpp)、`bge`(兼容保留) | |
| 11 | 11 | - 云后端:`dashscope_rerank`(调用 DashScope `/compatible-api/v1/reranks`,支持按地域切换 endpoint) |
| 12 | 12 | - 统一配置:`config/config.yaml` → `services.rerank.backend` / `services.rerank.backends.<name>` |
| 13 | 13 | - 文档去重、分数与输入顺序一致、FP16/GPU 支持(视后端) |
| ... | ... | @@ -19,7 +19,7 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe |
| 19 | 19 | - `backends/bge.py`:BGE 后端 |
| 20 | 20 | - `backends/qwen3_vllm.py`:Qwen3-Reranker-0.6B + vLLM 后端 |
| 21 | 21 | - `backends/qwen3_transformers.py`:Qwen3-Reranker-0.6B 纯 Transformers 后端(官方 Usage 方式) |
| 22 | - - `backends/qwen3_gguf.py`:Qwen3-Reranker-4B GGUF + llama.cpp 后端 | |
| 22 | + - `backends/qwen3_gguf.py`:Qwen3-Reranker GGUF + llama.cpp 后端(支持 `qwen3_gguf` / `qwen3_gguf_06b`) | |
| 23 | 23 | - `backends/dashscope_rerank.py`:DashScope 云重排后端(HTTP 调用) |
| 24 | 24 | - `reranker/bge_reranker.py`:BGE 核心推理(被 bge 后端封装) |
| 25 | 25 | - `reranker/config.py`:服务端口、MAX_DOCS、NORMALIZE 等(后端参数在 config.yaml) |
| ... | ... | @@ -32,11 +32,12 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe |
| 32 | 32 | - 现在按 backend 使用独立 venv: |
| 33 | 33 | - `qwen3_vllm` -> `.venv-reranker` |
| 34 | 34 | - `qwen3_gguf` -> `.venv-reranker-gguf` |
| 35 | + - `qwen3_gguf_06b` -> `.venv-reranker-gguf-06b` | |
| 35 | 36 | - `qwen3_transformers` -> `.venv-reranker-transformers` |
| 36 | 37 | - `bge` -> `.venv-reranker-bge` |
| 37 | 38 | - `dashscope_rerank` -> `.venv-reranker-dashscope` |
| 38 | 39 | ```bash |
| 39 | - ./scripts/setup_reranker_venv.sh qwen3_gguf | |
| 40 | + ./scripts/setup_reranker_venv.sh qwen3_gguf_06b | |
| 40 | 41 | ``` |
| 41 | 42 | CUDA 构建建议: |
| 42 | 43 | ```bash |
| ... | ... | @@ -48,7 +49,7 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe |
| 48 | 49 | ``` |
| 49 | 50 | |
| 50 | 51 | ## 配置 |
| 51 | -- **后端选择**:`config/config.yaml` 中 `services.rerank.backend`(`qwen3_vllm` | `qwen3_transformers` | `qwen3_gguf` | `bge` | `dashscope_rerank`),或环境变量 `RERANK_BACKEND`。 | |
| 52 | +- **后端选择**:`config/config.yaml` 中 `services.rerank.backend`(`qwen3_vllm` | `qwen3_transformers` | `qwen3_gguf` | `qwen3_gguf_06b` | `bge` | `dashscope_rerank`),或环境变量 `RERANK_BACKEND`。 | |
| 52 | 53 | - **后端参数**:`services.rerank.backends.bge` / `services.rerank.backends.qwen3_vllm`,例如: |
| 53 | 54 | |
| 54 | 55 | ```yaml |
| ... | ... | @@ -96,6 +97,20 @@ services: |
| 96 | 97 | infer_batch_size: 8 |
| 97 | 98 | sort_by_doc_length: true |
| 98 | 99 | length_sort_mode: "char" |
| 100 | + qwen3_gguf_06b: | |
| 101 | + repo_id: "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF" | |
| 102 | + filename: "qwen3-reranker-0.6b-q8_0.gguf" | |
| 103 | + local_dir: "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf" | |
| 104 | + cache_dir: "./model_cache" | |
| 105 | + instruction: "Rank products by query with category & style match prioritized" | |
| 106 | + n_ctx: 256 | |
| 107 | + n_batch: 256 | |
| 108 | + n_ubatch: 256 | |
| 109 | + n_gpu_layers: 999 | |
| 110 | + infer_batch_size: 32 | |
| 111 | + sort_by_doc_length: true | |
| 112 | + length_sort_mode: "char" | |
| 113 | + reuse_query_state: false | |
| 99 | 114 | dashscope_rerank: |
| 100 | 115 | model_name: "qwen3-rerank" |
| 101 | 116 | endpoint: "https://dashscope.aliyuncs.com/compatible-api/v1/reranks" |
| ... | ... | @@ -153,7 +168,7 @@ Content-Type: application/json |
| 153 | 168 | ``` |
| 154 | 169 | |
| 155 | 170 | `top_n` 为可选字段: |
| 156 | -- 对本地后端(`qwen3_vllm` / `qwen3_transformers` / `qwen3_gguf` / `bge`)通常会忽略,仍返回全量分数。 | |
| 171 | +- 对本地后端(`qwen3_vllm` / `qwen3_transformers` / `qwen3_gguf` / `qwen3_gguf_06b` / `bge`)通常会忽略,仍返回全量分数。 | |
| 157 | 172 | - 对 `dashscope_rerank` 可用于控制云端返回的候选量,建议设置为 `page+size`(例如分页 `from=20,size=10` 时传 `30`)。 |
| 158 | 173 | |
| 159 | 174 | Response: |
| ... | ... | @@ -192,3 +207,4 @@ uvicorn reranker.server:app --host 0.0.0.0 --port 6007 --log-level info |
| 192 | 207 | - **Qwen3-vLLM**:参考 [Qwen3-Reranker-0.6B](https://huggingface.co/Qwen/Qwen3-Reranker-0.6B),需 GPU 与较多显存;与 BGE 相比适合长文本、高吞吐场景(vLLM 前缀缓存)。 |
| 193 | 208 | - **Qwen3-Transformers**:官方 Transformers Usage 方式,无需 vLLM;适合 CPU 或小显存。默认 `attn_implementation: "sdpa"`;若已安装 `flash_attn` 可设 `flash_attention_2`(未安装时服务会自动回退到 sdpa)。 |
| 194 | 209 | - **Qwen3-GGUF**:参考 [DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF](https://huggingface.co/DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF)。单卡 T4 且仅剩约 `4.8~6GB` 显存时,推荐 `Q8_0 + n_ctx=384 + n_gpu_layers=24 + flash_attn=true + offload_kqv=true` 起步;若启动 OOM,优先把 `n_gpu_layers` 下调到 `20`,再把 `n_ctx` 下调到 `320`。`infer_batch_size` 在 GGUF 后端是服务侧 work chunk,大多不如 `n_gpu_layers` / `n_ctx` 关键。 |
| 210 | +- **Qwen3-GGUF-0.6B**:参考 [ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF](https://huggingface.co/ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF)。它的优点是权重小、显存占用低,单进程实测约 `0.9~1.1 GiB`;但在当前 llama.cpp 串行打分接法下,`1 query + 400 titles` 的实测延迟仍约 `265s`。因此它更适合低显存功能后备,不适合作为在线低延迟主 reranker。 | ... | ... |
reranker/backends/__init__.py
| ... | ... | @@ -48,12 +48,19 @@ def get_rerank_backend(name: str, config: Dict[str, Any]) -> RerankBackendProtoc |
| 48 | 48 | return Qwen3TransformersRerankerBackend(config) |
| 49 | 49 | if name == "qwen3_gguf": |
| 50 | 50 | from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend |
| 51 | - return Qwen3GGUFRerankerBackend(config) | |
| 51 | + gguf_config = dict(config or {}) | |
| 52 | + gguf_config.setdefault("_backend_name", "qwen3_gguf") | |
| 53 | + return Qwen3GGUFRerankerBackend(gguf_config) | |
| 54 | + if name == "qwen3_gguf_06b": | |
| 55 | + from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend | |
| 56 | + gguf_config = dict(config or {}) | |
| 57 | + gguf_config.setdefault("_backend_name", "qwen3_gguf_06b") | |
| 58 | + return Qwen3GGUFRerankerBackend(gguf_config) | |
| 52 | 59 | if name == "dashscope_rerank": |
| 53 | 60 | from reranker.backends.dashscope_rerank import DashScopeRerankBackend |
| 54 | 61 | return DashScopeRerankBackend(config) |
| 55 | 62 | raise ValueError( |
| 56 | - f"Unknown rerank backend: {name!r}. Supported: bge, qwen3_vllm, qwen3_transformers, qwen3_gguf, dashscope_rerank" | |
| 63 | + f"Unknown rerank backend: {name!r}. Supported: bge, qwen3_vllm, qwen3_transformers, qwen3_gguf, qwen3_gguf_06b, dashscope_rerank" | |
| 57 | 64 | ) |
| 58 | 65 | |
| 59 | 66 | ... | ... |
reranker/backends/qwen3_gguf.py
| 1 | 1 | """ |
| 2 | -Qwen3-Reranker-4B GGUF backend using llama-cpp-python. | |
| 2 | +Qwen3-Reranker GGUF backend using llama-cpp-python. | |
| 3 | 3 | |
| 4 | 4 | Reference: |
| 5 | 5 | - https://huggingface.co/DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF |
| 6 | 6 | - https://huggingface.co/Qwen/Qwen3-Reranker-4B |
| 7 | +- https://huggingface.co/ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF | |
| 8 | +- https://huggingface.co/Qwen/Qwen3-Reranker-0.6B | |
| 7 | 9 | """ |
| 8 | 10 | |
| 9 | 11 | from __future__ import annotations |
| ... | ... | @@ -13,12 +15,27 @@ import math |
| 13 | 15 | import os |
| 14 | 16 | import threading |
| 15 | 17 | import time |
| 18 | +from pathlib import Path | |
| 16 | 19 | from typing import Any, Dict, List, Tuple |
| 17 | 20 | |
| 18 | 21 | |
| 19 | 22 | logger = logging.getLogger("reranker.backends.qwen3_gguf") |
| 20 | 23 | |
| 21 | 24 | |
| 25 | +_BACKEND_DEFAULTS: Dict[str, Dict[str, str]] = { | |
| 26 | + "qwen3_gguf": { | |
| 27 | + "repo_id": "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF", | |
| 28 | + "filename": "*Q8_0.gguf", | |
| 29 | + "local_dir": "./models/reranker/qwen3-reranker-4b-gguf", | |
| 30 | + }, | |
| 31 | + "qwen3_gguf_06b": { | |
| 32 | + "repo_id": "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF", | |
| 33 | + "filename": "qwen3-reranker-0.6b-q8_0.gguf", | |
| 34 | + "local_dir": "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf", | |
| 35 | + }, | |
| 36 | +} | |
| 37 | + | |
| 38 | + | |
| 22 | 39 | def deduplicate_with_positions(texts: List[str]) -> Tuple[List[str], List[int]]: |
| 23 | 40 | """Deduplicate texts globally while preserving first-seen order.""" |
| 24 | 41 | unique_texts: List[str] = [] |
| ... | ... | @@ -46,21 +63,21 @@ def _format_instruction(instruction: str, query: str, doc: str) -> str: |
| 46 | 63 | |
| 47 | 64 | class Qwen3GGUFRerankerBackend: |
| 48 | 65 | """ |
| 49 | - Qwen3-Reranker-4B GGUF backend using llama.cpp through llama-cpp-python. | |
| 66 | + Qwen3-Reranker GGUF backend using llama.cpp through llama-cpp-python. | |
| 50 | 67 | |
| 51 | - Tuned for short-query / short-doc reranking on a memory-constrained single T4. | |
| 52 | - Config from services.rerank.backends.qwen3_gguf. | |
| 68 | + Tuned for short-query / short-doc reranking on a single GPU. | |
| 69 | + Config from services.rerank.backends.<backend_name>. | |
| 53 | 70 | """ |
| 54 | 71 | |
| 55 | 72 | def __init__(self, config: Dict[str, Any]) -> None: |
| 56 | 73 | self._config = config or {} |
| 57 | - self._repo_id = str( | |
| 58 | - self._config.get("repo_id") or "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF" | |
| 59 | - ).strip() | |
| 60 | - self._filename = str(self._config.get("filename") or "*Q8_0.gguf").strip() | |
| 74 | + self._backend_name = str(self._config.get("_backend_name") or "qwen3_gguf").strip() | |
| 75 | + defaults = _BACKEND_DEFAULTS.get(self._backend_name, _BACKEND_DEFAULTS["qwen3_gguf"]) | |
| 76 | + self._repo_id = str(self._config.get("repo_id") or defaults["repo_id"]).strip() | |
| 77 | + self._filename = str(self._config.get("filename") or defaults["filename"]).strip() | |
| 61 | 78 | self._model_path = str(self._config.get("model_path") or "").strip() |
| 62 | 79 | self._cache_dir = str(self._config.get("cache_dir") or "").strip() or None |
| 63 | - self._local_dir = str(self._config.get("local_dir") or "").strip() or None | |
| 80 | + self._local_dir = str(self._config.get("local_dir") or defaults["local_dir"]).strip() or None | |
| 64 | 81 | self._instruction = str( |
| 65 | 82 | self._config.get("instruction") |
| 66 | 83 | or "Rank products by query with category & style match prioritized" |
| ... | ... | @@ -79,6 +96,7 @@ class Qwen3GGUFRerankerBackend: |
| 79 | 96 | "on", |
| 80 | 97 | } |
| 81 | 98 | self._length_sort_mode = str(self._config.get("length_sort_mode") or "char").strip().lower() |
| 99 | + self._reuse_query_state = bool(self._config.get("reuse_query_state", False)) | |
| 82 | 100 | |
| 83 | 101 | n_ctx = int(self._config.get("n_ctx", self._config.get("max_model_len", 384))) |
| 84 | 102 | n_batch = int(self._config.get("n_batch", min(n_ctx, 384))) |
| ... | ... | @@ -105,8 +123,9 @@ class Qwen3GGUFRerankerBackend: |
| 105 | 123 | from llama_cpp import Llama |
| 106 | 124 | except Exception as exc: # pragma: no cover - depends on optional dependency |
| 107 | 125 | raise RuntimeError( |
| 108 | - "qwen3_gguf backend requires llama-cpp-python. " | |
| 109 | - "Install the qwen3_gguf backend venv first via scripts/setup_reranker_venv.sh qwen3_gguf." | |
| 126 | + f"{self._backend_name} backend requires llama-cpp-python. " | |
| 127 | + f"Install the {self._backend_name} backend venv first via " | |
| 128 | + f"scripts/setup_reranker_venv.sh {self._backend_name}." | |
| 110 | 129 | ) from exc |
| 111 | 130 | |
| 112 | 131 | self._llama_class = Llama |
| ... | ... | @@ -118,7 +137,8 @@ class Qwen3GGUFRerankerBackend: |
| 118 | 137 | self._infer_lock = threading.Lock() |
| 119 | 138 | |
| 120 | 139 | logger.info( |
| 121 | - "[Qwen3_GGUF] Loading model repo=%s filename=%s model_path=%s n_ctx=%s n_batch=%s n_ubatch=%s n_gpu_layers=%s flash_attn=%s offload_kqv=%s", | |
| 140 | + "[Qwen3_GGUF] Loading backend=%s repo=%s filename=%s model_path=%s n_ctx=%s n_batch=%s n_ubatch=%s n_gpu_layers=%s flash_attn=%s offload_kqv=%s reuse_query_state=%s", | |
| 141 | + self._backend_name, | |
| 122 | 142 | self._repo_id, |
| 123 | 143 | self._filename, |
| 124 | 144 | self._model_path or None, |
| ... | ... | @@ -128,6 +148,7 @@ class Qwen3GGUFRerankerBackend: |
| 128 | 148 | n_gpu_layers, |
| 129 | 149 | flash_attn, |
| 130 | 150 | offload_kqv, |
| 151 | + self._reuse_query_state, | |
| 131 | 152 | ) |
| 132 | 153 | |
| 133 | 154 | llm_kwargs = { |
| ... | ... | @@ -158,6 +179,7 @@ class Qwen3GGUFRerankerBackend: |
| 158 | 179 | self._suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n" |
| 159 | 180 | self._prefix_tokens = self._tokenize(self._prefix, special=True) |
| 160 | 181 | self._suffix_tokens = self._tokenize(self._suffix, special=True) |
| 182 | + self._request_prefix_template = "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: " | |
| 161 | 183 | self._effective_max_len = self._n_ctx - len(self._prefix_tokens) - len(self._suffix_tokens) |
| 162 | 184 | if self._effective_max_len <= 16: |
| 163 | 185 | raise RuntimeError( |
| ... | ... | @@ -171,7 +193,8 @@ class Qwen3GGUFRerankerBackend: |
| 171 | 193 | self._warmup() |
| 172 | 194 | |
| 173 | 195 | logger.info( |
| 174 | - "[Qwen3_GGUF] Model ready | model=%s effective_max_len=%s infer_batch_size=%s sort_by_doc_length=%s", | |
| 196 | + "[Qwen3_GGUF] Model ready | backend=%s model=%s effective_max_len=%s infer_batch_size=%s sort_by_doc_length=%s", | |
| 197 | + self._backend_name, | |
| 175 | 198 | self._model_name, |
| 176 | 199 | self._effective_max_len, |
| 177 | 200 | self._infer_batch_size, |
| ... | ... | @@ -181,6 +204,14 @@ class Qwen3GGUFRerankerBackend: |
| 181 | 204 | def _load_model(self, llm_kwargs: Dict[str, Any]): |
| 182 | 205 | if self._model_path: |
| 183 | 206 | return self._llama_class(model_path=self._model_path, **llm_kwargs) |
| 207 | + if self._local_dir: | |
| 208 | + matches = sorted( | |
| 209 | + path for path in Path(self._local_dir).glob(self._filename) if path.is_file() | |
| 210 | + ) | |
| 211 | + if matches: | |
| 212 | + local_model_path = str(matches[0].resolve()) | |
| 213 | + logger.info("[Qwen3_GGUF] Using local GGUF file: %s", local_model_path) | |
| 214 | + return self._llama_class(model_path=local_model_path, **llm_kwargs) | |
| 184 | 215 | return self._llama_class.from_pretrained( |
| 185 | 216 | repo_id=self._repo_id, |
| 186 | 217 | filename=self._filename, |
| ... | ... | @@ -212,6 +243,13 @@ class Qwen3GGUFRerankerBackend: |
| 212 | 243 | except Exception as exc: # pragma: no cover - defensive |
| 213 | 244 | logger.warning("[Qwen3_GGUF] Warmup failed: %s", exc) |
| 214 | 245 | |
| 246 | + def _build_request_prefix_tokens(self, query: str) -> List[int]: | |
| 247 | + request_prefix = self._request_prefix_template.format( | |
| 248 | + instruction=self._instruction, | |
| 249 | + query=query, | |
| 250 | + ) | |
| 251 | + return self._tokenize(request_prefix, special=False) | |
| 252 | + | |
| 215 | 253 | def _build_prompt_tokens(self, query: str, doc: str) -> List[int]: |
| 216 | 254 | pair = _format_instruction(self._instruction, query, doc) |
| 217 | 255 | pair_tokens = self._tokenize(pair, special=False) |
| ... | ... | @@ -235,6 +273,36 @@ class Qwen3GGUFRerankerBackend: |
| 235 | 273 | false_exp = math.exp(false_logit - max_logit) |
| 236 | 274 | return float(true_exp / (true_exp + false_exp)) |
| 237 | 275 | |
| 276 | + def _supports_query_state_reuse(self) -> bool: | |
| 277 | + return ( | |
| 278 | + self._reuse_query_state | |
| 279 | + and hasattr(self._llm, "save_state") | |
| 280 | + and hasattr(self._llm, "load_state") | |
| 281 | + ) | |
| 282 | + | |
| 283 | + def _build_query_state_locked(self, query: str): | |
| 284 | + request_prefix_tokens = self._build_request_prefix_tokens(query) | |
| 285 | + max_doc_tokens = self._effective_max_len - len(request_prefix_tokens) | |
| 286 | + if max_doc_tokens <= 0: | |
| 287 | + return None, 0 | |
| 288 | + self._llm.reset() | |
| 289 | + self._llm.eval(self._prefix_tokens + request_prefix_tokens) | |
| 290 | + return self._llm.save_state(), max_doc_tokens | |
| 291 | + | |
| 292 | + def _score_doc_with_state_locked(self, state, doc_tokens: List[int], max_doc_tokens: int) -> float: | |
| 293 | + self._llm.load_state(state) | |
| 294 | + self._llm.eval(doc_tokens[:max_doc_tokens] + self._suffix_tokens) | |
| 295 | + logits = self._llm.eval_logits | |
| 296 | + if not logits: | |
| 297 | + raise RuntimeError("llama.cpp returned empty logits") | |
| 298 | + final_logits = list(logits[-1]) | |
| 299 | + true_logit = float(final_logits[self._true_token]) | |
| 300 | + false_logit = float(final_logits[self._false_token]) | |
| 301 | + max_logit = max(true_logit, false_logit) | |
| 302 | + true_exp = math.exp(true_logit - max_logit) | |
| 303 | + false_exp = math.exp(false_logit - max_logit) | |
| 304 | + return float(true_exp / (true_exp + false_exp)) | |
| 305 | + | |
| 238 | 306 | def _estimate_doc_lengths(self, docs: List[str]) -> List[int]: |
| 239 | 307 | if self._length_sort_mode == "token": |
| 240 | 308 | return [len(self._tokenize(text, special=False)) for text in docs] |
| ... | ... | @@ -269,7 +337,7 @@ class Qwen3GGUFRerankerBackend: |
| 269 | 337 | "dedup_ratio": 0.0, |
| 270 | 338 | "elapsed_ms": round(elapsed_ms, 3), |
| 271 | 339 | "model": self._model_name, |
| 272 | - "backend": "qwen3_gguf", | |
| 340 | + "backend": self._backend_name, | |
| 273 | 341 | "normalize": normalize, |
| 274 | 342 | "infer_batch_size": self._infer_batch_size, |
| 275 | 343 | "inference_batches": 0, |
| ... | ... | @@ -289,14 +357,26 @@ class Qwen3GGUFRerankerBackend: |
| 289 | 357 | order = sorted(order, key=lambda i: lengths[i]) |
| 290 | 358 | |
| 291 | 359 | unique_scores: List[float] = [0.0] * len(unique_texts) |
| 360 | + unique_doc_tokens = [self._tokenize(text, special=False) for text in unique_texts] | |
| 292 | 361 | inference_batches = 0 |
| 293 | - for start in range(0, len(order), self._infer_batch_size): | |
| 294 | - batch_indices = order[start : start + self._infer_batch_size] | |
| 295 | - inference_batches += 1 | |
| 296 | - for idx in batch_indices: | |
| 297 | - prompt = self._build_prompt_tokens(query, unique_texts[idx]) | |
| 298 | - with self._infer_lock: | |
| 299 | - unique_scores[idx] = self._score_prompt(prompt) | |
| 362 | + with self._infer_lock: | |
| 363 | + query_state = None | |
| 364 | + max_doc_tokens = self._effective_max_len | |
| 365 | + if self._supports_query_state_reuse(): | |
| 366 | + query_state, max_doc_tokens = self._build_query_state_locked(query) | |
| 367 | + for start in range(0, len(order), self._infer_batch_size): | |
| 368 | + batch_indices = order[start : start + self._infer_batch_size] | |
| 369 | + inference_batches += 1 | |
| 370 | + for idx in batch_indices: | |
| 371 | + if query_state is not None: | |
| 372 | + unique_scores[idx] = self._score_doc_with_state_locked( | |
| 373 | + query_state, | |
| 374 | + unique_doc_tokens[idx], | |
| 375 | + max_doc_tokens, | |
| 376 | + ) | |
| 377 | + else: | |
| 378 | + prompt = self._build_prompt_tokens(query, unique_texts[idx]) | |
| 379 | + unique_scores[idx] = self._score_prompt(prompt) | |
| 300 | 380 | |
| 301 | 381 | for (orig_idx, _), unique_idx in zip(indexed, position_to_unique): |
| 302 | 382 | output_scores[orig_idx] = float(unique_scores[unique_idx]) |
| ... | ... | @@ -313,7 +393,7 @@ class Qwen3GGUFRerankerBackend: |
| 313 | 393 | "dedup_ratio": round(dedup_ratio, 4), |
| 314 | 394 | "elapsed_ms": round(elapsed_ms, 3), |
| 315 | 395 | "model": self._model_name, |
| 316 | - "backend": "qwen3_gguf", | |
| 396 | + "backend": self._backend_name, | |
| 317 | 397 | "normalize": normalize, |
| 318 | 398 | "infer_batch_size": self._infer_batch_size, |
| 319 | 399 | "inference_batches": inference_batches, |
| ... | ... | @@ -323,5 +403,6 @@ class Qwen3GGUFRerankerBackend: |
| 323 | 403 | "n_batch": self._n_batch, |
| 324 | 404 | "n_ubatch": self._n_ubatch, |
| 325 | 405 | "n_gpu_layers": self._n_gpu_layers, |
| 406 | + "reuse_query_state": query_state is not None, | |
| 326 | 407 | } |
| 327 | 408 | return output_scores, meta | ... | ... |
reranker/server.py
| ... | ... | @@ -7,7 +7,7 @@ Request: { "query": "...", "docs": ["doc1", "doc2", ...], "normalize": optional |
| 7 | 7 | Response: { "scores": [float], "meta": {...} } |
| 8 | 8 | |
| 9 | 9 | Backend selected via config: services.rerank.backend |
| 10 | -(bge | qwen3_vllm | qwen3_transformers | qwen3_gguf | dashscope_rerank), env RERANK_BACKEND. | |
| 10 | +(bge | qwen3_vllm | qwen3_transformers | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank), env RERANK_BACKEND. | |
| 11 | 11 | """ |
| 12 | 12 | |
| 13 | 13 | import logging | ... | ... |
| ... | ... | @@ -0,0 +1,198 @@ |
| 1 | +#!/usr/bin/env python3 | |
| 2 | +""" | |
| 3 | +Local tuning probe for GGUF reranker backends. | |
| 4 | + | |
| 5 | +Runs the backend directly in a fresh process per config to measure: | |
| 6 | +- load time | |
| 7 | +- GPU memory used by this process | |
| 8 | +- single-request rerank latency | |
| 9 | + | |
| 10 | +Example: | |
| 11 | + ./.venv-reranker-gguf/bin/python scripts/benchmark_reranker_gguf_local.py | |
| 12 | + ./.venv-reranker-gguf-06b/bin/python scripts/benchmark_reranker_gguf_local.py --backend-name qwen3_gguf_06b --docs 400 | |
| 13 | +""" | |
| 14 | + | |
| 15 | +from __future__ import annotations | |
| 16 | + | |
| 17 | +import argparse | |
| 18 | +import json | |
| 19 | +import os | |
| 20 | +import random | |
| 21 | +import statistics | |
| 22 | +import subprocess | |
| 23 | +import sys | |
| 24 | +import time | |
| 25 | +from pathlib import Path | |
| 26 | +from typing import Any | |
| 27 | + | |
| 28 | + | |
| 29 | +DEFAULT_TITLES = Path("/home/ubuntu/rerank_test/titles.1.8w") | |
| 30 | + | |
| 31 | + | |
| 32 | +def load_titles(path: Path) -> list[str]: | |
| 33 | + items: list[str] = [] | |
| 34 | + with path.open(encoding="utf-8", errors="replace") as fh: | |
| 35 | + for line in fh: | |
| 36 | + text = line.strip() | |
| 37 | + if text: | |
| 38 | + items.append(text) | |
| 39 | + return items | |
| 40 | + | |
| 41 | + | |
| 42 | +def gpu_mem_for_pid(pid: int) -> int: | |
| 43 | + try: | |
| 44 | + out = subprocess.check_output( | |
| 45 | + [ | |
| 46 | + "nvidia-smi", | |
| 47 | + "--query-compute-apps=pid,used_gpu_memory", | |
| 48 | + "--format=csv,noheader,nounits", | |
| 49 | + ], | |
| 50 | + text=True, | |
| 51 | + ) | |
| 52 | + except Exception: | |
| 53 | + return -1 | |
| 54 | + for raw in out.splitlines(): | |
| 55 | + parts = [p.strip() for p in raw.split(",")] | |
| 56 | + if len(parts) != 2: | |
| 57 | + continue | |
| 58 | + try: | |
| 59 | + row_pid = int(parts[0]) | |
| 60 | + row_mem = int(parts[1]) | |
| 61 | + except ValueError: | |
| 62 | + continue | |
| 63 | + if row_pid == pid: | |
| 64 | + return row_mem | |
| 65 | + return -1 | |
| 66 | + | |
| 67 | + | |
| 68 | +def main() -> int: | |
| 69 | + parser = argparse.ArgumentParser() | |
| 70 | + parser.add_argument("--backend-name", type=str, default="qwen3_gguf") | |
| 71 | + parser.add_argument("--titles-file", type=Path, default=DEFAULT_TITLES) | |
| 72 | + parser.add_argument("--query", type=str, default="白色oversized T-shirt") | |
| 73 | + parser.add_argument("--docs", type=int, default=160) | |
| 74 | + parser.add_argument("--repeat", type=int, default=1) | |
| 75 | + parser.add_argument("--seed", type=int, default=42) | |
| 76 | + parser.add_argument( | |
| 77 | + "--configs-json", | |
| 78 | + type=str, | |
| 79 | + default="", | |
| 80 | + help="JSON array of config objects; when omitted, uses built-in scan set.", | |
| 81 | + ) | |
| 82 | + args = parser.parse_args() | |
| 83 | + | |
| 84 | + if not args.titles_file.is_file(): | |
| 85 | + print(f"missing titles file: {args.titles_file}", file=sys.stderr) | |
| 86 | + return 2 | |
| 87 | + | |
| 88 | + titles = load_titles(args.titles_file) | |
| 89 | + if len(titles) < args.docs: | |
| 90 | + print(f"not enough titles: need {args.docs}, got {len(titles)}", file=sys.stderr) | |
| 91 | + return 2 | |
| 92 | + | |
| 93 | + random.seed(args.seed) | |
| 94 | + docs = random.sample(titles, args.docs) | |
| 95 | + | |
| 96 | + if args.configs_json: | |
| 97 | + configs = json.loads(args.configs_json) | |
| 98 | + elif args.backend_name == "qwen3_gguf_06b": | |
| 99 | + configs = [ | |
| 100 | + {"name": "gguf_06b_full_256", "n_ctx": 256, "n_batch": 256, "n_ubatch": 256, "n_gpu_layers": 999}, | |
| 101 | + {"name": "gguf_06b_full_320", "n_ctx": 320, "n_batch": 320, "n_ubatch": 320, "n_gpu_layers": 999}, | |
| 102 | + {"name": "gguf_06b_full_384", "n_ctx": 384, "n_batch": 384, "n_ubatch": 384, "n_gpu_layers": 999}, | |
| 103 | + {"name": "gguf_06b_full_512", "n_ctx": 512, "n_batch": 512, "n_ubatch": 512, "n_gpu_layers": 999}, | |
| 104 | + ] | |
| 105 | + else: | |
| 106 | + configs = [ | |
| 107 | + {"name": "gguf_t4_24g", "n_ctx": 384, "n_batch": 384, "n_ubatch": 128, "n_gpu_layers": 24}, | |
| 108 | + {"name": "gguf_t4_40g", "n_ctx": 384, "n_batch": 384, "n_ubatch": 128, "n_gpu_layers": 40}, | |
| 109 | + {"name": "gguf_t4_full", "n_ctx": 384, "n_batch": 384, "n_ubatch": 128, "n_gpu_layers": 999}, | |
| 110 | + {"name": "gguf_t4_full_512", "n_ctx": 512, "n_batch": 512, "n_ubatch": 256, "n_gpu_layers": 999}, | |
| 111 | + {"name": "gguf_t4_full_512_u512", "n_ctx": 512, "n_batch": 512, "n_ubatch": 512, "n_gpu_layers": 999}, | |
| 112 | + {"name": "gguf_t4_full_768", "n_ctx": 768, "n_batch": 768, "n_ubatch": 256, "n_gpu_layers": 999}, | |
| 113 | + ] | |
| 114 | + | |
| 115 | + from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend | |
| 116 | + | |
| 117 | + default_cfg_by_backend: dict[str, dict[str, Any]] = { | |
| 118 | + "qwen3_gguf": { | |
| 119 | + "_backend_name": "qwen3_gguf", | |
| 120 | + "repo_id": "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF", | |
| 121 | + "filename": "*Q8_0.gguf", | |
| 122 | + "local_dir": "./models/reranker/qwen3-reranker-4b-gguf", | |
| 123 | + "infer_batch_size": 8, | |
| 124 | + }, | |
| 125 | + "qwen3_gguf_06b": { | |
| 126 | + "_backend_name": "qwen3_gguf_06b", | |
| 127 | + "repo_id": "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF", | |
| 128 | + "filename": "qwen3-reranker-0.6b-q8_0.gguf", | |
| 129 | + "local_dir": "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf", | |
| 130 | + "infer_batch_size": 32, | |
| 131 | + }, | |
| 132 | + } | |
| 133 | + if args.backend_name not in default_cfg_by_backend: | |
| 134 | + print(f"unsupported backend: {args.backend_name}", file=sys.stderr) | |
| 135 | + return 2 | |
| 136 | + | |
| 137 | + base_cfg: dict[str, Any] = { | |
| 138 | + **default_cfg_by_backend[args.backend_name], | |
| 139 | + "instruction": "Rank products by query with category & style match prioritized", | |
| 140 | + "cache_dir": "./model_cache", | |
| 141 | + "main_gpu": 0, | |
| 142 | + "n_threads": 2, | |
| 143 | + "n_threads_batch": 4, | |
| 144 | + "flash_attn": True, | |
| 145 | + "offload_kqv": True, | |
| 146 | + "use_mmap": True, | |
| 147 | + "use_mlock": False, | |
| 148 | + "sort_by_doc_length": True, | |
| 149 | + "length_sort_mode": "char", | |
| 150 | + "enable_warmup": True, | |
| 151 | + "verbose": False, | |
| 152 | + "reuse_query_state": True, | |
| 153 | + } | |
| 154 | + | |
| 155 | + all_results: list[dict[str, Any]] = [] | |
| 156 | + for cfg in configs: | |
| 157 | + merged = dict(base_cfg) | |
| 158 | + merged.update(cfg) | |
| 159 | + name = str(merged.pop("name")) | |
| 160 | + | |
| 161 | + t0 = time.perf_counter() | |
| 162 | + backend = Qwen3GGUFRerankerBackend(merged) | |
| 163 | + load_ms = (time.perf_counter() - t0) * 1000.0 | |
| 164 | + gpu_mem_mib = gpu_mem_for_pid(os.getpid()) | |
| 165 | + | |
| 166 | + runs: list[float] = [] | |
| 167 | + last_meta: dict[str, Any] = {} | |
| 168 | + for _ in range(args.repeat): | |
| 169 | + t1 = time.perf_counter() | |
| 170 | + _scores, meta = backend.score_with_meta(args.query, docs, normalize=True) | |
| 171 | + runs.append((time.perf_counter() - t1) * 1000.0) | |
| 172 | + last_meta = dict(meta) | |
| 173 | + | |
| 174 | + result = { | |
| 175 | + "name": name, | |
| 176 | + "config": merged, | |
| 177 | + "load_ms": round(load_ms, 2), | |
| 178 | + "gpu_mem_mib": gpu_mem_mib, | |
| 179 | + "latency_ms_min": round(min(runs), 2), | |
| 180 | + "latency_ms_avg": round(statistics.mean(runs), 2), | |
| 181 | + "latency_ms_max": round(max(runs), 2), | |
| 182 | + "meta": last_meta, | |
| 183 | + } | |
| 184 | + all_results.append(result) | |
| 185 | + print(json.dumps(result, ensure_ascii=False)) | |
| 186 | + del backend | |
| 187 | + | |
| 188 | + print("SUMMARY") | |
| 189 | + for item in sorted(all_results, key=lambda x: x["latency_ms_avg"]): | |
| 190 | + print( | |
| 191 | + f'{item["name"]}: avg={item["latency_ms_avg"]}ms ' | |
| 192 | + f'gpu={item["gpu_mem_mib"]}MiB load={item["load_ms"]}ms' | |
| 193 | + ) | |
| 194 | + return 0 | |
| 195 | + | |
| 196 | + | |
| 197 | +if __name__ == "__main__": | |
| 198 | + raise SystemExit(main()) | ... | ... |
scripts/lib/reranker_backend_env.sh
| ... | ... | @@ -40,6 +40,7 @@ reranker_backend_venv_dir() { |
| 40 | 40 | case "${backend}" in |
| 41 | 41 | qwen3_vllm) printf '%s/.venv-reranker\n' "${project_root}" ;; |
| 42 | 42 | qwen3_gguf) printf '%s/.venv-reranker-gguf\n' "${project_root}" ;; |
| 43 | + qwen3_gguf_06b) printf '%s/.venv-reranker-gguf-06b\n' "${project_root}" ;; | |
| 43 | 44 | qwen3_transformers) printf '%s/.venv-reranker-transformers\n' "${project_root}" ;; |
| 44 | 45 | bge) printf '%s/.venv-reranker-bge\n' "${project_root}" ;; |
| 45 | 46 | dashscope_rerank) printf '%s/.venv-reranker-dashscope\n' "${project_root}" ;; |
| ... | ... | @@ -54,6 +55,7 @@ reranker_backend_requirements_file() { |
| 54 | 55 | case "${backend}" in |
| 55 | 56 | qwen3_vllm) printf '%s/requirements_reranker_qwen3_vllm.txt\n' "${project_root}" ;; |
| 56 | 57 | qwen3_gguf) printf '%s/requirements_reranker_qwen3_gguf.txt\n' "${project_root}" ;; |
| 58 | + qwen3_gguf_06b) printf '%s/requirements_reranker_qwen3_gguf_06b.txt\n' "${project_root}" ;; | |
| 57 | 59 | qwen3_transformers) printf '%s/requirements_reranker_qwen3_transformers.txt\n' "${project_root}" ;; |
| 58 | 60 | bge) printf '%s/requirements_reranker_bge.txt\n' "${project_root}" ;; |
| 59 | 61 | dashscope_rerank) printf '%s/requirements_reranker_dashscope.txt\n' "${project_root}" ;; | ... | ... |
scripts/setup_reranker_venv.sh
| ... | ... | @@ -50,6 +50,30 @@ echo "Using TMPDIR=${TMPDIR}" |
| 50 | 50 | "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" --upgrade pip wheel |
| 51 | 51 | "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" -r "${REQ_FILE}" |
| 52 | 52 | |
| 53 | +if [[ "${BACKEND}" == qwen3_gguf* ]]; then | |
| 54 | + if [[ -x "/usr/local/cuda/bin/nvcc" ]]; then | |
| 55 | + "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" \ | |
| 56 | + cmake \ | |
| 57 | + ninja \ | |
| 58 | + scikit-build-core \ | |
| 59 | + flit_core \ | |
| 60 | + setuptools-scm | |
| 61 | + echo "Rebuilding llama-cpp-python with CUDA support for ${BACKEND}" | |
| 62 | + PATH="/usr/local/cuda/bin:/usr/bin:/bin" \ | |
| 63 | + CC="/usr/bin/x86_64-linux-gnu-gcc" \ | |
| 64 | + CXX="/usr/bin/x86_64-linux-gnu-g++" \ | |
| 65 | + CUDACXX="/usr/local/cuda/bin/nvcc" \ | |
| 66 | + CMAKE_ARGS="-DGGML_CUDA=on" \ | |
| 67 | + FORCE_CMAKE=1 \ | |
| 68 | + "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" \ | |
| 69 | + --force-reinstall \ | |
| 70 | + --no-build-isolation \ | |
| 71 | + "llama-cpp-python==0.3.18" | |
| 72 | + else | |
| 73 | + echo "WARNING: /usr/local/cuda/bin/nvcc not found; ${BACKEND} will be installed without CUDA support." >&2 | |
| 74 | + fi | |
| 75 | +fi | |
| 76 | + | |
| 53 | 77 | echo |
| 54 | 78 | echo "Done." |
| 55 | 79 | echo "Backend: ${BACKEND}" | ... | ... |
scripts/start_reranker.sh
| ... | ... | @@ -43,6 +43,10 @@ export TMPDIR="${RERANKER_RUNTIME_DIR}/tmp" |
| 43 | 43 | export VLLM_NO_USAGE_STATS="${VLLM_NO_USAGE_STATS:-1}" |
| 44 | 44 | export PATH="${RERANKER_VENV}/bin:${PATH}" |
| 45 | 45 | |
| 46 | +if [[ "${RERANK_BACKEND}" == qwen3_gguf* ]]; then | |
| 47 | + export HF_HUB_DISABLE_XET="${HF_HUB_DISABLE_XET:-1}" | |
| 48 | +fi | |
| 49 | + | |
| 46 | 50 | if [[ "${RERANK_BACKEND}" == "qwen3_vllm" ]]; then |
| 47 | 51 | if ! command -v nvidia-smi >/dev/null 2>&1 || ! nvidia-smi >/dev/null 2>&1; then |
| 48 | 52 | echo "ERROR: qwen3_vllm backend requires NVIDIA GPU, but nvidia-smi is unavailable." >&2 |
| ... | ... | @@ -64,16 +68,24 @@ PY |
| 64 | 68 | fi |
| 65 | 69 | fi |
| 66 | 70 | |
| 67 | -if [[ "${RERANK_BACKEND}" == "qwen3_gguf" ]]; then | |
| 68 | - if ! "${PYTHON_BIN}" - <<'PY' | |
| 71 | +if [[ "${RERANK_BACKEND}" == qwen3_gguf* ]]; then | |
| 72 | + gguf_check_status=0 | |
| 73 | + "${PYTHON_BIN}" - <<'PY' || gguf_check_status=$? | |
| 69 | 74 | try: |
| 70 | - import llama_cpp # noqa: F401 | |
| 75 | + import llama_cpp | |
| 76 | + if hasattr(llama_cpp, "llama_supports_gpu_offload") and not llama_cpp.llama_supports_gpu_offload(): | |
| 77 | + raise SystemExit(2) | |
| 71 | 78 | except Exception: |
| 72 | 79 | raise SystemExit(1) |
| 73 | 80 | PY |
| 74 | - then | |
| 75 | - echo "ERROR: qwen3_gguf backend requires llama-cpp-python in ${RERANKER_VENV}." >&2 | |
| 76 | - echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2 | |
| 81 | + if [[ "${gguf_check_status}" != "0" ]]; then | |
| 82 | + if [[ "${gguf_check_status}" == "2" ]]; then | |
| 83 | + echo "ERROR: ${RERANK_BACKEND} backend detected a CPU-only llama-cpp-python build in ${RERANKER_VENV}." >&2 | |
| 84 | + echo "Please rerun: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2 | |
| 85 | + else | |
| 86 | + echo "ERROR: ${RERANK_BACKEND} backend requires llama-cpp-python in ${RERANKER_VENV}." >&2 | |
| 87 | + echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2 | |
| 88 | + fi | |
| 77 | 89 | exit 1 |
| 78 | 90 | fi |
| 79 | 91 | fi | ... | ... |
tests/test_reranker_qwen3_gguf_backend.py
| ... | ... | @@ -12,6 +12,8 @@ class _FakeLlama: |
| 12 | 12 | self.model_path = model_path |
| 13 | 13 | self.kwargs = kwargs |
| 14 | 14 | self.eval_logits = [] |
| 15 | + self._tokens = [] | |
| 16 | + self.eval_call_count = 0 | |
| 15 | 17 | |
| 16 | 18 | @classmethod |
| 17 | 19 | def from_pretrained(cls, repo_id: str, filename: str, local_dir=None, cache_dir=None, **kwargs): |
| ... | ... | @@ -31,16 +33,25 @@ class _FakeLlama: |
| 31 | 33 | return [10 + (ord(ch) % 17) for ch in raw] |
| 32 | 34 | |
| 33 | 35 | def reset(self): |
| 36 | + self._tokens = [] | |
| 34 | 37 | return None |
| 35 | 38 | |
| 36 | 39 | def eval(self, prompt_tokens): |
| 37 | - pos = float(sum(prompt_tokens) % 11) + 3.0 | |
| 40 | + self.eval_call_count += 1 | |
| 41 | + self._tokens.extend(prompt_tokens) | |
| 42 | + pos = float(sum(self._tokens) % 11) + 3.0 | |
| 38 | 43 | neg = 1.0 |
| 39 | 44 | logits = [0.0] * 64 |
| 40 | 45 | logits[1] = pos |
| 41 | 46 | logits[2] = neg |
| 42 | 47 | self.eval_logits = [logits] |
| 43 | 48 | |
| 49 | + def save_state(self): | |
| 50 | + return list(self._tokens) | |
| 51 | + | |
| 52 | + def load_state(self, state): | |
| 53 | + self._tokens = list(state) | |
| 54 | + | |
| 44 | 55 | |
| 45 | 56 | def _install_fake_llama_cpp(monkeypatch): |
| 46 | 57 | fake_module = types.SimpleNamespace(Llama=_FakeLlama) |
| ... | ... | @@ -58,6 +69,21 @@ def test_qwen3_gguf_backend_factory_loads(monkeypatch): |
| 58 | 69 | }, |
| 59 | 70 | ) |
| 60 | 71 | assert isinstance(backend, Qwen3GGUFRerankerBackend) |
| 72 | + assert backend._backend_name == "qwen3_gguf" | |
| 73 | + | |
| 74 | + | |
| 75 | +def test_qwen3_gguf_06b_backend_factory_loads(monkeypatch): | |
| 76 | + _install_fake_llama_cpp(monkeypatch) | |
| 77 | + backend = get_rerank_backend( | |
| 78 | + "qwen3_gguf_06b", | |
| 79 | + { | |
| 80 | + "enable_warmup": False, | |
| 81 | + }, | |
| 82 | + ) | |
| 83 | + assert isinstance(backend, Qwen3GGUFRerankerBackend) | |
| 84 | + assert backend._backend_name == "qwen3_gguf_06b" | |
| 85 | + assert backend._repo_id == "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF" | |
| 86 | + assert backend._filename == "qwen3-reranker-0.6b-q8_0.gguf" | |
| 61 | 87 | |
| 62 | 88 | |
| 63 | 89 | def test_qwen3_gguf_backend_score_with_meta_dedup_and_restore(monkeypatch): |
| ... | ... | @@ -69,6 +95,7 @@ def test_qwen3_gguf_backend_score_with_meta_dedup_and_restore(monkeypatch): |
| 69 | 95 | "enable_warmup": False, |
| 70 | 96 | "infer_batch_size": 2, |
| 71 | 97 | "sort_by_doc_length": True, |
| 98 | + "reuse_query_state": True, | |
| 72 | 99 | } |
| 73 | 100 | ) |
| 74 | 101 | |
| ... | ... | @@ -88,3 +115,5 @@ def test_qwen3_gguf_backend_score_with_meta_dedup_and_restore(monkeypatch): |
| 88 | 115 | assert meta["unique_docs"] == 2 |
| 89 | 116 | assert meta["backend"] == "qwen3_gguf" |
| 90 | 117 | assert meta["inference_batches"] == 1 |
| 118 | + assert meta["reuse_query_state"] is True | |
| 119 | + assert backend._llm.eval_call_count == 3 | ... | ... |