Commit 5c21a485bbd6bd2f6876a1d2ddee6a6afbeeffa9
1 parent
3d508beb
qwen3-reranker-0.6b-gguf
Showing
16 changed files
with
886 additions
and
47 deletions
Show diff stats
config/config.yaml
| @@ -381,7 +381,7 @@ services: | @@ -381,7 +381,7 @@ services: | ||
| 381 | max_docs: 1000 | 381 | max_docs: 1000 |
| 382 | normalize: true | 382 | normalize: true |
| 383 | # 服务内后端(reranker 进程启动时读取) | 383 | # 服务内后端(reranker 进程启动时读取) |
| 384 | - backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | qwen3_gguf | dashscope_rerank | 384 | + backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank |
| 385 | backends: | 385 | backends: |
| 386 | bge: | 386 | bge: |
| 387 | model_name: "BAAI/bge-reranker-v2-m3" | 387 | model_name: "BAAI/bge-reranker-v2-m3" |
| @@ -426,11 +426,11 @@ services: | @@ -426,11 +426,11 @@ services: | ||
| 426 | cache_dir: "./model_cache" | 426 | cache_dir: "./model_cache" |
| 427 | local_dir: "./models/reranker/qwen3-reranker-4b-gguf" | 427 | local_dir: "./models/reranker/qwen3-reranker-4b-gguf" |
| 428 | instruction: "Rank products by query with category & style match prioritized" | 428 | instruction: "Rank products by query with category & style match prioritized" |
| 429 | - # T4 16GB / 显存约 5~6GB 的保守配置 | ||
| 430 | - n_ctx: 384 | ||
| 431 | - n_batch: 384 | ||
| 432 | - n_ubatch: 128 | ||
| 433 | - n_gpu_layers: 24 | 429 | + # T4 16GB / 性能优先配置:全量层 offload,实测比保守配置明显更快 |
| 430 | + n_ctx: 512 | ||
| 431 | + n_batch: 512 | ||
| 432 | + n_ubatch: 512 | ||
| 433 | + n_gpu_layers: 999 | ||
| 434 | main_gpu: 0 | 434 | main_gpu: 0 |
| 435 | n_threads: 2 | 435 | n_threads: 2 |
| 436 | n_threads_batch: 4 | 436 | n_threads_batch: 4 |
| @@ -443,6 +443,31 @@ services: | @@ -443,6 +443,31 @@ services: | ||
| 443 | length_sort_mode: "char" | 443 | length_sort_mode: "char" |
| 444 | enable_warmup: true | 444 | enable_warmup: true |
| 445 | verbose: false | 445 | verbose: false |
| 446 | + qwen3_gguf_06b: | ||
| 447 | + repo_id: "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF" | ||
| 448 | + filename: "qwen3-reranker-0.6b-q8_0.gguf" | ||
| 449 | + cache_dir: "./model_cache" | ||
| 450 | + local_dir: "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf" | ||
| 451 | + instruction: "Rank products by query with category & style match prioritized" | ||
| 452 | + # 0.6B GGUF / online rerank baseline: | ||
| 453 | + # 实测 400 titles 单请求约 265s,因此它更适合作为低显存功能后备,不适合在线低延迟主路由。 | ||
| 454 | + n_ctx: 256 | ||
| 455 | + n_batch: 256 | ||
| 456 | + n_ubatch: 256 | ||
| 457 | + n_gpu_layers: 999 | ||
| 458 | + main_gpu: 0 | ||
| 459 | + n_threads: 2 | ||
| 460 | + n_threads_batch: 4 | ||
| 461 | + flash_attn: true | ||
| 462 | + offload_kqv: true | ||
| 463 | + use_mmap: true | ||
| 464 | + use_mlock: false | ||
| 465 | + infer_batch_size: 32 | ||
| 466 | + sort_by_doc_length: true | ||
| 467 | + length_sort_mode: "char" | ||
| 468 | + reuse_query_state: false | ||
| 469 | + enable_warmup: true | ||
| 470 | + verbose: false | ||
| 446 | dashscope_rerank: | 471 | dashscope_rerank: |
| 447 | model_name: "qwen3-rerank" | 472 | model_name: "qwen3-rerank" |
| 448 | # 按地域选择 endpoint: | 473 | # 按地域选择 endpoint: |
config/services_config.py
| @@ -7,6 +7,7 @@ contains no independent parsing or precedence logic. | @@ -7,6 +7,7 @@ contains no independent parsing or precedence logic. | ||
| 7 | 7 | ||
| 8 | from __future__ import annotations | 8 | from __future__ import annotations |
| 9 | 9 | ||
| 10 | +import os | ||
| 10 | from typing import Any, Dict, Tuple | 11 | from typing import Any, Dict, Tuple |
| 11 | 12 | ||
| 12 | from config.loader import get_app_config | 13 | from config.loader import get_app_config |
| @@ -61,6 +62,12 @@ def get_embedding_image_backend_config() -> Tuple[str, Dict[str, Any]]: | @@ -61,6 +62,12 @@ def get_embedding_image_backend_config() -> Tuple[str, Dict[str, Any]]: | ||
| 61 | 62 | ||
| 62 | def get_rerank_backend_config() -> Tuple[str, Dict[str, Any]]: | 63 | def get_rerank_backend_config() -> Tuple[str, Dict[str, Any]]: |
| 63 | cfg = get_app_config().services.rerank | 64 | cfg = get_app_config().services.rerank |
| 65 | + backend = str(os.getenv("RERANK_BACKEND") or cfg.backend).strip() | ||
| 66 | + if backend != cfg.backend: | ||
| 67 | + backend_cfg = cfg.backends.get(backend) | ||
| 68 | + if backend_cfg is None: | ||
| 69 | + raise ValueError(f"Unknown rerank backend override from RERANK_BACKEND: {backend!r}") | ||
| 70 | + return backend, dict(backend_cfg) | ||
| 64 | return cfg.backend, cfg.get_backend_config() | 71 | return cfg.backend, cfg.get_backend_config() |
| 65 | 72 | ||
| 66 | 73 |
requirements_reranker_qwen3_gguf.txt
| 1 | # Isolated dependencies for qwen3_gguf reranker backend (.venv-reranker-gguf). | 1 | # Isolated dependencies for qwen3_gguf reranker backend (.venv-reranker-gguf). |
| 2 | 2 | ||
| 3 | -r requirements_reranker_base.txt | 3 | -r requirements_reranker_base.txt |
| 4 | +huggingface-hub>=0.32.0 | ||
| 4 | llama-cpp-python>=0.3.16 | 5 | llama-cpp-python>=0.3.16 |
reranker/DEPLOYMENT_AND_TUNING.md
| @@ -3,15 +3,15 @@ | @@ -3,15 +3,15 @@ | ||
| 3 | 本文档沉淀当前项目在电商搜索重排场景下的可复用实践,覆盖: | 3 | 本文档沉淀当前项目在电商搜索重排场景下的可复用实践,覆盖: |
| 4 | 4 | ||
| 5 | - 环境准备与安装部署 | 5 | - 环境准备与安装部署 |
| 6 | -- `qwen3_vllm` / `qwen3_gguf` 配置项与优化思路 | 6 | +- `qwen3_vllm` / `qwen3_gguf` / `qwen3_gguf_06b` 配置项与优化思路 |
| 7 | - 1000-doc 场景压测流程 | 7 | - 1000-doc 场景压测流程 |
| 8 | - 关键结论与推荐默认参数 | 8 | - 关键结论与推荐默认参数 |
| 9 | - 常见故障排查 | 9 | - 常见故障排查 |
| 10 | 10 | ||
| 11 | 适用范围: | 11 | 适用范围: |
| 12 | 12 | ||
| 13 | -- 重排后端:`services.rerank.backend: qwen3_vllm` 或 `qwen3_gguf` | ||
| 14 | -- 模型:`Qwen/Qwen3-Reranker-0.6B` / `DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF` | 13 | +- 重排后端:`services.rerank.backend: qwen3_vllm` / `qwen3_gguf` / `qwen3_gguf_06b` |
| 14 | +- 模型:`Qwen/Qwen3-Reranker-0.6B` / `DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF` / `ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF` | ||
| 15 | - 场景:query 较短(通常 < 100 tokens),doc 为商品标题或标题+简短描述,单请求 docs 约 1000 条 | 15 | - 场景:query 较短(通常 < 100 tokens),doc 为商品标题或标题+简短描述,单请求 docs 约 1000 条 |
| 16 | 16 | ||
| 17 | ## 1. 环境基线 | 17 | ## 1. 环境基线 |
| @@ -0,0 +1,154 @@ | @@ -0,0 +1,154 @@ | ||
| 1 | +# Qwen3-Reranker-0.6B GGUF 安装与调优 | ||
| 2 | + | ||
| 3 | +本文档覆盖 `qwen3_gguf_06b` 后端,对应模型: | ||
| 4 | + | ||
| 5 | +- Hugging Face: `ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF` | ||
| 6 | +- 文件: `qwen3-reranker-0.6b-q8_0.gguf` | ||
| 7 | +- 本地目录: `./models/reranker/qwen3-reranker-0.6b-q8_0-gguf` | ||
| 8 | + | ||
| 9 | +## 结论先看 | ||
| 10 | + | ||
| 11 | +这个后端已经接入完成,也能正常使用 GPU offload,但不适合当前项目的在线主链路场景。 | ||
| 12 | + | ||
| 13 | +目标场景是: | ||
| 14 | + | ||
| 15 | +- 1 个 query | ||
| 16 | +- 400 个商品标题 | ||
| 17 | +- 追求最短响应时间 | ||
| 18 | + | ||
| 19 | +实测最优配置下: | ||
| 20 | + | ||
| 21 | +- GPU 显存占用约 `894 MiB` | ||
| 22 | +- 400 titles 单请求延迟约 `265318 ms` | ||
| 23 | + | ||
| 24 | +因此它更适合作为: | ||
| 25 | + | ||
| 26 | +- 低显存 fallback | ||
| 27 | +- 功能验证 | ||
| 28 | +- 本地离线实验 | ||
| 29 | + | ||
| 30 | +不建议作为在线低延迟 reranker 主 backend。 | ||
| 31 | + | ||
| 32 | +## 独立环境 | ||
| 33 | + | ||
| 34 | +`qwen3_gguf_06b` 使用独立 venv: | ||
| 35 | + | ||
| 36 | +- backend: `qwen3_gguf_06b` | ||
| 37 | +- venv: `.venv-reranker-gguf-06b` | ||
| 38 | +- requirements: `requirements_reranker_qwen3_gguf_06b.txt` | ||
| 39 | + | ||
| 40 | +安装: | ||
| 41 | + | ||
| 42 | +```bash | ||
| 43 | +./scripts/setup_reranker_venv.sh qwen3_gguf_06b | ||
| 44 | +``` | ||
| 45 | + | ||
| 46 | +如果需要确认是 CUDA 版 `llama-cpp-python`: | ||
| 47 | + | ||
| 48 | +```bash | ||
| 49 | +./.venv-reranker-gguf-06b/bin/python - <<'PY' | ||
| 50 | +import llama_cpp | ||
| 51 | +print(llama_cpp.llama_supports_gpu_offload()) | ||
| 52 | +PY | ||
| 53 | +``` | ||
| 54 | + | ||
| 55 | +预期输出: | ||
| 56 | + | ||
| 57 | +```python | ||
| 58 | +True | ||
| 59 | +``` | ||
| 60 | + | ||
| 61 | +## 模型下载 | ||
| 62 | + | ||
| 63 | +推荐预先下载到本地,避免首次服务启动时在线拉取: | ||
| 64 | + | ||
| 65 | +```bash | ||
| 66 | +mkdir -p models/reranker/qwen3-reranker-0.6b-q8_0-gguf | ||
| 67 | +curl -L --fail -C - \ | ||
| 68 | + -o models/reranker/qwen3-reranker-0.6b-q8_0-gguf/qwen3-reranker-0.6b-q8_0.gguf \ | ||
| 69 | + 'https://huggingface.co/ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/resolve/main/qwen3-reranker-0.6b-q8_0.gguf?download=true' | ||
| 70 | +``` | ||
| 71 | + | ||
| 72 | +当前实测文件大小: | ||
| 73 | + | ||
| 74 | +- `639153184` bytes | ||
| 75 | + | ||
| 76 | +## 推荐配置 | ||
| 77 | + | ||
| 78 | +`config/config.yaml` 中建议保留: | ||
| 79 | + | ||
| 80 | +```yaml | ||
| 81 | +qwen3_gguf_06b: | ||
| 82 | + repo_id: "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF" | ||
| 83 | + filename: "qwen3-reranker-0.6b-q8_0.gguf" | ||
| 84 | + local_dir: "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf" | ||
| 85 | + cache_dir: "./model_cache" | ||
| 86 | + instruction: "Rank products by query with category & style match prioritized" | ||
| 87 | + n_ctx: 256 | ||
| 88 | + n_batch: 256 | ||
| 89 | + n_ubatch: 256 | ||
| 90 | + n_gpu_layers: 999 | ||
| 91 | + main_gpu: 0 | ||
| 92 | + n_threads: 2 | ||
| 93 | + n_threads_batch: 4 | ||
| 94 | + flash_attn: true | ||
| 95 | + offload_kqv: true | ||
| 96 | + use_mmap: true | ||
| 97 | + use_mlock: false | ||
| 98 | + infer_batch_size: 32 | ||
| 99 | + sort_by_doc_length: true | ||
| 100 | + length_sort_mode: "char" | ||
| 101 | + reuse_query_state: false | ||
| 102 | + enable_warmup: true | ||
| 103 | + verbose: false | ||
| 104 | +``` | ||
| 105 | + | ||
| 106 | +## 调优结果 | ||
| 107 | + | ||
| 108 | +在当前机器上做了同机实测。标题文件来自 `/home/ubuntu/rerank_test/titles.1.8w`,查询为 `白色oversized T-shirt`。 | ||
| 109 | + | ||
| 110 | +80 titles: | ||
| 111 | + | ||
| 112 | +- `n_ctx=256, reuse_query_state=true` -> `60108 ms` | ||
| 113 | +- `n_ctx=256, reuse_query_state=false` -> `53383~56893 ms` | ||
| 114 | +- `n_ctx=320, reuse_query_state=true` -> `60961 ms` | ||
| 115 | +- `n_ctx=384, reuse_query_state=true` -> `56578 ms` | ||
| 116 | +- `n_ctx=384, reuse_query_state=false` -> `57272 ms` | ||
| 117 | +- `n_ctx=512, reuse_query_state=false` -> `60542 ms` | ||
| 118 | +- `n_ctx=256, reuse_query_state=false, n_threads=4, n_threads_batch=8` -> `61228 ms` | ||
| 119 | + | ||
| 120 | +400 titles: | ||
| 121 | + | ||
| 122 | +- `n_ctx=256, n_batch=256, n_ubatch=256, n_gpu_layers=999, reuse_query_state=false` | ||
| 123 | + -> `265318 ms` | ||
| 124 | + | ||
| 125 | +## 经验沉淀 | ||
| 126 | + | ||
| 127 | +这次接入最重要的结论不是“哪个小参数更快”,而是: | ||
| 128 | + | ||
| 129 | +1. 这个 0.6B GGUF 权重虽然小,但当前后端实现仍是逐 doc 顺序打分。 | ||
| 130 | +2. 对在线 400-title 请求来说,串行打分本身就是主瓶颈。 | ||
| 131 | +3. `reuse_query_state` 在这个模型上没有带来收益,反而更慢。 | ||
| 132 | +4. `n_ctx` 拉大到 `384/512` 也没有带来实质收益,反而更慢或持平。 | ||
| 133 | +5. 这个 backend 的优势是低显存,不是低延迟。 | ||
| 134 | + | ||
| 135 | +如果目标是在线最短响应时间,优先级建议是: | ||
| 136 | + | ||
| 137 | +1. `qwen3_vllm` | ||
| 138 | +2. 其他真正支持高吞吐批处理的后端 | ||
| 139 | +3. `qwen3_gguf_06b` 仅作为低显存 fallback | ||
| 140 | + | ||
| 141 | +## 验证命令 | ||
| 142 | + | ||
| 143 | +本地直连 backend 调优: | ||
| 144 | + | ||
| 145 | +```bash | ||
| 146 | +PYTHONPATH=/data/saas-search ./.venv-reranker-gguf/bin/python \ | ||
| 147 | + scripts/benchmark_reranker_gguf_local.py --backend-name qwen3_gguf_06b --docs 400 | ||
| 148 | +``` | ||
| 149 | + | ||
| 150 | +按服务方式启动: | ||
| 151 | + | ||
| 152 | +```bash | ||
| 153 | +RERANK_BACKEND=qwen3_gguf_06b ./scripts/start_reranker.sh | ||
| 154 | +``` |
| @@ -0,0 +1,280 @@ | @@ -0,0 +1,280 @@ | ||
| 1 | +# Qwen3 GGUF 安装与调优手册 | ||
| 2 | + | ||
| 3 | +本文档只覆盖 `qwen3_gguf` 后端,目标机器为当前项目实测环境: | ||
| 4 | + | ||
| 5 | +- GPU: `Tesla T4 16GB` | ||
| 6 | +- CUDA: `12.8` | ||
| 7 | +- 模型: `DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF` | ||
| 8 | +- 量化: `Q8_0` | ||
| 9 | + | ||
| 10 | +--- | ||
| 11 | + | ||
| 12 | +## 1. 结论先看 | ||
| 13 | + | ||
| 14 | +当前这套代码里,GGUF 后端的主要瓶颈不是“显存没吃满”,而是 **llama.cpp 按 doc 顺序逐条打分**。因此最有效的优化策略是: | ||
| 15 | + | ||
| 16 | +- 让模型层尽可能全部 offload 到 GPU | ||
| 17 | +- 打开 `flash_attn` / `offload_kqv` | ||
| 18 | +- 把 `n_ctx / n_batch / n_ubatch` 调到一个对短标题重排更合适的高效点 | ||
| 19 | + | ||
| 20 | +本轮在当前机器上的推荐配置是: | ||
| 21 | + | ||
| 22 | +```yaml | ||
| 23 | +qwen3_gguf: | ||
| 24 | + n_ctx: 512 | ||
| 25 | + n_batch: 512 | ||
| 26 | + n_ubatch: 512 | ||
| 27 | + n_gpu_layers: 999 | ||
| 28 | + n_threads: 2 | ||
| 29 | + n_threads_batch: 4 | ||
| 30 | + flash_attn: true | ||
| 31 | + offload_kqv: true | ||
| 32 | + infer_batch_size: 8 | ||
| 33 | + sort_by_doc_length: true | ||
| 34 | + length_sort_mode: "char" | ||
| 35 | +``` | ||
| 36 | + | ||
| 37 | +说明: | ||
| 38 | + | ||
| 39 | +- `n_gpu_layers: 999` 在 llama.cpp 中等价于“尽可能全部层都 offload” | ||
| 40 | +- 这台 T4 上,**即使全量 offload,当前模型也只占到约 `4.5 GiB` GPU 显存** | ||
| 41 | +- 所以“允许 8G 显存”并不会自动带来更高速度;这个模型/后端在当前工作负载下已经接近“该用到的权重都上 GPU 了” | ||
| 42 | + | ||
| 43 | +--- | ||
| 44 | + | ||
| 45 | +## 2. 独立环境 | ||
| 46 | + | ||
| 47 | +`qwen3_gguf` 必须使用自己的独立 venv: | ||
| 48 | + | ||
| 49 | +- `qwen3_vllm` -> `.venv-reranker` | ||
| 50 | +- `qwen3_gguf` -> `.venv-reranker-gguf` | ||
| 51 | + | ||
| 52 | +安装命令: | ||
| 53 | + | ||
| 54 | +```bash | ||
| 55 | +./scripts/setup_reranker_venv.sh qwen3_gguf | ||
| 56 | +``` | ||
| 57 | + | ||
| 58 | +脚本现在会自动做两件事: | ||
| 59 | + | ||
| 60 | +1. 安装 GGUF 后端所需 Python 依赖 | ||
| 61 | +2. 在检测到 `/usr/local/cuda/bin/nvcc` 时,把 `llama-cpp-python` **重编译成 CUDA 版** | ||
| 62 | + | ||
| 63 | +--- | ||
| 64 | + | ||
| 65 | +## 3. GPU 版验证 | ||
| 66 | + | ||
| 67 | +必须验证不是 CPU-only 版: | ||
| 68 | + | ||
| 69 | +```bash | ||
| 70 | +./.venv-reranker-gguf/bin/python - <<'PY' | ||
| 71 | +import llama_cpp | ||
| 72 | +print("supports_gpu_offload =", llama_cpp.llama_supports_gpu_offload()) | ||
| 73 | +PY | ||
| 74 | +``` | ||
| 75 | + | ||
| 76 | +正确结果应为: | ||
| 77 | + | ||
| 78 | +```text | ||
| 79 | +supports_gpu_offload = True | ||
| 80 | +``` | ||
| 81 | + | ||
| 82 | +还可以看动态库: | ||
| 83 | + | ||
| 84 | +```bash | ||
| 85 | +ldd .venv-reranker-gguf/lib/python3.12/site-packages/llama_cpp/lib/libllama.so | rg 'cuda|cublas|ggml-cuda' | ||
| 86 | +``` | ||
| 87 | + | ||
| 88 | +应能看到: | ||
| 89 | + | ||
| 90 | +- `libggml-cuda.so` | ||
| 91 | +- `libcudart.so` | ||
| 92 | +- `libcublas.so` | ||
| 93 | + | ||
| 94 | +--- | ||
| 95 | + | ||
| 96 | +## 4. 模型下载 | ||
| 97 | + | ||
| 98 | +当前使用本地文件优先策略,模型放在: | ||
| 99 | + | ||
| 100 | +```text | ||
| 101 | +models/reranker/qwen3-reranker-4b-gguf/Qwen.Qwen3-Reranker-4B.Q8_0.gguf | ||
| 102 | +``` | ||
| 103 | + | ||
| 104 | +若本地文件存在,后端会直接加载本地 GGUF,不再依赖启动时在线下载。 | ||
| 105 | + | ||
| 106 | +为了避免当前机器上 Hugging Face Xet 下载的 `416 Range Not Satisfiable` 问题,`start_reranker.sh` 已对 `qwen3_gguf` 默认设置: | ||
| 107 | + | ||
| 108 | +```bash | ||
| 109 | +HF_HUB_DISABLE_XET=1 | ||
| 110 | +``` | ||
| 111 | + | ||
| 112 | +--- | ||
| 113 | + | ||
| 114 | +## 5. 本地调优脚本 | ||
| 115 | + | ||
| 116 | +新增本地基准脚本: | ||
| 117 | + | ||
| 118 | +```bash | ||
| 119 | +PYTHONPATH=/data/saas-search ./.venv-reranker-gguf/bin/python \ | ||
| 120 | + scripts/benchmark_reranker_gguf_local.py --docs 64 --repeat 1 | ||
| 121 | +``` | ||
| 122 | + | ||
| 123 | +它会直接实例化 GGUF backend,输出: | ||
| 124 | + | ||
| 125 | +- 模型加载耗时 | ||
| 126 | +- 当前进程 GPU 显存占用 | ||
| 127 | +- 单次 rerank 延迟 | ||
| 128 | + | ||
| 129 | +--- | ||
| 130 | + | ||
| 131 | +## 6. 本轮实测结果 | ||
| 132 | + | ||
| 133 | +测试条件: | ||
| 134 | + | ||
| 135 | +- Query: `白色oversized T-shirt` | ||
| 136 | +- Docs: `64` 条商品标题 | ||
| 137 | +- 本地脚本:`scripts/benchmark_reranker_gguf_local.py` | ||
| 138 | +- 每组 1 次,重点比较相对趋势 | ||
| 139 | + | ||
| 140 | +结果: | ||
| 141 | + | ||
| 142 | +### 6.1 保守配置 | ||
| 143 | + | ||
| 144 | +```text | ||
| 145 | +n_ctx=384 | ||
| 146 | +n_batch=384 | ||
| 147 | +n_ubatch=128 | ||
| 148 | +n_gpu_layers=24 | ||
| 149 | +``` | ||
| 150 | + | ||
| 151 | +- GPU 显存:`2984 MiB` | ||
| 152 | +- 64 docs 延迟:`74347.91 ms` | ||
| 153 | + | ||
| 154 | +### 6.2 全量 offload | ||
| 155 | + | ||
| 156 | +```text | ||
| 157 | +n_ctx=384 | ||
| 158 | +n_batch=384 | ||
| 159 | +n_ubatch=128 | ||
| 160 | +n_gpu_layers=999 | ||
| 161 | +``` | ||
| 162 | + | ||
| 163 | +- GPU 显存:`4338 MiB` | ||
| 164 | +- 64 docs 延迟:`51401.77 ms` | ||
| 165 | + | ||
| 166 | +### 6.3 最优配置 | ||
| 167 | + | ||
| 168 | +```text | ||
| 169 | +n_ctx=512 | ||
| 170 | +n_batch=512 | ||
| 171 | +n_ubatch=512 | ||
| 172 | +n_gpu_layers=999 | ||
| 173 | +``` | ||
| 174 | + | ||
| 175 | +- GPU 显存:`4564 MiB` | ||
| 176 | +- 64 docs 延迟:`49116.10 ms` | ||
| 177 | + | ||
| 178 | +### 6.4 其它尝试 | ||
| 179 | + | ||
| 180 | +`n_threads=4 / n_threads_batch=8`: | ||
| 181 | + | ||
| 182 | +- GPU 显存:`4564 MiB` | ||
| 183 | +- 64 docs 延迟:`49895.88 ms` | ||
| 184 | +- 比推荐值略慢 | ||
| 185 | + | ||
| 186 | +`infer_batch_size=64`: | ||
| 187 | + | ||
| 188 | +- GPU 显存:`4564 MiB` | ||
| 189 | +- 64 docs 延迟:`50723.36 ms` | ||
| 190 | +- 也略慢 | ||
| 191 | + | ||
| 192 | +### 6.5 API 级验证 | ||
| 193 | + | ||
| 194 | +在把推荐配置写入 `config/config.yaml` 并重启服务后,使用: | ||
| 195 | + | ||
| 196 | +```bash | ||
| 197 | +RERANK_BASE=http://127.0.0.1:6007 \ | ||
| 198 | + ./.venv/bin/python scripts/benchmark_reranker_random_titles.py 64 --repeat 1 --query '白色oversized T-shirt' | ||
| 199 | +``` | ||
| 200 | + | ||
| 201 | +得到: | ||
| 202 | + | ||
| 203 | +- `64 docs`:`50177.22 ms` | ||
| 204 | + | ||
| 205 | +再用: | ||
| 206 | + | ||
| 207 | +```bash | ||
| 208 | +RERANK_BASE=http://127.0.0.1:6007 \ | ||
| 209 | + ./.venv/bin/python scripts/benchmark_reranker_random_titles.py 153 --repeat 1 --query '白色oversized T-shirt' | ||
| 210 | +``` | ||
| 211 | + | ||
| 212 | +得到: | ||
| 213 | + | ||
| 214 | +- `153 docs`:`115328.60 ms` | ||
| 215 | + | ||
| 216 | +对比旧日志中的保守配置: | ||
| 217 | + | ||
| 218 | +- 旧配置 `153 docs`:`153435.37 ms` | ||
| 219 | +- 新配置 `153 docs`:`115328.60 ms` | ||
| 220 | + | ||
| 221 | +改善幅度约: | ||
| 222 | + | ||
| 223 | +- `24.8%` | ||
| 224 | + | ||
| 225 | +--- | ||
| 226 | + | ||
| 227 | +## 7. 为什么没有吃到 8G | ||
| 228 | + | ||
| 229 | +结论很重要: | ||
| 230 | + | ||
| 231 | +- 当前最优配置已经是“尽可能全量层 offload” | ||
| 232 | +- 该 `Q8_0` 模型在这套 llama.cpp / T4 / 短文本重排场景下,**实测只需要约 `4.5 GiB` GPU 显存** | ||
| 233 | +- 继续为了“吃满 8G”去增大 `n_ctx`,不会明显提升吞吐,反而可能带来额外开销 | ||
| 234 | + | ||
| 235 | +所以本轮不是“显存太保守”,而是: | ||
| 236 | + | ||
| 237 | +- 可 offload 的权重已经基本 offload 完了 | ||
| 238 | +- 真正拖慢响应的是 **逐 doc 顺序推理** 这一后端实现路径 | ||
| 239 | + | ||
| 240 | +--- | ||
| 241 | + | ||
| 242 | +## 8. 生产建议 | ||
| 243 | + | ||
| 244 | +### 8.1 当前建议 | ||
| 245 | + | ||
| 246 | +保留以下参数: | ||
| 247 | + | ||
| 248 | +```yaml | ||
| 249 | +n_ctx: 512 | ||
| 250 | +n_batch: 512 | ||
| 251 | +n_ubatch: 512 | ||
| 252 | +n_gpu_layers: 999 | ||
| 253 | +n_threads: 2 | ||
| 254 | +n_threads_batch: 4 | ||
| 255 | +flash_attn: true | ||
| 256 | +offload_kqv: true | ||
| 257 | +``` | ||
| 258 | + | ||
| 259 | +### 8.2 如果还嫌慢 | ||
| 260 | + | ||
| 261 | +优先级建议: | ||
| 262 | + | ||
| 263 | +1. 缩小 `rerank_window` | ||
| 264 | +2. 减少传入 doc 数 | ||
| 265 | +3. 若业务允许,切换到更适合高吞吐的后端 | ||
| 266 | + | ||
| 267 | +原因: | ||
| 268 | + | ||
| 269 | +- 当前 GGUF 后端是本地单进程、逐 doc 打分 | ||
| 270 | +- 对长列表重排,它天然不如 vLLM / 云端 rerank API 擅长吞吐 | ||
| 271 | + | ||
| 272 | +--- | ||
| 273 | + | ||
| 274 | +## 9. 本轮落地文件 | ||
| 275 | + | ||
| 276 | +- `config/config.yaml` | ||
| 277 | +- `scripts/setup_reranker_venv.sh` | ||
| 278 | +- `scripts/start_reranker.sh` | ||
| 279 | +- `scripts/benchmark_reranker_gguf_local.py` | ||
| 280 | +- `reranker/GGUF_INSTALL_AND_TUNING.md` |
reranker/README.md
| 1 | # Reranker 模块 | 1 | # Reranker 模块 |
| 2 | 2 | ||
| 3 | -**请求示例**见 `docs/QUICKSTART.md` §3.5。扩展规范见 `docs/DEVELOPER_GUIDE.md` §7。部署与调优实战见 `reranker/DEPLOYMENT_AND_TUNING.md`。 | 3 | +**请求示例**见 `docs/QUICKSTART.md` §3.5。扩展规范见 `docs/DEVELOPER_GUIDE.md` §7。部署与调优实战见 `reranker/DEPLOYMENT_AND_TUNING.md`。`ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF` 的专项接入与调优结论见 `reranker/GGUF_0_6B_INSTALL_AND_TUNING.md`。 |
| 4 | 4 | ||
| 5 | --- | 5 | --- |
| 6 | 6 | ||
| 7 | Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwen3-vLLM、Qwen3-Transformers、Qwen3-GGUF、DashScope 云重排)。调用方通过 HTTP 访问,不关心具体后端。 | 7 | Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwen3-vLLM、Qwen3-Transformers、Qwen3-GGUF、DashScope 云重排)。调用方通过 HTTP 访问,不关心具体后端。 |
| 8 | 8 | ||
| 9 | **特性** | 9 | **特性** |
| 10 | -- 多后端:`qwen3_vllm`、`qwen3_transformers`、`qwen3_gguf`(Qwen3-Reranker-4B GGUF + llama.cpp)、`bge`(兼容保留) | 10 | +- 多后端:`qwen3_vllm`、`qwen3_transformers`、`qwen3_gguf`(Qwen3-Reranker-4B GGUF + llama.cpp)、`qwen3_gguf_06b`(Qwen3-Reranker-0.6B Q8_0 GGUF + llama.cpp)、`bge`(兼容保留) |
| 11 | - 云后端:`dashscope_rerank`(调用 DashScope `/compatible-api/v1/reranks`,支持按地域切换 endpoint) | 11 | - 云后端:`dashscope_rerank`(调用 DashScope `/compatible-api/v1/reranks`,支持按地域切换 endpoint) |
| 12 | - 统一配置:`config/config.yaml` → `services.rerank.backend` / `services.rerank.backends.<name>` | 12 | - 统一配置:`config/config.yaml` → `services.rerank.backend` / `services.rerank.backends.<name>` |
| 13 | - 文档去重、分数与输入顺序一致、FP16/GPU 支持(视后端) | 13 | - 文档去重、分数与输入顺序一致、FP16/GPU 支持(视后端) |
| @@ -19,7 +19,7 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe | @@ -19,7 +19,7 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe | ||
| 19 | - `backends/bge.py`:BGE 后端 | 19 | - `backends/bge.py`:BGE 后端 |
| 20 | - `backends/qwen3_vllm.py`:Qwen3-Reranker-0.6B + vLLM 后端 | 20 | - `backends/qwen3_vllm.py`:Qwen3-Reranker-0.6B + vLLM 后端 |
| 21 | - `backends/qwen3_transformers.py`:Qwen3-Reranker-0.6B 纯 Transformers 后端(官方 Usage 方式) | 21 | - `backends/qwen3_transformers.py`:Qwen3-Reranker-0.6B 纯 Transformers 后端(官方 Usage 方式) |
| 22 | - - `backends/qwen3_gguf.py`:Qwen3-Reranker-4B GGUF + llama.cpp 后端 | 22 | + - `backends/qwen3_gguf.py`:Qwen3-Reranker GGUF + llama.cpp 后端(支持 `qwen3_gguf` / `qwen3_gguf_06b`) |
| 23 | - `backends/dashscope_rerank.py`:DashScope 云重排后端(HTTP 调用) | 23 | - `backends/dashscope_rerank.py`:DashScope 云重排后端(HTTP 调用) |
| 24 | - `reranker/bge_reranker.py`:BGE 核心推理(被 bge 后端封装) | 24 | - `reranker/bge_reranker.py`:BGE 核心推理(被 bge 后端封装) |
| 25 | - `reranker/config.py`:服务端口、MAX_DOCS、NORMALIZE 等(后端参数在 config.yaml) | 25 | - `reranker/config.py`:服务端口、MAX_DOCS、NORMALIZE 等(后端参数在 config.yaml) |
| @@ -32,11 +32,12 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe | @@ -32,11 +32,12 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe | ||
| 32 | - 现在按 backend 使用独立 venv: | 32 | - 现在按 backend 使用独立 venv: |
| 33 | - `qwen3_vllm` -> `.venv-reranker` | 33 | - `qwen3_vllm` -> `.venv-reranker` |
| 34 | - `qwen3_gguf` -> `.venv-reranker-gguf` | 34 | - `qwen3_gguf` -> `.venv-reranker-gguf` |
| 35 | + - `qwen3_gguf_06b` -> `.venv-reranker-gguf-06b` | ||
| 35 | - `qwen3_transformers` -> `.venv-reranker-transformers` | 36 | - `qwen3_transformers` -> `.venv-reranker-transformers` |
| 36 | - `bge` -> `.venv-reranker-bge` | 37 | - `bge` -> `.venv-reranker-bge` |
| 37 | - `dashscope_rerank` -> `.venv-reranker-dashscope` | 38 | - `dashscope_rerank` -> `.venv-reranker-dashscope` |
| 38 | ```bash | 39 | ```bash |
| 39 | - ./scripts/setup_reranker_venv.sh qwen3_gguf | 40 | + ./scripts/setup_reranker_venv.sh qwen3_gguf_06b |
| 40 | ``` | 41 | ``` |
| 41 | CUDA 构建建议: | 42 | CUDA 构建建议: |
| 42 | ```bash | 43 | ```bash |
| @@ -48,7 +49,7 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe | @@ -48,7 +49,7 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe | ||
| 48 | ``` | 49 | ``` |
| 49 | 50 | ||
| 50 | ## 配置 | 51 | ## 配置 |
| 51 | -- **后端选择**:`config/config.yaml` 中 `services.rerank.backend`(`qwen3_vllm` | `qwen3_transformers` | `qwen3_gguf` | `bge` | `dashscope_rerank`),或环境变量 `RERANK_BACKEND`。 | 52 | +- **后端选择**:`config/config.yaml` 中 `services.rerank.backend`(`qwen3_vllm` | `qwen3_transformers` | `qwen3_gguf` | `qwen3_gguf_06b` | `bge` | `dashscope_rerank`),或环境变量 `RERANK_BACKEND`。 |
| 52 | - **后端参数**:`services.rerank.backends.bge` / `services.rerank.backends.qwen3_vllm`,例如: | 53 | - **后端参数**:`services.rerank.backends.bge` / `services.rerank.backends.qwen3_vllm`,例如: |
| 53 | 54 | ||
| 54 | ```yaml | 55 | ```yaml |
| @@ -96,6 +97,20 @@ services: | @@ -96,6 +97,20 @@ services: | ||
| 96 | infer_batch_size: 8 | 97 | infer_batch_size: 8 |
| 97 | sort_by_doc_length: true | 98 | sort_by_doc_length: true |
| 98 | length_sort_mode: "char" | 99 | length_sort_mode: "char" |
| 100 | + qwen3_gguf_06b: | ||
| 101 | + repo_id: "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF" | ||
| 102 | + filename: "qwen3-reranker-0.6b-q8_0.gguf" | ||
| 103 | + local_dir: "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf" | ||
| 104 | + cache_dir: "./model_cache" | ||
| 105 | + instruction: "Rank products by query with category & style match prioritized" | ||
| 106 | + n_ctx: 256 | ||
| 107 | + n_batch: 256 | ||
| 108 | + n_ubatch: 256 | ||
| 109 | + n_gpu_layers: 999 | ||
| 110 | + infer_batch_size: 32 | ||
| 111 | + sort_by_doc_length: true | ||
| 112 | + length_sort_mode: "char" | ||
| 113 | + reuse_query_state: false | ||
| 99 | dashscope_rerank: | 114 | dashscope_rerank: |
| 100 | model_name: "qwen3-rerank" | 115 | model_name: "qwen3-rerank" |
| 101 | endpoint: "https://dashscope.aliyuncs.com/compatible-api/v1/reranks" | 116 | endpoint: "https://dashscope.aliyuncs.com/compatible-api/v1/reranks" |
| @@ -153,7 +168,7 @@ Content-Type: application/json | @@ -153,7 +168,7 @@ Content-Type: application/json | ||
| 153 | ``` | 168 | ``` |
| 154 | 169 | ||
| 155 | `top_n` 为可选字段: | 170 | `top_n` 为可选字段: |
| 156 | -- 对本地后端(`qwen3_vllm` / `qwen3_transformers` / `qwen3_gguf` / `bge`)通常会忽略,仍返回全量分数。 | 171 | +- 对本地后端(`qwen3_vllm` / `qwen3_transformers` / `qwen3_gguf` / `qwen3_gguf_06b` / `bge`)通常会忽略,仍返回全量分数。 |
| 157 | - 对 `dashscope_rerank` 可用于控制云端返回的候选量,建议设置为 `page+size`(例如分页 `from=20,size=10` 时传 `30`)。 | 172 | - 对 `dashscope_rerank` 可用于控制云端返回的候选量,建议设置为 `page+size`(例如分页 `from=20,size=10` 时传 `30`)。 |
| 158 | 173 | ||
| 159 | Response: | 174 | Response: |
| @@ -192,3 +207,4 @@ uvicorn reranker.server:app --host 0.0.0.0 --port 6007 --log-level info | @@ -192,3 +207,4 @@ uvicorn reranker.server:app --host 0.0.0.0 --port 6007 --log-level info | ||
| 192 | - **Qwen3-vLLM**:参考 [Qwen3-Reranker-0.6B](https://huggingface.co/Qwen/Qwen3-Reranker-0.6B),需 GPU 与较多显存;与 BGE 相比适合长文本、高吞吐场景(vLLM 前缀缓存)。 | 207 | - **Qwen3-vLLM**:参考 [Qwen3-Reranker-0.6B](https://huggingface.co/Qwen/Qwen3-Reranker-0.6B),需 GPU 与较多显存;与 BGE 相比适合长文本、高吞吐场景(vLLM 前缀缓存)。 |
| 193 | - **Qwen3-Transformers**:官方 Transformers Usage 方式,无需 vLLM;适合 CPU 或小显存。默认 `attn_implementation: "sdpa"`;若已安装 `flash_attn` 可设 `flash_attention_2`(未安装时服务会自动回退到 sdpa)。 | 208 | - **Qwen3-Transformers**:官方 Transformers Usage 方式,无需 vLLM;适合 CPU 或小显存。默认 `attn_implementation: "sdpa"`;若已安装 `flash_attn` 可设 `flash_attention_2`(未安装时服务会自动回退到 sdpa)。 |
| 194 | - **Qwen3-GGUF**:参考 [DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF](https://huggingface.co/DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF)。单卡 T4 且仅剩约 `4.8~6GB` 显存时,推荐 `Q8_0 + n_ctx=384 + n_gpu_layers=24 + flash_attn=true + offload_kqv=true` 起步;若启动 OOM,优先把 `n_gpu_layers` 下调到 `20`,再把 `n_ctx` 下调到 `320`。`infer_batch_size` 在 GGUF 后端是服务侧 work chunk,大多不如 `n_gpu_layers` / `n_ctx` 关键。 | 209 | - **Qwen3-GGUF**:参考 [DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF](https://huggingface.co/DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF)。单卡 T4 且仅剩约 `4.8~6GB` 显存时,推荐 `Q8_0 + n_ctx=384 + n_gpu_layers=24 + flash_attn=true + offload_kqv=true` 起步;若启动 OOM,优先把 `n_gpu_layers` 下调到 `20`,再把 `n_ctx` 下调到 `320`。`infer_batch_size` 在 GGUF 后端是服务侧 work chunk,大多不如 `n_gpu_layers` / `n_ctx` 关键。 |
| 210 | +- **Qwen3-GGUF-0.6B**:参考 [ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF](https://huggingface.co/ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF)。它的优点是权重小、显存占用低,单进程实测约 `0.9~1.1 GiB`;但在当前 llama.cpp 串行打分接法下,`1 query + 400 titles` 的实测延迟仍约 `265s`。因此它更适合低显存功能后备,不适合作为在线低延迟主 reranker。 |
reranker/backends/__init__.py
| @@ -48,12 +48,19 @@ def get_rerank_backend(name: str, config: Dict[str, Any]) -> RerankBackendProtoc | @@ -48,12 +48,19 @@ def get_rerank_backend(name: str, config: Dict[str, Any]) -> RerankBackendProtoc | ||
| 48 | return Qwen3TransformersRerankerBackend(config) | 48 | return Qwen3TransformersRerankerBackend(config) |
| 49 | if name == "qwen3_gguf": | 49 | if name == "qwen3_gguf": |
| 50 | from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend | 50 | from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend |
| 51 | - return Qwen3GGUFRerankerBackend(config) | 51 | + gguf_config = dict(config or {}) |
| 52 | + gguf_config.setdefault("_backend_name", "qwen3_gguf") | ||
| 53 | + return Qwen3GGUFRerankerBackend(gguf_config) | ||
| 54 | + if name == "qwen3_gguf_06b": | ||
| 55 | + from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend | ||
| 56 | + gguf_config = dict(config or {}) | ||
| 57 | + gguf_config.setdefault("_backend_name", "qwen3_gguf_06b") | ||
| 58 | + return Qwen3GGUFRerankerBackend(gguf_config) | ||
| 52 | if name == "dashscope_rerank": | 59 | if name == "dashscope_rerank": |
| 53 | from reranker.backends.dashscope_rerank import DashScopeRerankBackend | 60 | from reranker.backends.dashscope_rerank import DashScopeRerankBackend |
| 54 | return DashScopeRerankBackend(config) | 61 | return DashScopeRerankBackend(config) |
| 55 | raise ValueError( | 62 | raise ValueError( |
| 56 | - f"Unknown rerank backend: {name!r}. Supported: bge, qwen3_vllm, qwen3_transformers, qwen3_gguf, dashscope_rerank" | 63 | + f"Unknown rerank backend: {name!r}. Supported: bge, qwen3_vllm, qwen3_transformers, qwen3_gguf, qwen3_gguf_06b, dashscope_rerank" |
| 57 | ) | 64 | ) |
| 58 | 65 | ||
| 59 | 66 |
reranker/backends/qwen3_gguf.py
| 1 | """ | 1 | """ |
| 2 | -Qwen3-Reranker-4B GGUF backend using llama-cpp-python. | 2 | +Qwen3-Reranker GGUF backend using llama-cpp-python. |
| 3 | 3 | ||
| 4 | Reference: | 4 | Reference: |
| 5 | - https://huggingface.co/DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF | 5 | - https://huggingface.co/DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF |
| 6 | - https://huggingface.co/Qwen/Qwen3-Reranker-4B | 6 | - https://huggingface.co/Qwen/Qwen3-Reranker-4B |
| 7 | +- https://huggingface.co/ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF | ||
| 8 | +- https://huggingface.co/Qwen/Qwen3-Reranker-0.6B | ||
| 7 | """ | 9 | """ |
| 8 | 10 | ||
| 9 | from __future__ import annotations | 11 | from __future__ import annotations |
| @@ -13,12 +15,27 @@ import math | @@ -13,12 +15,27 @@ import math | ||
| 13 | import os | 15 | import os |
| 14 | import threading | 16 | import threading |
| 15 | import time | 17 | import time |
| 18 | +from pathlib import Path | ||
| 16 | from typing import Any, Dict, List, Tuple | 19 | from typing import Any, Dict, List, Tuple |
| 17 | 20 | ||
| 18 | 21 | ||
| 19 | logger = logging.getLogger("reranker.backends.qwen3_gguf") | 22 | logger = logging.getLogger("reranker.backends.qwen3_gguf") |
| 20 | 23 | ||
| 21 | 24 | ||
| 25 | +_BACKEND_DEFAULTS: Dict[str, Dict[str, str]] = { | ||
| 26 | + "qwen3_gguf": { | ||
| 27 | + "repo_id": "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF", | ||
| 28 | + "filename": "*Q8_0.gguf", | ||
| 29 | + "local_dir": "./models/reranker/qwen3-reranker-4b-gguf", | ||
| 30 | + }, | ||
| 31 | + "qwen3_gguf_06b": { | ||
| 32 | + "repo_id": "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF", | ||
| 33 | + "filename": "qwen3-reranker-0.6b-q8_0.gguf", | ||
| 34 | + "local_dir": "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf", | ||
| 35 | + }, | ||
| 36 | +} | ||
| 37 | + | ||
| 38 | + | ||
| 22 | def deduplicate_with_positions(texts: List[str]) -> Tuple[List[str], List[int]]: | 39 | def deduplicate_with_positions(texts: List[str]) -> Tuple[List[str], List[int]]: |
| 23 | """Deduplicate texts globally while preserving first-seen order.""" | 40 | """Deduplicate texts globally while preserving first-seen order.""" |
| 24 | unique_texts: List[str] = [] | 41 | unique_texts: List[str] = [] |
| @@ -46,21 +63,21 @@ def _format_instruction(instruction: str, query: str, doc: str) -> str: | @@ -46,21 +63,21 @@ def _format_instruction(instruction: str, query: str, doc: str) -> str: | ||
| 46 | 63 | ||
| 47 | class Qwen3GGUFRerankerBackend: | 64 | class Qwen3GGUFRerankerBackend: |
| 48 | """ | 65 | """ |
| 49 | - Qwen3-Reranker-4B GGUF backend using llama.cpp through llama-cpp-python. | 66 | + Qwen3-Reranker GGUF backend using llama.cpp through llama-cpp-python. |
| 50 | 67 | ||
| 51 | - Tuned for short-query / short-doc reranking on a memory-constrained single T4. | ||
| 52 | - Config from services.rerank.backends.qwen3_gguf. | 68 | + Tuned for short-query / short-doc reranking on a single GPU. |
| 69 | + Config from services.rerank.backends.<backend_name>. | ||
| 53 | """ | 70 | """ |
| 54 | 71 | ||
| 55 | def __init__(self, config: Dict[str, Any]) -> None: | 72 | def __init__(self, config: Dict[str, Any]) -> None: |
| 56 | self._config = config or {} | 73 | self._config = config or {} |
| 57 | - self._repo_id = str( | ||
| 58 | - self._config.get("repo_id") or "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF" | ||
| 59 | - ).strip() | ||
| 60 | - self._filename = str(self._config.get("filename") or "*Q8_0.gguf").strip() | 74 | + self._backend_name = str(self._config.get("_backend_name") or "qwen3_gguf").strip() |
| 75 | + defaults = _BACKEND_DEFAULTS.get(self._backend_name, _BACKEND_DEFAULTS["qwen3_gguf"]) | ||
| 76 | + self._repo_id = str(self._config.get("repo_id") or defaults["repo_id"]).strip() | ||
| 77 | + self._filename = str(self._config.get("filename") or defaults["filename"]).strip() | ||
| 61 | self._model_path = str(self._config.get("model_path") or "").strip() | 78 | self._model_path = str(self._config.get("model_path") or "").strip() |
| 62 | self._cache_dir = str(self._config.get("cache_dir") or "").strip() or None | 79 | self._cache_dir = str(self._config.get("cache_dir") or "").strip() or None |
| 63 | - self._local_dir = str(self._config.get("local_dir") or "").strip() or None | 80 | + self._local_dir = str(self._config.get("local_dir") or defaults["local_dir"]).strip() or None |
| 64 | self._instruction = str( | 81 | self._instruction = str( |
| 65 | self._config.get("instruction") | 82 | self._config.get("instruction") |
| 66 | or "Rank products by query with category & style match prioritized" | 83 | or "Rank products by query with category & style match prioritized" |
| @@ -79,6 +96,7 @@ class Qwen3GGUFRerankerBackend: | @@ -79,6 +96,7 @@ class Qwen3GGUFRerankerBackend: | ||
| 79 | "on", | 96 | "on", |
| 80 | } | 97 | } |
| 81 | self._length_sort_mode = str(self._config.get("length_sort_mode") or "char").strip().lower() | 98 | self._length_sort_mode = str(self._config.get("length_sort_mode") or "char").strip().lower() |
| 99 | + self._reuse_query_state = bool(self._config.get("reuse_query_state", False)) | ||
| 82 | 100 | ||
| 83 | n_ctx = int(self._config.get("n_ctx", self._config.get("max_model_len", 384))) | 101 | n_ctx = int(self._config.get("n_ctx", self._config.get("max_model_len", 384))) |
| 84 | n_batch = int(self._config.get("n_batch", min(n_ctx, 384))) | 102 | n_batch = int(self._config.get("n_batch", min(n_ctx, 384))) |
| @@ -105,8 +123,9 @@ class Qwen3GGUFRerankerBackend: | @@ -105,8 +123,9 @@ class Qwen3GGUFRerankerBackend: | ||
| 105 | from llama_cpp import Llama | 123 | from llama_cpp import Llama |
| 106 | except Exception as exc: # pragma: no cover - depends on optional dependency | 124 | except Exception as exc: # pragma: no cover - depends on optional dependency |
| 107 | raise RuntimeError( | 125 | raise RuntimeError( |
| 108 | - "qwen3_gguf backend requires llama-cpp-python. " | ||
| 109 | - "Install the qwen3_gguf backend venv first via scripts/setup_reranker_venv.sh qwen3_gguf." | 126 | + f"{self._backend_name} backend requires llama-cpp-python. " |
| 127 | + f"Install the {self._backend_name} backend venv first via " | ||
| 128 | + f"scripts/setup_reranker_venv.sh {self._backend_name}." | ||
| 110 | ) from exc | 129 | ) from exc |
| 111 | 130 | ||
| 112 | self._llama_class = Llama | 131 | self._llama_class = Llama |
| @@ -118,7 +137,8 @@ class Qwen3GGUFRerankerBackend: | @@ -118,7 +137,8 @@ class Qwen3GGUFRerankerBackend: | ||
| 118 | self._infer_lock = threading.Lock() | 137 | self._infer_lock = threading.Lock() |
| 119 | 138 | ||
| 120 | logger.info( | 139 | logger.info( |
| 121 | - "[Qwen3_GGUF] Loading model repo=%s filename=%s model_path=%s n_ctx=%s n_batch=%s n_ubatch=%s n_gpu_layers=%s flash_attn=%s offload_kqv=%s", | 140 | + "[Qwen3_GGUF] Loading backend=%s repo=%s filename=%s model_path=%s n_ctx=%s n_batch=%s n_ubatch=%s n_gpu_layers=%s flash_attn=%s offload_kqv=%s reuse_query_state=%s", |
| 141 | + self._backend_name, | ||
| 122 | self._repo_id, | 142 | self._repo_id, |
| 123 | self._filename, | 143 | self._filename, |
| 124 | self._model_path or None, | 144 | self._model_path or None, |
| @@ -128,6 +148,7 @@ class Qwen3GGUFRerankerBackend: | @@ -128,6 +148,7 @@ class Qwen3GGUFRerankerBackend: | ||
| 128 | n_gpu_layers, | 148 | n_gpu_layers, |
| 129 | flash_attn, | 149 | flash_attn, |
| 130 | offload_kqv, | 150 | offload_kqv, |
| 151 | + self._reuse_query_state, | ||
| 131 | ) | 152 | ) |
| 132 | 153 | ||
| 133 | llm_kwargs = { | 154 | llm_kwargs = { |
| @@ -158,6 +179,7 @@ class Qwen3GGUFRerankerBackend: | @@ -158,6 +179,7 @@ class Qwen3GGUFRerankerBackend: | ||
| 158 | self._suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n" | 179 | self._suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n" |
| 159 | self._prefix_tokens = self._tokenize(self._prefix, special=True) | 180 | self._prefix_tokens = self._tokenize(self._prefix, special=True) |
| 160 | self._suffix_tokens = self._tokenize(self._suffix, special=True) | 181 | self._suffix_tokens = self._tokenize(self._suffix, special=True) |
| 182 | + self._request_prefix_template = "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: " | ||
| 161 | self._effective_max_len = self._n_ctx - len(self._prefix_tokens) - len(self._suffix_tokens) | 183 | self._effective_max_len = self._n_ctx - len(self._prefix_tokens) - len(self._suffix_tokens) |
| 162 | if self._effective_max_len <= 16: | 184 | if self._effective_max_len <= 16: |
| 163 | raise RuntimeError( | 185 | raise RuntimeError( |
| @@ -171,7 +193,8 @@ class Qwen3GGUFRerankerBackend: | @@ -171,7 +193,8 @@ class Qwen3GGUFRerankerBackend: | ||
| 171 | self._warmup() | 193 | self._warmup() |
| 172 | 194 | ||
| 173 | logger.info( | 195 | logger.info( |
| 174 | - "[Qwen3_GGUF] Model ready | model=%s effective_max_len=%s infer_batch_size=%s sort_by_doc_length=%s", | 196 | + "[Qwen3_GGUF] Model ready | backend=%s model=%s effective_max_len=%s infer_batch_size=%s sort_by_doc_length=%s", |
| 197 | + self._backend_name, | ||
| 175 | self._model_name, | 198 | self._model_name, |
| 176 | self._effective_max_len, | 199 | self._effective_max_len, |
| 177 | self._infer_batch_size, | 200 | self._infer_batch_size, |
| @@ -181,6 +204,14 @@ class Qwen3GGUFRerankerBackend: | @@ -181,6 +204,14 @@ class Qwen3GGUFRerankerBackend: | ||
| 181 | def _load_model(self, llm_kwargs: Dict[str, Any]): | 204 | def _load_model(self, llm_kwargs: Dict[str, Any]): |
| 182 | if self._model_path: | 205 | if self._model_path: |
| 183 | return self._llama_class(model_path=self._model_path, **llm_kwargs) | 206 | return self._llama_class(model_path=self._model_path, **llm_kwargs) |
| 207 | + if self._local_dir: | ||
| 208 | + matches = sorted( | ||
| 209 | + path for path in Path(self._local_dir).glob(self._filename) if path.is_file() | ||
| 210 | + ) | ||
| 211 | + if matches: | ||
| 212 | + local_model_path = str(matches[0].resolve()) | ||
| 213 | + logger.info("[Qwen3_GGUF] Using local GGUF file: %s", local_model_path) | ||
| 214 | + return self._llama_class(model_path=local_model_path, **llm_kwargs) | ||
| 184 | return self._llama_class.from_pretrained( | 215 | return self._llama_class.from_pretrained( |
| 185 | repo_id=self._repo_id, | 216 | repo_id=self._repo_id, |
| 186 | filename=self._filename, | 217 | filename=self._filename, |
| @@ -212,6 +243,13 @@ class Qwen3GGUFRerankerBackend: | @@ -212,6 +243,13 @@ class Qwen3GGUFRerankerBackend: | ||
| 212 | except Exception as exc: # pragma: no cover - defensive | 243 | except Exception as exc: # pragma: no cover - defensive |
| 213 | logger.warning("[Qwen3_GGUF] Warmup failed: %s", exc) | 244 | logger.warning("[Qwen3_GGUF] Warmup failed: %s", exc) |
| 214 | 245 | ||
| 246 | + def _build_request_prefix_tokens(self, query: str) -> List[int]: | ||
| 247 | + request_prefix = self._request_prefix_template.format( | ||
| 248 | + instruction=self._instruction, | ||
| 249 | + query=query, | ||
| 250 | + ) | ||
| 251 | + return self._tokenize(request_prefix, special=False) | ||
| 252 | + | ||
| 215 | def _build_prompt_tokens(self, query: str, doc: str) -> List[int]: | 253 | def _build_prompt_tokens(self, query: str, doc: str) -> List[int]: |
| 216 | pair = _format_instruction(self._instruction, query, doc) | 254 | pair = _format_instruction(self._instruction, query, doc) |
| 217 | pair_tokens = self._tokenize(pair, special=False) | 255 | pair_tokens = self._tokenize(pair, special=False) |
| @@ -235,6 +273,36 @@ class Qwen3GGUFRerankerBackend: | @@ -235,6 +273,36 @@ class Qwen3GGUFRerankerBackend: | ||
| 235 | false_exp = math.exp(false_logit - max_logit) | 273 | false_exp = math.exp(false_logit - max_logit) |
| 236 | return float(true_exp / (true_exp + false_exp)) | 274 | return float(true_exp / (true_exp + false_exp)) |
| 237 | 275 | ||
| 276 | + def _supports_query_state_reuse(self) -> bool: | ||
| 277 | + return ( | ||
| 278 | + self._reuse_query_state | ||
| 279 | + and hasattr(self._llm, "save_state") | ||
| 280 | + and hasattr(self._llm, "load_state") | ||
| 281 | + ) | ||
| 282 | + | ||
| 283 | + def _build_query_state_locked(self, query: str): | ||
| 284 | + request_prefix_tokens = self._build_request_prefix_tokens(query) | ||
| 285 | + max_doc_tokens = self._effective_max_len - len(request_prefix_tokens) | ||
| 286 | + if max_doc_tokens <= 0: | ||
| 287 | + return None, 0 | ||
| 288 | + self._llm.reset() | ||
| 289 | + self._llm.eval(self._prefix_tokens + request_prefix_tokens) | ||
| 290 | + return self._llm.save_state(), max_doc_tokens | ||
| 291 | + | ||
| 292 | + def _score_doc_with_state_locked(self, state, doc_tokens: List[int], max_doc_tokens: int) -> float: | ||
| 293 | + self._llm.load_state(state) | ||
| 294 | + self._llm.eval(doc_tokens[:max_doc_tokens] + self._suffix_tokens) | ||
| 295 | + logits = self._llm.eval_logits | ||
| 296 | + if not logits: | ||
| 297 | + raise RuntimeError("llama.cpp returned empty logits") | ||
| 298 | + final_logits = list(logits[-1]) | ||
| 299 | + true_logit = float(final_logits[self._true_token]) | ||
| 300 | + false_logit = float(final_logits[self._false_token]) | ||
| 301 | + max_logit = max(true_logit, false_logit) | ||
| 302 | + true_exp = math.exp(true_logit - max_logit) | ||
| 303 | + false_exp = math.exp(false_logit - max_logit) | ||
| 304 | + return float(true_exp / (true_exp + false_exp)) | ||
| 305 | + | ||
| 238 | def _estimate_doc_lengths(self, docs: List[str]) -> List[int]: | 306 | def _estimate_doc_lengths(self, docs: List[str]) -> List[int]: |
| 239 | if self._length_sort_mode == "token": | 307 | if self._length_sort_mode == "token": |
| 240 | return [len(self._tokenize(text, special=False)) for text in docs] | 308 | return [len(self._tokenize(text, special=False)) for text in docs] |
| @@ -269,7 +337,7 @@ class Qwen3GGUFRerankerBackend: | @@ -269,7 +337,7 @@ class Qwen3GGUFRerankerBackend: | ||
| 269 | "dedup_ratio": 0.0, | 337 | "dedup_ratio": 0.0, |
| 270 | "elapsed_ms": round(elapsed_ms, 3), | 338 | "elapsed_ms": round(elapsed_ms, 3), |
| 271 | "model": self._model_name, | 339 | "model": self._model_name, |
| 272 | - "backend": "qwen3_gguf", | 340 | + "backend": self._backend_name, |
| 273 | "normalize": normalize, | 341 | "normalize": normalize, |
| 274 | "infer_batch_size": self._infer_batch_size, | 342 | "infer_batch_size": self._infer_batch_size, |
| 275 | "inference_batches": 0, | 343 | "inference_batches": 0, |
| @@ -289,14 +357,26 @@ class Qwen3GGUFRerankerBackend: | @@ -289,14 +357,26 @@ class Qwen3GGUFRerankerBackend: | ||
| 289 | order = sorted(order, key=lambda i: lengths[i]) | 357 | order = sorted(order, key=lambda i: lengths[i]) |
| 290 | 358 | ||
| 291 | unique_scores: List[float] = [0.0] * len(unique_texts) | 359 | unique_scores: List[float] = [0.0] * len(unique_texts) |
| 360 | + unique_doc_tokens = [self._tokenize(text, special=False) for text in unique_texts] | ||
| 292 | inference_batches = 0 | 361 | inference_batches = 0 |
| 293 | - for start in range(0, len(order), self._infer_batch_size): | ||
| 294 | - batch_indices = order[start : start + self._infer_batch_size] | ||
| 295 | - inference_batches += 1 | ||
| 296 | - for idx in batch_indices: | ||
| 297 | - prompt = self._build_prompt_tokens(query, unique_texts[idx]) | ||
| 298 | - with self._infer_lock: | ||
| 299 | - unique_scores[idx] = self._score_prompt(prompt) | 362 | + with self._infer_lock: |
| 363 | + query_state = None | ||
| 364 | + max_doc_tokens = self._effective_max_len | ||
| 365 | + if self._supports_query_state_reuse(): | ||
| 366 | + query_state, max_doc_tokens = self._build_query_state_locked(query) | ||
| 367 | + for start in range(0, len(order), self._infer_batch_size): | ||
| 368 | + batch_indices = order[start : start + self._infer_batch_size] | ||
| 369 | + inference_batches += 1 | ||
| 370 | + for idx in batch_indices: | ||
| 371 | + if query_state is not None: | ||
| 372 | + unique_scores[idx] = self._score_doc_with_state_locked( | ||
| 373 | + query_state, | ||
| 374 | + unique_doc_tokens[idx], | ||
| 375 | + max_doc_tokens, | ||
| 376 | + ) | ||
| 377 | + else: | ||
| 378 | + prompt = self._build_prompt_tokens(query, unique_texts[idx]) | ||
| 379 | + unique_scores[idx] = self._score_prompt(prompt) | ||
| 300 | 380 | ||
| 301 | for (orig_idx, _), unique_idx in zip(indexed, position_to_unique): | 381 | for (orig_idx, _), unique_idx in zip(indexed, position_to_unique): |
| 302 | output_scores[orig_idx] = float(unique_scores[unique_idx]) | 382 | output_scores[orig_idx] = float(unique_scores[unique_idx]) |
| @@ -313,7 +393,7 @@ class Qwen3GGUFRerankerBackend: | @@ -313,7 +393,7 @@ class Qwen3GGUFRerankerBackend: | ||
| 313 | "dedup_ratio": round(dedup_ratio, 4), | 393 | "dedup_ratio": round(dedup_ratio, 4), |
| 314 | "elapsed_ms": round(elapsed_ms, 3), | 394 | "elapsed_ms": round(elapsed_ms, 3), |
| 315 | "model": self._model_name, | 395 | "model": self._model_name, |
| 316 | - "backend": "qwen3_gguf", | 396 | + "backend": self._backend_name, |
| 317 | "normalize": normalize, | 397 | "normalize": normalize, |
| 318 | "infer_batch_size": self._infer_batch_size, | 398 | "infer_batch_size": self._infer_batch_size, |
| 319 | "inference_batches": inference_batches, | 399 | "inference_batches": inference_batches, |
| @@ -323,5 +403,6 @@ class Qwen3GGUFRerankerBackend: | @@ -323,5 +403,6 @@ class Qwen3GGUFRerankerBackend: | ||
| 323 | "n_batch": self._n_batch, | 403 | "n_batch": self._n_batch, |
| 324 | "n_ubatch": self._n_ubatch, | 404 | "n_ubatch": self._n_ubatch, |
| 325 | "n_gpu_layers": self._n_gpu_layers, | 405 | "n_gpu_layers": self._n_gpu_layers, |
| 406 | + "reuse_query_state": query_state is not None, | ||
| 326 | } | 407 | } |
| 327 | return output_scores, meta | 408 | return output_scores, meta |
reranker/server.py
| @@ -7,7 +7,7 @@ Request: { "query": "...", "docs": ["doc1", "doc2", ...], "normalize": optional | @@ -7,7 +7,7 @@ Request: { "query": "...", "docs": ["doc1", "doc2", ...], "normalize": optional | ||
| 7 | Response: { "scores": [float], "meta": {...} } | 7 | Response: { "scores": [float], "meta": {...} } |
| 8 | 8 | ||
| 9 | Backend selected via config: services.rerank.backend | 9 | Backend selected via config: services.rerank.backend |
| 10 | -(bge | qwen3_vllm | qwen3_transformers | qwen3_gguf | dashscope_rerank), env RERANK_BACKEND. | 10 | +(bge | qwen3_vllm | qwen3_transformers | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank), env RERANK_BACKEND. |
| 11 | """ | 11 | """ |
| 12 | 12 | ||
| 13 | import logging | 13 | import logging |
| @@ -0,0 +1,198 @@ | @@ -0,0 +1,198 @@ | ||
| 1 | +#!/usr/bin/env python3 | ||
| 2 | +""" | ||
| 3 | +Local tuning probe for GGUF reranker backends. | ||
| 4 | + | ||
| 5 | +Runs the backend directly in a fresh process per config to measure: | ||
| 6 | +- load time | ||
| 7 | +- GPU memory used by this process | ||
| 8 | +- single-request rerank latency | ||
| 9 | + | ||
| 10 | +Example: | ||
| 11 | + ./.venv-reranker-gguf/bin/python scripts/benchmark_reranker_gguf_local.py | ||
| 12 | + ./.venv-reranker-gguf-06b/bin/python scripts/benchmark_reranker_gguf_local.py --backend-name qwen3_gguf_06b --docs 400 | ||
| 13 | +""" | ||
| 14 | + | ||
| 15 | +from __future__ import annotations | ||
| 16 | + | ||
| 17 | +import argparse | ||
| 18 | +import json | ||
| 19 | +import os | ||
| 20 | +import random | ||
| 21 | +import statistics | ||
| 22 | +import subprocess | ||
| 23 | +import sys | ||
| 24 | +import time | ||
| 25 | +from pathlib import Path | ||
| 26 | +from typing import Any | ||
| 27 | + | ||
| 28 | + | ||
| 29 | +DEFAULT_TITLES = Path("/home/ubuntu/rerank_test/titles.1.8w") | ||
| 30 | + | ||
| 31 | + | ||
| 32 | +def load_titles(path: Path) -> list[str]: | ||
| 33 | + items: list[str] = [] | ||
| 34 | + with path.open(encoding="utf-8", errors="replace") as fh: | ||
| 35 | + for line in fh: | ||
| 36 | + text = line.strip() | ||
| 37 | + if text: | ||
| 38 | + items.append(text) | ||
| 39 | + return items | ||
| 40 | + | ||
| 41 | + | ||
| 42 | +def gpu_mem_for_pid(pid: int) -> int: | ||
| 43 | + try: | ||
| 44 | + out = subprocess.check_output( | ||
| 45 | + [ | ||
| 46 | + "nvidia-smi", | ||
| 47 | + "--query-compute-apps=pid,used_gpu_memory", | ||
| 48 | + "--format=csv,noheader,nounits", | ||
| 49 | + ], | ||
| 50 | + text=True, | ||
| 51 | + ) | ||
| 52 | + except Exception: | ||
| 53 | + return -1 | ||
| 54 | + for raw in out.splitlines(): | ||
| 55 | + parts = [p.strip() for p in raw.split(",")] | ||
| 56 | + if len(parts) != 2: | ||
| 57 | + continue | ||
| 58 | + try: | ||
| 59 | + row_pid = int(parts[0]) | ||
| 60 | + row_mem = int(parts[1]) | ||
| 61 | + except ValueError: | ||
| 62 | + continue | ||
| 63 | + if row_pid == pid: | ||
| 64 | + return row_mem | ||
| 65 | + return -1 | ||
| 66 | + | ||
| 67 | + | ||
| 68 | +def main() -> int: | ||
| 69 | + parser = argparse.ArgumentParser() | ||
| 70 | + parser.add_argument("--backend-name", type=str, default="qwen3_gguf") | ||
| 71 | + parser.add_argument("--titles-file", type=Path, default=DEFAULT_TITLES) | ||
| 72 | + parser.add_argument("--query", type=str, default="白色oversized T-shirt") | ||
| 73 | + parser.add_argument("--docs", type=int, default=160) | ||
| 74 | + parser.add_argument("--repeat", type=int, default=1) | ||
| 75 | + parser.add_argument("--seed", type=int, default=42) | ||
| 76 | + parser.add_argument( | ||
| 77 | + "--configs-json", | ||
| 78 | + type=str, | ||
| 79 | + default="", | ||
| 80 | + help="JSON array of config objects; when omitted, uses built-in scan set.", | ||
| 81 | + ) | ||
| 82 | + args = parser.parse_args() | ||
| 83 | + | ||
| 84 | + if not args.titles_file.is_file(): | ||
| 85 | + print(f"missing titles file: {args.titles_file}", file=sys.stderr) | ||
| 86 | + return 2 | ||
| 87 | + | ||
| 88 | + titles = load_titles(args.titles_file) | ||
| 89 | + if len(titles) < args.docs: | ||
| 90 | + print(f"not enough titles: need {args.docs}, got {len(titles)}", file=sys.stderr) | ||
| 91 | + return 2 | ||
| 92 | + | ||
| 93 | + random.seed(args.seed) | ||
| 94 | + docs = random.sample(titles, args.docs) | ||
| 95 | + | ||
| 96 | + if args.configs_json: | ||
| 97 | + configs = json.loads(args.configs_json) | ||
| 98 | + elif args.backend_name == "qwen3_gguf_06b": | ||
| 99 | + configs = [ | ||
| 100 | + {"name": "gguf_06b_full_256", "n_ctx": 256, "n_batch": 256, "n_ubatch": 256, "n_gpu_layers": 999}, | ||
| 101 | + {"name": "gguf_06b_full_320", "n_ctx": 320, "n_batch": 320, "n_ubatch": 320, "n_gpu_layers": 999}, | ||
| 102 | + {"name": "gguf_06b_full_384", "n_ctx": 384, "n_batch": 384, "n_ubatch": 384, "n_gpu_layers": 999}, | ||
| 103 | + {"name": "gguf_06b_full_512", "n_ctx": 512, "n_batch": 512, "n_ubatch": 512, "n_gpu_layers": 999}, | ||
| 104 | + ] | ||
| 105 | + else: | ||
| 106 | + configs = [ | ||
| 107 | + {"name": "gguf_t4_24g", "n_ctx": 384, "n_batch": 384, "n_ubatch": 128, "n_gpu_layers": 24}, | ||
| 108 | + {"name": "gguf_t4_40g", "n_ctx": 384, "n_batch": 384, "n_ubatch": 128, "n_gpu_layers": 40}, | ||
| 109 | + {"name": "gguf_t4_full", "n_ctx": 384, "n_batch": 384, "n_ubatch": 128, "n_gpu_layers": 999}, | ||
| 110 | + {"name": "gguf_t4_full_512", "n_ctx": 512, "n_batch": 512, "n_ubatch": 256, "n_gpu_layers": 999}, | ||
| 111 | + {"name": "gguf_t4_full_512_u512", "n_ctx": 512, "n_batch": 512, "n_ubatch": 512, "n_gpu_layers": 999}, | ||
| 112 | + {"name": "gguf_t4_full_768", "n_ctx": 768, "n_batch": 768, "n_ubatch": 256, "n_gpu_layers": 999}, | ||
| 113 | + ] | ||
| 114 | + | ||
| 115 | + from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend | ||
| 116 | + | ||
| 117 | + default_cfg_by_backend: dict[str, dict[str, Any]] = { | ||
| 118 | + "qwen3_gguf": { | ||
| 119 | + "_backend_name": "qwen3_gguf", | ||
| 120 | + "repo_id": "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF", | ||
| 121 | + "filename": "*Q8_0.gguf", | ||
| 122 | + "local_dir": "./models/reranker/qwen3-reranker-4b-gguf", | ||
| 123 | + "infer_batch_size": 8, | ||
| 124 | + }, | ||
| 125 | + "qwen3_gguf_06b": { | ||
| 126 | + "_backend_name": "qwen3_gguf_06b", | ||
| 127 | + "repo_id": "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF", | ||
| 128 | + "filename": "qwen3-reranker-0.6b-q8_0.gguf", | ||
| 129 | + "local_dir": "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf", | ||
| 130 | + "infer_batch_size": 32, | ||
| 131 | + }, | ||
| 132 | + } | ||
| 133 | + if args.backend_name not in default_cfg_by_backend: | ||
| 134 | + print(f"unsupported backend: {args.backend_name}", file=sys.stderr) | ||
| 135 | + return 2 | ||
| 136 | + | ||
| 137 | + base_cfg: dict[str, Any] = { | ||
| 138 | + **default_cfg_by_backend[args.backend_name], | ||
| 139 | + "instruction": "Rank products by query with category & style match prioritized", | ||
| 140 | + "cache_dir": "./model_cache", | ||
| 141 | + "main_gpu": 0, | ||
| 142 | + "n_threads": 2, | ||
| 143 | + "n_threads_batch": 4, | ||
| 144 | + "flash_attn": True, | ||
| 145 | + "offload_kqv": True, | ||
| 146 | + "use_mmap": True, | ||
| 147 | + "use_mlock": False, | ||
| 148 | + "sort_by_doc_length": True, | ||
| 149 | + "length_sort_mode": "char", | ||
| 150 | + "enable_warmup": True, | ||
| 151 | + "verbose": False, | ||
| 152 | + "reuse_query_state": True, | ||
| 153 | + } | ||
| 154 | + | ||
| 155 | + all_results: list[dict[str, Any]] = [] | ||
| 156 | + for cfg in configs: | ||
| 157 | + merged = dict(base_cfg) | ||
| 158 | + merged.update(cfg) | ||
| 159 | + name = str(merged.pop("name")) | ||
| 160 | + | ||
| 161 | + t0 = time.perf_counter() | ||
| 162 | + backend = Qwen3GGUFRerankerBackend(merged) | ||
| 163 | + load_ms = (time.perf_counter() - t0) * 1000.0 | ||
| 164 | + gpu_mem_mib = gpu_mem_for_pid(os.getpid()) | ||
| 165 | + | ||
| 166 | + runs: list[float] = [] | ||
| 167 | + last_meta: dict[str, Any] = {} | ||
| 168 | + for _ in range(args.repeat): | ||
| 169 | + t1 = time.perf_counter() | ||
| 170 | + _scores, meta = backend.score_with_meta(args.query, docs, normalize=True) | ||
| 171 | + runs.append((time.perf_counter() - t1) * 1000.0) | ||
| 172 | + last_meta = dict(meta) | ||
| 173 | + | ||
| 174 | + result = { | ||
| 175 | + "name": name, | ||
| 176 | + "config": merged, | ||
| 177 | + "load_ms": round(load_ms, 2), | ||
| 178 | + "gpu_mem_mib": gpu_mem_mib, | ||
| 179 | + "latency_ms_min": round(min(runs), 2), | ||
| 180 | + "latency_ms_avg": round(statistics.mean(runs), 2), | ||
| 181 | + "latency_ms_max": round(max(runs), 2), | ||
| 182 | + "meta": last_meta, | ||
| 183 | + } | ||
| 184 | + all_results.append(result) | ||
| 185 | + print(json.dumps(result, ensure_ascii=False)) | ||
| 186 | + del backend | ||
| 187 | + | ||
| 188 | + print("SUMMARY") | ||
| 189 | + for item in sorted(all_results, key=lambda x: x["latency_ms_avg"]): | ||
| 190 | + print( | ||
| 191 | + f'{item["name"]}: avg={item["latency_ms_avg"]}ms ' | ||
| 192 | + f'gpu={item["gpu_mem_mib"]}MiB load={item["load_ms"]}ms' | ||
| 193 | + ) | ||
| 194 | + return 0 | ||
| 195 | + | ||
| 196 | + | ||
| 197 | +if __name__ == "__main__": | ||
| 198 | + raise SystemExit(main()) |
scripts/lib/reranker_backend_env.sh
| @@ -40,6 +40,7 @@ reranker_backend_venv_dir() { | @@ -40,6 +40,7 @@ reranker_backend_venv_dir() { | ||
| 40 | case "${backend}" in | 40 | case "${backend}" in |
| 41 | qwen3_vllm) printf '%s/.venv-reranker\n' "${project_root}" ;; | 41 | qwen3_vllm) printf '%s/.venv-reranker\n' "${project_root}" ;; |
| 42 | qwen3_gguf) printf '%s/.venv-reranker-gguf\n' "${project_root}" ;; | 42 | qwen3_gguf) printf '%s/.venv-reranker-gguf\n' "${project_root}" ;; |
| 43 | + qwen3_gguf_06b) printf '%s/.venv-reranker-gguf-06b\n' "${project_root}" ;; | ||
| 43 | qwen3_transformers) printf '%s/.venv-reranker-transformers\n' "${project_root}" ;; | 44 | qwen3_transformers) printf '%s/.venv-reranker-transformers\n' "${project_root}" ;; |
| 44 | bge) printf '%s/.venv-reranker-bge\n' "${project_root}" ;; | 45 | bge) printf '%s/.venv-reranker-bge\n' "${project_root}" ;; |
| 45 | dashscope_rerank) printf '%s/.venv-reranker-dashscope\n' "${project_root}" ;; | 46 | dashscope_rerank) printf '%s/.venv-reranker-dashscope\n' "${project_root}" ;; |
| @@ -54,6 +55,7 @@ reranker_backend_requirements_file() { | @@ -54,6 +55,7 @@ reranker_backend_requirements_file() { | ||
| 54 | case "${backend}" in | 55 | case "${backend}" in |
| 55 | qwen3_vllm) printf '%s/requirements_reranker_qwen3_vllm.txt\n' "${project_root}" ;; | 56 | qwen3_vllm) printf '%s/requirements_reranker_qwen3_vllm.txt\n' "${project_root}" ;; |
| 56 | qwen3_gguf) printf '%s/requirements_reranker_qwen3_gguf.txt\n' "${project_root}" ;; | 57 | qwen3_gguf) printf '%s/requirements_reranker_qwen3_gguf.txt\n' "${project_root}" ;; |
| 58 | + qwen3_gguf_06b) printf '%s/requirements_reranker_qwen3_gguf_06b.txt\n' "${project_root}" ;; | ||
| 57 | qwen3_transformers) printf '%s/requirements_reranker_qwen3_transformers.txt\n' "${project_root}" ;; | 59 | qwen3_transformers) printf '%s/requirements_reranker_qwen3_transformers.txt\n' "${project_root}" ;; |
| 58 | bge) printf '%s/requirements_reranker_bge.txt\n' "${project_root}" ;; | 60 | bge) printf '%s/requirements_reranker_bge.txt\n' "${project_root}" ;; |
| 59 | dashscope_rerank) printf '%s/requirements_reranker_dashscope.txt\n' "${project_root}" ;; | 61 | dashscope_rerank) printf '%s/requirements_reranker_dashscope.txt\n' "${project_root}" ;; |
scripts/setup_reranker_venv.sh
| @@ -50,6 +50,30 @@ echo "Using TMPDIR=${TMPDIR}" | @@ -50,6 +50,30 @@ echo "Using TMPDIR=${TMPDIR}" | ||
| 50 | "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" --upgrade pip wheel | 50 | "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" --upgrade pip wheel |
| 51 | "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" -r "${REQ_FILE}" | 51 | "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" -r "${REQ_FILE}" |
| 52 | 52 | ||
| 53 | +if [[ "${BACKEND}" == qwen3_gguf* ]]; then | ||
| 54 | + if [[ -x "/usr/local/cuda/bin/nvcc" ]]; then | ||
| 55 | + "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" \ | ||
| 56 | + cmake \ | ||
| 57 | + ninja \ | ||
| 58 | + scikit-build-core \ | ||
| 59 | + flit_core \ | ||
| 60 | + setuptools-scm | ||
| 61 | + echo "Rebuilding llama-cpp-python with CUDA support for ${BACKEND}" | ||
| 62 | + PATH="/usr/local/cuda/bin:/usr/bin:/bin" \ | ||
| 63 | + CC="/usr/bin/x86_64-linux-gnu-gcc" \ | ||
| 64 | + CXX="/usr/bin/x86_64-linux-gnu-g++" \ | ||
| 65 | + CUDACXX="/usr/local/cuda/bin/nvcc" \ | ||
| 66 | + CMAKE_ARGS="-DGGML_CUDA=on" \ | ||
| 67 | + FORCE_CMAKE=1 \ | ||
| 68 | + "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" \ | ||
| 69 | + --force-reinstall \ | ||
| 70 | + --no-build-isolation \ | ||
| 71 | + "llama-cpp-python==0.3.18" | ||
| 72 | + else | ||
| 73 | + echo "WARNING: /usr/local/cuda/bin/nvcc not found; ${BACKEND} will be installed without CUDA support." >&2 | ||
| 74 | + fi | ||
| 75 | +fi | ||
| 76 | + | ||
| 53 | echo | 77 | echo |
| 54 | echo "Done." | 78 | echo "Done." |
| 55 | echo "Backend: ${BACKEND}" | 79 | echo "Backend: ${BACKEND}" |
scripts/start_reranker.sh
| @@ -43,6 +43,10 @@ export TMPDIR="${RERANKER_RUNTIME_DIR}/tmp" | @@ -43,6 +43,10 @@ export TMPDIR="${RERANKER_RUNTIME_DIR}/tmp" | ||
| 43 | export VLLM_NO_USAGE_STATS="${VLLM_NO_USAGE_STATS:-1}" | 43 | export VLLM_NO_USAGE_STATS="${VLLM_NO_USAGE_STATS:-1}" |
| 44 | export PATH="${RERANKER_VENV}/bin:${PATH}" | 44 | export PATH="${RERANKER_VENV}/bin:${PATH}" |
| 45 | 45 | ||
| 46 | +if [[ "${RERANK_BACKEND}" == qwen3_gguf* ]]; then | ||
| 47 | + export HF_HUB_DISABLE_XET="${HF_HUB_DISABLE_XET:-1}" | ||
| 48 | +fi | ||
| 49 | + | ||
| 46 | if [[ "${RERANK_BACKEND}" == "qwen3_vllm" ]]; then | 50 | if [[ "${RERANK_BACKEND}" == "qwen3_vllm" ]]; then |
| 47 | if ! command -v nvidia-smi >/dev/null 2>&1 || ! nvidia-smi >/dev/null 2>&1; then | 51 | if ! command -v nvidia-smi >/dev/null 2>&1 || ! nvidia-smi >/dev/null 2>&1; then |
| 48 | echo "ERROR: qwen3_vllm backend requires NVIDIA GPU, but nvidia-smi is unavailable." >&2 | 52 | echo "ERROR: qwen3_vllm backend requires NVIDIA GPU, but nvidia-smi is unavailable." >&2 |
| @@ -64,16 +68,24 @@ PY | @@ -64,16 +68,24 @@ PY | ||
| 64 | fi | 68 | fi |
| 65 | fi | 69 | fi |
| 66 | 70 | ||
| 67 | -if [[ "${RERANK_BACKEND}" == "qwen3_gguf" ]]; then | ||
| 68 | - if ! "${PYTHON_BIN}" - <<'PY' | 71 | +if [[ "${RERANK_BACKEND}" == qwen3_gguf* ]]; then |
| 72 | + gguf_check_status=0 | ||
| 73 | + "${PYTHON_BIN}" - <<'PY' || gguf_check_status=$? | ||
| 69 | try: | 74 | try: |
| 70 | - import llama_cpp # noqa: F401 | 75 | + import llama_cpp |
| 76 | + if hasattr(llama_cpp, "llama_supports_gpu_offload") and not llama_cpp.llama_supports_gpu_offload(): | ||
| 77 | + raise SystemExit(2) | ||
| 71 | except Exception: | 78 | except Exception: |
| 72 | raise SystemExit(1) | 79 | raise SystemExit(1) |
| 73 | PY | 80 | PY |
| 74 | - then | ||
| 75 | - echo "ERROR: qwen3_gguf backend requires llama-cpp-python in ${RERANKER_VENV}." >&2 | ||
| 76 | - echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2 | 81 | + if [[ "${gguf_check_status}" != "0" ]]; then |
| 82 | + if [[ "${gguf_check_status}" == "2" ]]; then | ||
| 83 | + echo "ERROR: ${RERANK_BACKEND} backend detected a CPU-only llama-cpp-python build in ${RERANKER_VENV}." >&2 | ||
| 84 | + echo "Please rerun: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2 | ||
| 85 | + else | ||
| 86 | + echo "ERROR: ${RERANK_BACKEND} backend requires llama-cpp-python in ${RERANKER_VENV}." >&2 | ||
| 87 | + echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2 | ||
| 88 | + fi | ||
| 77 | exit 1 | 89 | exit 1 |
| 78 | fi | 90 | fi |
| 79 | fi | 91 | fi |
tests/test_reranker_qwen3_gguf_backend.py
| @@ -12,6 +12,8 @@ class _FakeLlama: | @@ -12,6 +12,8 @@ class _FakeLlama: | ||
| 12 | self.model_path = model_path | 12 | self.model_path = model_path |
| 13 | self.kwargs = kwargs | 13 | self.kwargs = kwargs |
| 14 | self.eval_logits = [] | 14 | self.eval_logits = [] |
| 15 | + self._tokens = [] | ||
| 16 | + self.eval_call_count = 0 | ||
| 15 | 17 | ||
| 16 | @classmethod | 18 | @classmethod |
| 17 | def from_pretrained(cls, repo_id: str, filename: str, local_dir=None, cache_dir=None, **kwargs): | 19 | def from_pretrained(cls, repo_id: str, filename: str, local_dir=None, cache_dir=None, **kwargs): |
| @@ -31,16 +33,25 @@ class _FakeLlama: | @@ -31,16 +33,25 @@ class _FakeLlama: | ||
| 31 | return [10 + (ord(ch) % 17) for ch in raw] | 33 | return [10 + (ord(ch) % 17) for ch in raw] |
| 32 | 34 | ||
| 33 | def reset(self): | 35 | def reset(self): |
| 36 | + self._tokens = [] | ||
| 34 | return None | 37 | return None |
| 35 | 38 | ||
| 36 | def eval(self, prompt_tokens): | 39 | def eval(self, prompt_tokens): |
| 37 | - pos = float(sum(prompt_tokens) % 11) + 3.0 | 40 | + self.eval_call_count += 1 |
| 41 | + self._tokens.extend(prompt_tokens) | ||
| 42 | + pos = float(sum(self._tokens) % 11) + 3.0 | ||
| 38 | neg = 1.0 | 43 | neg = 1.0 |
| 39 | logits = [0.0] * 64 | 44 | logits = [0.0] * 64 |
| 40 | logits[1] = pos | 45 | logits[1] = pos |
| 41 | logits[2] = neg | 46 | logits[2] = neg |
| 42 | self.eval_logits = [logits] | 47 | self.eval_logits = [logits] |
| 43 | 48 | ||
| 49 | + def save_state(self): | ||
| 50 | + return list(self._tokens) | ||
| 51 | + | ||
| 52 | + def load_state(self, state): | ||
| 53 | + self._tokens = list(state) | ||
| 54 | + | ||
| 44 | 55 | ||
| 45 | def _install_fake_llama_cpp(monkeypatch): | 56 | def _install_fake_llama_cpp(monkeypatch): |
| 46 | fake_module = types.SimpleNamespace(Llama=_FakeLlama) | 57 | fake_module = types.SimpleNamespace(Llama=_FakeLlama) |
| @@ -58,6 +69,21 @@ def test_qwen3_gguf_backend_factory_loads(monkeypatch): | @@ -58,6 +69,21 @@ def test_qwen3_gguf_backend_factory_loads(monkeypatch): | ||
| 58 | }, | 69 | }, |
| 59 | ) | 70 | ) |
| 60 | assert isinstance(backend, Qwen3GGUFRerankerBackend) | 71 | assert isinstance(backend, Qwen3GGUFRerankerBackend) |
| 72 | + assert backend._backend_name == "qwen3_gguf" | ||
| 73 | + | ||
| 74 | + | ||
| 75 | +def test_qwen3_gguf_06b_backend_factory_loads(monkeypatch): | ||
| 76 | + _install_fake_llama_cpp(monkeypatch) | ||
| 77 | + backend = get_rerank_backend( | ||
| 78 | + "qwen3_gguf_06b", | ||
| 79 | + { | ||
| 80 | + "enable_warmup": False, | ||
| 81 | + }, | ||
| 82 | + ) | ||
| 83 | + assert isinstance(backend, Qwen3GGUFRerankerBackend) | ||
| 84 | + assert backend._backend_name == "qwen3_gguf_06b" | ||
| 85 | + assert backend._repo_id == "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF" | ||
| 86 | + assert backend._filename == "qwen3-reranker-0.6b-q8_0.gguf" | ||
| 61 | 87 | ||
| 62 | 88 | ||
| 63 | def test_qwen3_gguf_backend_score_with_meta_dedup_and_restore(monkeypatch): | 89 | def test_qwen3_gguf_backend_score_with_meta_dedup_and_restore(monkeypatch): |
| @@ -69,6 +95,7 @@ def test_qwen3_gguf_backend_score_with_meta_dedup_and_restore(monkeypatch): | @@ -69,6 +95,7 @@ def test_qwen3_gguf_backend_score_with_meta_dedup_and_restore(monkeypatch): | ||
| 69 | "enable_warmup": False, | 95 | "enable_warmup": False, |
| 70 | "infer_batch_size": 2, | 96 | "infer_batch_size": 2, |
| 71 | "sort_by_doc_length": True, | 97 | "sort_by_doc_length": True, |
| 98 | + "reuse_query_state": True, | ||
| 72 | } | 99 | } |
| 73 | ) | 100 | ) |
| 74 | 101 | ||
| @@ -88,3 +115,5 @@ def test_qwen3_gguf_backend_score_with_meta_dedup_and_restore(monkeypatch): | @@ -88,3 +115,5 @@ def test_qwen3_gguf_backend_score_with_meta_dedup_and_restore(monkeypatch): | ||
| 88 | assert meta["unique_docs"] == 2 | 115 | assert meta["unique_docs"] == 2 |
| 89 | assert meta["backend"] == "qwen3_gguf" | 116 | assert meta["backend"] == "qwen3_gguf" |
| 90 | assert meta["inference_batches"] == 1 | 117 | assert meta["inference_batches"] == 1 |
| 118 | + assert meta["reuse_query_state"] is True | ||
| 119 | + assert backend._llm.eval_call_count == 3 |