Commit 5c21a485bbd6bd2f6876a1d2ddee6a6afbeeffa9

Authored by tangwang
1 parent 3d508beb

qwen3-reranker-0.6b-gguf

config/config.yaml
@@ -381,7 +381,7 @@ services: @@ -381,7 +381,7 @@ services:
381 max_docs: 1000 381 max_docs: 1000
382 normalize: true 382 normalize: true
383 # 服务内后端(reranker 进程启动时读取) 383 # 服务内后端(reranker 进程启动时读取)
384 - backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | qwen3_gguf | dashscope_rerank 384 + backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank
385 backends: 385 backends:
386 bge: 386 bge:
387 model_name: "BAAI/bge-reranker-v2-m3" 387 model_name: "BAAI/bge-reranker-v2-m3"
@@ -426,11 +426,11 @@ services: @@ -426,11 +426,11 @@ services:
426 cache_dir: "./model_cache" 426 cache_dir: "./model_cache"
427 local_dir: "./models/reranker/qwen3-reranker-4b-gguf" 427 local_dir: "./models/reranker/qwen3-reranker-4b-gguf"
428 instruction: "Rank products by query with category & style match prioritized" 428 instruction: "Rank products by query with category & style match prioritized"
429 - # T4 16GB / 显存约 5~6GB 的保守配置  
430 - n_ctx: 384  
431 - n_batch: 384  
432 - n_ubatch: 128  
433 - n_gpu_layers: 24 429 + # T4 16GB / 性能优先配置:全量层 offload,实测比保守配置明显更快
  430 + n_ctx: 512
  431 + n_batch: 512
  432 + n_ubatch: 512
  433 + n_gpu_layers: 999
434 main_gpu: 0 434 main_gpu: 0
435 n_threads: 2 435 n_threads: 2
436 n_threads_batch: 4 436 n_threads_batch: 4
@@ -443,6 +443,31 @@ services: @@ -443,6 +443,31 @@ services:
443 length_sort_mode: "char" 443 length_sort_mode: "char"
444 enable_warmup: true 444 enable_warmup: true
445 verbose: false 445 verbose: false
  446 + qwen3_gguf_06b:
  447 + repo_id: "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF"
  448 + filename: "qwen3-reranker-0.6b-q8_0.gguf"
  449 + cache_dir: "./model_cache"
  450 + local_dir: "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf"
  451 + instruction: "Rank products by query with category & style match prioritized"
  452 + # 0.6B GGUF / online rerank baseline:
  453 + # 实测 400 titles 单请求约 265s,因此它更适合作为低显存功能后备,不适合在线低延迟主路由。
  454 + n_ctx: 256
  455 + n_batch: 256
  456 + n_ubatch: 256
  457 + n_gpu_layers: 999
  458 + main_gpu: 0
  459 + n_threads: 2
  460 + n_threads_batch: 4
  461 + flash_attn: true
  462 + offload_kqv: true
  463 + use_mmap: true
  464 + use_mlock: false
  465 + infer_batch_size: 32
  466 + sort_by_doc_length: true
  467 + length_sort_mode: "char"
  468 + reuse_query_state: false
  469 + enable_warmup: true
  470 + verbose: false
446 dashscope_rerank: 471 dashscope_rerank:
447 model_name: "qwen3-rerank" 472 model_name: "qwen3-rerank"
448 # 按地域选择 endpoint: 473 # 按地域选择 endpoint:
config/services_config.py
@@ -7,6 +7,7 @@ contains no independent parsing or precedence logic. @@ -7,6 +7,7 @@ contains no independent parsing or precedence logic.
7 7
8 from __future__ import annotations 8 from __future__ import annotations
9 9
  10 +import os
10 from typing import Any, Dict, Tuple 11 from typing import Any, Dict, Tuple
11 12
12 from config.loader import get_app_config 13 from config.loader import get_app_config
@@ -61,6 +62,12 @@ def get_embedding_image_backend_config() -> Tuple[str, Dict[str, Any]]: @@ -61,6 +62,12 @@ def get_embedding_image_backend_config() -> Tuple[str, Dict[str, Any]]:
61 62
62 def get_rerank_backend_config() -> Tuple[str, Dict[str, Any]]: 63 def get_rerank_backend_config() -> Tuple[str, Dict[str, Any]]:
63 cfg = get_app_config().services.rerank 64 cfg = get_app_config().services.rerank
  65 + backend = str(os.getenv("RERANK_BACKEND") or cfg.backend).strip()
  66 + if backend != cfg.backend:
  67 + backend_cfg = cfg.backends.get(backend)
  68 + if backend_cfg is None:
  69 + raise ValueError(f"Unknown rerank backend override from RERANK_BACKEND: {backend!r}")
  70 + return backend, dict(backend_cfg)
64 return cfg.backend, cfg.get_backend_config() 71 return cfg.backend, cfg.get_backend_config()
65 72
66 73
requirements_reranker_qwen3_gguf.txt
1 # Isolated dependencies for qwen3_gguf reranker backend (.venv-reranker-gguf). 1 # Isolated dependencies for qwen3_gguf reranker backend (.venv-reranker-gguf).
2 2
3 -r requirements_reranker_base.txt 3 -r requirements_reranker_base.txt
  4 +huggingface-hub>=0.32.0
4 llama-cpp-python>=0.3.16 5 llama-cpp-python>=0.3.16
requirements_reranker_qwen3_gguf_06b.txt 0 → 100644
@@ -0,0 +1,3 @@ @@ -0,0 +1,3 @@
  1 +# Isolated dependencies for qwen3_gguf_06b reranker backend (.venv-reranker-gguf-06b).
  2 +
  3 +-r requirements_reranker_qwen3_gguf.txt
reranker/DEPLOYMENT_AND_TUNING.md
@@ -3,15 +3,15 @@ @@ -3,15 +3,15 @@
3 本文档沉淀当前项目在电商搜索重排场景下的可复用实践,覆盖: 3 本文档沉淀当前项目在电商搜索重排场景下的可复用实践,覆盖:
4 4
5 - 环境准备与安装部署 5 - 环境准备与安装部署
6 -- `qwen3_vllm` / `qwen3_gguf` 配置项与优化思路 6 +- `qwen3_vllm` / `qwen3_gguf` / `qwen3_gguf_06b` 配置项与优化思路
7 - 1000-doc 场景压测流程 7 - 1000-doc 场景压测流程
8 - 关键结论与推荐默认参数 8 - 关键结论与推荐默认参数
9 - 常见故障排查 9 - 常见故障排查
10 10
11 适用范围: 11 适用范围:
12 12
13 -- 重排后端:`services.rerank.backend: qwen3_vllm` 或 `qwen3_gguf`  
14 -- 模型:`Qwen/Qwen3-Reranker-0.6B` / `DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF` 13 +- 重排后端:`services.rerank.backend: qwen3_vllm` / `qwen3_gguf` / `qwen3_gguf_06b`
  14 +- 模型:`Qwen/Qwen3-Reranker-0.6B` / `DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF` / `ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF`
15 - 场景:query 较短(通常 < 100 tokens),doc 为商品标题或标题+简短描述,单请求 docs 约 1000 条 15 - 场景:query 较短(通常 < 100 tokens),doc 为商品标题或标题+简短描述,单请求 docs 约 1000 条
16 16
17 ## 1. 环境基线 17 ## 1. 环境基线
reranker/GGUF_0_6B_INSTALL_AND_TUNING.md 0 → 100644
@@ -0,0 +1,154 @@ @@ -0,0 +1,154 @@
  1 +# Qwen3-Reranker-0.6B GGUF 安装与调优
  2 +
  3 +本文档覆盖 `qwen3_gguf_06b` 后端,对应模型:
  4 +
  5 +- Hugging Face: `ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF`
  6 +- 文件: `qwen3-reranker-0.6b-q8_0.gguf`
  7 +- 本地目录: `./models/reranker/qwen3-reranker-0.6b-q8_0-gguf`
  8 +
  9 +## 结论先看
  10 +
  11 +这个后端已经接入完成,也能正常使用 GPU offload,但不适合当前项目的在线主链路场景。
  12 +
  13 +目标场景是:
  14 +
  15 +- 1 个 query
  16 +- 400 个商品标题
  17 +- 追求最短响应时间
  18 +
  19 +实测最优配置下:
  20 +
  21 +- GPU 显存占用约 `894 MiB`
  22 +- 400 titles 单请求延迟约 `265318 ms`
  23 +
  24 +因此它更适合作为:
  25 +
  26 +- 低显存 fallback
  27 +- 功能验证
  28 +- 本地离线实验
  29 +
  30 +不建议作为在线低延迟 reranker 主 backend。
  31 +
  32 +## 独立环境
  33 +
  34 +`qwen3_gguf_06b` 使用独立 venv:
  35 +
  36 +- backend: `qwen3_gguf_06b`
  37 +- venv: `.venv-reranker-gguf-06b`
  38 +- requirements: `requirements_reranker_qwen3_gguf_06b.txt`
  39 +
  40 +安装:
  41 +
  42 +```bash
  43 +./scripts/setup_reranker_venv.sh qwen3_gguf_06b
  44 +```
  45 +
  46 +如果需要确认是 CUDA 版 `llama-cpp-python`:
  47 +
  48 +```bash
  49 +./.venv-reranker-gguf-06b/bin/python - <<'PY'
  50 +import llama_cpp
  51 +print(llama_cpp.llama_supports_gpu_offload())
  52 +PY
  53 +```
  54 +
  55 +预期输出:
  56 +
  57 +```python
  58 +True
  59 +```
  60 +
  61 +## 模型下载
  62 +
  63 +推荐预先下载到本地,避免首次服务启动时在线拉取:
  64 +
  65 +```bash
  66 +mkdir -p models/reranker/qwen3-reranker-0.6b-q8_0-gguf
  67 +curl -L --fail -C - \
  68 + -o models/reranker/qwen3-reranker-0.6b-q8_0-gguf/qwen3-reranker-0.6b-q8_0.gguf \
  69 + 'https://huggingface.co/ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/resolve/main/qwen3-reranker-0.6b-q8_0.gguf?download=true'
  70 +```
  71 +
  72 +当前实测文件大小:
  73 +
  74 +- `639153184` bytes
  75 +
  76 +## 推荐配置
  77 +
  78 +`config/config.yaml` 中建议保留:
  79 +
  80 +```yaml
  81 +qwen3_gguf_06b:
  82 + repo_id: "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF"
  83 + filename: "qwen3-reranker-0.6b-q8_0.gguf"
  84 + local_dir: "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf"
  85 + cache_dir: "./model_cache"
  86 + instruction: "Rank products by query with category & style match prioritized"
  87 + n_ctx: 256
  88 + n_batch: 256
  89 + n_ubatch: 256
  90 + n_gpu_layers: 999
  91 + main_gpu: 0
  92 + n_threads: 2
  93 + n_threads_batch: 4
  94 + flash_attn: true
  95 + offload_kqv: true
  96 + use_mmap: true
  97 + use_mlock: false
  98 + infer_batch_size: 32
  99 + sort_by_doc_length: true
  100 + length_sort_mode: "char"
  101 + reuse_query_state: false
  102 + enable_warmup: true
  103 + verbose: false
  104 +```
  105 +
  106 +## 调优结果
  107 +
  108 +在当前机器上做了同机实测。标题文件来自 `/home/ubuntu/rerank_test/titles.1.8w`,查询为 `白色oversized T-shirt`。
  109 +
  110 +80 titles:
  111 +
  112 +- `n_ctx=256, reuse_query_state=true` -> `60108 ms`
  113 +- `n_ctx=256, reuse_query_state=false` -> `53383~56893 ms`
  114 +- `n_ctx=320, reuse_query_state=true` -> `60961 ms`
  115 +- `n_ctx=384, reuse_query_state=true` -> `56578 ms`
  116 +- `n_ctx=384, reuse_query_state=false` -> `57272 ms`
  117 +- `n_ctx=512, reuse_query_state=false` -> `60542 ms`
  118 +- `n_ctx=256, reuse_query_state=false, n_threads=4, n_threads_batch=8` -> `61228 ms`
  119 +
  120 +400 titles:
  121 +
  122 +- `n_ctx=256, n_batch=256, n_ubatch=256, n_gpu_layers=999, reuse_query_state=false`
  123 + -> `265318 ms`
  124 +
  125 +## 经验沉淀
  126 +
  127 +这次接入最重要的结论不是“哪个小参数更快”,而是:
  128 +
  129 +1. 这个 0.6B GGUF 权重虽然小,但当前后端实现仍是逐 doc 顺序打分。
  130 +2. 对在线 400-title 请求来说,串行打分本身就是主瓶颈。
  131 +3. `reuse_query_state` 在这个模型上没有带来收益,反而更慢。
  132 +4. `n_ctx` 拉大到 `384/512` 也没有带来实质收益,反而更慢或持平。
  133 +5. 这个 backend 的优势是低显存,不是低延迟。
  134 +
  135 +如果目标是在线最短响应时间,优先级建议是:
  136 +
  137 +1. `qwen3_vllm`
  138 +2. 其他真正支持高吞吐批处理的后端
  139 +3. `qwen3_gguf_06b` 仅作为低显存 fallback
  140 +
  141 +## 验证命令
  142 +
  143 +本地直连 backend 调优:
  144 +
  145 +```bash
  146 +PYTHONPATH=/data/saas-search ./.venv-reranker-gguf/bin/python \
  147 + scripts/benchmark_reranker_gguf_local.py --backend-name qwen3_gguf_06b --docs 400
  148 +```
  149 +
  150 +按服务方式启动:
  151 +
  152 +```bash
  153 +RERANK_BACKEND=qwen3_gguf_06b ./scripts/start_reranker.sh
  154 +```
reranker/GGUF_INSTALL_AND_TUNING.md 0 → 100644
@@ -0,0 +1,280 @@ @@ -0,0 +1,280 @@
  1 +# Qwen3 GGUF 安装与调优手册
  2 +
  3 +本文档只覆盖 `qwen3_gguf` 后端,目标机器为当前项目实测环境:
  4 +
  5 +- GPU: `Tesla T4 16GB`
  6 +- CUDA: `12.8`
  7 +- 模型: `DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF`
  8 +- 量化: `Q8_0`
  9 +
  10 +---
  11 +
  12 +## 1. 结论先看
  13 +
  14 +当前这套代码里,GGUF 后端的主要瓶颈不是“显存没吃满”,而是 **llama.cpp 按 doc 顺序逐条打分**。因此最有效的优化策略是:
  15 +
  16 +- 让模型层尽可能全部 offload 到 GPU
  17 +- 打开 `flash_attn` / `offload_kqv`
  18 +- 把 `n_ctx / n_batch / n_ubatch` 调到一个对短标题重排更合适的高效点
  19 +
  20 +本轮在当前机器上的推荐配置是:
  21 +
  22 +```yaml
  23 +qwen3_gguf:
  24 + n_ctx: 512
  25 + n_batch: 512
  26 + n_ubatch: 512
  27 + n_gpu_layers: 999
  28 + n_threads: 2
  29 + n_threads_batch: 4
  30 + flash_attn: true
  31 + offload_kqv: true
  32 + infer_batch_size: 8
  33 + sort_by_doc_length: true
  34 + length_sort_mode: "char"
  35 +```
  36 +
  37 +说明:
  38 +
  39 +- `n_gpu_layers: 999` 在 llama.cpp 中等价于“尽可能全部层都 offload”
  40 +- 这台 T4 上,**即使全量 offload,当前模型也只占到约 `4.5 GiB` GPU 显存**
  41 +- 所以“允许 8G 显存”并不会自动带来更高速度;这个模型/后端在当前工作负载下已经接近“该用到的权重都上 GPU 了”
  42 +
  43 +---
  44 +
  45 +## 2. 独立环境
  46 +
  47 +`qwen3_gguf` 必须使用自己的独立 venv:
  48 +
  49 +- `qwen3_vllm` -> `.venv-reranker`
  50 +- `qwen3_gguf` -> `.venv-reranker-gguf`
  51 +
  52 +安装命令:
  53 +
  54 +```bash
  55 +./scripts/setup_reranker_venv.sh qwen3_gguf
  56 +```
  57 +
  58 +脚本现在会自动做两件事:
  59 +
  60 +1. 安装 GGUF 后端所需 Python 依赖
  61 +2. 在检测到 `/usr/local/cuda/bin/nvcc` 时,把 `llama-cpp-python` **重编译成 CUDA 版**
  62 +
  63 +---
  64 +
  65 +## 3. GPU 版验证
  66 +
  67 +必须验证不是 CPU-only 版:
  68 +
  69 +```bash
  70 +./.venv-reranker-gguf/bin/python - <<'PY'
  71 +import llama_cpp
  72 +print("supports_gpu_offload =", llama_cpp.llama_supports_gpu_offload())
  73 +PY
  74 +```
  75 +
  76 +正确结果应为:
  77 +
  78 +```text
  79 +supports_gpu_offload = True
  80 +```
  81 +
  82 +还可以看动态库:
  83 +
  84 +```bash
  85 +ldd .venv-reranker-gguf/lib/python3.12/site-packages/llama_cpp/lib/libllama.so | rg 'cuda|cublas|ggml-cuda'
  86 +```
  87 +
  88 +应能看到:
  89 +
  90 +- `libggml-cuda.so`
  91 +- `libcudart.so`
  92 +- `libcublas.so`
  93 +
  94 +---
  95 +
  96 +## 4. 模型下载
  97 +
  98 +当前使用本地文件优先策略,模型放在:
  99 +
  100 +```text
  101 +models/reranker/qwen3-reranker-4b-gguf/Qwen.Qwen3-Reranker-4B.Q8_0.gguf
  102 +```
  103 +
  104 +若本地文件存在,后端会直接加载本地 GGUF,不再依赖启动时在线下载。
  105 +
  106 +为了避免当前机器上 Hugging Face Xet 下载的 `416 Range Not Satisfiable` 问题,`start_reranker.sh` 已对 `qwen3_gguf` 默认设置:
  107 +
  108 +```bash
  109 +HF_HUB_DISABLE_XET=1
  110 +```
  111 +
  112 +---
  113 +
  114 +## 5. 本地调优脚本
  115 +
  116 +新增本地基准脚本:
  117 +
  118 +```bash
  119 +PYTHONPATH=/data/saas-search ./.venv-reranker-gguf/bin/python \
  120 + scripts/benchmark_reranker_gguf_local.py --docs 64 --repeat 1
  121 +```
  122 +
  123 +它会直接实例化 GGUF backend,输出:
  124 +
  125 +- 模型加载耗时
  126 +- 当前进程 GPU 显存占用
  127 +- 单次 rerank 延迟
  128 +
  129 +---
  130 +
  131 +## 6. 本轮实测结果
  132 +
  133 +测试条件:
  134 +
  135 +- Query: `白色oversized T-shirt`
  136 +- Docs: `64` 条商品标题
  137 +- 本地脚本:`scripts/benchmark_reranker_gguf_local.py`
  138 +- 每组 1 次,重点比较相对趋势
  139 +
  140 +结果:
  141 +
  142 +### 6.1 保守配置
  143 +
  144 +```text
  145 +n_ctx=384
  146 +n_batch=384
  147 +n_ubatch=128
  148 +n_gpu_layers=24
  149 +```
  150 +
  151 +- GPU 显存:`2984 MiB`
  152 +- 64 docs 延迟:`74347.91 ms`
  153 +
  154 +### 6.2 全量 offload
  155 +
  156 +```text
  157 +n_ctx=384
  158 +n_batch=384
  159 +n_ubatch=128
  160 +n_gpu_layers=999
  161 +```
  162 +
  163 +- GPU 显存:`4338 MiB`
  164 +- 64 docs 延迟:`51401.77 ms`
  165 +
  166 +### 6.3 最优配置
  167 +
  168 +```text
  169 +n_ctx=512
  170 +n_batch=512
  171 +n_ubatch=512
  172 +n_gpu_layers=999
  173 +```
  174 +
  175 +- GPU 显存:`4564 MiB`
  176 +- 64 docs 延迟:`49116.10 ms`
  177 +
  178 +### 6.4 其它尝试
  179 +
  180 +`n_threads=4 / n_threads_batch=8`:
  181 +
  182 +- GPU 显存:`4564 MiB`
  183 +- 64 docs 延迟:`49895.88 ms`
  184 +- 比推荐值略慢
  185 +
  186 +`infer_batch_size=64`:
  187 +
  188 +- GPU 显存:`4564 MiB`
  189 +- 64 docs 延迟:`50723.36 ms`
  190 +- 也略慢
  191 +
  192 +### 6.5 API 级验证
  193 +
  194 +在把推荐配置写入 `config/config.yaml` 并重启服务后,使用:
  195 +
  196 +```bash
  197 +RERANK_BASE=http://127.0.0.1:6007 \
  198 + ./.venv/bin/python scripts/benchmark_reranker_random_titles.py 64 --repeat 1 --query '白色oversized T-shirt'
  199 +```
  200 +
  201 +得到:
  202 +
  203 +- `64 docs`:`50177.22 ms`
  204 +
  205 +再用:
  206 +
  207 +```bash
  208 +RERANK_BASE=http://127.0.0.1:6007 \
  209 + ./.venv/bin/python scripts/benchmark_reranker_random_titles.py 153 --repeat 1 --query '白色oversized T-shirt'
  210 +```
  211 +
  212 +得到:
  213 +
  214 +- `153 docs`:`115328.60 ms`
  215 +
  216 +对比旧日志中的保守配置:
  217 +
  218 +- 旧配置 `153 docs`:`153435.37 ms`
  219 +- 新配置 `153 docs`:`115328.60 ms`
  220 +
  221 +改善幅度约:
  222 +
  223 +- `24.8%`
  224 +
  225 +---
  226 +
  227 +## 7. 为什么没有吃到 8G
  228 +
  229 +结论很重要:
  230 +
  231 +- 当前最优配置已经是“尽可能全量层 offload”
  232 +- 该 `Q8_0` 模型在这套 llama.cpp / T4 / 短文本重排场景下,**实测只需要约 `4.5 GiB` GPU 显存**
  233 +- 继续为了“吃满 8G”去增大 `n_ctx`,不会明显提升吞吐,反而可能带来额外开销
  234 +
  235 +所以本轮不是“显存太保守”,而是:
  236 +
  237 +- 可 offload 的权重已经基本 offload 完了
  238 +- 真正拖慢响应的是 **逐 doc 顺序推理** 这一后端实现路径
  239 +
  240 +---
  241 +
  242 +## 8. 生产建议
  243 +
  244 +### 8.1 当前建议
  245 +
  246 +保留以下参数:
  247 +
  248 +```yaml
  249 +n_ctx: 512
  250 +n_batch: 512
  251 +n_ubatch: 512
  252 +n_gpu_layers: 999
  253 +n_threads: 2
  254 +n_threads_batch: 4
  255 +flash_attn: true
  256 +offload_kqv: true
  257 +```
  258 +
  259 +### 8.2 如果还嫌慢
  260 +
  261 +优先级建议:
  262 +
  263 +1. 缩小 `rerank_window`
  264 +2. 减少传入 doc 数
  265 +3. 若业务允许,切换到更适合高吞吐的后端
  266 +
  267 +原因:
  268 +
  269 +- 当前 GGUF 后端是本地单进程、逐 doc 打分
  270 +- 对长列表重排,它天然不如 vLLM / 云端 rerank API 擅长吞吐
  271 +
  272 +---
  273 +
  274 +## 9. 本轮落地文件
  275 +
  276 +- `config/config.yaml`
  277 +- `scripts/setup_reranker_venv.sh`
  278 +- `scripts/start_reranker.sh`
  279 +- `scripts/benchmark_reranker_gguf_local.py`
  280 +- `reranker/GGUF_INSTALL_AND_TUNING.md`
reranker/README.md
1 # Reranker 模块 1 # Reranker 模块
2 2
3 -**请求示例**见 `docs/QUICKSTART.md` §3.5。扩展规范见 `docs/DEVELOPER_GUIDE.md` §7。部署与调优实战见 `reranker/DEPLOYMENT_AND_TUNING.md`。 3 +**请求示例**见 `docs/QUICKSTART.md` §3.5。扩展规范见 `docs/DEVELOPER_GUIDE.md` §7。部署与调优实战见 `reranker/DEPLOYMENT_AND_TUNING.md`。`ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF` 的专项接入与调优结论见 `reranker/GGUF_0_6B_INSTALL_AND_TUNING.md`。
4 4
5 --- 5 ---
6 6
7 Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwen3-vLLM、Qwen3-Transformers、Qwen3-GGUF、DashScope 云重排)。调用方通过 HTTP 访问,不关心具体后端。 7 Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwen3-vLLM、Qwen3-Transformers、Qwen3-GGUF、DashScope 云重排)。调用方通过 HTTP 访问,不关心具体后端。
8 8
9 **特性** 9 **特性**
10 -- 多后端:`qwen3_vllm`、`qwen3_transformers`、`qwen3_gguf`(Qwen3-Reranker-4B GGUF + llama.cpp)、`bge`(兼容保留) 10 +- 多后端:`qwen3_vllm`、`qwen3_transformers`、`qwen3_gguf`(Qwen3-Reranker-4B GGUF + llama.cpp)、`qwen3_gguf_06b`(Qwen3-Reranker-0.6B Q8_0 GGUF + llama.cpp)、`bge`(兼容保留)
11 - 云后端:`dashscope_rerank`(调用 DashScope `/compatible-api/v1/reranks`,支持按地域切换 endpoint) 11 - 云后端:`dashscope_rerank`(调用 DashScope `/compatible-api/v1/reranks`,支持按地域切换 endpoint)
12 - 统一配置:`config/config.yaml` → `services.rerank.backend` / `services.rerank.backends.<name>` 12 - 统一配置:`config/config.yaml` → `services.rerank.backend` / `services.rerank.backends.<name>`
13 - 文档去重、分数与输入顺序一致、FP16/GPU 支持(视后端) 13 - 文档去重、分数与输入顺序一致、FP16/GPU 支持(视后端)
@@ -19,7 +19,7 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe @@ -19,7 +19,7 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe
19 - `backends/bge.py`:BGE 后端 19 - `backends/bge.py`:BGE 后端
20 - `backends/qwen3_vllm.py`:Qwen3-Reranker-0.6B + vLLM 后端 20 - `backends/qwen3_vllm.py`:Qwen3-Reranker-0.6B + vLLM 后端
21 - `backends/qwen3_transformers.py`:Qwen3-Reranker-0.6B 纯 Transformers 后端(官方 Usage 方式) 21 - `backends/qwen3_transformers.py`:Qwen3-Reranker-0.6B 纯 Transformers 后端(官方 Usage 方式)
22 - - `backends/qwen3_gguf.py`:Qwen3-Reranker-4B GGUF + llama.cpp 后端 22 + - `backends/qwen3_gguf.py`:Qwen3-Reranker GGUF + llama.cpp 后端(支持 `qwen3_gguf` / `qwen3_gguf_06b`)
23 - `backends/dashscope_rerank.py`:DashScope 云重排后端(HTTP 调用) 23 - `backends/dashscope_rerank.py`:DashScope 云重排后端(HTTP 调用)
24 - `reranker/bge_reranker.py`:BGE 核心推理(被 bge 后端封装) 24 - `reranker/bge_reranker.py`:BGE 核心推理(被 bge 后端封装)
25 - `reranker/config.py`:服务端口、MAX_DOCS、NORMALIZE 等(后端参数在 config.yaml) 25 - `reranker/config.py`:服务端口、MAX_DOCS、NORMALIZE 等(后端参数在 config.yaml)
@@ -32,11 +32,12 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe @@ -32,11 +32,12 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe
32 - 现在按 backend 使用独立 venv: 32 - 现在按 backend 使用独立 venv:
33 - `qwen3_vllm` -> `.venv-reranker` 33 - `qwen3_vllm` -> `.venv-reranker`
34 - `qwen3_gguf` -> `.venv-reranker-gguf` 34 - `qwen3_gguf` -> `.venv-reranker-gguf`
  35 + - `qwen3_gguf_06b` -> `.venv-reranker-gguf-06b`
35 - `qwen3_transformers` -> `.venv-reranker-transformers` 36 - `qwen3_transformers` -> `.venv-reranker-transformers`
36 - `bge` -> `.venv-reranker-bge` 37 - `bge` -> `.venv-reranker-bge`
37 - `dashscope_rerank` -> `.venv-reranker-dashscope` 38 - `dashscope_rerank` -> `.venv-reranker-dashscope`
38 ```bash 39 ```bash
39 - ./scripts/setup_reranker_venv.sh qwen3_gguf 40 + ./scripts/setup_reranker_venv.sh qwen3_gguf_06b
40 ``` 41 ```
41 CUDA 构建建议: 42 CUDA 构建建议:
42 ```bash 43 ```bash
@@ -48,7 +49,7 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe @@ -48,7 +49,7 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe
48 ``` 49 ```
49 50
50 ## 配置 51 ## 配置
51 -- **后端选择**:`config/config.yaml` 中 `services.rerank.backend`(`qwen3_vllm` | `qwen3_transformers` | `qwen3_gguf` | `bge` | `dashscope_rerank`),或环境变量 `RERANK_BACKEND`。 52 +- **后端选择**:`config/config.yaml` 中 `services.rerank.backend`(`qwen3_vllm` | `qwen3_transformers` | `qwen3_gguf` | `qwen3_gguf_06b` | `bge` | `dashscope_rerank`),或环境变量 `RERANK_BACKEND`。
52 - **后端参数**:`services.rerank.backends.bge` / `services.rerank.backends.qwen3_vllm`,例如: 53 - **后端参数**:`services.rerank.backends.bge` / `services.rerank.backends.qwen3_vllm`,例如:
53 54
54 ```yaml 55 ```yaml
@@ -96,6 +97,20 @@ services: @@ -96,6 +97,20 @@ services:
96 infer_batch_size: 8 97 infer_batch_size: 8
97 sort_by_doc_length: true 98 sort_by_doc_length: true
98 length_sort_mode: "char" 99 length_sort_mode: "char"
  100 + qwen3_gguf_06b:
  101 + repo_id: "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF"
  102 + filename: "qwen3-reranker-0.6b-q8_0.gguf"
  103 + local_dir: "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf"
  104 + cache_dir: "./model_cache"
  105 + instruction: "Rank products by query with category & style match prioritized"
  106 + n_ctx: 256
  107 + n_batch: 256
  108 + n_ubatch: 256
  109 + n_gpu_layers: 999
  110 + infer_batch_size: 32
  111 + sort_by_doc_length: true
  112 + length_sort_mode: "char"
  113 + reuse_query_state: false
99 dashscope_rerank: 114 dashscope_rerank:
100 model_name: "qwen3-rerank" 115 model_name: "qwen3-rerank"
101 endpoint: "https://dashscope.aliyuncs.com/compatible-api/v1/reranks" 116 endpoint: "https://dashscope.aliyuncs.com/compatible-api/v1/reranks"
@@ -153,7 +168,7 @@ Content-Type: application/json @@ -153,7 +168,7 @@ Content-Type: application/json
153 ``` 168 ```
154 169
155 `top_n` 为可选字段: 170 `top_n` 为可选字段:
156 -- 对本地后端(`qwen3_vllm` / `qwen3_transformers` / `qwen3_gguf` / `bge`)通常会忽略,仍返回全量分数。 171 +- 对本地后端(`qwen3_vllm` / `qwen3_transformers` / `qwen3_gguf` / `qwen3_gguf_06b` / `bge`)通常会忽略,仍返回全量分数。
157 - 对 `dashscope_rerank` 可用于控制云端返回的候选量,建议设置为 `page+size`(例如分页 `from=20,size=10` 时传 `30`)。 172 - 对 `dashscope_rerank` 可用于控制云端返回的候选量,建议设置为 `page+size`(例如分页 `from=20,size=10` 时传 `30`)。
158 173
159 Response: 174 Response:
@@ -192,3 +207,4 @@ uvicorn reranker.server:app --host 0.0.0.0 --port 6007 --log-level info @@ -192,3 +207,4 @@ uvicorn reranker.server:app --host 0.0.0.0 --port 6007 --log-level info
192 - **Qwen3-vLLM**:参考 [Qwen3-Reranker-0.6B](https://huggingface.co/Qwen/Qwen3-Reranker-0.6B),需 GPU 与较多显存;与 BGE 相比适合长文本、高吞吐场景(vLLM 前缀缓存)。 207 - **Qwen3-vLLM**:参考 [Qwen3-Reranker-0.6B](https://huggingface.co/Qwen/Qwen3-Reranker-0.6B),需 GPU 与较多显存;与 BGE 相比适合长文本、高吞吐场景(vLLM 前缀缓存)。
193 - **Qwen3-Transformers**:官方 Transformers Usage 方式,无需 vLLM;适合 CPU 或小显存。默认 `attn_implementation: "sdpa"`;若已安装 `flash_attn` 可设 `flash_attention_2`(未安装时服务会自动回退到 sdpa)。 208 - **Qwen3-Transformers**:官方 Transformers Usage 方式,无需 vLLM;适合 CPU 或小显存。默认 `attn_implementation: "sdpa"`;若已安装 `flash_attn` 可设 `flash_attention_2`(未安装时服务会自动回退到 sdpa)。
194 - **Qwen3-GGUF**:参考 [DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF](https://huggingface.co/DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF)。单卡 T4 且仅剩约 `4.8~6GB` 显存时,推荐 `Q8_0 + n_ctx=384 + n_gpu_layers=24 + flash_attn=true + offload_kqv=true` 起步;若启动 OOM,优先把 `n_gpu_layers` 下调到 `20`,再把 `n_ctx` 下调到 `320`。`infer_batch_size` 在 GGUF 后端是服务侧 work chunk,大多不如 `n_gpu_layers` / `n_ctx` 关键。 209 - **Qwen3-GGUF**:参考 [DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF](https://huggingface.co/DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF)。单卡 T4 且仅剩约 `4.8~6GB` 显存时,推荐 `Q8_0 + n_ctx=384 + n_gpu_layers=24 + flash_attn=true + offload_kqv=true` 起步;若启动 OOM,优先把 `n_gpu_layers` 下调到 `20`,再把 `n_ctx` 下调到 `320`。`infer_batch_size` 在 GGUF 后端是服务侧 work chunk,大多不如 `n_gpu_layers` / `n_ctx` 关键。
  210 +- **Qwen3-GGUF-0.6B**:参考 [ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF](https://huggingface.co/ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF)。它的优点是权重小、显存占用低,单进程实测约 `0.9~1.1 GiB`;但在当前 llama.cpp 串行打分接法下,`1 query + 400 titles` 的实测延迟仍约 `265s`。因此它更适合低显存功能后备,不适合作为在线低延迟主 reranker。
reranker/backends/__init__.py
@@ -48,12 +48,19 @@ def get_rerank_backend(name: str, config: Dict[str, Any]) -&gt; RerankBackendProtoc @@ -48,12 +48,19 @@ def get_rerank_backend(name: str, config: Dict[str, Any]) -&gt; RerankBackendProtoc
48 return Qwen3TransformersRerankerBackend(config) 48 return Qwen3TransformersRerankerBackend(config)
49 if name == "qwen3_gguf": 49 if name == "qwen3_gguf":
50 from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend 50 from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend
51 - return Qwen3GGUFRerankerBackend(config) 51 + gguf_config = dict(config or {})
  52 + gguf_config.setdefault("_backend_name", "qwen3_gguf")
  53 + return Qwen3GGUFRerankerBackend(gguf_config)
  54 + if name == "qwen3_gguf_06b":
  55 + from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend
  56 + gguf_config = dict(config or {})
  57 + gguf_config.setdefault("_backend_name", "qwen3_gguf_06b")
  58 + return Qwen3GGUFRerankerBackend(gguf_config)
52 if name == "dashscope_rerank": 59 if name == "dashscope_rerank":
53 from reranker.backends.dashscope_rerank import DashScopeRerankBackend 60 from reranker.backends.dashscope_rerank import DashScopeRerankBackend
54 return DashScopeRerankBackend(config) 61 return DashScopeRerankBackend(config)
55 raise ValueError( 62 raise ValueError(
56 - f"Unknown rerank backend: {name!r}. Supported: bge, qwen3_vllm, qwen3_transformers, qwen3_gguf, dashscope_rerank" 63 + f"Unknown rerank backend: {name!r}. Supported: bge, qwen3_vllm, qwen3_transformers, qwen3_gguf, qwen3_gguf_06b, dashscope_rerank"
57 ) 64 )
58 65
59 66
reranker/backends/qwen3_gguf.py
1 """ 1 """
2 -Qwen3-Reranker-4B GGUF backend using llama-cpp-python. 2 +Qwen3-Reranker GGUF backend using llama-cpp-python.
3 3
4 Reference: 4 Reference:
5 - https://huggingface.co/DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF 5 - https://huggingface.co/DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF
6 - https://huggingface.co/Qwen/Qwen3-Reranker-4B 6 - https://huggingface.co/Qwen/Qwen3-Reranker-4B
  7 +- https://huggingface.co/ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF
  8 +- https://huggingface.co/Qwen/Qwen3-Reranker-0.6B
7 """ 9 """
8 10
9 from __future__ import annotations 11 from __future__ import annotations
@@ -13,12 +15,27 @@ import math @@ -13,12 +15,27 @@ import math
13 import os 15 import os
14 import threading 16 import threading
15 import time 17 import time
  18 +from pathlib import Path
16 from typing import Any, Dict, List, Tuple 19 from typing import Any, Dict, List, Tuple
17 20
18 21
19 logger = logging.getLogger("reranker.backends.qwen3_gguf") 22 logger = logging.getLogger("reranker.backends.qwen3_gguf")
20 23
21 24
  25 +_BACKEND_DEFAULTS: Dict[str, Dict[str, str]] = {
  26 + "qwen3_gguf": {
  27 + "repo_id": "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF",
  28 + "filename": "*Q8_0.gguf",
  29 + "local_dir": "./models/reranker/qwen3-reranker-4b-gguf",
  30 + },
  31 + "qwen3_gguf_06b": {
  32 + "repo_id": "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF",
  33 + "filename": "qwen3-reranker-0.6b-q8_0.gguf",
  34 + "local_dir": "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf",
  35 + },
  36 +}
  37 +
  38 +
22 def deduplicate_with_positions(texts: List[str]) -> Tuple[List[str], List[int]]: 39 def deduplicate_with_positions(texts: List[str]) -> Tuple[List[str], List[int]]:
23 """Deduplicate texts globally while preserving first-seen order.""" 40 """Deduplicate texts globally while preserving first-seen order."""
24 unique_texts: List[str] = [] 41 unique_texts: List[str] = []
@@ -46,21 +63,21 @@ def _format_instruction(instruction: str, query: str, doc: str) -&gt; str: @@ -46,21 +63,21 @@ def _format_instruction(instruction: str, query: str, doc: str) -&gt; str:
46 63
47 class Qwen3GGUFRerankerBackend: 64 class Qwen3GGUFRerankerBackend:
48 """ 65 """
49 - Qwen3-Reranker-4B GGUF backend using llama.cpp through llama-cpp-python. 66 + Qwen3-Reranker GGUF backend using llama.cpp through llama-cpp-python.
50 67
51 - Tuned for short-query / short-doc reranking on a memory-constrained single T4.  
52 - Config from services.rerank.backends.qwen3_gguf. 68 + Tuned for short-query / short-doc reranking on a single GPU.
  69 + Config from services.rerank.backends.<backend_name>.
53 """ 70 """
54 71
55 def __init__(self, config: Dict[str, Any]) -> None: 72 def __init__(self, config: Dict[str, Any]) -> None:
56 self._config = config or {} 73 self._config = config or {}
57 - self._repo_id = str(  
58 - self._config.get("repo_id") or "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF"  
59 - ).strip()  
60 - self._filename = str(self._config.get("filename") or "*Q8_0.gguf").strip() 74 + self._backend_name = str(self._config.get("_backend_name") or "qwen3_gguf").strip()
  75 + defaults = _BACKEND_DEFAULTS.get(self._backend_name, _BACKEND_DEFAULTS["qwen3_gguf"])
  76 + self._repo_id = str(self._config.get("repo_id") or defaults["repo_id"]).strip()
  77 + self._filename = str(self._config.get("filename") or defaults["filename"]).strip()
61 self._model_path = str(self._config.get("model_path") or "").strip() 78 self._model_path = str(self._config.get("model_path") or "").strip()
62 self._cache_dir = str(self._config.get("cache_dir") or "").strip() or None 79 self._cache_dir = str(self._config.get("cache_dir") or "").strip() or None
63 - self._local_dir = str(self._config.get("local_dir") or "").strip() or None 80 + self._local_dir = str(self._config.get("local_dir") or defaults["local_dir"]).strip() or None
64 self._instruction = str( 81 self._instruction = str(
65 self._config.get("instruction") 82 self._config.get("instruction")
66 or "Rank products by query with category & style match prioritized" 83 or "Rank products by query with category & style match prioritized"
@@ -79,6 +96,7 @@ class Qwen3GGUFRerankerBackend: @@ -79,6 +96,7 @@ class Qwen3GGUFRerankerBackend:
79 "on", 96 "on",
80 } 97 }
81 self._length_sort_mode = str(self._config.get("length_sort_mode") or "char").strip().lower() 98 self._length_sort_mode = str(self._config.get("length_sort_mode") or "char").strip().lower()
  99 + self._reuse_query_state = bool(self._config.get("reuse_query_state", False))
82 100
83 n_ctx = int(self._config.get("n_ctx", self._config.get("max_model_len", 384))) 101 n_ctx = int(self._config.get("n_ctx", self._config.get("max_model_len", 384)))
84 n_batch = int(self._config.get("n_batch", min(n_ctx, 384))) 102 n_batch = int(self._config.get("n_batch", min(n_ctx, 384)))
@@ -105,8 +123,9 @@ class Qwen3GGUFRerankerBackend: @@ -105,8 +123,9 @@ class Qwen3GGUFRerankerBackend:
105 from llama_cpp import Llama 123 from llama_cpp import Llama
106 except Exception as exc: # pragma: no cover - depends on optional dependency 124 except Exception as exc: # pragma: no cover - depends on optional dependency
107 raise RuntimeError( 125 raise RuntimeError(
108 - "qwen3_gguf backend requires llama-cpp-python. "  
109 - "Install the qwen3_gguf backend venv first via scripts/setup_reranker_venv.sh qwen3_gguf." 126 + f"{self._backend_name} backend requires llama-cpp-python. "
  127 + f"Install the {self._backend_name} backend venv first via "
  128 + f"scripts/setup_reranker_venv.sh {self._backend_name}."
110 ) from exc 129 ) from exc
111 130
112 self._llama_class = Llama 131 self._llama_class = Llama
@@ -118,7 +137,8 @@ class Qwen3GGUFRerankerBackend: @@ -118,7 +137,8 @@ class Qwen3GGUFRerankerBackend:
118 self._infer_lock = threading.Lock() 137 self._infer_lock = threading.Lock()
119 138
120 logger.info( 139 logger.info(
121 - "[Qwen3_GGUF] Loading model repo=%s filename=%s model_path=%s n_ctx=%s n_batch=%s n_ubatch=%s n_gpu_layers=%s flash_attn=%s offload_kqv=%s", 140 + "[Qwen3_GGUF] Loading backend=%s repo=%s filename=%s model_path=%s n_ctx=%s n_batch=%s n_ubatch=%s n_gpu_layers=%s flash_attn=%s offload_kqv=%s reuse_query_state=%s",
  141 + self._backend_name,
122 self._repo_id, 142 self._repo_id,
123 self._filename, 143 self._filename,
124 self._model_path or None, 144 self._model_path or None,
@@ -128,6 +148,7 @@ class Qwen3GGUFRerankerBackend: @@ -128,6 +148,7 @@ class Qwen3GGUFRerankerBackend:
128 n_gpu_layers, 148 n_gpu_layers,
129 flash_attn, 149 flash_attn,
130 offload_kqv, 150 offload_kqv,
  151 + self._reuse_query_state,
131 ) 152 )
132 153
133 llm_kwargs = { 154 llm_kwargs = {
@@ -158,6 +179,7 @@ class Qwen3GGUFRerankerBackend: @@ -158,6 +179,7 @@ class Qwen3GGUFRerankerBackend:
158 self._suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n" 179 self._suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
159 self._prefix_tokens = self._tokenize(self._prefix, special=True) 180 self._prefix_tokens = self._tokenize(self._prefix, special=True)
160 self._suffix_tokens = self._tokenize(self._suffix, special=True) 181 self._suffix_tokens = self._tokenize(self._suffix, special=True)
  182 + self._request_prefix_template = "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: "
161 self._effective_max_len = self._n_ctx - len(self._prefix_tokens) - len(self._suffix_tokens) 183 self._effective_max_len = self._n_ctx - len(self._prefix_tokens) - len(self._suffix_tokens)
162 if self._effective_max_len <= 16: 184 if self._effective_max_len <= 16:
163 raise RuntimeError( 185 raise RuntimeError(
@@ -171,7 +193,8 @@ class Qwen3GGUFRerankerBackend: @@ -171,7 +193,8 @@ class Qwen3GGUFRerankerBackend:
171 self._warmup() 193 self._warmup()
172 194
173 logger.info( 195 logger.info(
174 - "[Qwen3_GGUF] Model ready | model=%s effective_max_len=%s infer_batch_size=%s sort_by_doc_length=%s", 196 + "[Qwen3_GGUF] Model ready | backend=%s model=%s effective_max_len=%s infer_batch_size=%s sort_by_doc_length=%s",
  197 + self._backend_name,
175 self._model_name, 198 self._model_name,
176 self._effective_max_len, 199 self._effective_max_len,
177 self._infer_batch_size, 200 self._infer_batch_size,
@@ -181,6 +204,14 @@ class Qwen3GGUFRerankerBackend: @@ -181,6 +204,14 @@ class Qwen3GGUFRerankerBackend:
181 def _load_model(self, llm_kwargs: Dict[str, Any]): 204 def _load_model(self, llm_kwargs: Dict[str, Any]):
182 if self._model_path: 205 if self._model_path:
183 return self._llama_class(model_path=self._model_path, **llm_kwargs) 206 return self._llama_class(model_path=self._model_path, **llm_kwargs)
  207 + if self._local_dir:
  208 + matches = sorted(
  209 + path for path in Path(self._local_dir).glob(self._filename) if path.is_file()
  210 + )
  211 + if matches:
  212 + local_model_path = str(matches[0].resolve())
  213 + logger.info("[Qwen3_GGUF] Using local GGUF file: %s", local_model_path)
  214 + return self._llama_class(model_path=local_model_path, **llm_kwargs)
184 return self._llama_class.from_pretrained( 215 return self._llama_class.from_pretrained(
185 repo_id=self._repo_id, 216 repo_id=self._repo_id,
186 filename=self._filename, 217 filename=self._filename,
@@ -212,6 +243,13 @@ class Qwen3GGUFRerankerBackend: @@ -212,6 +243,13 @@ class Qwen3GGUFRerankerBackend:
212 except Exception as exc: # pragma: no cover - defensive 243 except Exception as exc: # pragma: no cover - defensive
213 logger.warning("[Qwen3_GGUF] Warmup failed: %s", exc) 244 logger.warning("[Qwen3_GGUF] Warmup failed: %s", exc)
214 245
  246 + def _build_request_prefix_tokens(self, query: str) -> List[int]:
  247 + request_prefix = self._request_prefix_template.format(
  248 + instruction=self._instruction,
  249 + query=query,
  250 + )
  251 + return self._tokenize(request_prefix, special=False)
  252 +
215 def _build_prompt_tokens(self, query: str, doc: str) -> List[int]: 253 def _build_prompt_tokens(self, query: str, doc: str) -> List[int]:
216 pair = _format_instruction(self._instruction, query, doc) 254 pair = _format_instruction(self._instruction, query, doc)
217 pair_tokens = self._tokenize(pair, special=False) 255 pair_tokens = self._tokenize(pair, special=False)
@@ -235,6 +273,36 @@ class Qwen3GGUFRerankerBackend: @@ -235,6 +273,36 @@ class Qwen3GGUFRerankerBackend:
235 false_exp = math.exp(false_logit - max_logit) 273 false_exp = math.exp(false_logit - max_logit)
236 return float(true_exp / (true_exp + false_exp)) 274 return float(true_exp / (true_exp + false_exp))
237 275
  276 + def _supports_query_state_reuse(self) -> bool:
  277 + return (
  278 + self._reuse_query_state
  279 + and hasattr(self._llm, "save_state")
  280 + and hasattr(self._llm, "load_state")
  281 + )
  282 +
  283 + def _build_query_state_locked(self, query: str):
  284 + request_prefix_tokens = self._build_request_prefix_tokens(query)
  285 + max_doc_tokens = self._effective_max_len - len(request_prefix_tokens)
  286 + if max_doc_tokens <= 0:
  287 + return None, 0
  288 + self._llm.reset()
  289 + self._llm.eval(self._prefix_tokens + request_prefix_tokens)
  290 + return self._llm.save_state(), max_doc_tokens
  291 +
  292 + def _score_doc_with_state_locked(self, state, doc_tokens: List[int], max_doc_tokens: int) -> float:
  293 + self._llm.load_state(state)
  294 + self._llm.eval(doc_tokens[:max_doc_tokens] + self._suffix_tokens)
  295 + logits = self._llm.eval_logits
  296 + if not logits:
  297 + raise RuntimeError("llama.cpp returned empty logits")
  298 + final_logits = list(logits[-1])
  299 + true_logit = float(final_logits[self._true_token])
  300 + false_logit = float(final_logits[self._false_token])
  301 + max_logit = max(true_logit, false_logit)
  302 + true_exp = math.exp(true_logit - max_logit)
  303 + false_exp = math.exp(false_logit - max_logit)
  304 + return float(true_exp / (true_exp + false_exp))
  305 +
238 def _estimate_doc_lengths(self, docs: List[str]) -> List[int]: 306 def _estimate_doc_lengths(self, docs: List[str]) -> List[int]:
239 if self._length_sort_mode == "token": 307 if self._length_sort_mode == "token":
240 return [len(self._tokenize(text, special=False)) for text in docs] 308 return [len(self._tokenize(text, special=False)) for text in docs]
@@ -269,7 +337,7 @@ class Qwen3GGUFRerankerBackend: @@ -269,7 +337,7 @@ class Qwen3GGUFRerankerBackend:
269 "dedup_ratio": 0.0, 337 "dedup_ratio": 0.0,
270 "elapsed_ms": round(elapsed_ms, 3), 338 "elapsed_ms": round(elapsed_ms, 3),
271 "model": self._model_name, 339 "model": self._model_name,
272 - "backend": "qwen3_gguf", 340 + "backend": self._backend_name,
273 "normalize": normalize, 341 "normalize": normalize,
274 "infer_batch_size": self._infer_batch_size, 342 "infer_batch_size": self._infer_batch_size,
275 "inference_batches": 0, 343 "inference_batches": 0,
@@ -289,14 +357,26 @@ class Qwen3GGUFRerankerBackend: @@ -289,14 +357,26 @@ class Qwen3GGUFRerankerBackend:
289 order = sorted(order, key=lambda i: lengths[i]) 357 order = sorted(order, key=lambda i: lengths[i])
290 358
291 unique_scores: List[float] = [0.0] * len(unique_texts) 359 unique_scores: List[float] = [0.0] * len(unique_texts)
  360 + unique_doc_tokens = [self._tokenize(text, special=False) for text in unique_texts]
292 inference_batches = 0 361 inference_batches = 0
293 - for start in range(0, len(order), self._infer_batch_size):  
294 - batch_indices = order[start : start + self._infer_batch_size]  
295 - inference_batches += 1  
296 - for idx in batch_indices:  
297 - prompt = self._build_prompt_tokens(query, unique_texts[idx])  
298 - with self._infer_lock:  
299 - unique_scores[idx] = self._score_prompt(prompt) 362 + with self._infer_lock:
  363 + query_state = None
  364 + max_doc_tokens = self._effective_max_len
  365 + if self._supports_query_state_reuse():
  366 + query_state, max_doc_tokens = self._build_query_state_locked(query)
  367 + for start in range(0, len(order), self._infer_batch_size):
  368 + batch_indices = order[start : start + self._infer_batch_size]
  369 + inference_batches += 1
  370 + for idx in batch_indices:
  371 + if query_state is not None:
  372 + unique_scores[idx] = self._score_doc_with_state_locked(
  373 + query_state,
  374 + unique_doc_tokens[idx],
  375 + max_doc_tokens,
  376 + )
  377 + else:
  378 + prompt = self._build_prompt_tokens(query, unique_texts[idx])
  379 + unique_scores[idx] = self._score_prompt(prompt)
300 380
301 for (orig_idx, _), unique_idx in zip(indexed, position_to_unique): 381 for (orig_idx, _), unique_idx in zip(indexed, position_to_unique):
302 output_scores[orig_idx] = float(unique_scores[unique_idx]) 382 output_scores[orig_idx] = float(unique_scores[unique_idx])
@@ -313,7 +393,7 @@ class Qwen3GGUFRerankerBackend: @@ -313,7 +393,7 @@ class Qwen3GGUFRerankerBackend:
313 "dedup_ratio": round(dedup_ratio, 4), 393 "dedup_ratio": round(dedup_ratio, 4),
314 "elapsed_ms": round(elapsed_ms, 3), 394 "elapsed_ms": round(elapsed_ms, 3),
315 "model": self._model_name, 395 "model": self._model_name,
316 - "backend": "qwen3_gguf", 396 + "backend": self._backend_name,
317 "normalize": normalize, 397 "normalize": normalize,
318 "infer_batch_size": self._infer_batch_size, 398 "infer_batch_size": self._infer_batch_size,
319 "inference_batches": inference_batches, 399 "inference_batches": inference_batches,
@@ -323,5 +403,6 @@ class Qwen3GGUFRerankerBackend: @@ -323,5 +403,6 @@ class Qwen3GGUFRerankerBackend:
323 "n_batch": self._n_batch, 403 "n_batch": self._n_batch,
324 "n_ubatch": self._n_ubatch, 404 "n_ubatch": self._n_ubatch,
325 "n_gpu_layers": self._n_gpu_layers, 405 "n_gpu_layers": self._n_gpu_layers,
  406 + "reuse_query_state": query_state is not None,
326 } 407 }
327 return output_scores, meta 408 return output_scores, meta
reranker/server.py
@@ -7,7 +7,7 @@ Request: { &quot;query&quot;: &quot;...&quot;, &quot;docs&quot;: [&quot;doc1&quot;, &quot;doc2&quot;, ...], &quot;normalize&quot;: optional @@ -7,7 +7,7 @@ Request: { &quot;query&quot;: &quot;...&quot;, &quot;docs&quot;: [&quot;doc1&quot;, &quot;doc2&quot;, ...], &quot;normalize&quot;: optional
7 Response: { "scores": [float], "meta": {...} } 7 Response: { "scores": [float], "meta": {...} }
8 8
9 Backend selected via config: services.rerank.backend 9 Backend selected via config: services.rerank.backend
10 -(bge | qwen3_vllm | qwen3_transformers | qwen3_gguf | dashscope_rerank), env RERANK_BACKEND. 10 +(bge | qwen3_vllm | qwen3_transformers | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank), env RERANK_BACKEND.
11 """ 11 """
12 12
13 import logging 13 import logging
scripts/benchmark_reranker_gguf_local.py 0 → 100644
@@ -0,0 +1,198 @@ @@ -0,0 +1,198 @@
  1 +#!/usr/bin/env python3
  2 +"""
  3 +Local tuning probe for GGUF reranker backends.
  4 +
  5 +Runs the backend directly in a fresh process per config to measure:
  6 +- load time
  7 +- GPU memory used by this process
  8 +- single-request rerank latency
  9 +
  10 +Example:
  11 + ./.venv-reranker-gguf/bin/python scripts/benchmark_reranker_gguf_local.py
  12 + ./.venv-reranker-gguf-06b/bin/python scripts/benchmark_reranker_gguf_local.py --backend-name qwen3_gguf_06b --docs 400
  13 +"""
  14 +
  15 +from __future__ import annotations
  16 +
  17 +import argparse
  18 +import json
  19 +import os
  20 +import random
  21 +import statistics
  22 +import subprocess
  23 +import sys
  24 +import time
  25 +from pathlib import Path
  26 +from typing import Any
  27 +
  28 +
  29 +DEFAULT_TITLES = Path("/home/ubuntu/rerank_test/titles.1.8w")
  30 +
  31 +
  32 +def load_titles(path: Path) -> list[str]:
  33 + items: list[str] = []
  34 + with path.open(encoding="utf-8", errors="replace") as fh:
  35 + for line in fh:
  36 + text = line.strip()
  37 + if text:
  38 + items.append(text)
  39 + return items
  40 +
  41 +
  42 +def gpu_mem_for_pid(pid: int) -> int:
  43 + try:
  44 + out = subprocess.check_output(
  45 + [
  46 + "nvidia-smi",
  47 + "--query-compute-apps=pid,used_gpu_memory",
  48 + "--format=csv,noheader,nounits",
  49 + ],
  50 + text=True,
  51 + )
  52 + except Exception:
  53 + return -1
  54 + for raw in out.splitlines():
  55 + parts = [p.strip() for p in raw.split(",")]
  56 + if len(parts) != 2:
  57 + continue
  58 + try:
  59 + row_pid = int(parts[0])
  60 + row_mem = int(parts[1])
  61 + except ValueError:
  62 + continue
  63 + if row_pid == pid:
  64 + return row_mem
  65 + return -1
  66 +
  67 +
  68 +def main() -> int:
  69 + parser = argparse.ArgumentParser()
  70 + parser.add_argument("--backend-name", type=str, default="qwen3_gguf")
  71 + parser.add_argument("--titles-file", type=Path, default=DEFAULT_TITLES)
  72 + parser.add_argument("--query", type=str, default="白色oversized T-shirt")
  73 + parser.add_argument("--docs", type=int, default=160)
  74 + parser.add_argument("--repeat", type=int, default=1)
  75 + parser.add_argument("--seed", type=int, default=42)
  76 + parser.add_argument(
  77 + "--configs-json",
  78 + type=str,
  79 + default="",
  80 + help="JSON array of config objects; when omitted, uses built-in scan set.",
  81 + )
  82 + args = parser.parse_args()
  83 +
  84 + if not args.titles_file.is_file():
  85 + print(f"missing titles file: {args.titles_file}", file=sys.stderr)
  86 + return 2
  87 +
  88 + titles = load_titles(args.titles_file)
  89 + if len(titles) < args.docs:
  90 + print(f"not enough titles: need {args.docs}, got {len(titles)}", file=sys.stderr)
  91 + return 2
  92 +
  93 + random.seed(args.seed)
  94 + docs = random.sample(titles, args.docs)
  95 +
  96 + if args.configs_json:
  97 + configs = json.loads(args.configs_json)
  98 + elif args.backend_name == "qwen3_gguf_06b":
  99 + configs = [
  100 + {"name": "gguf_06b_full_256", "n_ctx": 256, "n_batch": 256, "n_ubatch": 256, "n_gpu_layers": 999},
  101 + {"name": "gguf_06b_full_320", "n_ctx": 320, "n_batch": 320, "n_ubatch": 320, "n_gpu_layers": 999},
  102 + {"name": "gguf_06b_full_384", "n_ctx": 384, "n_batch": 384, "n_ubatch": 384, "n_gpu_layers": 999},
  103 + {"name": "gguf_06b_full_512", "n_ctx": 512, "n_batch": 512, "n_ubatch": 512, "n_gpu_layers": 999},
  104 + ]
  105 + else:
  106 + configs = [
  107 + {"name": "gguf_t4_24g", "n_ctx": 384, "n_batch": 384, "n_ubatch": 128, "n_gpu_layers": 24},
  108 + {"name": "gguf_t4_40g", "n_ctx": 384, "n_batch": 384, "n_ubatch": 128, "n_gpu_layers": 40},
  109 + {"name": "gguf_t4_full", "n_ctx": 384, "n_batch": 384, "n_ubatch": 128, "n_gpu_layers": 999},
  110 + {"name": "gguf_t4_full_512", "n_ctx": 512, "n_batch": 512, "n_ubatch": 256, "n_gpu_layers": 999},
  111 + {"name": "gguf_t4_full_512_u512", "n_ctx": 512, "n_batch": 512, "n_ubatch": 512, "n_gpu_layers": 999},
  112 + {"name": "gguf_t4_full_768", "n_ctx": 768, "n_batch": 768, "n_ubatch": 256, "n_gpu_layers": 999},
  113 + ]
  114 +
  115 + from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend
  116 +
  117 + default_cfg_by_backend: dict[str, dict[str, Any]] = {
  118 + "qwen3_gguf": {
  119 + "_backend_name": "qwen3_gguf",
  120 + "repo_id": "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF",
  121 + "filename": "*Q8_0.gguf",
  122 + "local_dir": "./models/reranker/qwen3-reranker-4b-gguf",
  123 + "infer_batch_size": 8,
  124 + },
  125 + "qwen3_gguf_06b": {
  126 + "_backend_name": "qwen3_gguf_06b",
  127 + "repo_id": "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF",
  128 + "filename": "qwen3-reranker-0.6b-q8_0.gguf",
  129 + "local_dir": "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf",
  130 + "infer_batch_size": 32,
  131 + },
  132 + }
  133 + if args.backend_name not in default_cfg_by_backend:
  134 + print(f"unsupported backend: {args.backend_name}", file=sys.stderr)
  135 + return 2
  136 +
  137 + base_cfg: dict[str, Any] = {
  138 + **default_cfg_by_backend[args.backend_name],
  139 + "instruction": "Rank products by query with category & style match prioritized",
  140 + "cache_dir": "./model_cache",
  141 + "main_gpu": 0,
  142 + "n_threads": 2,
  143 + "n_threads_batch": 4,
  144 + "flash_attn": True,
  145 + "offload_kqv": True,
  146 + "use_mmap": True,
  147 + "use_mlock": False,
  148 + "sort_by_doc_length": True,
  149 + "length_sort_mode": "char",
  150 + "enable_warmup": True,
  151 + "verbose": False,
  152 + "reuse_query_state": True,
  153 + }
  154 +
  155 + all_results: list[dict[str, Any]] = []
  156 + for cfg in configs:
  157 + merged = dict(base_cfg)
  158 + merged.update(cfg)
  159 + name = str(merged.pop("name"))
  160 +
  161 + t0 = time.perf_counter()
  162 + backend = Qwen3GGUFRerankerBackend(merged)
  163 + load_ms = (time.perf_counter() - t0) * 1000.0
  164 + gpu_mem_mib = gpu_mem_for_pid(os.getpid())
  165 +
  166 + runs: list[float] = []
  167 + last_meta: dict[str, Any] = {}
  168 + for _ in range(args.repeat):
  169 + t1 = time.perf_counter()
  170 + _scores, meta = backend.score_with_meta(args.query, docs, normalize=True)
  171 + runs.append((time.perf_counter() - t1) * 1000.0)
  172 + last_meta = dict(meta)
  173 +
  174 + result = {
  175 + "name": name,
  176 + "config": merged,
  177 + "load_ms": round(load_ms, 2),
  178 + "gpu_mem_mib": gpu_mem_mib,
  179 + "latency_ms_min": round(min(runs), 2),
  180 + "latency_ms_avg": round(statistics.mean(runs), 2),
  181 + "latency_ms_max": round(max(runs), 2),
  182 + "meta": last_meta,
  183 + }
  184 + all_results.append(result)
  185 + print(json.dumps(result, ensure_ascii=False))
  186 + del backend
  187 +
  188 + print("SUMMARY")
  189 + for item in sorted(all_results, key=lambda x: x["latency_ms_avg"]):
  190 + print(
  191 + f'{item["name"]}: avg={item["latency_ms_avg"]}ms '
  192 + f'gpu={item["gpu_mem_mib"]}MiB load={item["load_ms"]}ms'
  193 + )
  194 + return 0
  195 +
  196 +
  197 +if __name__ == "__main__":
  198 + raise SystemExit(main())
scripts/lib/reranker_backend_env.sh
@@ -40,6 +40,7 @@ reranker_backend_venv_dir() { @@ -40,6 +40,7 @@ reranker_backend_venv_dir() {
40 case "${backend}" in 40 case "${backend}" in
41 qwen3_vllm) printf '%s/.venv-reranker\n' "${project_root}" ;; 41 qwen3_vllm) printf '%s/.venv-reranker\n' "${project_root}" ;;
42 qwen3_gguf) printf '%s/.venv-reranker-gguf\n' "${project_root}" ;; 42 qwen3_gguf) printf '%s/.venv-reranker-gguf\n' "${project_root}" ;;
  43 + qwen3_gguf_06b) printf '%s/.venv-reranker-gguf-06b\n' "${project_root}" ;;
43 qwen3_transformers) printf '%s/.venv-reranker-transformers\n' "${project_root}" ;; 44 qwen3_transformers) printf '%s/.venv-reranker-transformers\n' "${project_root}" ;;
44 bge) printf '%s/.venv-reranker-bge\n' "${project_root}" ;; 45 bge) printf '%s/.venv-reranker-bge\n' "${project_root}" ;;
45 dashscope_rerank) printf '%s/.venv-reranker-dashscope\n' "${project_root}" ;; 46 dashscope_rerank) printf '%s/.venv-reranker-dashscope\n' "${project_root}" ;;
@@ -54,6 +55,7 @@ reranker_backend_requirements_file() { @@ -54,6 +55,7 @@ reranker_backend_requirements_file() {
54 case "${backend}" in 55 case "${backend}" in
55 qwen3_vllm) printf '%s/requirements_reranker_qwen3_vllm.txt\n' "${project_root}" ;; 56 qwen3_vllm) printf '%s/requirements_reranker_qwen3_vllm.txt\n' "${project_root}" ;;
56 qwen3_gguf) printf '%s/requirements_reranker_qwen3_gguf.txt\n' "${project_root}" ;; 57 qwen3_gguf) printf '%s/requirements_reranker_qwen3_gguf.txt\n' "${project_root}" ;;
  58 + qwen3_gguf_06b) printf '%s/requirements_reranker_qwen3_gguf_06b.txt\n' "${project_root}" ;;
57 qwen3_transformers) printf '%s/requirements_reranker_qwen3_transformers.txt\n' "${project_root}" ;; 59 qwen3_transformers) printf '%s/requirements_reranker_qwen3_transformers.txt\n' "${project_root}" ;;
58 bge) printf '%s/requirements_reranker_bge.txt\n' "${project_root}" ;; 60 bge) printf '%s/requirements_reranker_bge.txt\n' "${project_root}" ;;
59 dashscope_rerank) printf '%s/requirements_reranker_dashscope.txt\n' "${project_root}" ;; 61 dashscope_rerank) printf '%s/requirements_reranker_dashscope.txt\n' "${project_root}" ;;
scripts/setup_reranker_venv.sh
@@ -50,6 +50,30 @@ echo &quot;Using TMPDIR=${TMPDIR}&quot; @@ -50,6 +50,30 @@ echo &quot;Using TMPDIR=${TMPDIR}&quot;
50 "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" --upgrade pip wheel 50 "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" --upgrade pip wheel
51 "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" -r "${REQ_FILE}" 51 "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" -r "${REQ_FILE}"
52 52
  53 +if [[ "${BACKEND}" == qwen3_gguf* ]]; then
  54 + if [[ -x "/usr/local/cuda/bin/nvcc" ]]; then
  55 + "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" \
  56 + cmake \
  57 + ninja \
  58 + scikit-build-core \
  59 + flit_core \
  60 + setuptools-scm
  61 + echo "Rebuilding llama-cpp-python with CUDA support for ${BACKEND}"
  62 + PATH="/usr/local/cuda/bin:/usr/bin:/bin" \
  63 + CC="/usr/bin/x86_64-linux-gnu-gcc" \
  64 + CXX="/usr/bin/x86_64-linux-gnu-g++" \
  65 + CUDACXX="/usr/local/cuda/bin/nvcc" \
  66 + CMAKE_ARGS="-DGGML_CUDA=on" \
  67 + FORCE_CMAKE=1 \
  68 + "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" \
  69 + --force-reinstall \
  70 + --no-build-isolation \
  71 + "llama-cpp-python==0.3.18"
  72 + else
  73 + echo "WARNING: /usr/local/cuda/bin/nvcc not found; ${BACKEND} will be installed without CUDA support." >&2
  74 + fi
  75 +fi
  76 +
53 echo 77 echo
54 echo "Done." 78 echo "Done."
55 echo "Backend: ${BACKEND}" 79 echo "Backend: ${BACKEND}"
scripts/start_reranker.sh
@@ -43,6 +43,10 @@ export TMPDIR=&quot;${RERANKER_RUNTIME_DIR}/tmp&quot; @@ -43,6 +43,10 @@ export TMPDIR=&quot;${RERANKER_RUNTIME_DIR}/tmp&quot;
43 export VLLM_NO_USAGE_STATS="${VLLM_NO_USAGE_STATS:-1}" 43 export VLLM_NO_USAGE_STATS="${VLLM_NO_USAGE_STATS:-1}"
44 export PATH="${RERANKER_VENV}/bin:${PATH}" 44 export PATH="${RERANKER_VENV}/bin:${PATH}"
45 45
  46 +if [[ "${RERANK_BACKEND}" == qwen3_gguf* ]]; then
  47 + export HF_HUB_DISABLE_XET="${HF_HUB_DISABLE_XET:-1}"
  48 +fi
  49 +
46 if [[ "${RERANK_BACKEND}" == "qwen3_vllm" ]]; then 50 if [[ "${RERANK_BACKEND}" == "qwen3_vllm" ]]; then
47 if ! command -v nvidia-smi >/dev/null 2>&1 || ! nvidia-smi >/dev/null 2>&1; then 51 if ! command -v nvidia-smi >/dev/null 2>&1 || ! nvidia-smi >/dev/null 2>&1; then
48 echo "ERROR: qwen3_vllm backend requires NVIDIA GPU, but nvidia-smi is unavailable." >&2 52 echo "ERROR: qwen3_vllm backend requires NVIDIA GPU, but nvidia-smi is unavailable." >&2
@@ -64,16 +68,24 @@ PY @@ -64,16 +68,24 @@ PY
64 fi 68 fi
65 fi 69 fi
66 70
67 -if [[ "${RERANK_BACKEND}" == "qwen3_gguf" ]]; then  
68 - if ! "${PYTHON_BIN}" - <<'PY' 71 +if [[ "${RERANK_BACKEND}" == qwen3_gguf* ]]; then
  72 + gguf_check_status=0
  73 + "${PYTHON_BIN}" - <<'PY' || gguf_check_status=$?
69 try: 74 try:
70 - import llama_cpp # noqa: F401 75 + import llama_cpp
  76 + if hasattr(llama_cpp, "llama_supports_gpu_offload") and not llama_cpp.llama_supports_gpu_offload():
  77 + raise SystemExit(2)
71 except Exception: 78 except Exception:
72 raise SystemExit(1) 79 raise SystemExit(1)
73 PY 80 PY
74 - then  
75 - echo "ERROR: qwen3_gguf backend requires llama-cpp-python in ${RERANKER_VENV}." >&2  
76 - echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2 81 + if [[ "${gguf_check_status}" != "0" ]]; then
  82 + if [[ "${gguf_check_status}" == "2" ]]; then
  83 + echo "ERROR: ${RERANK_BACKEND} backend detected a CPU-only llama-cpp-python build in ${RERANKER_VENV}." >&2
  84 + echo "Please rerun: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2
  85 + else
  86 + echo "ERROR: ${RERANK_BACKEND} backend requires llama-cpp-python in ${RERANKER_VENV}." >&2
  87 + echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2
  88 + fi
77 exit 1 89 exit 1
78 fi 90 fi
79 fi 91 fi
tests/test_reranker_qwen3_gguf_backend.py
@@ -12,6 +12,8 @@ class _FakeLlama: @@ -12,6 +12,8 @@ class _FakeLlama:
12 self.model_path = model_path 12 self.model_path = model_path
13 self.kwargs = kwargs 13 self.kwargs = kwargs
14 self.eval_logits = [] 14 self.eval_logits = []
  15 + self._tokens = []
  16 + self.eval_call_count = 0
15 17
16 @classmethod 18 @classmethod
17 def from_pretrained(cls, repo_id: str, filename: str, local_dir=None, cache_dir=None, **kwargs): 19 def from_pretrained(cls, repo_id: str, filename: str, local_dir=None, cache_dir=None, **kwargs):
@@ -31,16 +33,25 @@ class _FakeLlama: @@ -31,16 +33,25 @@ class _FakeLlama:
31 return [10 + (ord(ch) % 17) for ch in raw] 33 return [10 + (ord(ch) % 17) for ch in raw]
32 34
33 def reset(self): 35 def reset(self):
  36 + self._tokens = []
34 return None 37 return None
35 38
36 def eval(self, prompt_tokens): 39 def eval(self, prompt_tokens):
37 - pos = float(sum(prompt_tokens) % 11) + 3.0 40 + self.eval_call_count += 1
  41 + self._tokens.extend(prompt_tokens)
  42 + pos = float(sum(self._tokens) % 11) + 3.0
38 neg = 1.0 43 neg = 1.0
39 logits = [0.0] * 64 44 logits = [0.0] * 64
40 logits[1] = pos 45 logits[1] = pos
41 logits[2] = neg 46 logits[2] = neg
42 self.eval_logits = [logits] 47 self.eval_logits = [logits]
43 48
  49 + def save_state(self):
  50 + return list(self._tokens)
  51 +
  52 + def load_state(self, state):
  53 + self._tokens = list(state)
  54 +
44 55
45 def _install_fake_llama_cpp(monkeypatch): 56 def _install_fake_llama_cpp(monkeypatch):
46 fake_module = types.SimpleNamespace(Llama=_FakeLlama) 57 fake_module = types.SimpleNamespace(Llama=_FakeLlama)
@@ -58,6 +69,21 @@ def test_qwen3_gguf_backend_factory_loads(monkeypatch): @@ -58,6 +69,21 @@ def test_qwen3_gguf_backend_factory_loads(monkeypatch):
58 }, 69 },
59 ) 70 )
60 assert isinstance(backend, Qwen3GGUFRerankerBackend) 71 assert isinstance(backend, Qwen3GGUFRerankerBackend)
  72 + assert backend._backend_name == "qwen3_gguf"
  73 +
  74 +
  75 +def test_qwen3_gguf_06b_backend_factory_loads(monkeypatch):
  76 + _install_fake_llama_cpp(monkeypatch)
  77 + backend = get_rerank_backend(
  78 + "qwen3_gguf_06b",
  79 + {
  80 + "enable_warmup": False,
  81 + },
  82 + )
  83 + assert isinstance(backend, Qwen3GGUFRerankerBackend)
  84 + assert backend._backend_name == "qwen3_gguf_06b"
  85 + assert backend._repo_id == "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF"
  86 + assert backend._filename == "qwen3-reranker-0.6b-q8_0.gguf"
61 87
62 88
63 def test_qwen3_gguf_backend_score_with_meta_dedup_and_restore(monkeypatch): 89 def test_qwen3_gguf_backend_score_with_meta_dedup_and_restore(monkeypatch):
@@ -69,6 +95,7 @@ def test_qwen3_gguf_backend_score_with_meta_dedup_and_restore(monkeypatch): @@ -69,6 +95,7 @@ def test_qwen3_gguf_backend_score_with_meta_dedup_and_restore(monkeypatch):
69 "enable_warmup": False, 95 "enable_warmup": False,
70 "infer_batch_size": 2, 96 "infer_batch_size": 2,
71 "sort_by_doc_length": True, 97 "sort_by_doc_length": True,
  98 + "reuse_query_state": True,
72 } 99 }
73 ) 100 )
74 101
@@ -88,3 +115,5 @@ def test_qwen3_gguf_backend_score_with_meta_dedup_and_restore(monkeypatch): @@ -88,3 +115,5 @@ def test_qwen3_gguf_backend_score_with_meta_dedup_and_restore(monkeypatch):
88 assert meta["unique_docs"] == 2 115 assert meta["unique_docs"] == 2
89 assert meta["backend"] == "qwen3_gguf" 116 assert meta["backend"] == "qwen3_gguf"
90 assert meta["inference_batches"] == 1 117 assert meta["inference_batches"] == 1
  118 + assert meta["reuse_query_state"] is True
  119 + assert backend._llm.eval_call_count == 3