Commit 3d508beb9fae5ebcd83a244ece6949be47296fed

Authored by tangwang
1 parent 87cacb1b

reranker-4b-gguf

config/config.yaml
@@ -381,7 +381,7 @@ services: @@ -381,7 +381,7 @@ services:
381 max_docs: 1000 381 max_docs: 1000
382 normalize: true 382 normalize: true
383 # 服务内后端(reranker 进程启动时读取) 383 # 服务内后端(reranker 进程启动时读取)
384 - backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank 384 + backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | qwen3_gguf | dashscope_rerank
385 backends: 385 backends:
386 bge: 386 bge:
387 model_name: "BAAI/bge-reranker-v2-m3" 387 model_name: "BAAI/bge-reranker-v2-m3"
@@ -420,6 +420,29 @@ services: @@ -420,6 +420,29 @@ services:
420 use_fp16: true 420 use_fp16: true
421 # sdpa:默认无需 flash-attn;若已安装 flash_attn 可改为 flash_attention_2 421 # sdpa:默认无需 flash-attn;若已安装 flash_attn 可改为 flash_attention_2
422 attn_implementation: "sdpa" 422 attn_implementation: "sdpa"
  423 + qwen3_gguf:
  424 + repo_id: "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF"
  425 + filename: "*Q8_0.gguf"
  426 + cache_dir: "./model_cache"
  427 + local_dir: "./models/reranker/qwen3-reranker-4b-gguf"
  428 + instruction: "Rank products by query with category & style match prioritized"
  429 + # T4 16GB / 显存约 5~6GB 的保守配置
  430 + n_ctx: 384
  431 + n_batch: 384
  432 + n_ubatch: 128
  433 + n_gpu_layers: 24
  434 + main_gpu: 0
  435 + n_threads: 2
  436 + n_threads_batch: 4
  437 + flash_attn: true
  438 + offload_kqv: true
  439 + use_mmap: true
  440 + use_mlock: false
  441 + infer_batch_size: 8
  442 + sort_by_doc_length: true
  443 + length_sort_mode: "char"
  444 + enable_warmup: true
  445 + verbose: false
423 dashscope_rerank: 446 dashscope_rerank:
424 model_name: "qwen3-rerank" 447 model_name: "qwen3-rerank"
425 # 按地域选择 endpoint: 448 # 按地域选择 endpoint:
requirements_reranker_base.txt 0 → 100644
@@ -0,0 +1,7 @@ @@ -0,0 +1,7 @@
  1 +# Shared base dependencies for reranker service venvs.
  2 +
  3 +fastapi>=0.100.0
  4 +uvicorn[standard]>=0.23.0
  5 +pydantic>=2.0.0
  6 +numpy>=1.24.0
  7 +pyyaml>=6.0
requirements_reranker_bge.txt 0 → 100644
@@ -0,0 +1,7 @@ @@ -0,0 +1,7 @@
  1 +# Isolated dependencies for bge reranker backend.
  2 +
  3 +-r requirements_reranker_base.txt
  4 +torch>=2.0.0
  5 +transformers>=4.30.0
  6 +sentence-transformers>=2.2.0
  7 +modelscope>=1.9.0
requirements_reranker_dashscope.txt 0 → 100644
@@ -0,0 +1,3 @@ @@ -0,0 +1,3 @@
  1 +# Isolated dependencies for dashscope_rerank backend.
  2 +
  3 +-r requirements_reranker_base.txt
requirements_reranker_qwen3_gguf.txt 0 → 100644
@@ -0,0 +1,4 @@ @@ -0,0 +1,4 @@
  1 +# Isolated dependencies for qwen3_gguf reranker backend (.venv-reranker-gguf).
  2 +
  3 +-r requirements_reranker_base.txt
  4 +llama-cpp-python>=0.3.16
requirements_reranker_qwen3_transformers.txt 0 → 100644
@@ -0,0 +1,5 @@ @@ -0,0 +1,5 @@
  1 +# Isolated dependencies for qwen3_transformers reranker backend.
  2 +
  3 +-r requirements_reranker_base.txt
  4 +torch>=2.0.0
  5 +transformers>=4.51.0
requirements_reranker_qwen3_vllm.txt 0 → 100644
@@ -0,0 +1,5 @@ @@ -0,0 +1,5 @@
  1 +# Isolated dependencies for qwen3_vllm reranker backend (.venv-reranker).
  2 +
  3 +-r requirements_reranker_base.txt
  4 +transformers>=4.30.0
  5 +vllm>=0.8.5
requirements_reranker_service.txt
1 -# Isolated dependencies for reranker service (.venv-reranker) 1 +# Legacy alias: qwen3_vllm reranker service env (.venv-reranker).
2 # 2 #
3 -# Default backend is qwen3_vllm (Qwen3-Reranker-0.6B). 3 +# Prefer backend-specific requirements files:
  4 +# - requirements_reranker_qwen3_vllm.txt
  5 +# - requirements_reranker_qwen3_gguf.txt
  6 +# - requirements_reranker_qwen3_transformers.txt
  7 +# - requirements_reranker_bge.txt
  8 +# - requirements_reranker_dashscope.txt
4 9
5 -fastapi>=0.100.0  
6 -uvicorn[standard]>=0.23.0  
7 -pydantic>=2.0.0  
8 -numpy>=1.24.0  
9 -pyyaml>=6.0  
10 -transformers>=4.30.0  
11 -vllm>=0.8.5 10 +-r requirements_reranker_qwen3_vllm.txt
reranker/DEPLOYMENT_AND_TUNING.md
1 -# Reranker 部署与性能调优手册(Qwen3-vLLM 1 +# Reranker 部署与性能调优手册(Qwen3-vLLM / Qwen3-GGUF
2 2
3 本文档沉淀当前项目在电商搜索重排场景下的可复用实践,覆盖: 3 本文档沉淀当前项目在电商搜索重排场景下的可复用实践,覆盖:
4 4
5 - 环境准备与安装部署 5 - 环境准备与安装部署
6 -- `qwen3_vllm` 配置项与优化思路 6 +- `qwen3_vllm` / `qwen3_gguf` 配置项与优化思路
7 - 1000-doc 场景压测流程 7 - 1000-doc 场景压测流程
8 - 关键结论与推荐默认参数 8 - 关键结论与推荐默认参数
9 - 常见故障排查 9 - 常见故障排查
10 10
11 适用范围: 11 适用范围:
12 12
13 -- 重排后端:`services.rerank.backend: qwen3_vllm`  
14 -- 模型:`Qwen/Qwen3-Reranker-0.6B` 13 +- 重排后端:`services.rerank.backend: qwen3_vllm` 或 `qwen3_gguf`
  14 +- 模型:`Qwen/Qwen3-Reranker-0.6B` / `DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF`
15 - 场景:query 较短(通常 < 100 tokens),doc 为商品标题或标题+简短描述,单请求 docs 约 1000 条 15 - 场景:query 较短(通常 < 100 tokens),doc 为商品标题或标题+简短描述,单请求 docs 约 1000 条
16 16
17 ## 1. 环境基线 17 ## 1. 环境基线
18 18
19 -当前验证环境(2026-03-11): 19 +当前验证环境(2026-03-25):
20 20
21 - GPU:`Tesla T4 16GB` 21 - GPU:`Tesla T4 16GB`
22 - Driver / CUDA:`570.158.01 / 12.8` 22 - Driver / CUDA:`570.158.01 / 12.8`
23 - Python:`3.12.3` 23 - Python:`3.12.3`
24 -- 关键依赖:`vllm==0.17.0`、`torch==2.10.0+cu128`、`transformers==4.57.6`、`fastapi==0.135.1`、`uvicorn==0.41.0` 24 +- 关键依赖:`vllm==0.17.0`、`torch==2.10.0+cu128`、`transformers==4.57.6`、`llama-cpp-python>=0.3.16`、`fastapi==0.135.1`、`uvicorn==0.41.0`
25 25
26 ## 2. 环境准备与安装 26 ## 2. 环境准备与安装
27 27
28 ### 2.1 准备 reranker 独立虚拟环境 28 ### 2.1 准备 reranker 独立虚拟环境
29 29
30 ```bash 30 ```bash
31 -./scripts/setup_reranker_venv.sh 31 +./scripts/setup_reranker_venv.sh qwen3_vllm
  32 +```
  33 +
  34 +若使用 GGUF 并需要 CUDA:
  35 +
  36 +```bash
  37 +./scripts/setup_reranker_venv.sh qwen3_gguf
  38 +PATH=/usr/local/cuda/bin:$PATH \
  39 +CUDACXX=/usr/local/cuda/bin/nvcc \
  40 +CMAKE_ARGS="-DGGML_CUDA=on" \
  41 +FORCE_CMAKE=1 \
  42 +./.venv-reranker-gguf/bin/pip install --no-cache-dir --force-reinstall --no-build-isolation llama-cpp-python==0.3.18
32 ``` 43 ```
33 44
34 ### 2.2 基础检查 45 ### 2.2 基础检查
@@ -37,6 +48,7 @@ @@ -37,6 +48,7 @@
37 nvidia-smi 48 nvidia-smi
38 ./.venv-reranker/bin/python -c "import torch; print(torch.cuda.is_available())" 49 ./.venv-reranker/bin/python -c "import torch; print(torch.cuda.is_available())"
39 ./.venv-reranker/bin/python -c "import vllm, transformers; print(vllm.__version__, transformers.__version__)" 50 ./.venv-reranker/bin/python -c "import vllm, transformers; print(vllm.__version__, transformers.__version__)"
  51 +./.venv-reranker-gguf/bin/python -c "import llama_cpp; print(llama_cpp.__version__)"
40 ``` 52 ```
41 53
42 ## 3. 部署与运行 54 ## 3. 部署与运行
@@ -64,6 +76,29 @@ services: @@ -64,6 +76,29 @@ services:
64 length_sort_mode: "char" # char | token 76 length_sort_mode: "char" # char | token
65 ``` 77 ```
66 78
  79 +GGUF / T4 剩余显存约 `4.8~6GB` 时,推荐基线:
  80 +
  81 +```yaml
  82 +services:
  83 + rerank:
  84 + backend: "qwen3_gguf"
  85 + backends:
  86 + qwen3_gguf:
  87 + repo_id: "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF"
  88 + filename: "*Q8_0.gguf"
  89 + local_dir: "./models/reranker/qwen3-reranker-4b-gguf"
  90 + cache_dir: "./model_cache"
  91 + n_ctx: 384
  92 + n_batch: 384
  93 + n_ubatch: 128
  94 + n_gpu_layers: 24
  95 + flash_attn: true
  96 + offload_kqv: true
  97 + infer_batch_size: 8
  98 + sort_by_doc_length: true
  99 + length_sort_mode: "char"
  100 +```
  101 +
67 ### 3.2 启停命令 102 ### 3.2 启停命令
68 103
69 推荐统一使用: 104 推荐统一使用:
@@ -105,6 +140,13 @@ curl -sS http://127.0.0.1:6007/health @@ -105,6 +140,13 @@ curl -sS http://127.0.0.1:6007/health
105 - `service_ctl.sh` 对 reranker 使用独立启动路径 140 - `service_ctl.sh` 对 reranker 使用独立启动路径
106 - 增加“稳定健康检查”(连续健康探测)避免“刚 healthy 即退出”的假阳性 141 - 增加“稳定健康检查”(连续健康探测)避免“刚 healthy 即退出”的假阳性
107 142
  143 +### 4.4 GGUF / T4 小显存优化原则
  144 +
  145 +- `Q8_0` 权重约 `4.28GB`,但还要给 KV cache、CUDA 工作区和运行时碎片预留空间,不能按“模型大小 < 剩余显存”直接判断可行。
  146 +- 当前业务是短 query + 商品标题,优先压缩 `n_ctx`;`384` 通常比默认长上下文更划算。
  147 +- T4 小显存下先扫 `n_gpu_layers`,再尝试提高 `n_ctx`;`infer_batch_size` 在当前 GGUF 接入里主要是服务侧 work chunk,不是 llama.cpp 的真实算子 batch。
  148 +- `flash_attn: true`、`offload_kqv: true` 默认保持开启;若 OOM,优先降低 `n_gpu_layers`。
  149 +
108 ## 5. 性能调优流程(标准流程) 150 ## 5. 性能调优流程(标准流程)
109 151
110 ### 5.1 使用一键压测脚本 152 ### 5.1 使用一键压测脚本
@@ -125,6 +167,13 @@ curl -sS http://127.0.0.1:6007/health @@ -125,6 +167,13 @@ curl -sS http://127.0.0.1:6007/health
125 - `infer_batch_size`: `24 32 48 64` 167 - `infer_batch_size`: `24 32 48 64`
126 - 并发组:`c=1`(看单请求延迟)、`c=4`(看并发吞吐与尾延迟) 168 - 并发组:`c=1`(看单请求延迟)、`c=4`(看并发吞吐与尾延迟)
127 169
  170 +GGUF 建议扫描:
  171 +
  172 +- `n_gpu_layers`: `20 24 28`
  173 +- `n_ctx`: `320 384 448`
  174 +- `infer_batch_size`: `4 8 12`(次要,仅影响服务侧 work chunk)
  175 +- 扫描顺序:先固定 `n_ctx=384`,找能稳定启动的最大 `n_gpu_layers`;再在显存允许时尝试 `n_ctx=448`;最后才微调 `infer_batch_size`
  176 +
128 可通过环境变量覆盖: 177 可通过环境变量覆盖:
129 178
130 - `BATCH_SIZES` 179 - `BATCH_SIZES`
@@ -140,23 +189,28 @@ curl -sS http://127.0.0.1:6007/health @@ -140,23 +189,28 @@ curl -sS http://127.0.0.1:6007/health
140 - `RERANK_VLLM_INFER_BATCH_SIZE` 189 - `RERANK_VLLM_INFER_BATCH_SIZE`
141 - `RERANK_VLLM_SORT_BY_DOC_LENGTH` 190 - `RERANK_VLLM_SORT_BY_DOC_LENGTH`
142 191
143 -## 6. 本轮关键结论(2026-03-11)  
144 -  
145 -基于报告:  
146 -  
147 -- `perf_reports/20260311/reranker_1000docs/report.md` 192 +## 6. 本轮关键结论
148 193
149 -结论 194 +vLLM(2026-03-11,见 `perf_reports/20260311/reranker_1000docs/report.md`)
150 195
151 - 对在线重排更重要的单请求延迟(`c=1`)指标,`infer_batch_size=64` 最优 196 - 对在线重排更重要的单请求延迟(`c=1`)指标,`infer_batch_size=64` 最优
152 - `infer_batch_size=96` 在更高并发下吞吐略高,但会牺牲单请求延迟稳定性 197 - `infer_batch_size=96` 在更高并发下吞吐略高,但会牺牲单请求延迟稳定性
153 - 当前默认选择 `infer_batch_size=64` 作为平衡点 198 - 当前默认选择 `infer_batch_size=64` 作为平衡点
154 199
  200 +GGUF(2026-03-25,本次接入):
  201 +
  202 +- `DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF` 的 `Q8_0` 体积约 `4.28GB`,结合当前机器实测剩余显存约 `4823 MiB`,默认不采用激进的全量 GPU offload。
  203 +- 当前推荐默认值:`n_ctx=384`、`n_batch=384`、`n_ubatch=128`、`n_gpu_layers=24`、`infer_batch_size=8`。
  204 +- 若现场剩余显存更接近 `6GB` 且碎片较少,可优先尝试 `n_gpu_layers=28`;若启动失败,回退到 `24` 或 `20`。
  205 +- 由于当前工作区尚未缓存该 GGUF 权重,本次尚未完成真实吞吐压测;上线前需在部署机复跑一轮参数扫描并归档报告。
  206 +
155 ## 7. 生产建议 207 ## 7. 生产建议
156 208
157 - 默认保持:`infer_batch_size: 64`、`sort_by_doc_length: true` 209 - 默认保持:`infer_batch_size: 64`、`sort_by_doc_length: true`
158 - 满足以下条件时可考虑提高到 `96`:业务以吞吐优先、可接受更高单请求延迟、已通过同机同数据压测验证收益 210 - 满足以下条件时可考虑提高到 `96`:业务以吞吐优先、可接受更高单请求延迟、已通过同机同数据压测验证收益
159 - 每次改动后都必须复跑 `benchmark_reranker_1000docs.sh` 并归档结果 211 - 每次改动后都必须复跑 `benchmark_reranker_1000docs.sh` 并归档结果
  212 +- GGUF 默认保持:`n_ctx: 384`、`n_gpu_layers: 24`、`infer_batch_size: 8`、`flash_attn: true`、`offload_kqv: true`
  213 +- GGUF 若 OOM:先降 `n_gpu_layers`,再降 `n_ctx`,最后再降 `infer_batch_size`
160 214
161 ## 8. 故障排查 215 ## 8. 故障排查
162 216
@@ -194,6 +248,13 @@ lsof -i :6007 -P -n @@ -194,6 +248,13 @@ lsof -i :6007 -P -n
194 - 降低 `infer_batch_size` 248 - 降低 `infer_batch_size`
195 - 检查是否有其他进程占用同卡 249 - 检查是否有其他进程占用同卡
196 250
  251 +GGUF 优先调整:
  252 +
  253 +- 降低 `n_gpu_layers`
  254 +- 降低 `n_ctx`
  255 +- 降低 `infer_batch_size`
  256 +- 检查是否有其他进程占用同卡
  257 +
197 ## 9. 变更与验证清单 258 ## 9. 变更与验证清单
198 259
199 每次 reranker 调优改动后,至少完成: 260 每次 reranker 调优改动后,至少完成:
reranker/README.md
@@ -4,10 +4,10 @@ @@ -4,10 +4,10 @@
4 4
5 --- 5 ---
6 6
7 -Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwen3-vLLM、Qwen3-Transformers、DashScope 云重排)。调用方通过 HTTP 访问,不关心具体后端。 7 +Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwen3-vLLM、Qwen3-Transformers、Qwen3-GGUF、DashScope 云重排)。调用方通过 HTTP 访问,不关心具体后端。
8 8
9 **特性** 9 **特性**
10 -- 多后端:`qwen3_vllm`(默认,Qwen3-Reranker-0.6B + vLLM)、`qwen3_transformers`(纯 Transformers,无需 vLLM)、`bge`(兼容保留) 10 +- 多后端:`qwen3_vllm`、`qwen3_transformers`、`qwen3_gguf`(Qwen3-Reranker-4B GGUF + llama.cpp)、`bge`(兼容保留)
11 - 云后端:`dashscope_rerank`(调用 DashScope `/compatible-api/v1/reranks`,支持按地域切换 endpoint) 11 - 云后端:`dashscope_rerank`(调用 DashScope `/compatible-api/v1/reranks`,支持按地域切换 endpoint)
12 - 统一配置:`config/config.yaml` → `services.rerank.backend` / `services.rerank.backends.<name>` 12 - 统一配置:`config/config.yaml` → `services.rerank.backend` / `services.rerank.backends.<name>`
13 - 文档去重、分数与输入顺序一致、FP16/GPU 支持(视后端) 13 - 文档去重、分数与输入顺序一致、FP16/GPU 支持(视后端)
@@ -19,6 +19,7 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe @@ -19,6 +19,7 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe
19 - `backends/bge.py`:BGE 后端 19 - `backends/bge.py`:BGE 后端
20 - `backends/qwen3_vllm.py`:Qwen3-Reranker-0.6B + vLLM 后端 20 - `backends/qwen3_vllm.py`:Qwen3-Reranker-0.6B + vLLM 后端
21 - `backends/qwen3_transformers.py`:Qwen3-Reranker-0.6B 纯 Transformers 后端(官方 Usage 方式) 21 - `backends/qwen3_transformers.py`:Qwen3-Reranker-0.6B 纯 Transformers 后端(官方 Usage 方式)
  22 + - `backends/qwen3_gguf.py`:Qwen3-Reranker-4B GGUF + llama.cpp 后端
22 - `backends/dashscope_rerank.py`:DashScope 云重排后端(HTTP 调用) 23 - `backends/dashscope_rerank.py`:DashScope 云重排后端(HTTP 调用)
23 - `reranker/bge_reranker.py`:BGE 核心推理(被 bge 后端封装) 24 - `reranker/bge_reranker.py`:BGE 核心推理(被 bge 后端封装)
24 - `reranker/config.py`:服务端口、MAX_DOCS、NORMALIZE 等(后端参数在 config.yaml) 25 - `reranker/config.py`:服务端口、MAX_DOCS、NORMALIZE 等(后端参数在 config.yaml)
@@ -27,18 +28,33 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe @@ -27,18 +28,33 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe
27 - 通用:`torch`、`transformers`、`fastapi`、`uvicorn`(隔离环境见 `requirements_reranker_service.txt`;全量 ML 环境另见 `requirements_ml.txt`) 28 - 通用:`torch`、`transformers`、`fastapi`、`uvicorn`(隔离环境见 `requirements_reranker_service.txt`;全量 ML 环境另见 `requirements_ml.txt`)
28 - **Qwen3-vLLM 后端**:`vllm>=0.8.5`、`transformers>=4.51.0`(仅当使用 `backend: qwen3_vllm` 时需 vLLM) 29 - **Qwen3-vLLM 后端**:`vllm>=0.8.5`、`transformers>=4.51.0`(仅当使用 `backend: qwen3_vllm` 时需 vLLM)
29 - **Qwen3-Transformers 后端**:`transformers>=4.51.0`、`torch`(无需 vLLM,适合 CPU 或小显存) 30 - **Qwen3-Transformers 后端**:`transformers>=4.51.0`、`torch`(无需 vLLM,适合 CPU 或小显存)
  31 +- **Qwen3-GGUF 后端**:`llama-cpp-python>=0.3.16`
  32 +- 现在按 backend 使用独立 venv:
  33 + - `qwen3_vllm` -> `.venv-reranker`
  34 + - `qwen3_gguf` -> `.venv-reranker-gguf`
  35 + - `qwen3_transformers` -> `.venv-reranker-transformers`
  36 + - `bge` -> `.venv-reranker-bge`
  37 + - `dashscope_rerank` -> `.venv-reranker-dashscope`
30 ```bash 38 ```bash
31 - ./scripts/setup_reranker_venv.sh 39 + ./scripts/setup_reranker_venv.sh qwen3_gguf
  40 + ```
  41 + CUDA 构建建议:
  42 + ```bash
  43 + PATH=/usr/local/cuda/bin:$PATH \
  44 + CUDACXX=/usr/local/cuda/bin/nvcc \
  45 + CMAKE_ARGS="-DGGML_CUDA=on" \
  46 + FORCE_CMAKE=1 \
  47 + ./.venv-reranker-gguf/bin/pip install --no-cache-dir --force-reinstall --no-build-isolation llama-cpp-python==0.3.18
32 ``` 48 ```
33 49
34 ## 配置 50 ## 配置
35 -- **后端选择**:`config/config.yaml` 中 `services.rerank.backend`(`qwen3_vllm` | `qwen3_transformers` | `bge` | `dashscope_rerank`),或环境变量 `RERANK_BACKEND`。 51 +- **后端选择**:`config/config.yaml` 中 `services.rerank.backend`(`qwen3_vllm` | `qwen3_transformers` | `qwen3_gguf` | `bge` | `dashscope_rerank`),或环境变量 `RERANK_BACKEND`。
36 - **后端参数**:`services.rerank.backends.bge` / `services.rerank.backends.qwen3_vllm`,例如: 52 - **后端参数**:`services.rerank.backends.bge` / `services.rerank.backends.qwen3_vllm`,例如:
37 53
38 ```yaml 54 ```yaml
39 services: 55 services:
40 rerank: 56 rerank:
41 - backend: "qwen3_vllm" # 或 bge 57 + backend: "qwen3_gguf" # 或 qwen3_vllm / bge
42 backends: 58 backends:
43 bge: 59 bge:
44 model_name: "BAAI/bge-reranker-v2-m3" 60 model_name: "BAAI/bge-reranker-v2-m3"
@@ -65,6 +81,21 @@ services: @@ -65,6 +81,21 @@ services:
65 tensor_parallel_size: 1 81 tensor_parallel_size: 1
66 gpu_memory_utilization: 0.8 82 gpu_memory_utilization: 0.8
67 instruction: "Given a shopping query, rank product titles by relevance" 83 instruction: "Given a shopping query, rank product titles by relevance"
  84 + qwen3_gguf:
  85 + repo_id: "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF"
  86 + filename: "*Q8_0.gguf"
  87 + local_dir: "./models/reranker/qwen3-reranker-4b-gguf"
  88 + cache_dir: "./model_cache"
  89 + instruction: "Rank products by query with category & style match prioritized"
  90 + n_ctx: 384
  91 + n_batch: 384
  92 + n_ubatch: 128
  93 + n_gpu_layers: 24
  94 + flash_attn: true
  95 + offload_kqv: true
  96 + infer_batch_size: 8
  97 + sort_by_doc_length: true
  98 + length_sort_mode: "char"
68 dashscope_rerank: 99 dashscope_rerank:
69 model_name: "qwen3-rerank" 100 model_name: "qwen3-rerank"
70 endpoint: "https://dashscope.aliyuncs.com/compatible-api/v1/reranks" 101 endpoint: "https://dashscope.aliyuncs.com/compatible-api/v1/reranks"
@@ -94,7 +125,7 @@ DashScope 认证: @@ -94,7 +125,7 @@ DashScope 认证:
94 ```bash 125 ```bash
95 ./scripts/start_reranker.sh 126 ./scripts/start_reranker.sh
96 ``` 127 ```
97 -该脚本会使用隔离环境 `.venv-reranker`;首次请先执行 `./scripts/setup_reranker_venv.sh`。 128 +该脚本会按当前 `services.rerank.backend` 自动选择对应的独立 venv;首次请先执行 `./scripts/setup_reranker_venv.sh <backend>`。
98 129
99 ## 性能压测(1000 docs) 130 ## 性能压测(1000 docs)
100 ```bash 131 ```bash
@@ -122,7 +153,7 @@ Content-Type: application/json @@ -122,7 +153,7 @@ Content-Type: application/json
122 ``` 153 ```
123 154
124 `top_n` 为可选字段: 155 `top_n` 为可选字段:
125 -- 对本地后端(`qwen3_vllm` / `qwen3_transformers` / `bge`)通常会忽略,仍返回全量分数。 156 +- 对本地后端(`qwen3_vllm` / `qwen3_transformers` / `qwen3_gguf` / `bge`)通常会忽略,仍返回全量分数。
126 - 对 `dashscope_rerank` 可用于控制云端返回的候选量,建议设置为 `page+size`(例如分页 `from=20,size=10` 时传 `30`)。 157 - 对 `dashscope_rerank` 可用于控制云端返回的候选量,建议设置为 `page+size`(例如分页 `from=20,size=10` 时传 `30`)。
127 158
128 Response: 159 Response:
@@ -160,3 +191,4 @@ uvicorn reranker.server:app --host 0.0.0.0 --port 6007 --log-level info @@ -160,3 +191,4 @@ uvicorn reranker.server:app --host 0.0.0.0 --port 6007 --log-level info
160 - 运行时可用环境变量临时覆盖批量参数:`RERANK_VLLM_INFER_BATCH_SIZE`、`RERANK_VLLM_SORT_BY_DOC_LENGTH`。 191 - 运行时可用环境变量临时覆盖批量参数:`RERANK_VLLM_INFER_BATCH_SIZE`、`RERANK_VLLM_SORT_BY_DOC_LENGTH`。
161 - **Qwen3-vLLM**:参考 [Qwen3-Reranker-0.6B](https://huggingface.co/Qwen/Qwen3-Reranker-0.6B),需 GPU 与较多显存;与 BGE 相比适合长文本、高吞吐场景(vLLM 前缀缓存)。 192 - **Qwen3-vLLM**:参考 [Qwen3-Reranker-0.6B](https://huggingface.co/Qwen/Qwen3-Reranker-0.6B),需 GPU 与较多显存;与 BGE 相比适合长文本、高吞吐场景(vLLM 前缀缓存)。
162 - **Qwen3-Transformers**:官方 Transformers Usage 方式,无需 vLLM;适合 CPU 或小显存。默认 `attn_implementation: "sdpa"`;若已安装 `flash_attn` 可设 `flash_attention_2`(未安装时服务会自动回退到 sdpa)。 193 - **Qwen3-Transformers**:官方 Transformers Usage 方式,无需 vLLM;适合 CPU 或小显存。默认 `attn_implementation: "sdpa"`;若已安装 `flash_attn` 可设 `flash_attention_2`(未安装时服务会自动回退到 sdpa)。
  194 +- **Qwen3-GGUF**:参考 [DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF](https://huggingface.co/DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF)。单卡 T4 且仅剩约 `4.8~6GB` 显存时,推荐 `Q8_0 + n_ctx=384 + n_gpu_layers=24 + flash_attn=true + offload_kqv=true` 起步;若启动 OOM,优先把 `n_gpu_layers` 下调到 `20`,再把 `n_ctx` 下调到 `320`。`infer_batch_size` 在 GGUF 后端是服务侧 work chunk,大多不如 `n_gpu_layers` / `n_ctx` 关键。
reranker/backends/__init__.py
@@ -46,11 +46,14 @@ def get_rerank_backend(name: str, config: Dict[str, Any]) -&gt; RerankBackendProtoc @@ -46,11 +46,14 @@ def get_rerank_backend(name: str, config: Dict[str, Any]) -&gt; RerankBackendProtoc
46 if name == "qwen3_transformers": 46 if name == "qwen3_transformers":
47 from reranker.backends.qwen3_transformers import Qwen3TransformersRerankerBackend 47 from reranker.backends.qwen3_transformers import Qwen3TransformersRerankerBackend
48 return Qwen3TransformersRerankerBackend(config) 48 return Qwen3TransformersRerankerBackend(config)
  49 + if name == "qwen3_gguf":
  50 + from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend
  51 + return Qwen3GGUFRerankerBackend(config)
49 if name == "dashscope_rerank": 52 if name == "dashscope_rerank":
50 from reranker.backends.dashscope_rerank import DashScopeRerankBackend 53 from reranker.backends.dashscope_rerank import DashScopeRerankBackend
51 return DashScopeRerankBackend(config) 54 return DashScopeRerankBackend(config)
52 raise ValueError( 55 raise ValueError(
53 - f"Unknown rerank backend: {name!r}. Supported: bge, qwen3_vllm, qwen3_transformers, dashscope_rerank" 56 + f"Unknown rerank backend: {name!r}. Supported: bge, qwen3_vllm, qwen3_transformers, qwen3_gguf, dashscope_rerank"
54 ) 57 )
55 58
56 59
reranker/backends/qwen3_gguf.py 0 → 100644
@@ -0,0 +1,327 @@ @@ -0,0 +1,327 @@
  1 +"""
  2 +Qwen3-Reranker-4B GGUF backend using llama-cpp-python.
  3 +
  4 +Reference:
  5 +- https://huggingface.co/DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF
  6 +- https://huggingface.co/Qwen/Qwen3-Reranker-4B
  7 +"""
  8 +
  9 +from __future__ import annotations
  10 +
  11 +import logging
  12 +import math
  13 +import os
  14 +import threading
  15 +import time
  16 +from typing import Any, Dict, List, Tuple
  17 +
  18 +
  19 +logger = logging.getLogger("reranker.backends.qwen3_gguf")
  20 +
  21 +
  22 +def deduplicate_with_positions(texts: List[str]) -> Tuple[List[str], List[int]]:
  23 + """Deduplicate texts globally while preserving first-seen order."""
  24 + unique_texts: List[str] = []
  25 + position_to_unique: List[int] = []
  26 + seen: Dict[str, int] = {}
  27 +
  28 + for text in texts:
  29 + idx = seen.get(text)
  30 + if idx is None:
  31 + idx = len(unique_texts)
  32 + seen[text] = idx
  33 + unique_texts.append(text)
  34 + position_to_unique.append(idx)
  35 +
  36 + return unique_texts, position_to_unique
  37 +
  38 +
  39 +def _format_instruction(instruction: str, query: str, doc: str) -> str:
  40 + return "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}".format(
  41 + instruction=instruction,
  42 + query=query,
  43 + doc=doc,
  44 + )
  45 +
  46 +
  47 +class Qwen3GGUFRerankerBackend:
  48 + """
  49 + Qwen3-Reranker-4B GGUF backend using llama.cpp through llama-cpp-python.
  50 +
  51 + Tuned for short-query / short-doc reranking on a memory-constrained single T4.
  52 + Config from services.rerank.backends.qwen3_gguf.
  53 + """
  54 +
  55 + def __init__(self, config: Dict[str, Any]) -> None:
  56 + self._config = config or {}
  57 + self._repo_id = str(
  58 + self._config.get("repo_id") or "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF"
  59 + ).strip()
  60 + self._filename = str(self._config.get("filename") or "*Q8_0.gguf").strip()
  61 + self._model_path = str(self._config.get("model_path") or "").strip()
  62 + self._cache_dir = str(self._config.get("cache_dir") or "").strip() or None
  63 + self._local_dir = str(self._config.get("local_dir") or "").strip() or None
  64 + self._instruction = str(
  65 + self._config.get("instruction")
  66 + or "Rank products by query with category & style match prioritized"
  67 + )
  68 + self._infer_batch_size = int(
  69 + os.getenv("RERANK_GGUF_INFER_BATCH_SIZE") or self._config.get("infer_batch_size", 8)
  70 + )
  71 + sort_by_doc_length = os.getenv("RERANK_GGUF_SORT_BY_DOC_LENGTH")
  72 + if sort_by_doc_length is None:
  73 + sort_by_doc_length = self._config.get("sort_by_doc_length", True)
  74 + self._sort_by_doc_length = str(sort_by_doc_length).strip().lower() in {
  75 + "1",
  76 + "true",
  77 + "yes",
  78 + "y",
  79 + "on",
  80 + }
  81 + self._length_sort_mode = str(self._config.get("length_sort_mode") or "char").strip().lower()
  82 +
  83 + n_ctx = int(self._config.get("n_ctx", self._config.get("max_model_len", 384)))
  84 + n_batch = int(self._config.get("n_batch", min(n_ctx, 384)))
  85 + n_ubatch = int(self._config.get("n_ubatch", min(n_batch, 128)))
  86 + n_gpu_layers = int(self._config.get("n_gpu_layers", 24))
  87 + main_gpu = int(self._config.get("main_gpu", 0))
  88 + n_threads = int(self._config.get("n_threads", 2))
  89 + n_threads_batch = int(self._config.get("n_threads_batch", 4))
  90 + flash_attn = bool(self._config.get("flash_attn", True))
  91 + offload_kqv = bool(self._config.get("offload_kqv", True))
  92 + use_mmap = bool(self._config.get("use_mmap", True))
  93 + use_mlock = bool(self._config.get("use_mlock", False))
  94 + verbose = bool(self._config.get("verbose", False))
  95 + enable_warmup = bool(self._config.get("enable_warmup", True))
  96 +
  97 + if self._infer_batch_size <= 0:
  98 + raise ValueError(f"infer_batch_size must be > 0, got {self._infer_batch_size}")
  99 + if n_ctx <= 0:
  100 + raise ValueError(f"n_ctx must be > 0, got {n_ctx}")
  101 + if n_batch <= 0 or n_ubatch <= 0:
  102 + raise ValueError(f"n_batch/n_ubatch must be > 0, got {n_batch}/{n_ubatch}")
  103 +
  104 + try:
  105 + from llama_cpp import Llama
  106 + except Exception as exc: # pragma: no cover - depends on optional dependency
  107 + raise RuntimeError(
  108 + "qwen3_gguf backend requires llama-cpp-python. "
  109 + "Install the qwen3_gguf backend venv first via scripts/setup_reranker_venv.sh qwen3_gguf."
  110 + ) from exc
  111 +
  112 + self._llama_class = Llama
  113 + self._n_ctx = n_ctx
  114 + self._n_batch = n_batch
  115 + self._n_ubatch = n_ubatch
  116 + self._n_gpu_layers = n_gpu_layers
  117 + self._enable_warmup = enable_warmup
  118 + self._infer_lock = threading.Lock()
  119 +
  120 + logger.info(
  121 + "[Qwen3_GGUF] Loading model repo=%s filename=%s model_path=%s n_ctx=%s n_batch=%s n_ubatch=%s n_gpu_layers=%s flash_attn=%s offload_kqv=%s",
  122 + self._repo_id,
  123 + self._filename,
  124 + self._model_path or None,
  125 + n_ctx,
  126 + n_batch,
  127 + n_ubatch,
  128 + n_gpu_layers,
  129 + flash_attn,
  130 + offload_kqv,
  131 + )
  132 +
  133 + llm_kwargs = {
  134 + "n_ctx": n_ctx,
  135 + "n_batch": n_batch,
  136 + "n_ubatch": n_ubatch,
  137 + "n_gpu_layers": n_gpu_layers,
  138 + "main_gpu": main_gpu,
  139 + "n_threads": n_threads,
  140 + "n_threads_batch": n_threads_batch,
  141 + "logits_all": True,
  142 + "offload_kqv": offload_kqv,
  143 + "flash_attn": flash_attn,
  144 + "use_mmap": use_mmap,
  145 + "use_mlock": use_mlock,
  146 + "verbose": verbose,
  147 + }
  148 + llm_kwargs = {key: value for key, value in llm_kwargs.items() if value is not None}
  149 + self._llm = self._load_model(llm_kwargs)
  150 + self._model_name = self._model_path or f"{self._repo_id}:{self._filename}"
  151 +
  152 + self._prefix = (
  153 + "<|im_start|>system\n"
  154 + "Judge whether the Document meets the requirements based on the Query and the Instruct provided. "
  155 + 'Note that the answer can only be "yes" or "no".'
  156 + "<|im_end|>\n<|im_start|>user\n"
  157 + )
  158 + self._suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
  159 + self._prefix_tokens = self._tokenize(self._prefix, special=True)
  160 + self._suffix_tokens = self._tokenize(self._suffix, special=True)
  161 + self._effective_max_len = self._n_ctx - len(self._prefix_tokens) - len(self._suffix_tokens)
  162 + if self._effective_max_len <= 16:
  163 + raise RuntimeError(
  164 + f"n_ctx={self._n_ctx} is too small after prompt overhead; effective={self._effective_max_len}"
  165 + )
  166 +
  167 + self._true_token = self._single_token_id("yes")
  168 + self._false_token = self._single_token_id("no")
  169 +
  170 + if self._enable_warmup:
  171 + self._warmup()
  172 +
  173 + logger.info(
  174 + "[Qwen3_GGUF] Model ready | model=%s effective_max_len=%s infer_batch_size=%s sort_by_doc_length=%s",
  175 + self._model_name,
  176 + self._effective_max_len,
  177 + self._infer_batch_size,
  178 + self._sort_by_doc_length,
  179 + )
  180 +
  181 + def _load_model(self, llm_kwargs: Dict[str, Any]):
  182 + if self._model_path:
  183 + return self._llama_class(model_path=self._model_path, **llm_kwargs)
  184 + return self._llama_class.from_pretrained(
  185 + repo_id=self._repo_id,
  186 + filename=self._filename,
  187 + local_dir=self._local_dir,
  188 + cache_dir=self._cache_dir,
  189 + **llm_kwargs,
  190 + )
  191 +
  192 + def _tokenize(self, text: str, *, special: bool) -> List[int]:
  193 + return list(
  194 + self._llm.tokenize(
  195 + text.encode("utf-8"),
  196 + add_bos=False,
  197 + special=special,
  198 + )
  199 + )
  200 +
  201 + def _single_token_id(self, text: str) -> int:
  202 + token_ids = self._tokenize(text, special=False)
  203 + if len(token_ids) != 1:
  204 + raise RuntimeError(f"Expected {text!r} to be one token, got {token_ids}")
  205 + return int(token_ids[0])
  206 +
  207 + def _warmup(self) -> None:
  208 + try:
  209 + prompt = self._build_prompt_tokens("warmup query", "warmup document")
  210 + with self._infer_lock:
  211 + self._eval_logits(prompt)
  212 + except Exception as exc: # pragma: no cover - defensive
  213 + logger.warning("[Qwen3_GGUF] Warmup failed: %s", exc)
  214 +
  215 + def _build_prompt_tokens(self, query: str, doc: str) -> List[int]:
  216 + pair = _format_instruction(self._instruction, query, doc)
  217 + pair_tokens = self._tokenize(pair, special=False)
  218 + pair_tokens = pair_tokens[: self._effective_max_len]
  219 + return self._prefix_tokens + pair_tokens + self._suffix_tokens
  220 +
  221 + def _eval_logits(self, prompt_tokens: List[int]) -> List[float]:
  222 + self._llm.reset()
  223 + self._llm.eval(prompt_tokens)
  224 + logits = self._llm.eval_logits
  225 + if not logits:
  226 + raise RuntimeError("llama.cpp returned empty logits")
  227 + return list(logits[-1])
  228 +
  229 + def _score_prompt(self, prompt_tokens: List[int]) -> float:
  230 + logits = self._eval_logits(prompt_tokens)
  231 + true_logit = float(logits[self._true_token])
  232 + false_logit = float(logits[self._false_token])
  233 + max_logit = max(true_logit, false_logit)
  234 + true_exp = math.exp(true_logit - max_logit)
  235 + false_exp = math.exp(false_logit - max_logit)
  236 + return float(true_exp / (true_exp + false_exp))
  237 +
  238 + def _estimate_doc_lengths(self, docs: List[str]) -> List[int]:
  239 + if self._length_sort_mode == "token":
  240 + return [len(self._tokenize(text, special=False)) for text in docs]
  241 + return [len(text) for text in docs]
  242 +
  243 + def score_with_meta(
  244 + self,
  245 + query: str,
  246 + docs: List[str],
  247 + normalize: bool = True,
  248 + ) -> Tuple[List[float], Dict[str, Any]]:
  249 + start_ts = time.time()
  250 + total_docs = len(docs) if docs else 0
  251 + output_scores: List[float] = [0.0] * total_docs
  252 +
  253 + query = "" if query is None else str(query).strip()
  254 + indexed: List[Tuple[int, str]] = []
  255 + for i, doc in enumerate(docs or []):
  256 + if doc is None:
  257 + continue
  258 + text = str(doc).strip()
  259 + if not text:
  260 + continue
  261 + indexed.append((i, text))
  262 +
  263 + if not query or not indexed:
  264 + elapsed_ms = (time.time() - start_ts) * 1000.0
  265 + return output_scores, {
  266 + "input_docs": total_docs,
  267 + "usable_docs": len(indexed),
  268 + "unique_docs": 0,
  269 + "dedup_ratio": 0.0,
  270 + "elapsed_ms": round(elapsed_ms, 3),
  271 + "model": self._model_name,
  272 + "backend": "qwen3_gguf",
  273 + "normalize": normalize,
  274 + "infer_batch_size": self._infer_batch_size,
  275 + "inference_batches": 0,
  276 + "sort_by_doc_length": self._sort_by_doc_length,
  277 + "n_ctx": self._n_ctx,
  278 + "n_batch": self._n_batch,
  279 + "n_ubatch": self._n_ubatch,
  280 + "n_gpu_layers": self._n_gpu_layers,
  281 + }
  282 +
  283 + indexed_texts = [text for _, text in indexed]
  284 + unique_texts, position_to_unique = deduplicate_with_positions(indexed_texts)
  285 +
  286 + lengths = self._estimate_doc_lengths(unique_texts)
  287 + order = list(range(len(unique_texts)))
  288 + if self._sort_by_doc_length and len(unique_texts) > 1:
  289 + order = sorted(order, key=lambda i: lengths[i])
  290 +
  291 + unique_scores: List[float] = [0.0] * len(unique_texts)
  292 + inference_batches = 0
  293 + for start in range(0, len(order), self._infer_batch_size):
  294 + batch_indices = order[start : start + self._infer_batch_size]
  295 + inference_batches += 1
  296 + for idx in batch_indices:
  297 + prompt = self._build_prompt_tokens(query, unique_texts[idx])
  298 + with self._infer_lock:
  299 + unique_scores[idx] = self._score_prompt(prompt)
  300 +
  301 + for (orig_idx, _), unique_idx in zip(indexed, position_to_unique):
  302 + output_scores[orig_idx] = float(unique_scores[unique_idx])
  303 +
  304 + elapsed_ms = (time.time() - start_ts) * 1000.0
  305 + dedup_ratio = 0.0
  306 + if indexed:
  307 + dedup_ratio = 1.0 - (len(unique_texts) / float(len(indexed)))
  308 +
  309 + meta = {
  310 + "input_docs": total_docs,
  311 + "usable_docs": len(indexed),
  312 + "unique_docs": len(unique_texts),
  313 + "dedup_ratio": round(dedup_ratio, 4),
  314 + "elapsed_ms": round(elapsed_ms, 3),
  315 + "model": self._model_name,
  316 + "backend": "qwen3_gguf",
  317 + "normalize": normalize,
  318 + "infer_batch_size": self._infer_batch_size,
  319 + "inference_batches": inference_batches,
  320 + "sort_by_doc_length": self._sort_by_doc_length,
  321 + "length_sort_mode": self._length_sort_mode,
  322 + "n_ctx": self._n_ctx,
  323 + "n_batch": self._n_batch,
  324 + "n_ubatch": self._n_ubatch,
  325 + "n_gpu_layers": self._n_gpu_layers,
  326 + }
  327 + return output_scores, meta
reranker/server.py
@@ -7,7 +7,7 @@ Request: { &quot;query&quot;: &quot;...&quot;, &quot;docs&quot;: [&quot;doc1&quot;, &quot;doc2&quot;, ...], &quot;normalize&quot;: optional @@ -7,7 +7,7 @@ Request: { &quot;query&quot;: &quot;...&quot;, &quot;docs&quot;: [&quot;doc1&quot;, &quot;doc2&quot;, ...], &quot;normalize&quot;: optional
7 Response: { "scores": [float], "meta": {...} } 7 Response: { "scores": [float], "meta": {...} }
8 8
9 Backend selected via config: services.rerank.backend 9 Backend selected via config: services.rerank.backend
10 -(bge | qwen3_vllm | qwen3_transformers | dashscope_rerank), env RERANK_BACKEND. 10 +(bge | qwen3_vllm | qwen3_transformers | qwen3_gguf | dashscope_rerank), env RERANK_BACKEND.
11 """ 11 """
12 12
13 import logging 13 import logging
scripts/lib/reranker_backend_env.sh 0 → 100644
@@ -0,0 +1,62 @@ @@ -0,0 +1,62 @@
  1 +#!/bin/bash
  2 +#
  3 +# Shared helpers for mapping reranker backends to isolated virtualenvs.
  4 +#
  5 +
  6 +set -euo pipefail
  7 +
  8 +detect_rerank_backend() {
  9 + local project_root="$1"
  10 + local backend="${RERANK_BACKEND:-}"
  11 +
  12 + if [[ -n "${backend}" ]]; then
  13 + printf '%s\n' "${backend}"
  14 + return 0
  15 + fi
  16 +
  17 + backend="$(
  18 + awk '
  19 + /^ rerank:$/ { in_rerank=1; next }
  20 + in_rerank && /^ [^ ]/ { in_rerank=0 }
  21 + in_rerank && /^ backend:/ {
  22 + gsub(/"/, "", $2)
  23 + print $2
  24 + exit
  25 + }
  26 + ' "${project_root}/config/config.yaml"
  27 + )"
  28 +
  29 + if [[ -z "${backend}" ]]; then
  30 + backend="qwen3_vllm"
  31 + fi
  32 +
  33 + printf '%s\n' "${backend}"
  34 +}
  35 +
  36 +reranker_backend_venv_dir() {
  37 + local project_root="$1"
  38 + local backend="$2"
  39 +
  40 + case "${backend}" in
  41 + qwen3_vllm) printf '%s/.venv-reranker\n' "${project_root}" ;;
  42 + qwen3_gguf) printf '%s/.venv-reranker-gguf\n' "${project_root}" ;;
  43 + qwen3_transformers) printf '%s/.venv-reranker-transformers\n' "${project_root}" ;;
  44 + bge) printf '%s/.venv-reranker-bge\n' "${project_root}" ;;
  45 + dashscope_rerank) printf '%s/.venv-reranker-dashscope\n' "${project_root}" ;;
  46 + *) printf '%s/.venv-reranker-%s\n' "${project_root}" "${backend}" ;;
  47 + esac
  48 +}
  49 +
  50 +reranker_backend_requirements_file() {
  51 + local project_root="$1"
  52 + local backend="$2"
  53 +
  54 + case "${backend}" in
  55 + qwen3_vllm) printf '%s/requirements_reranker_qwen3_vllm.txt\n' "${project_root}" ;;
  56 + qwen3_gguf) printf '%s/requirements_reranker_qwen3_gguf.txt\n' "${project_root}" ;;
  57 + qwen3_transformers) printf '%s/requirements_reranker_qwen3_transformers.txt\n' "${project_root}" ;;
  58 + bge) printf '%s/requirements_reranker_bge.txt\n' "${project_root}" ;;
  59 + dashscope_rerank) printf '%s/requirements_reranker_dashscope.txt\n' "${project_root}" ;;
  60 + *) return 1 ;;
  61 + esac
  62 +}
scripts/setup_reranker_venv.sh
1 #!/bin/bash 1 #!/bin/bash
2 # 2 #
3 -# Create isolated venv for reranker service (.venv-reranker). 3 +# Create isolated venv for one reranker backend.
4 # 4 #
5 set -euo pipefail 5 set -euo pipefail
6 6
7 PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)" 7 PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
8 cd "${PROJECT_ROOT}" 8 cd "${PROJECT_ROOT}"
9 9
10 -VENV_DIR="${PROJECT_ROOT}/.venv-reranker"  
11 PYTHON_BIN="${PYTHON_BIN:-python3}" 10 PYTHON_BIN="${PYTHON_BIN:-python3}"
12 TMP_DIR="${RERANKER_PIP_TMPDIR:-${PROJECT_ROOT}/.tmp/reranker-pip}" 11 TMP_DIR="${RERANKER_PIP_TMPDIR:-${PROJECT_ROOT}/.tmp/reranker-pip}"
13 12
  13 +# shellcheck source=scripts/lib/load_env.sh
  14 +source "${PROJECT_ROOT}/scripts/lib/load_env.sh"
  15 +load_env_file "${PROJECT_ROOT}/.env"
  16 +# shellcheck source=scripts/lib/reranker_backend_env.sh
  17 +source "${PROJECT_ROOT}/scripts/lib/reranker_backend_env.sh"
  18 +
  19 +BACKEND="${1:-$(detect_rerank_backend "${PROJECT_ROOT}")}"
  20 +VENV_DIR="${RERANKER_VENV:-$(reranker_backend_venv_dir "${PROJECT_ROOT}" "${BACKEND}")}"
  21 +REQ_FILE="$(reranker_backend_requirements_file "${PROJECT_ROOT}" "${BACKEND}")"
  22 +
  23 +if [[ ! -f "${REQ_FILE}" ]]; then
  24 + echo "ERROR: requirements file not found for reranker backend ${BACKEND}: ${REQ_FILE}" >&2
  25 + exit 1
  26 +fi
  27 +
14 if ! command -v "${PYTHON_BIN}" >/dev/null 2>&1; then 28 if ! command -v "${PYTHON_BIN}" >/dev/null 2>&1; then
15 echo "ERROR: python not found: ${PYTHON_BIN}" >&2 29 echo "ERROR: python not found: ${PYTHON_BIN}" >&2
16 exit 1 30 exit 1
@@ -34,9 +48,11 @@ PIP_ARGS=(--no-cache-dir) @@ -34,9 +48,11 @@ PIP_ARGS=(--no-cache-dir)
34 48
35 echo "Using TMPDIR=${TMPDIR}" 49 echo "Using TMPDIR=${TMPDIR}"
36 "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" --upgrade pip wheel 50 "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" --upgrade pip wheel
37 -"${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" -r requirements_reranker_service.txt 51 +"${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" -r "${REQ_FILE}"
38 52
39 echo 53 echo
40 echo "Done." 54 echo "Done."
  55 +echo "Backend: ${BACKEND}"
41 echo "Reranker venv: ${VENV_DIR}" 56 echo "Reranker venv: ${VENV_DIR}"
  57 +echo "Requirements: ${REQ_FILE}"
42 echo "Start service: ./scripts/start_reranker.sh" 58 echo "Start service: ./scripts/start_reranker.sh"
scripts/start_reranker.sh
1 #!/bin/bash 1 #!/bin/bash
2 # 2 #
3 -# Start reranker service from isolated venv (.venv-reranker). 3 +# Start reranker service from its backend-specific isolated venv.
4 # 4 #
5 set -euo pipefail 5 set -euo pipefail
6 6
7 PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)" 7 PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
8 cd "${PROJECT_ROOT}" 8 cd "${PROJECT_ROOT}"
9 9
10 -RERANKER_VENV="${RERANKER_VENV:-${PROJECT_ROOT}/.venv-reranker}"  
11 -PYTHON_BIN="${RERANKER_VENV}/bin/python"  
12 -  
13 -if [[ ! -x "${PYTHON_BIN}" ]]; then  
14 - echo "ERROR: reranker venv not found: ${RERANKER_VENV}" >&2  
15 - echo "Please run: ./scripts/setup_reranker_venv.sh" >&2  
16 - exit 1  
17 -fi  
18 -  
19 # Load .env without activating main venv. 10 # Load .env without activating main venv.
20 # shellcheck source=scripts/lib/load_env.sh 11 # shellcheck source=scripts/lib/load_env.sh
21 source "${PROJECT_ROOT}/scripts/lib/load_env.sh" 12 source "${PROJECT_ROOT}/scripts/lib/load_env.sh"
22 load_env_file "${PROJECT_ROOT}/.env" 13 load_env_file "${PROJECT_ROOT}/.env"
  14 +# shellcheck source=scripts/lib/reranker_backend_env.sh
  15 +source "${PROJECT_ROOT}/scripts/lib/reranker_backend_env.sh"
23 16
24 RERANKER_HOST="${RERANKER_HOST:-0.0.0.0}" 17 RERANKER_HOST="${RERANKER_HOST:-0.0.0.0}"
25 RERANKER_PORT="${RERANKER_PORT:-6007}" 18 RERANKER_PORT="${RERANKER_PORT:-6007}"
26 -RERANK_BACKEND=$("${PYTHON_BIN}" -c "from config.services_config import get_rerank_backend_config; print(get_rerank_backend_config()[0])") 19 +RERANK_BACKEND="${RERANK_BACKEND:-$(detect_rerank_backend "${PROJECT_ROOT}")}"
  20 +RERANKER_VENV="${RERANKER_VENV:-$(reranker_backend_venv_dir "${PROJECT_ROOT}" "${RERANK_BACKEND}")}"
  21 +PYTHON_BIN="${RERANKER_VENV}/bin/python"
  22 +
  23 +if [[ ! -x "${PYTHON_BIN}" ]]; then
  24 + echo "ERROR: reranker venv not found for backend ${RERANK_BACKEND}: ${RERANKER_VENV}" >&2
  25 + echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2
  26 + exit 1
  27 +fi
27 28
28 # Keep vLLM/triton/torch caches out of system disk. 29 # Keep vLLM/triton/torch caches out of system disk.
29 RERANKER_RUNTIME_DIR="${RERANKER_RUNTIME_DIR:-${PROJECT_ROOT}/.runtime/reranker}" 30 RERANKER_RUNTIME_DIR="${RERANKER_RUNTIME_DIR:-${PROJECT_ROOT}/.runtime/reranker}"
@@ -58,7 +59,21 @@ except Exception: @@ -58,7 +59,21 @@ except Exception:
58 PY 59 PY
59 then 60 then
60 echo "ERROR: qwen3_vllm backend requires vllm + CUDA runtime in ${RERANKER_VENV}." >&2 61 echo "ERROR: qwen3_vllm backend requires vllm + CUDA runtime in ${RERANKER_VENV}." >&2
61 - echo "Please run: ./scripts/setup_reranker_venv.sh and verify CUDA is available." >&2 62 + echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND} and verify CUDA is available." >&2
  63 + exit 1
  64 + fi
  65 +fi
  66 +
  67 +if [[ "${RERANK_BACKEND}" == "qwen3_gguf" ]]; then
  68 + if ! "${PYTHON_BIN}" - <<'PY'
  69 +try:
  70 + import llama_cpp # noqa: F401
  71 +except Exception:
  72 + raise SystemExit(1)
  73 +PY
  74 + then
  75 + echo "ERROR: qwen3_gguf backend requires llama-cpp-python in ${RERANKER_VENV}." >&2
  76 + echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2
62 exit 1 77 exit 1
63 fi 78 fi
64 fi 79 fi
tests/test_reranker_qwen3_gguf_backend.py 0 → 100644
@@ -0,0 +1,90 @@ @@ -0,0 +1,90 @@
  1 +from __future__ import annotations
  2 +
  3 +import sys
  4 +import types
  5 +
  6 +from reranker.backends import get_rerank_backend
  7 +from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend
  8 +
  9 +
  10 +class _FakeLlama:
  11 + def __init__(self, model_path: str | None = None, **kwargs):
  12 + self.model_path = model_path
  13 + self.kwargs = kwargs
  14 + self.eval_logits = []
  15 +
  16 + @classmethod
  17 + def from_pretrained(cls, repo_id: str, filename: str, local_dir=None, cache_dir=None, **kwargs):
  18 + inst = cls(model_path=f"{repo_id}/{filename}", **kwargs)
  19 + inst.repo_id = repo_id
  20 + inst.filename = filename
  21 + inst.local_dir = local_dir
  22 + inst.cache_dir = cache_dir
  23 + return inst
  24 +
  25 + def tokenize(self, text: bytes, add_bos: bool = False, special: bool = False):
  26 + raw = text.decode("utf-8")
  27 + if raw == "yes":
  28 + return [1]
  29 + if raw == "no":
  30 + return [2]
  31 + return [10 + (ord(ch) % 17) for ch in raw]
  32 +
  33 + def reset(self):
  34 + return None
  35 +
  36 + def eval(self, prompt_tokens):
  37 + pos = float(sum(prompt_tokens) % 11) + 3.0
  38 + neg = 1.0
  39 + logits = [0.0] * 64
  40 + logits[1] = pos
  41 + logits[2] = neg
  42 + self.eval_logits = [logits]
  43 +
  44 +
  45 +def _install_fake_llama_cpp(monkeypatch):
  46 + fake_module = types.SimpleNamespace(Llama=_FakeLlama)
  47 + monkeypatch.setitem(sys.modules, "llama_cpp", fake_module)
  48 +
  49 +
  50 +def test_qwen3_gguf_backend_factory_loads(monkeypatch):
  51 + _install_fake_llama_cpp(monkeypatch)
  52 + backend = get_rerank_backend(
  53 + "qwen3_gguf",
  54 + {
  55 + "repo_id": "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF",
  56 + "filename": "*Q8_0.gguf",
  57 + "enable_warmup": False,
  58 + },
  59 + )
  60 + assert isinstance(backend, Qwen3GGUFRerankerBackend)
  61 +
  62 +
  63 +def test_qwen3_gguf_backend_score_with_meta_dedup_and_restore(monkeypatch):
  64 + _install_fake_llama_cpp(monkeypatch)
  65 + backend = Qwen3GGUFRerankerBackend(
  66 + {
  67 + "repo_id": "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF",
  68 + "filename": "*Q8_0.gguf",
  69 + "enable_warmup": False,
  70 + "infer_batch_size": 2,
  71 + "sort_by_doc_length": True,
  72 + }
  73 + )
  74 +
  75 + scores, meta = backend.score_with_meta(
  76 + query="wireless mouse",
  77 + docs=["doc-a", "doc-b", "doc-a", "", " ", None],
  78 + normalize=True,
  79 + )
  80 +
  81 + assert len(scores) == 6
  82 + assert scores[0] == scores[2]
  83 + assert scores[0] > 0.5
  84 + assert scores[1] > 0.5
  85 + assert scores[3:] == [0.0, 0.0, 0.0]
  86 + assert meta["input_docs"] == 6
  87 + assert meta["usable_docs"] == 3
  88 + assert meta["unique_docs"] == 2
  89 + assert meta["backend"] == "qwen3_gguf"
  90 + assert meta["inference_batches"] == 1