diff --git a/config/config.yaml b/config/config.yaml index 6cf5b49..6dd76a8 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -381,7 +381,7 @@ services: max_docs: 1000 normalize: true # 服务内后端(reranker 进程启动时读取) - backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank + backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | qwen3_gguf | dashscope_rerank backends: bge: model_name: "BAAI/bge-reranker-v2-m3" @@ -420,6 +420,29 @@ services: use_fp16: true # sdpa:默认无需 flash-attn;若已安装 flash_attn 可改为 flash_attention_2 attn_implementation: "sdpa" + qwen3_gguf: + repo_id: "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF" + filename: "*Q8_0.gguf" + cache_dir: "./model_cache" + local_dir: "./models/reranker/qwen3-reranker-4b-gguf" + instruction: "Rank products by query with category & style match prioritized" + # T4 16GB / 显存约 5~6GB 的保守配置 + n_ctx: 384 + n_batch: 384 + n_ubatch: 128 + n_gpu_layers: 24 + main_gpu: 0 + n_threads: 2 + n_threads_batch: 4 + flash_attn: true + offload_kqv: true + use_mmap: true + use_mlock: false + infer_batch_size: 8 + sort_by_doc_length: true + length_sort_mode: "char" + enable_warmup: true + verbose: false dashscope_rerank: model_name: "qwen3-rerank" # 按地域选择 endpoint: diff --git a/requirements_reranker_base.txt b/requirements_reranker_base.txt new file mode 100644 index 0000000..510f4fb --- /dev/null +++ b/requirements_reranker_base.txt @@ -0,0 +1,7 @@ +# Shared base dependencies for reranker service venvs. + +fastapi>=0.100.0 +uvicorn[standard]>=0.23.0 +pydantic>=2.0.0 +numpy>=1.24.0 +pyyaml>=6.0 diff --git a/requirements_reranker_bge.txt b/requirements_reranker_bge.txt new file mode 100644 index 0000000..187ec43 --- /dev/null +++ b/requirements_reranker_bge.txt @@ -0,0 +1,7 @@ +# Isolated dependencies for bge reranker backend. + +-r requirements_reranker_base.txt +torch>=2.0.0 +transformers>=4.30.0 +sentence-transformers>=2.2.0 +modelscope>=1.9.0 diff --git a/requirements_reranker_dashscope.txt b/requirements_reranker_dashscope.txt new file mode 100644 index 0000000..ae1def7 --- /dev/null +++ b/requirements_reranker_dashscope.txt @@ -0,0 +1,3 @@ +# Isolated dependencies for dashscope_rerank backend. + +-r requirements_reranker_base.txt diff --git a/requirements_reranker_qwen3_gguf.txt b/requirements_reranker_qwen3_gguf.txt new file mode 100644 index 0000000..c97f222 --- /dev/null +++ b/requirements_reranker_qwen3_gguf.txt @@ -0,0 +1,4 @@ +# Isolated dependencies for qwen3_gguf reranker backend (.venv-reranker-gguf). + +-r requirements_reranker_base.txt +llama-cpp-python>=0.3.16 diff --git a/requirements_reranker_qwen3_transformers.txt b/requirements_reranker_qwen3_transformers.txt new file mode 100644 index 0000000..062c900 --- /dev/null +++ b/requirements_reranker_qwen3_transformers.txt @@ -0,0 +1,5 @@ +# Isolated dependencies for qwen3_transformers reranker backend. + +-r requirements_reranker_base.txt +torch>=2.0.0 +transformers>=4.51.0 diff --git a/requirements_reranker_qwen3_vllm.txt b/requirements_reranker_qwen3_vllm.txt new file mode 100644 index 0000000..322d24a --- /dev/null +++ b/requirements_reranker_qwen3_vllm.txt @@ -0,0 +1,5 @@ +# Isolated dependencies for qwen3_vllm reranker backend (.venv-reranker). + +-r requirements_reranker_base.txt +transformers>=4.30.0 +vllm>=0.8.5 diff --git a/requirements_reranker_service.txt b/requirements_reranker_service.txt index 409542c..18b738d 100644 --- a/requirements_reranker_service.txt +++ b/requirements_reranker_service.txt @@ -1,11 +1,10 @@ -# Isolated dependencies for reranker service (.venv-reranker) +# Legacy alias: qwen3_vllm reranker service env (.venv-reranker). # -# Default backend is qwen3_vllm (Qwen3-Reranker-0.6B). +# Prefer backend-specific requirements files: +# - requirements_reranker_qwen3_vllm.txt +# - requirements_reranker_qwen3_gguf.txt +# - requirements_reranker_qwen3_transformers.txt +# - requirements_reranker_bge.txt +# - requirements_reranker_dashscope.txt -fastapi>=0.100.0 -uvicorn[standard]>=0.23.0 -pydantic>=2.0.0 -numpy>=1.24.0 -pyyaml>=6.0 -transformers>=4.30.0 -vllm>=0.8.5 +-r requirements_reranker_qwen3_vllm.txt diff --git a/reranker/DEPLOYMENT_AND_TUNING.md b/reranker/DEPLOYMENT_AND_TUNING.md index e289135..c9a18eb 100644 --- a/reranker/DEPLOYMENT_AND_TUNING.md +++ b/reranker/DEPLOYMENT_AND_TUNING.md @@ -1,34 +1,45 @@ -# Reranker 部署与性能调优手册(Qwen3-vLLM) +# Reranker 部署与性能调优手册(Qwen3-vLLM / Qwen3-GGUF) 本文档沉淀当前项目在电商搜索重排场景下的可复用实践,覆盖: - 环境准备与安装部署 -- `qwen3_vllm` 配置项与优化思路 +- `qwen3_vllm` / `qwen3_gguf` 配置项与优化思路 - 1000-doc 场景压测流程 - 关键结论与推荐默认参数 - 常见故障排查 适用范围: -- 重排后端:`services.rerank.backend: qwen3_vllm` -- 模型:`Qwen/Qwen3-Reranker-0.6B` +- 重排后端:`services.rerank.backend: qwen3_vllm` 或 `qwen3_gguf` +- 模型:`Qwen/Qwen3-Reranker-0.6B` / `DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF` - 场景:query 较短(通常 < 100 tokens),doc 为商品标题或标题+简短描述,单请求 docs 约 1000 条 ## 1. 环境基线 -当前验证环境(2026-03-11): +当前验证环境(2026-03-25): - GPU:`Tesla T4 16GB` - Driver / CUDA:`570.158.01 / 12.8` - Python:`3.12.3` -- 关键依赖:`vllm==0.17.0`、`torch==2.10.0+cu128`、`transformers==4.57.6`、`fastapi==0.135.1`、`uvicorn==0.41.0` +- 关键依赖:`vllm==0.17.0`、`torch==2.10.0+cu128`、`transformers==4.57.6`、`llama-cpp-python>=0.3.16`、`fastapi==0.135.1`、`uvicorn==0.41.0` ## 2. 环境准备与安装 ### 2.1 准备 reranker 独立虚拟环境 ```bash -./scripts/setup_reranker_venv.sh +./scripts/setup_reranker_venv.sh qwen3_vllm +``` + +若使用 GGUF 并需要 CUDA: + +```bash +./scripts/setup_reranker_venv.sh qwen3_gguf +PATH=/usr/local/cuda/bin:$PATH \ +CUDACXX=/usr/local/cuda/bin/nvcc \ +CMAKE_ARGS="-DGGML_CUDA=on" \ +FORCE_CMAKE=1 \ +./.venv-reranker-gguf/bin/pip install --no-cache-dir --force-reinstall --no-build-isolation llama-cpp-python==0.3.18 ``` ### 2.2 基础检查 @@ -37,6 +48,7 @@ nvidia-smi ./.venv-reranker/bin/python -c "import torch; print(torch.cuda.is_available())" ./.venv-reranker/bin/python -c "import vllm, transformers; print(vllm.__version__, transformers.__version__)" +./.venv-reranker-gguf/bin/python -c "import llama_cpp; print(llama_cpp.__version__)" ``` ## 3. 部署与运行 @@ -64,6 +76,29 @@ services: length_sort_mode: "char" # char | token ``` +GGUF / T4 剩余显存约 `4.8~6GB` 时,推荐基线: + +```yaml +services: + rerank: + backend: "qwen3_gguf" + backends: + qwen3_gguf: + repo_id: "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF" + filename: "*Q8_0.gguf" + local_dir: "./models/reranker/qwen3-reranker-4b-gguf" + cache_dir: "./model_cache" + n_ctx: 384 + n_batch: 384 + n_ubatch: 128 + n_gpu_layers: 24 + flash_attn: true + offload_kqv: true + infer_batch_size: 8 + sort_by_doc_length: true + length_sort_mode: "char" +``` + ### 3.2 启停命令 推荐统一使用: @@ -105,6 +140,13 @@ curl -sS http://127.0.0.1:6007/health - `service_ctl.sh` 对 reranker 使用独立启动路径 - 增加“稳定健康检查”(连续健康探测)避免“刚 healthy 即退出”的假阳性 +### 4.4 GGUF / T4 小显存优化原则 + +- `Q8_0` 权重约 `4.28GB`,但还要给 KV cache、CUDA 工作区和运行时碎片预留空间,不能按“模型大小 < 剩余显存”直接判断可行。 +- 当前业务是短 query + 商品标题,优先压缩 `n_ctx`;`384` 通常比默认长上下文更划算。 +- T4 小显存下先扫 `n_gpu_layers`,再尝试提高 `n_ctx`;`infer_batch_size` 在当前 GGUF 接入里主要是服务侧 work chunk,不是 llama.cpp 的真实算子 batch。 +- `flash_attn: true`、`offload_kqv: true` 默认保持开启;若 OOM,优先降低 `n_gpu_layers`。 + ## 5. 性能调优流程(标准流程) ### 5.1 使用一键压测脚本 @@ -125,6 +167,13 @@ curl -sS http://127.0.0.1:6007/health - `infer_batch_size`: `24 32 48 64` - 并发组:`c=1`(看单请求延迟)、`c=4`(看并发吞吐与尾延迟) +GGUF 建议扫描: + +- `n_gpu_layers`: `20 24 28` +- `n_ctx`: `320 384 448` +- `infer_batch_size`: `4 8 12`(次要,仅影响服务侧 work chunk) +- 扫描顺序:先固定 `n_ctx=384`,找能稳定启动的最大 `n_gpu_layers`;再在显存允许时尝试 `n_ctx=448`;最后才微调 `infer_batch_size` + 可通过环境变量覆盖: - `BATCH_SIZES` @@ -140,23 +189,28 @@ curl -sS http://127.0.0.1:6007/health - `RERANK_VLLM_INFER_BATCH_SIZE` - `RERANK_VLLM_SORT_BY_DOC_LENGTH` -## 6. 本轮关键结论(2026-03-11) - -基于报告: - -- `perf_reports/20260311/reranker_1000docs/report.md` +## 6. 本轮关键结论 -结论: +vLLM(2026-03-11,见 `perf_reports/20260311/reranker_1000docs/report.md`): - 对在线重排更重要的单请求延迟(`c=1`)指标,`infer_batch_size=64` 最优 - `infer_batch_size=96` 在更高并发下吞吐略高,但会牺牲单请求延迟稳定性 - 当前默认选择 `infer_batch_size=64` 作为平衡点 +GGUF(2026-03-25,本次接入): + +- `DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF` 的 `Q8_0` 体积约 `4.28GB`,结合当前机器实测剩余显存约 `4823 MiB`,默认不采用激进的全量 GPU offload。 +- 当前推荐默认值:`n_ctx=384`、`n_batch=384`、`n_ubatch=128`、`n_gpu_layers=24`、`infer_batch_size=8`。 +- 若现场剩余显存更接近 `6GB` 且碎片较少,可优先尝试 `n_gpu_layers=28`;若启动失败,回退到 `24` 或 `20`。 +- 由于当前工作区尚未缓存该 GGUF 权重,本次尚未完成真实吞吐压测;上线前需在部署机复跑一轮参数扫描并归档报告。 + ## 7. 生产建议 - 默认保持:`infer_batch_size: 64`、`sort_by_doc_length: true` - 满足以下条件时可考虑提高到 `96`:业务以吞吐优先、可接受更高单请求延迟、已通过同机同数据压测验证收益 - 每次改动后都必须复跑 `benchmark_reranker_1000docs.sh` 并归档结果 +- GGUF 默认保持:`n_ctx: 384`、`n_gpu_layers: 24`、`infer_batch_size: 8`、`flash_attn: true`、`offload_kqv: true` +- GGUF 若 OOM:先降 `n_gpu_layers`,再降 `n_ctx`,最后再降 `infer_batch_size` ## 8. 故障排查 @@ -194,6 +248,13 @@ lsof -i :6007 -P -n - 降低 `infer_batch_size` - 检查是否有其他进程占用同卡 +GGUF 优先调整: + +- 降低 `n_gpu_layers` +- 降低 `n_ctx` +- 降低 `infer_batch_size` +- 检查是否有其他进程占用同卡 + ## 9. 变更与验证清单 每次 reranker 调优改动后,至少完成: diff --git a/reranker/README.md b/reranker/README.md index 4be9625..0460b3c 100644 --- a/reranker/README.md +++ b/reranker/README.md @@ -4,10 +4,10 @@ --- -Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwen3-vLLM、Qwen3-Transformers、DashScope 云重排)。调用方通过 HTTP 访问,不关心具体后端。 +Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwen3-vLLM、Qwen3-Transformers、Qwen3-GGUF、DashScope 云重排)。调用方通过 HTTP 访问,不关心具体后端。 **特性** -- 多后端:`qwen3_vllm`(默认,Qwen3-Reranker-0.6B + vLLM)、`qwen3_transformers`(纯 Transformers,无需 vLLM)、`bge`(兼容保留) +- 多后端:`qwen3_vllm`、`qwen3_transformers`、`qwen3_gguf`(Qwen3-Reranker-4B GGUF + llama.cpp)、`bge`(兼容保留) - 云后端:`dashscope_rerank`(调用 DashScope `/compatible-api/v1/reranks`,支持按地域切换 endpoint) - 统一配置:`config/config.yaml` → `services.rerank.backend` / `services.rerank.backends.` - 文档去重、分数与输入顺序一致、FP16/GPU 支持(视后端) @@ -19,6 +19,7 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe - `backends/bge.py`:BGE 后端 - `backends/qwen3_vllm.py`:Qwen3-Reranker-0.6B + vLLM 后端 - `backends/qwen3_transformers.py`:Qwen3-Reranker-0.6B 纯 Transformers 后端(官方 Usage 方式) + - `backends/qwen3_gguf.py`:Qwen3-Reranker-4B GGUF + llama.cpp 后端 - `backends/dashscope_rerank.py`:DashScope 云重排后端(HTTP 调用) - `reranker/bge_reranker.py`:BGE 核心推理(被 bge 后端封装) - `reranker/config.py`:服务端口、MAX_DOCS、NORMALIZE 等(后端参数在 config.yaml) @@ -27,18 +28,33 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe - 通用:`torch`、`transformers`、`fastapi`、`uvicorn`(隔离环境见 `requirements_reranker_service.txt`;全量 ML 环境另见 `requirements_ml.txt`) - **Qwen3-vLLM 后端**:`vllm>=0.8.5`、`transformers>=4.51.0`(仅当使用 `backend: qwen3_vllm` 时需 vLLM) - **Qwen3-Transformers 后端**:`transformers>=4.51.0`、`torch`(无需 vLLM,适合 CPU 或小显存) +- **Qwen3-GGUF 后端**:`llama-cpp-python>=0.3.16` +- 现在按 backend 使用独立 venv: + - `qwen3_vllm` -> `.venv-reranker` + - `qwen3_gguf` -> `.venv-reranker-gguf` + - `qwen3_transformers` -> `.venv-reranker-transformers` + - `bge` -> `.venv-reranker-bge` + - `dashscope_rerank` -> `.venv-reranker-dashscope` ```bash - ./scripts/setup_reranker_venv.sh + ./scripts/setup_reranker_venv.sh qwen3_gguf + ``` + CUDA 构建建议: + ```bash + PATH=/usr/local/cuda/bin:$PATH \ + CUDACXX=/usr/local/cuda/bin/nvcc \ + CMAKE_ARGS="-DGGML_CUDA=on" \ + FORCE_CMAKE=1 \ + ./.venv-reranker-gguf/bin/pip install --no-cache-dir --force-reinstall --no-build-isolation llama-cpp-python==0.3.18 ``` ## 配置 -- **后端选择**:`config/config.yaml` 中 `services.rerank.backend`(`qwen3_vllm` | `qwen3_transformers` | `bge` | `dashscope_rerank`),或环境变量 `RERANK_BACKEND`。 +- **后端选择**:`config/config.yaml` 中 `services.rerank.backend`(`qwen3_vllm` | `qwen3_transformers` | `qwen3_gguf` | `bge` | `dashscope_rerank`),或环境变量 `RERANK_BACKEND`。 - **后端参数**:`services.rerank.backends.bge` / `services.rerank.backends.qwen3_vllm`,例如: ```yaml services: rerank: - backend: "qwen3_vllm" # 或 bge + backend: "qwen3_gguf" # 或 qwen3_vllm / bge backends: bge: model_name: "BAAI/bge-reranker-v2-m3" @@ -65,6 +81,21 @@ services: tensor_parallel_size: 1 gpu_memory_utilization: 0.8 instruction: "Given a shopping query, rank product titles by relevance" + qwen3_gguf: + repo_id: "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF" + filename: "*Q8_0.gguf" + local_dir: "./models/reranker/qwen3-reranker-4b-gguf" + cache_dir: "./model_cache" + instruction: "Rank products by query with category & style match prioritized" + n_ctx: 384 + n_batch: 384 + n_ubatch: 128 + n_gpu_layers: 24 + flash_attn: true + offload_kqv: true + infer_batch_size: 8 + sort_by_doc_length: true + length_sort_mode: "char" dashscope_rerank: model_name: "qwen3-rerank" endpoint: "https://dashscope.aliyuncs.com/compatible-api/v1/reranks" @@ -94,7 +125,7 @@ DashScope 认证: ```bash ./scripts/start_reranker.sh ``` -该脚本会使用隔离环境 `.venv-reranker`;首次请先执行 `./scripts/setup_reranker_venv.sh`。 +该脚本会按当前 `services.rerank.backend` 自动选择对应的独立 venv;首次请先执行 `./scripts/setup_reranker_venv.sh `。 ## 性能压测(1000 docs) ```bash @@ -122,7 +153,7 @@ Content-Type: application/json ``` `top_n` 为可选字段: -- 对本地后端(`qwen3_vllm` / `qwen3_transformers` / `bge`)通常会忽略,仍返回全量分数。 +- 对本地后端(`qwen3_vllm` / `qwen3_transformers` / `qwen3_gguf` / `bge`)通常会忽略,仍返回全量分数。 - 对 `dashscope_rerank` 可用于控制云端返回的候选量,建议设置为 `page+size`(例如分页 `from=20,size=10` 时传 `30`)。 Response: @@ -160,3 +191,4 @@ uvicorn reranker.server:app --host 0.0.0.0 --port 6007 --log-level info - 运行时可用环境变量临时覆盖批量参数:`RERANK_VLLM_INFER_BATCH_SIZE`、`RERANK_VLLM_SORT_BY_DOC_LENGTH`。 - **Qwen3-vLLM**:参考 [Qwen3-Reranker-0.6B](https://huggingface.co/Qwen/Qwen3-Reranker-0.6B),需 GPU 与较多显存;与 BGE 相比适合长文本、高吞吐场景(vLLM 前缀缓存)。 - **Qwen3-Transformers**:官方 Transformers Usage 方式,无需 vLLM;适合 CPU 或小显存。默认 `attn_implementation: "sdpa"`;若已安装 `flash_attn` 可设 `flash_attention_2`(未安装时服务会自动回退到 sdpa)。 +- **Qwen3-GGUF**:参考 [DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF](https://huggingface.co/DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF)。单卡 T4 且仅剩约 `4.8~6GB` 显存时,推荐 `Q8_0 + n_ctx=384 + n_gpu_layers=24 + flash_attn=true + offload_kqv=true` 起步;若启动 OOM,优先把 `n_gpu_layers` 下调到 `20`,再把 `n_ctx` 下调到 `320`。`infer_batch_size` 在 GGUF 后端是服务侧 work chunk,大多不如 `n_gpu_layers` / `n_ctx` 关键。 diff --git a/reranker/backends/__init__.py b/reranker/backends/__init__.py index f68d472..293eb1c 100644 --- a/reranker/backends/__init__.py +++ b/reranker/backends/__init__.py @@ -46,11 +46,14 @@ def get_rerank_backend(name: str, config: Dict[str, Any]) -> RerankBackendProtoc if name == "qwen3_transformers": from reranker.backends.qwen3_transformers import Qwen3TransformersRerankerBackend return Qwen3TransformersRerankerBackend(config) + if name == "qwen3_gguf": + from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend + return Qwen3GGUFRerankerBackend(config) if name == "dashscope_rerank": from reranker.backends.dashscope_rerank import DashScopeRerankBackend return DashScopeRerankBackend(config) raise ValueError( - f"Unknown rerank backend: {name!r}. Supported: bge, qwen3_vllm, qwen3_transformers, dashscope_rerank" + f"Unknown rerank backend: {name!r}. Supported: bge, qwen3_vllm, qwen3_transformers, qwen3_gguf, dashscope_rerank" ) diff --git a/reranker/backends/qwen3_gguf.py b/reranker/backends/qwen3_gguf.py new file mode 100644 index 0000000..c980506 --- /dev/null +++ b/reranker/backends/qwen3_gguf.py @@ -0,0 +1,327 @@ +""" +Qwen3-Reranker-4B GGUF backend using llama-cpp-python. + +Reference: +- https://huggingface.co/DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF +- https://huggingface.co/Qwen/Qwen3-Reranker-4B +""" + +from __future__ import annotations + +import logging +import math +import os +import threading +import time +from typing import Any, Dict, List, Tuple + + +logger = logging.getLogger("reranker.backends.qwen3_gguf") + + +def deduplicate_with_positions(texts: List[str]) -> Tuple[List[str], List[int]]: + """Deduplicate texts globally while preserving first-seen order.""" + unique_texts: List[str] = [] + position_to_unique: List[int] = [] + seen: Dict[str, int] = {} + + for text in texts: + idx = seen.get(text) + if idx is None: + idx = len(unique_texts) + seen[text] = idx + unique_texts.append(text) + position_to_unique.append(idx) + + return unique_texts, position_to_unique + + +def _format_instruction(instruction: str, query: str, doc: str) -> str: + return ": {instruction}\n: {query}\n: {doc}".format( + instruction=instruction, + query=query, + doc=doc, + ) + + +class Qwen3GGUFRerankerBackend: + """ + Qwen3-Reranker-4B GGUF backend using llama.cpp through llama-cpp-python. + + Tuned for short-query / short-doc reranking on a memory-constrained single T4. + Config from services.rerank.backends.qwen3_gguf. + """ + + def __init__(self, config: Dict[str, Any]) -> None: + self._config = config or {} + self._repo_id = str( + self._config.get("repo_id") or "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF" + ).strip() + self._filename = str(self._config.get("filename") or "*Q8_0.gguf").strip() + self._model_path = str(self._config.get("model_path") or "").strip() + self._cache_dir = str(self._config.get("cache_dir") or "").strip() or None + self._local_dir = str(self._config.get("local_dir") or "").strip() or None + self._instruction = str( + self._config.get("instruction") + or "Rank products by query with category & style match prioritized" + ) + self._infer_batch_size = int( + os.getenv("RERANK_GGUF_INFER_BATCH_SIZE") or self._config.get("infer_batch_size", 8) + ) + sort_by_doc_length = os.getenv("RERANK_GGUF_SORT_BY_DOC_LENGTH") + if sort_by_doc_length is None: + sort_by_doc_length = self._config.get("sort_by_doc_length", True) + self._sort_by_doc_length = str(sort_by_doc_length).strip().lower() in { + "1", + "true", + "yes", + "y", + "on", + } + self._length_sort_mode = str(self._config.get("length_sort_mode") or "char").strip().lower() + + n_ctx = int(self._config.get("n_ctx", self._config.get("max_model_len", 384))) + n_batch = int(self._config.get("n_batch", min(n_ctx, 384))) + n_ubatch = int(self._config.get("n_ubatch", min(n_batch, 128))) + n_gpu_layers = int(self._config.get("n_gpu_layers", 24)) + main_gpu = int(self._config.get("main_gpu", 0)) + n_threads = int(self._config.get("n_threads", 2)) + n_threads_batch = int(self._config.get("n_threads_batch", 4)) + flash_attn = bool(self._config.get("flash_attn", True)) + offload_kqv = bool(self._config.get("offload_kqv", True)) + use_mmap = bool(self._config.get("use_mmap", True)) + use_mlock = bool(self._config.get("use_mlock", False)) + verbose = bool(self._config.get("verbose", False)) + enable_warmup = bool(self._config.get("enable_warmup", True)) + + if self._infer_batch_size <= 0: + raise ValueError(f"infer_batch_size must be > 0, got {self._infer_batch_size}") + if n_ctx <= 0: + raise ValueError(f"n_ctx must be > 0, got {n_ctx}") + if n_batch <= 0 or n_ubatch <= 0: + raise ValueError(f"n_batch/n_ubatch must be > 0, got {n_batch}/{n_ubatch}") + + try: + from llama_cpp import Llama + except Exception as exc: # pragma: no cover - depends on optional dependency + raise RuntimeError( + "qwen3_gguf backend requires llama-cpp-python. " + "Install the qwen3_gguf backend venv first via scripts/setup_reranker_venv.sh qwen3_gguf." + ) from exc + + self._llama_class = Llama + self._n_ctx = n_ctx + self._n_batch = n_batch + self._n_ubatch = n_ubatch + self._n_gpu_layers = n_gpu_layers + self._enable_warmup = enable_warmup + self._infer_lock = threading.Lock() + + logger.info( + "[Qwen3_GGUF] Loading model repo=%s filename=%s model_path=%s n_ctx=%s n_batch=%s n_ubatch=%s n_gpu_layers=%s flash_attn=%s offload_kqv=%s", + self._repo_id, + self._filename, + self._model_path or None, + n_ctx, + n_batch, + n_ubatch, + n_gpu_layers, + flash_attn, + offload_kqv, + ) + + llm_kwargs = { + "n_ctx": n_ctx, + "n_batch": n_batch, + "n_ubatch": n_ubatch, + "n_gpu_layers": n_gpu_layers, + "main_gpu": main_gpu, + "n_threads": n_threads, + "n_threads_batch": n_threads_batch, + "logits_all": True, + "offload_kqv": offload_kqv, + "flash_attn": flash_attn, + "use_mmap": use_mmap, + "use_mlock": use_mlock, + "verbose": verbose, + } + llm_kwargs = {key: value for key, value in llm_kwargs.items() if value is not None} + self._llm = self._load_model(llm_kwargs) + self._model_name = self._model_path or f"{self._repo_id}:{self._filename}" + + self._prefix = ( + "<|im_start|>system\n" + "Judge whether the Document meets the requirements based on the Query and the Instruct provided. " + 'Note that the answer can only be "yes" or "no".' + "<|im_end|>\n<|im_start|>user\n" + ) + self._suffix = "<|im_end|>\n<|im_start|>assistant\n\n\n\n\n" + self._prefix_tokens = self._tokenize(self._prefix, special=True) + self._suffix_tokens = self._tokenize(self._suffix, special=True) + self._effective_max_len = self._n_ctx - len(self._prefix_tokens) - len(self._suffix_tokens) + if self._effective_max_len <= 16: + raise RuntimeError( + f"n_ctx={self._n_ctx} is too small after prompt overhead; effective={self._effective_max_len}" + ) + + self._true_token = self._single_token_id("yes") + self._false_token = self._single_token_id("no") + + if self._enable_warmup: + self._warmup() + + logger.info( + "[Qwen3_GGUF] Model ready | model=%s effective_max_len=%s infer_batch_size=%s sort_by_doc_length=%s", + self._model_name, + self._effective_max_len, + self._infer_batch_size, + self._sort_by_doc_length, + ) + + def _load_model(self, llm_kwargs: Dict[str, Any]): + if self._model_path: + return self._llama_class(model_path=self._model_path, **llm_kwargs) + return self._llama_class.from_pretrained( + repo_id=self._repo_id, + filename=self._filename, + local_dir=self._local_dir, + cache_dir=self._cache_dir, + **llm_kwargs, + ) + + def _tokenize(self, text: str, *, special: bool) -> List[int]: + return list( + self._llm.tokenize( + text.encode("utf-8"), + add_bos=False, + special=special, + ) + ) + + def _single_token_id(self, text: str) -> int: + token_ids = self._tokenize(text, special=False) + if len(token_ids) != 1: + raise RuntimeError(f"Expected {text!r} to be one token, got {token_ids}") + return int(token_ids[0]) + + def _warmup(self) -> None: + try: + prompt = self._build_prompt_tokens("warmup query", "warmup document") + with self._infer_lock: + self._eval_logits(prompt) + except Exception as exc: # pragma: no cover - defensive + logger.warning("[Qwen3_GGUF] Warmup failed: %s", exc) + + def _build_prompt_tokens(self, query: str, doc: str) -> List[int]: + pair = _format_instruction(self._instruction, query, doc) + pair_tokens = self._tokenize(pair, special=False) + pair_tokens = pair_tokens[: self._effective_max_len] + return self._prefix_tokens + pair_tokens + self._suffix_tokens + + def _eval_logits(self, prompt_tokens: List[int]) -> List[float]: + self._llm.reset() + self._llm.eval(prompt_tokens) + logits = self._llm.eval_logits + if not logits: + raise RuntimeError("llama.cpp returned empty logits") + return list(logits[-1]) + + def _score_prompt(self, prompt_tokens: List[int]) -> float: + logits = self._eval_logits(prompt_tokens) + true_logit = float(logits[self._true_token]) + false_logit = float(logits[self._false_token]) + max_logit = max(true_logit, false_logit) + true_exp = math.exp(true_logit - max_logit) + false_exp = math.exp(false_logit - max_logit) + return float(true_exp / (true_exp + false_exp)) + + def _estimate_doc_lengths(self, docs: List[str]) -> List[int]: + if self._length_sort_mode == "token": + return [len(self._tokenize(text, special=False)) for text in docs] + return [len(text) for text in docs] + + def score_with_meta( + self, + query: str, + docs: List[str], + normalize: bool = True, + ) -> Tuple[List[float], Dict[str, Any]]: + start_ts = time.time() + total_docs = len(docs) if docs else 0 + output_scores: List[float] = [0.0] * total_docs + + query = "" if query is None else str(query).strip() + indexed: List[Tuple[int, str]] = [] + for i, doc in enumerate(docs or []): + if doc is None: + continue + text = str(doc).strip() + if not text: + continue + indexed.append((i, text)) + + if not query or not indexed: + elapsed_ms = (time.time() - start_ts) * 1000.0 + return output_scores, { + "input_docs": total_docs, + "usable_docs": len(indexed), + "unique_docs": 0, + "dedup_ratio": 0.0, + "elapsed_ms": round(elapsed_ms, 3), + "model": self._model_name, + "backend": "qwen3_gguf", + "normalize": normalize, + "infer_batch_size": self._infer_batch_size, + "inference_batches": 0, + "sort_by_doc_length": self._sort_by_doc_length, + "n_ctx": self._n_ctx, + "n_batch": self._n_batch, + "n_ubatch": self._n_ubatch, + "n_gpu_layers": self._n_gpu_layers, + } + + indexed_texts = [text for _, text in indexed] + unique_texts, position_to_unique = deduplicate_with_positions(indexed_texts) + + lengths = self._estimate_doc_lengths(unique_texts) + order = list(range(len(unique_texts))) + if self._sort_by_doc_length and len(unique_texts) > 1: + order = sorted(order, key=lambda i: lengths[i]) + + unique_scores: List[float] = [0.0] * len(unique_texts) + inference_batches = 0 + for start in range(0, len(order), self._infer_batch_size): + batch_indices = order[start : start + self._infer_batch_size] + inference_batches += 1 + for idx in batch_indices: + prompt = self._build_prompt_tokens(query, unique_texts[idx]) + with self._infer_lock: + unique_scores[idx] = self._score_prompt(prompt) + + for (orig_idx, _), unique_idx in zip(indexed, position_to_unique): + output_scores[orig_idx] = float(unique_scores[unique_idx]) + + elapsed_ms = (time.time() - start_ts) * 1000.0 + dedup_ratio = 0.0 + if indexed: + dedup_ratio = 1.0 - (len(unique_texts) / float(len(indexed))) + + meta = { + "input_docs": total_docs, + "usable_docs": len(indexed), + "unique_docs": len(unique_texts), + "dedup_ratio": round(dedup_ratio, 4), + "elapsed_ms": round(elapsed_ms, 3), + "model": self._model_name, + "backend": "qwen3_gguf", + "normalize": normalize, + "infer_batch_size": self._infer_batch_size, + "inference_batches": inference_batches, + "sort_by_doc_length": self._sort_by_doc_length, + "length_sort_mode": self._length_sort_mode, + "n_ctx": self._n_ctx, + "n_batch": self._n_batch, + "n_ubatch": self._n_ubatch, + "n_gpu_layers": self._n_gpu_layers, + } + return output_scores, meta diff --git a/reranker/server.py b/reranker/server.py index 11034db..42d3ec0 100644 --- a/reranker/server.py +++ b/reranker/server.py @@ -7,7 +7,7 @@ Request: { "query": "...", "docs": ["doc1", "doc2", ...], "normalize": optional Response: { "scores": [float], "meta": {...} } Backend selected via config: services.rerank.backend -(bge | qwen3_vllm | qwen3_transformers | dashscope_rerank), env RERANK_BACKEND. +(bge | qwen3_vllm | qwen3_transformers | qwen3_gguf | dashscope_rerank), env RERANK_BACKEND. """ import logging diff --git a/scripts/lib/reranker_backend_env.sh b/scripts/lib/reranker_backend_env.sh new file mode 100644 index 0000000..6464f8b --- /dev/null +++ b/scripts/lib/reranker_backend_env.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# +# Shared helpers for mapping reranker backends to isolated virtualenvs. +# + +set -euo pipefail + +detect_rerank_backend() { + local project_root="$1" + local backend="${RERANK_BACKEND:-}" + + if [[ -n "${backend}" ]]; then + printf '%s\n' "${backend}" + return 0 + fi + + backend="$( + awk ' + /^ rerank:$/ { in_rerank=1; next } + in_rerank && /^ [^ ]/ { in_rerank=0 } + in_rerank && /^ backend:/ { + gsub(/"/, "", $2) + print $2 + exit + } + ' "${project_root}/config/config.yaml" + )" + + if [[ -z "${backend}" ]]; then + backend="qwen3_vllm" + fi + + printf '%s\n' "${backend}" +} + +reranker_backend_venv_dir() { + local project_root="$1" + local backend="$2" + + case "${backend}" in + qwen3_vllm) printf '%s/.venv-reranker\n' "${project_root}" ;; + qwen3_gguf) printf '%s/.venv-reranker-gguf\n' "${project_root}" ;; + qwen3_transformers) printf '%s/.venv-reranker-transformers\n' "${project_root}" ;; + bge) printf '%s/.venv-reranker-bge\n' "${project_root}" ;; + dashscope_rerank) printf '%s/.venv-reranker-dashscope\n' "${project_root}" ;; + *) printf '%s/.venv-reranker-%s\n' "${project_root}" "${backend}" ;; + esac +} + +reranker_backend_requirements_file() { + local project_root="$1" + local backend="$2" + + case "${backend}" in + qwen3_vllm) printf '%s/requirements_reranker_qwen3_vllm.txt\n' "${project_root}" ;; + qwen3_gguf) printf '%s/requirements_reranker_qwen3_gguf.txt\n' "${project_root}" ;; + qwen3_transformers) printf '%s/requirements_reranker_qwen3_transformers.txt\n' "${project_root}" ;; + bge) printf '%s/requirements_reranker_bge.txt\n' "${project_root}" ;; + dashscope_rerank) printf '%s/requirements_reranker_dashscope.txt\n' "${project_root}" ;; + *) return 1 ;; + esac +} diff --git a/scripts/setup_reranker_venv.sh b/scripts/setup_reranker_venv.sh index 9df7735..1b338f8 100755 --- a/scripts/setup_reranker_venv.sh +++ b/scripts/setup_reranker_venv.sh @@ -1,16 +1,30 @@ #!/bin/bash # -# Create isolated venv for reranker service (.venv-reranker). +# Create isolated venv for one reranker backend. # set -euo pipefail PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)" cd "${PROJECT_ROOT}" -VENV_DIR="${PROJECT_ROOT}/.venv-reranker" PYTHON_BIN="${PYTHON_BIN:-python3}" TMP_DIR="${RERANKER_PIP_TMPDIR:-${PROJECT_ROOT}/.tmp/reranker-pip}" +# shellcheck source=scripts/lib/load_env.sh +source "${PROJECT_ROOT}/scripts/lib/load_env.sh" +load_env_file "${PROJECT_ROOT}/.env" +# shellcheck source=scripts/lib/reranker_backend_env.sh +source "${PROJECT_ROOT}/scripts/lib/reranker_backend_env.sh" + +BACKEND="${1:-$(detect_rerank_backend "${PROJECT_ROOT}")}" +VENV_DIR="${RERANKER_VENV:-$(reranker_backend_venv_dir "${PROJECT_ROOT}" "${BACKEND}")}" +REQ_FILE="$(reranker_backend_requirements_file "${PROJECT_ROOT}" "${BACKEND}")" + +if [[ ! -f "${REQ_FILE}" ]]; then + echo "ERROR: requirements file not found for reranker backend ${BACKEND}: ${REQ_FILE}" >&2 + exit 1 +fi + if ! command -v "${PYTHON_BIN}" >/dev/null 2>&1; then echo "ERROR: python not found: ${PYTHON_BIN}" >&2 exit 1 @@ -34,9 +48,11 @@ PIP_ARGS=(--no-cache-dir) echo "Using TMPDIR=${TMPDIR}" "${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" --upgrade pip wheel -"${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" -r requirements_reranker_service.txt +"${VENV_DIR}/bin/python" -m pip install "${PIP_ARGS[@]}" -r "${REQ_FILE}" echo echo "Done." +echo "Backend: ${BACKEND}" echo "Reranker venv: ${VENV_DIR}" +echo "Requirements: ${REQ_FILE}" echo "Start service: ./scripts/start_reranker.sh" diff --git a/scripts/start_reranker.sh b/scripts/start_reranker.sh index 9d58998..2fd6649 100755 --- a/scripts/start_reranker.sh +++ b/scripts/start_reranker.sh @@ -1,29 +1,30 @@ #!/bin/bash # -# Start reranker service from isolated venv (.venv-reranker). +# Start reranker service from its backend-specific isolated venv. # set -euo pipefail PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)" cd "${PROJECT_ROOT}" -RERANKER_VENV="${RERANKER_VENV:-${PROJECT_ROOT}/.venv-reranker}" -PYTHON_BIN="${RERANKER_VENV}/bin/python" - -if [[ ! -x "${PYTHON_BIN}" ]]; then - echo "ERROR: reranker venv not found: ${RERANKER_VENV}" >&2 - echo "Please run: ./scripts/setup_reranker_venv.sh" >&2 - exit 1 -fi - # Load .env without activating main venv. # shellcheck source=scripts/lib/load_env.sh source "${PROJECT_ROOT}/scripts/lib/load_env.sh" load_env_file "${PROJECT_ROOT}/.env" +# shellcheck source=scripts/lib/reranker_backend_env.sh +source "${PROJECT_ROOT}/scripts/lib/reranker_backend_env.sh" RERANKER_HOST="${RERANKER_HOST:-0.0.0.0}" RERANKER_PORT="${RERANKER_PORT:-6007}" -RERANK_BACKEND=$("${PYTHON_BIN}" -c "from config.services_config import get_rerank_backend_config; print(get_rerank_backend_config()[0])") +RERANK_BACKEND="${RERANK_BACKEND:-$(detect_rerank_backend "${PROJECT_ROOT}")}" +RERANKER_VENV="${RERANKER_VENV:-$(reranker_backend_venv_dir "${PROJECT_ROOT}" "${RERANK_BACKEND}")}" +PYTHON_BIN="${RERANKER_VENV}/bin/python" + +if [[ ! -x "${PYTHON_BIN}" ]]; then + echo "ERROR: reranker venv not found for backend ${RERANK_BACKEND}: ${RERANKER_VENV}" >&2 + echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2 + exit 1 +fi # Keep vLLM/triton/torch caches out of system disk. RERANKER_RUNTIME_DIR="${RERANKER_RUNTIME_DIR:-${PROJECT_ROOT}/.runtime/reranker}" @@ -58,7 +59,21 @@ except Exception: PY then echo "ERROR: qwen3_vllm backend requires vllm + CUDA runtime in ${RERANKER_VENV}." >&2 - echo "Please run: ./scripts/setup_reranker_venv.sh and verify CUDA is available." >&2 + echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND} and verify CUDA is available." >&2 + exit 1 + fi +fi + +if [[ "${RERANK_BACKEND}" == "qwen3_gguf" ]]; then + if ! "${PYTHON_BIN}" - <<'PY' +try: + import llama_cpp # noqa: F401 +except Exception: + raise SystemExit(1) +PY + then + echo "ERROR: qwen3_gguf backend requires llama-cpp-python in ${RERANKER_VENV}." >&2 + echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2 exit 1 fi fi diff --git a/tests/test_reranker_qwen3_gguf_backend.py b/tests/test_reranker_qwen3_gguf_backend.py new file mode 100644 index 0000000..3886d1d --- /dev/null +++ b/tests/test_reranker_qwen3_gguf_backend.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +import sys +import types + +from reranker.backends import get_rerank_backend +from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend + + +class _FakeLlama: + def __init__(self, model_path: str | None = None, **kwargs): + self.model_path = model_path + self.kwargs = kwargs + self.eval_logits = [] + + @classmethod + def from_pretrained(cls, repo_id: str, filename: str, local_dir=None, cache_dir=None, **kwargs): + inst = cls(model_path=f"{repo_id}/{filename}", **kwargs) + inst.repo_id = repo_id + inst.filename = filename + inst.local_dir = local_dir + inst.cache_dir = cache_dir + return inst + + def tokenize(self, text: bytes, add_bos: bool = False, special: bool = False): + raw = text.decode("utf-8") + if raw == "yes": + return [1] + if raw == "no": + return [2] + return [10 + (ord(ch) % 17) for ch in raw] + + def reset(self): + return None + + def eval(self, prompt_tokens): + pos = float(sum(prompt_tokens) % 11) + 3.0 + neg = 1.0 + logits = [0.0] * 64 + logits[1] = pos + logits[2] = neg + self.eval_logits = [logits] + + +def _install_fake_llama_cpp(monkeypatch): + fake_module = types.SimpleNamespace(Llama=_FakeLlama) + monkeypatch.setitem(sys.modules, "llama_cpp", fake_module) + + +def test_qwen3_gguf_backend_factory_loads(monkeypatch): + _install_fake_llama_cpp(monkeypatch) + backend = get_rerank_backend( + "qwen3_gguf", + { + "repo_id": "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF", + "filename": "*Q8_0.gguf", + "enable_warmup": False, + }, + ) + assert isinstance(backend, Qwen3GGUFRerankerBackend) + + +def test_qwen3_gguf_backend_score_with_meta_dedup_and_restore(monkeypatch): + _install_fake_llama_cpp(monkeypatch) + backend = Qwen3GGUFRerankerBackend( + { + "repo_id": "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF", + "filename": "*Q8_0.gguf", + "enable_warmup": False, + "infer_batch_size": 2, + "sort_by_doc_length": True, + } + ) + + scores, meta = backend.score_with_meta( + query="wireless mouse", + docs=["doc-a", "doc-b", "doc-a", "", " ", None], + normalize=True, + ) + + assert len(scores) == 6 + assert scores[0] == scores[2] + assert scores[0] > 0.5 + assert scores[1] > 0.5 + assert scores[3:] == [0.0, 0.0, 0.0] + assert meta["input_docs"] == 6 + assert meta["usable_docs"] == 3 + assert meta["unique_docs"] == 2 + assert meta["backend"] == "qwen3_gguf" + assert meta["inference_batches"] == 1 -- libgit2 0.21.2