Commit 4823f4631caed65dc708e201adc75d4e76036293
1 parent
9de5ef49
qwen3_vllm_score + 独立 0.18 环境
Showing
11 changed files
with
528 additions
and
20 deletions
Show diff stats
config/config.yaml
| ... | ... | @@ -381,7 +381,7 @@ services: |
| 381 | 381 | max_docs: 1000 |
| 382 | 382 | normalize: true |
| 383 | 383 | # 服务内后端(reranker 进程启动时读取) |
| 384 | - backend: "qwen3_vllm_score" # bge | qwen3_vllm | qwen3_vllm_score | qwen3_transformers | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank | |
| 384 | + backend: "qwen3_vllm_score" # bge | qwen3_vllm | qwen3_vllm_score | qwen3_transformers | qwen3_transformers_packed | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank | |
| 385 | 385 | backends: |
| 386 | 386 | bge: |
| 387 | 387 | model_name: "BAAI/bge-reranker-v2-m3" |
| ... | ... | @@ -411,11 +411,14 @@ services: |
| 411 | 411 | # instruction: "Relevance ranking: category & style match first" |
| 412 | 412 | # instruction: "Score product relevance by query with category & style match prioritized" |
| 413 | 413 | instruction: "Rank products by query with category & style match prioritized" |
| 414 | - # vLLM LLM.score()(跨编码打分);与 qwen3_vllm 共用 .venv-reranker 与同模型权重(vLLM 0.17+ 用 runner/convert=auto,旧版曾用 task=score) | |
| 414 | + # vLLM LLM.score()(跨编码打分)。独立高性能环境 .venv-reranker-score(vllm 0.18 固定版):./scripts/setup_reranker_venv.sh qwen3_vllm_score | |
| 415 | + # 与 qwen3_vllm 可共用同一 model_name / HF 缓存;venv 分离以便升级 vLLM 而不影响 generate 后端。 | |
| 415 | 416 | qwen3_vllm_score: |
| 416 | 417 | model_name: "Qwen/Qwen3-Reranker-0.6B" |
| 417 | 418 | # 官方 Hub 原版需 true;若改用已转换的 seq-cls 权重(如 tomaarsen/...-seq-cls)则设为 false |
| 418 | 419 | use_original_qwen3_hf_overrides: true |
| 420 | + # vLLM 0.18:算力 < 8(如 T4)默认自动用 TRITON_ATTN;Ampere+ 可省略或设 auto。也可设环境变量 RERANK_VLLM_ATTENTION_BACKEND | |
| 421 | + # vllm_attention_backend: "auto" | |
| 419 | 422 | # 可选:与 vLLM 对齐;一般保持 auto |
| 420 | 423 | # vllm_runner: "auto" |
| 421 | 424 | # vllm_convert: "auto" |
| ... | ... | @@ -440,6 +443,20 @@ services: |
| 440 | 443 | use_fp16: true |
| 441 | 444 | # sdpa:默认无需 flash-attn;若已安装 flash_attn 可改为 flash_attention_2 |
| 442 | 445 | attn_implementation: "sdpa" |
| 446 | + # Packed Transformers backend: shared query prefix + custom position_ids/attention_mask. | |
| 447 | + # For 1 query + many short docs (for example 400 product titles), this usually reduces | |
| 448 | + # repeated prefix work and padding waste compared with pairwise batching. | |
| 449 | + qwen3_transformers_packed: | |
| 450 | + model_name: "Qwen/Qwen3-Reranker-0.6B" | |
| 451 | + instruction: "Rank products by query with category & style match prioritized" | |
| 452 | + max_model_len: 4096 | |
| 453 | + max_doc_len: 160 | |
| 454 | + max_docs_per_pack: 0 | |
| 455 | + use_fp16: true | |
| 456 | + sort_by_doc_length: true | |
| 457 | + # Packed mode relies on a custom 4D attention mask. "eager" is the safest default. | |
| 458 | + # If your torch/transformers stack validates it, you can benchmark "sdpa". | |
| 459 | + attn_implementation: "eager" | |
| 443 | 460 | qwen3_gguf: |
| 444 | 461 | repo_id: "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF" |
| 445 | 462 | filename: "*Q8_0.gguf" | ... | ... |
requirements_reranker_qwen3_transformers_packed.txt
0 → 100644
| ... | ... | @@ -0,0 +1,14 @@ |
| 1 | +# Dedicated high-performance venv for qwen3_vllm_score: .venv-reranker-score | |
| 2 | +# | |
| 3 | +# Create / refresh: | |
| 4 | +# ./scripts/setup_reranker_venv.sh qwen3_vllm_score | |
| 5 | +# | |
| 6 | +# vLLM 0.17+ replaces LLM(task="score") with runner/convert auto + LLM.score(). | |
| 7 | +# Pin vLLM for reproducible perf baselines; bump after validating CUDA/driver on your hosts. | |
| 8 | +# If pip cannot find a wheel for your CUDA version, edit the vllm line or install from: | |
| 9 | +# https://docs.vllm.ai/en/latest/getting_started/installation.html | |
| 10 | + | |
| 11 | +-r requirements_reranker_base.txt | |
| 12 | +vllm==0.18.0 | |
| 13 | +# Match vLLM 0.18 stack; cap <5 to avoid pip prefetching incompatible transformers 5.x. | |
| 14 | +transformers>=4.51.0,<5 | ... | ... |
requirements_reranker_service.txt
| ... | ... | @@ -2,6 +2,7 @@ |
| 2 | 2 | # |
| 3 | 3 | # Prefer backend-specific requirements files: |
| 4 | 4 | # - requirements_reranker_qwen3_vllm.txt |
| 5 | +# - requirements_reranker_qwen3_vllm_score.txt | |
| 5 | 6 | # - requirements_reranker_qwen3_gguf.txt |
| 6 | 7 | # - requirements_reranker_qwen3_transformers.txt |
| 7 | 8 | # - requirements_reranker_bge.txt | ... | ... |
reranker/README.md
| ... | ... | @@ -7,7 +7,7 @@ |
| 7 | 7 | Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwen3-vLLM、Qwen3-Transformers、Qwen3-GGUF、DashScope 云重排)。调用方通过 HTTP 访问,不关心具体后端。 |
| 8 | 8 | |
| 9 | 9 | **特性** |
| 10 | -- 多后端:`qwen3_vllm`、`qwen3_transformers`、`qwen3_gguf`(Qwen3-Reranker-4B GGUF + llama.cpp)、`qwen3_gguf_06b`(Qwen3-Reranker-0.6B Q8_0 GGUF + llama.cpp)、`bge`(兼容保留) | |
| 10 | +- 多后端:`qwen3_vllm`、`qwen3_vllm_score`(同模型,vLLM ``LLM.score()`` + 独立 `.venv-reranker-score`)、`qwen3_transformers`、`qwen3_transformers_packed`(共享前缀 + packed attention mask)、`qwen3_gguf`(Qwen3-Reranker-4B GGUF + llama.cpp)、`qwen3_gguf_06b`(Qwen3-Reranker-0.6B Q8_0 GGUF + llama.cpp)、`bge`(兼容保留) | |
| 11 | 11 | - 云后端:`dashscope_rerank`(调用 DashScope `/compatible-api/v1/reranks`,支持按地域切换 endpoint) |
| 12 | 12 | - 统一配置:`config/config.yaml` → `services.rerank.backend` / `services.rerank.backends.<name>` |
| 13 | 13 | - 文档去重、分数与输入顺序一致、FP16/GPU 支持(视后端) |
| ... | ... | @@ -17,8 +17,10 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe |
| 17 | 17 | - `reranker/backends/`:后端实现与工厂 |
| 18 | 18 | - `backends/__init__.py`:`get_rerank_backend(name, config)` |
| 19 | 19 | - `backends/bge.py`:BGE 后端 |
| 20 | - - `backends/qwen3_vllm.py`:Qwen3-Reranker-0.6B + vLLM 后端 | |
| 20 | + - `backends/qwen3_vllm.py`:Qwen3-Reranker-0.6B + vLLM(generate + logprobs) | |
| 21 | + - `backends/qwen3_vllm_score.py`:同上模型 + vLLM ``LLM.score()``(`requirements_reranker_qwen3_vllm_score.txt` / `.venv-reranker-score`) | |
| 21 | 22 | - `backends/qwen3_transformers.py`:Qwen3-Reranker-0.6B 纯 Transformers 后端(官方 Usage 方式) |
| 23 | + - `backends/qwen3_transformers_packed.py`:Qwen3-Reranker-0.6B + Transformers packed 推理(共享 query prefix,适合 `1 query + 400 docs`) | |
| 22 | 24 | - `backends/qwen3_gguf.py`:Qwen3-Reranker GGUF + llama.cpp 后端(支持 `qwen3_gguf` / `qwen3_gguf_06b`) |
| 23 | 25 | - `backends/dashscope_rerank.py`:DashScope 云重排后端(HTTP 调用) |
| 24 | 26 | - `reranker/bge_reranker.py`:BGE 核心推理(被 bge 后端封装) |
| ... | ... | @@ -26,14 +28,18 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe |
| 26 | 28 | |
| 27 | 29 | ## 依赖 |
| 28 | 30 | - 通用:`torch`、`transformers`、`fastapi`、`uvicorn`(隔离环境见 `requirements_reranker_service.txt`;全量 ML 环境另见 `requirements_ml.txt`) |
| 29 | -- **Qwen3-vLLM 后端**:`vllm>=0.8.5`、`transformers>=4.51.0`(仅当使用 `backend: qwen3_vllm` 时需 vLLM) | |
| 31 | +- **Qwen3-vLLM 后端**:`vllm>=0.8.5`、`transformers>=4.51.0`(`qwen3_vllm` → `.venv-reranker`) | |
| 32 | +- **Qwen3-vLLM-score 后端**:固定 `vllm==0.18.0`(`qwen3_vllm_score` → `.venv-reranker-score`,见 `requirements_reranker_qwen3_vllm_score.txt`) | |
| 30 | 33 | - **Qwen3-Transformers 后端**:`transformers>=4.51.0`、`torch`(无需 vLLM,适合 CPU 或小显存) |
| 34 | +- **Qwen3-Transformers-Packed 后端**:复用 Transformers 依赖(`qwen3_transformers_packed` → `.venv-reranker-transformers-packed`) | |
| 31 | 35 | - **Qwen3-GGUF 后端**:`llama-cpp-python>=0.3.16` |
| 32 | 36 | - 现在按 backend 使用独立 venv: |
| 33 | 37 | - `qwen3_vllm` -> `.venv-reranker` |
| 38 | + - `qwen3_vllm_score` -> `.venv-reranker-score` | |
| 34 | 39 | - `qwen3_gguf` -> `.venv-reranker-gguf` |
| 35 | 40 | - `qwen3_gguf_06b` -> `.venv-reranker-gguf-06b` |
| 36 | 41 | - `qwen3_transformers` -> `.venv-reranker-transformers` |
| 42 | + - `qwen3_transformers_packed` -> `.venv-reranker-transformers-packed` | |
| 37 | 43 | - `bge` -> `.venv-reranker-bge` |
| 38 | 44 | - `dashscope_rerank` -> `.venv-reranker-dashscope` |
| 39 | 45 | ```bash |
| ... | ... | @@ -49,7 +55,7 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe |
| 49 | 55 | ``` |
| 50 | 56 | |
| 51 | 57 | ## 配置 |
| 52 | -- **后端选择**:`config/config.yaml` 中 `services.rerank.backend`(`qwen3_vllm` | `qwen3_transformers` | `qwen3_gguf` | `qwen3_gguf_06b` | `bge` | `dashscope_rerank`),或环境变量 `RERANK_BACKEND`。 | |
| 58 | +- **后端选择**:`config/config.yaml` 中 `services.rerank.backend`(`qwen3_vllm` | `qwen3_vllm_score` | `qwen3_transformers` | `qwen3_transformers_packed` | `qwen3_gguf` | `qwen3_gguf_06b` | `bge` | `dashscope_rerank`),或环境变量 `RERANK_BACKEND`。 | |
| 53 | 59 | - **后端参数**:`services.rerank.backends.bge` / `services.rerank.backends.qwen3_vllm`,例如: |
| 54 | 60 | |
| 55 | 61 | ```yaml |
| ... | ... | @@ -82,6 +88,15 @@ services: |
| 82 | 88 | tensor_parallel_size: 1 |
| 83 | 89 | gpu_memory_utilization: 0.8 |
| 84 | 90 | instruction: "Given a shopping query, rank product titles by relevance" |
| 91 | + qwen3_transformers_packed: | |
| 92 | + model_name: "Qwen/Qwen3-Reranker-0.6B" | |
| 93 | + instruction: "Rank products by query with category & style match prioritized" | |
| 94 | + max_model_len: 4096 | |
| 95 | + max_doc_len: 160 | |
| 96 | + max_docs_per_pack: 0 | |
| 97 | + use_fp16: true | |
| 98 | + sort_by_doc_length: true | |
| 99 | + attn_implementation: "eager" | |
| 85 | 100 | qwen3_gguf: |
| 86 | 101 | repo_id: "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF" |
| 87 | 102 | filename: "*Q8_0.gguf" |
| ... | ... | @@ -168,7 +183,7 @@ Content-Type: application/json |
| 168 | 183 | ``` |
| 169 | 184 | |
| 170 | 185 | `top_n` 为可选字段: |
| 171 | -- 对本地后端(`qwen3_vllm` / `qwen3_transformers` / `qwen3_gguf` / `qwen3_gguf_06b` / `bge`)通常会忽略,仍返回全量分数。 | |
| 186 | +- 对本地后端(`qwen3_vllm` / `qwen3_transformers` / `qwen3_transformers_packed` / `qwen3_gguf` / `qwen3_gguf_06b` / `bge`)通常会忽略,仍返回全量分数。 | |
| 172 | 187 | - 对 `dashscope_rerank` 可用于控制云端返回的候选量,建议设置为 `page+size`(例如分页 `from=20,size=10` 时传 `30`)。 |
| 173 | 188 | |
| 174 | 189 | Response: |
| ... | ... | @@ -206,5 +221,6 @@ uvicorn reranker.server:app --host 0.0.0.0 --port 6007 --log-level info |
| 206 | 221 | - 运行时可用环境变量临时覆盖批量参数:`RERANK_VLLM_INFER_BATCH_SIZE`、`RERANK_VLLM_SORT_BY_DOC_LENGTH`。 |
| 207 | 222 | - **Qwen3-vLLM**:参考 [Qwen3-Reranker-0.6B](https://huggingface.co/Qwen/Qwen3-Reranker-0.6B),需 GPU 与较多显存;与 BGE 相比适合长文本、高吞吐场景(vLLM 前缀缓存)。 |
| 208 | 223 | - **Qwen3-Transformers**:官方 Transformers Usage 方式,无需 vLLM;适合 CPU 或小显存。默认 `attn_implementation: "sdpa"`;若已安装 `flash_attn` 可设 `flash_attention_2`(未安装时服务会自动回退到 sdpa)。 |
| 224 | +- **Qwen3-Transformers-Packed**:仍使用 Hugging Face Transformers 与 PyTorch CUDA 内核,只定制 packed 输入、`position_ids` 和 4D `attention_mask`。它更适合在线检索里的“一个 query 对几百个短 doc”场景;默认 `attn_implementation: "eager"` 以保证自定义 mask 兼容性,若你的 `torch/transformers` 版本已验证支持,可再压测 `"sdpa"`。 | |
| 209 | 225 | - **Qwen3-GGUF**:参考 [DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF](https://huggingface.co/DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF)。单卡 T4 且仅剩约 `4.8~6GB` 显存时,推荐 `Q8_0 + n_ctx=384 + n_gpu_layers=24 + flash_attn=true + offload_kqv=true` 起步;若启动 OOM,优先把 `n_gpu_layers` 下调到 `20`,再把 `n_ctx` 下调到 `320`。`infer_batch_size` 在 GGUF 后端是服务侧 work chunk,大多不如 `n_gpu_layers` / `n_ctx` 关键。 |
| 210 | 226 | - **Qwen3-GGUF-0.6B**:参考 [ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF](https://huggingface.co/ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF)。它的优点是权重小、显存占用低,单进程实测约 `0.9~1.1 GiB`;但在当前 llama.cpp 串行打分接法下,`1 query + 400 titles` 的实测延迟仍约 `265s`。因此它更适合低显存功能后备,不适合作为在线低延迟主 reranker。 | ... | ... |
reranker/backends/__init__.py
| ... | ... | @@ -49,6 +49,11 @@ def get_rerank_backend(name: str, config: Dict[str, Any]) -> RerankBackendProtoc |
| 49 | 49 | if name == "qwen3_transformers": |
| 50 | 50 | from reranker.backends.qwen3_transformers import Qwen3TransformersRerankerBackend |
| 51 | 51 | return Qwen3TransformersRerankerBackend(config) |
| 52 | + if name == "qwen3_transformers_packed": | |
| 53 | + from reranker.backends.qwen3_transformers_packed import ( | |
| 54 | + Qwen3TransformersPackedRerankerBackend, | |
| 55 | + ) | |
| 56 | + return Qwen3TransformersPackedRerankerBackend(config) | |
| 52 | 57 | if name == "qwen3_gguf": |
| 53 | 58 | from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend |
| 54 | 59 | gguf_config = dict(config or {}) |
| ... | ... | @@ -63,7 +68,7 @@ def get_rerank_backend(name: str, config: Dict[str, Any]) -> RerankBackendProtoc |
| 63 | 68 | from reranker.backends.dashscope_rerank import DashScopeRerankBackend |
| 64 | 69 | return DashScopeRerankBackend(config) |
| 65 | 70 | raise ValueError( |
| 66 | - f"Unknown rerank backend: {name!r}. Supported: bge, qwen3_vllm, qwen3_vllm_score, qwen3_transformers, qwen3_gguf, qwen3_gguf_06b, dashscope_rerank" | |
| 71 | + f"Unknown rerank backend: {name!r}. Supported: bge, qwen3_vllm, qwen3_vllm_score, qwen3_transformers, qwen3_transformers_packed, qwen3_gguf, qwen3_gguf_06b, dashscope_rerank" | |
| 67 | 72 | ) |
| 68 | 73 | |
| 69 | 74 | ... | ... |
| ... | ... | @@ -0,0 +1,398 @@ |
| 1 | +""" | |
| 2 | +Qwen3-Reranker backend using packed inference with Transformers. | |
| 3 | + | |
| 4 | +This backend implements the sequence stitching optimization described in | |
| 5 | +Qwen3-Reranker packed inference examples: | |
| 6 | +1. Share the query/instruction prefix across many documents. | |
| 7 | +2. Reset document ``position_ids`` relative to the shared prefix. | |
| 8 | +3. Use a custom causal attention mask so each document can attend to the | |
| 9 | + prefix and itself, but never to other documents. | |
| 10 | + | |
| 11 | +Compared with the standard per-pair batching path, this reduces repeated | |
| 12 | +prefix computation and removes inter-sample padding waste. For online search | |
| 13 | +requests like ``1 query + 400 docs``, the backend further packs documents into | |
| 14 | +multiple chunks under a configurable total token budget. | |
| 15 | +""" | |
| 16 | + | |
| 17 | +from __future__ import annotations | |
| 18 | + | |
| 19 | +import logging | |
| 20 | +import threading | |
| 21 | +import time | |
| 22 | +from typing import Any, Dict, List, Sequence, Tuple | |
| 23 | + | |
| 24 | +import torch | |
| 25 | +from transformers import AutoModelForCausalLM, AutoTokenizer | |
| 26 | + | |
| 27 | +logger = logging.getLogger("reranker.backends.qwen3_transformers_packed") | |
| 28 | + | |
| 29 | +_DEFAULT_PREFIX = ( | |
| 30 | + "<|im_start|>system\n" | |
| 31 | + "Judge whether the Document meets the requirements based on the Query and the Instruct " | |
| 32 | + 'provided. Note that the answer can only be "yes" or "no".' | |
| 33 | + "<|im_end|>\n<|im_start|>user\n" | |
| 34 | +) | |
| 35 | +_DEFAULT_SUFFIX = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n" | |
| 36 | +_DEFAULT_PAIR_PREFIX_TEMPLATE = "{prefix}<Instruct>: {instruction}\n<Query>: {query}\n<Document>: " | |
| 37 | + | |
| 38 | + | |
| 39 | +def _deduplicate_with_positions(texts: Sequence[str]) -> Tuple[List[str], List[int]]: | |
| 40 | + unique_texts: List[str] = [] | |
| 41 | + position_to_unique: List[int] = [] | |
| 42 | + seen: Dict[str, int] = {} | |
| 43 | + | |
| 44 | + for text in texts: | |
| 45 | + idx = seen.get(text) | |
| 46 | + if idx is None: | |
| 47 | + idx = len(unique_texts) | |
| 48 | + seen[text] = idx | |
| 49 | + unique_texts.append(text) | |
| 50 | + position_to_unique.append(idx) | |
| 51 | + | |
| 52 | + return unique_texts, position_to_unique | |
| 53 | + | |
| 54 | + | |
| 55 | +class Qwen3TransformersPackedRerankerBackend: | |
| 56 | + """ | |
| 57 | + Qwen3-Reranker packed inference backend using Transformers. | |
| 58 | + | |
| 59 | + Config from ``services.rerank.backends.qwen3_transformers_packed``. | |
| 60 | + """ | |
| 61 | + | |
| 62 | + def __init__(self, config: Dict[str, Any]) -> None: | |
| 63 | + self._config = config or {} | |
| 64 | + model_name = str(self._config.get("model_name") or "Qwen/Qwen3-Reranker-0.6B") | |
| 65 | + self._instruction = str( | |
| 66 | + self._config.get("instruction") | |
| 67 | + or "Rank products by query with category & style match prioritized" | |
| 68 | + ) | |
| 69 | + self._prefix = str(self._config.get("prompt_prefix") or _DEFAULT_PREFIX) | |
| 70 | + self._suffix = str(self._config.get("prompt_suffix") or _DEFAULT_SUFFIX) | |
| 71 | + self._pair_prefix_template = str( | |
| 72 | + self._config.get("pair_prefix_template") or _DEFAULT_PAIR_PREFIX_TEMPLATE | |
| 73 | + ) | |
| 74 | + | |
| 75 | + max_model_len = int(self._config.get("max_model_len", 4096)) | |
| 76 | + max_doc_len = int(self._config.get("max_doc_len", 160)) | |
| 77 | + max_docs_per_pack = int(self._config.get("max_docs_per_pack", 0)) | |
| 78 | + use_fp16 = bool(self._config.get("use_fp16", True)) | |
| 79 | + device = self._config.get("device") | |
| 80 | + attn_impl = str(self._config.get("attn_implementation") or "eager").strip() | |
| 81 | + sort_by_doc_length = self._config.get("sort_by_doc_length", True) | |
| 82 | + | |
| 83 | + self._model_name = model_name | |
| 84 | + self._max_model_len = max_model_len | |
| 85 | + self._max_doc_len = max_doc_len | |
| 86 | + self._max_docs_per_pack = max_docs_per_pack | |
| 87 | + self._sort_by_doc_length = str(sort_by_doc_length).strip().lower() in { | |
| 88 | + "1", | |
| 89 | + "true", | |
| 90 | + "yes", | |
| 91 | + "y", | |
| 92 | + "on", | |
| 93 | + } | |
| 94 | + self._attn_impl = attn_impl | |
| 95 | + | |
| 96 | + logger.info( | |
| 97 | + "[Qwen3_Transformers_Packed] Loading model %s (max_model_len=%s, max_doc_len=%s, " | |
| 98 | + "max_docs_per_pack=%s, fp16=%s, attn_impl=%s)", | |
| 99 | + model_name, | |
| 100 | + max_model_len, | |
| 101 | + max_doc_len, | |
| 102 | + max_docs_per_pack, | |
| 103 | + use_fp16, | |
| 104 | + attn_impl, | |
| 105 | + ) | |
| 106 | + | |
| 107 | + self._tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left") | |
| 108 | + self._tokenizer.pad_token = self._tokenizer.eos_token | |
| 109 | + | |
| 110 | + self._prefix_tokens = self._tokenizer.encode(self._prefix, add_special_tokens=False) | |
| 111 | + self._suffix_tokens = self._tokenizer.encode(self._suffix, add_special_tokens=False) | |
| 112 | + self._suffix_len = len(self._suffix_tokens) | |
| 113 | + | |
| 114 | + if not torch.cuda.is_available(): | |
| 115 | + raise RuntimeError( | |
| 116 | + "qwen3_transformers_packed backend requires CUDA GPU, " | |
| 117 | + "but torch.cuda.is_available() is False" | |
| 118 | + ) | |
| 119 | + | |
| 120 | + kwargs: Dict[str, Any] = {} | |
| 121 | + if use_fp16: | |
| 122 | + kwargs["torch_dtype"] = torch.float16 | |
| 123 | + if attn_impl: | |
| 124 | + kwargs["attn_implementation"] = attn_impl | |
| 125 | + | |
| 126 | + self._model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs).eval() | |
| 127 | + target_device = str(device).strip() if device is not None else "cuda" | |
| 128 | + if not target_device.startswith("cuda"): | |
| 129 | + raise ValueError( | |
| 130 | + "qwen3_transformers_packed backend is GPU-only. " | |
| 131 | + f"Unsupported device setting: {target_device!r}" | |
| 132 | + ) | |
| 133 | + self._model = self._model.to(target_device) | |
| 134 | + self._device = next(self._model.parameters()).device | |
| 135 | + if self._device.type != "cuda": | |
| 136 | + raise RuntimeError( | |
| 137 | + "qwen3_transformers_packed backend failed to place model on CUDA. " | |
| 138 | + f"Current device: {self._device}" | |
| 139 | + ) | |
| 140 | + | |
| 141 | + self._token_true_id = self._tokenizer.convert_tokens_to_ids("yes") | |
| 142 | + self._token_false_id = self._tokenizer.convert_tokens_to_ids("no") | |
| 143 | + if self._token_true_id is None or self._token_false_id is None: | |
| 144 | + raise RuntimeError("Failed to resolve Qwen3 reranker classifier token ids for yes/no") | |
| 145 | + | |
| 146 | + prefix_budget = len(self._prefix_tokens) + self._suffix_len + 1 | |
| 147 | + if self._max_model_len <= prefix_budget: | |
| 148 | + raise ValueError( | |
| 149 | + "max_model_len is too small for packed reranking. " | |
| 150 | + f"Need > {prefix_budget}, got {self._max_model_len}." | |
| 151 | + ) | |
| 152 | + if self._max_doc_len <= 0: | |
| 153 | + raise ValueError(f"max_doc_len must be > 0, got {self._max_doc_len}") | |
| 154 | + if self._max_docs_per_pack < 0: | |
| 155 | + raise ValueError( | |
| 156 | + f"max_docs_per_pack must be >= 0, got {self._max_docs_per_pack}" | |
| 157 | + ) | |
| 158 | + | |
| 159 | + self._infer_lock = threading.Lock() | |
| 160 | + | |
| 161 | + logger.info( | |
| 162 | + "[Qwen3_Transformers_Packed] Model ready | model=%s device=%s", | |
| 163 | + model_name, | |
| 164 | + self._device, | |
| 165 | + ) | |
| 166 | + | |
| 167 | + def _build_pair_prefix_tokens(self, query: str) -> List[int]: | |
| 168 | + pair_prefix = self._pair_prefix_template.format( | |
| 169 | + prefix=self._prefix, | |
| 170 | + instruction=self._instruction, | |
| 171 | + query=query, | |
| 172 | + ) | |
| 173 | + return self._tokenizer.encode(pair_prefix, add_special_tokens=False) | |
| 174 | + | |
| 175 | + def _tokenize_documents(self, docs: Sequence[str], query_prefix_len: int) -> List[List[int]]: | |
| 176 | + max_doc_tokens = min( | |
| 177 | + self._max_doc_len, | |
| 178 | + max(1, self._max_model_len - query_prefix_len - self._suffix_len), | |
| 179 | + ) | |
| 180 | + tokenized = self._tokenizer( | |
| 181 | + list(docs), | |
| 182 | + padding=False, | |
| 183 | + truncation=True, | |
| 184 | + max_length=max_doc_tokens, | |
| 185 | + add_special_tokens=False, | |
| 186 | + return_attention_mask=False, | |
| 187 | + ) | |
| 188 | + return [list(ids) for ids in tokenized["input_ids"]] | |
| 189 | + | |
| 190 | + def _build_pack_plan( | |
| 191 | + self, | |
| 192 | + query_prefix_len: int, | |
| 193 | + doc_tokens: Sequence[Sequence[int]], | |
| 194 | + ) -> List[List[int]]: | |
| 195 | + order = list(range(len(doc_tokens))) | |
| 196 | + if self._sort_by_doc_length and len(order) > 1: | |
| 197 | + order.sort(key=lambda idx: len(doc_tokens[idx])) | |
| 198 | + | |
| 199 | + packs: List[List[int]] = [] | |
| 200 | + current_pack: List[int] = [] | |
| 201 | + current_len = query_prefix_len | |
| 202 | + for idx in order: | |
| 203 | + packed_doc_len = len(doc_tokens[idx]) + self._suffix_len | |
| 204 | + if packed_doc_len <= 0: | |
| 205 | + continue | |
| 206 | + | |
| 207 | + over_docs_cap = self._max_docs_per_pack > 0 and len(current_pack) >= self._max_docs_per_pack | |
| 208 | + over_token_cap = current_pack and (current_len + packed_doc_len > self._max_model_len) | |
| 209 | + if over_docs_cap or over_token_cap: | |
| 210 | + packs.append(current_pack) | |
| 211 | + current_pack = [] | |
| 212 | + current_len = query_prefix_len | |
| 213 | + | |
| 214 | + if query_prefix_len + packed_doc_len > self._max_model_len: | |
| 215 | + raise ValueError( | |
| 216 | + "Packed doc still exceeds max_model_len after truncation. " | |
| 217 | + f"query_prefix_len={query_prefix_len}, doc_len={packed_doc_len}, " | |
| 218 | + f"max_model_len={self._max_model_len}" | |
| 219 | + ) | |
| 220 | + | |
| 221 | + current_pack.append(idx) | |
| 222 | + current_len += packed_doc_len | |
| 223 | + | |
| 224 | + if current_pack: | |
| 225 | + packs.append(current_pack) | |
| 226 | + return packs | |
| 227 | + | |
| 228 | + def _build_pack_inputs( | |
| 229 | + self, | |
| 230 | + query_prefix_tokens: Sequence[int], | |
| 231 | + doc_tokens: Sequence[Sequence[int]], | |
| 232 | + doc_indices: Sequence[int], | |
| 233 | + ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]: | |
| 234 | + prefix_len = len(query_prefix_tokens) | |
| 235 | + input_ids_list = list(query_prefix_tokens) | |
| 236 | + position_ids_list = list(range(prefix_len)) | |
| 237 | + spans: List[Tuple[int, int]] = [] | |
| 238 | + current_len = prefix_len | |
| 239 | + | |
| 240 | + for idx in doc_indices: | |
| 241 | + doc_with_suffix = list(doc_tokens[idx]) + self._suffix_tokens | |
| 242 | + start = current_len | |
| 243 | + end = start + len(doc_with_suffix) | |
| 244 | + spans.append((start, end)) | |
| 245 | + input_ids_list.extend(doc_with_suffix) | |
| 246 | + position_ids_list.extend(range(prefix_len, prefix_len + len(doc_with_suffix))) | |
| 247 | + current_len = end | |
| 248 | + | |
| 249 | + total_len = len(input_ids_list) | |
| 250 | + device = self._device | |
| 251 | + neg_inf = torch.finfo(torch.float32).min | |
| 252 | + | |
| 253 | + allowed = torch.zeros((total_len, total_len), dtype=torch.bool, device=device) | |
| 254 | + prefix_causal = torch.tril( | |
| 255 | + torch.ones((prefix_len, prefix_len), dtype=torch.bool, device=device) | |
| 256 | + ) | |
| 257 | + allowed[:prefix_len, :prefix_len] = prefix_causal | |
| 258 | + for start, end in spans: | |
| 259 | + allowed[start:end, :prefix_len] = True | |
| 260 | + doc_len = end - start | |
| 261 | + allowed[start:end, start:end] = torch.tril( | |
| 262 | + torch.ones((doc_len, doc_len), dtype=torch.bool, device=device) | |
| 263 | + ) | |
| 264 | + | |
| 265 | + attention_mask = torch.full( | |
| 266 | + (total_len, total_len), | |
| 267 | + neg_inf, | |
| 268 | + dtype=torch.float32, | |
| 269 | + device=device, | |
| 270 | + ) | |
| 271 | + attention_mask.masked_fill_(allowed, 0.0) | |
| 272 | + | |
| 273 | + inputs = { | |
| 274 | + "input_ids": torch.tensor([input_ids_list], dtype=torch.long, device=device), | |
| 275 | + "position_ids": torch.tensor([position_ids_list], dtype=torch.long, device=device), | |
| 276 | + "attention_mask": attention_mask.view(1, 1, total_len, total_len), | |
| 277 | + } | |
| 278 | + logits_ids = torch.tensor( | |
| 279 | + [end - 1 for _, end in spans], | |
| 280 | + dtype=torch.long, | |
| 281 | + device=device, | |
| 282 | + ) | |
| 283 | + return inputs, logits_ids | |
| 284 | + | |
| 285 | + @torch.no_grad() | |
| 286 | + def _score_pack( | |
| 287 | + self, | |
| 288 | + query_prefix_tokens: Sequence[int], | |
| 289 | + doc_tokens: Sequence[Sequence[int]], | |
| 290 | + doc_indices: Sequence[int], | |
| 291 | + ) -> Tuple[List[float], int]: | |
| 292 | + inputs, logits_ids = self._build_pack_inputs( | |
| 293 | + query_prefix_tokens=query_prefix_tokens, | |
| 294 | + doc_tokens=doc_tokens, | |
| 295 | + doc_indices=doc_indices, | |
| 296 | + ) | |
| 297 | + outputs = self._model(**inputs) | |
| 298 | + scores = outputs.logits[0, logits_ids, :] | |
| 299 | + true_vector = scores[:, self._token_true_id] | |
| 300 | + false_vector = scores[:, self._token_false_id] | |
| 301 | + pair_scores = torch.stack([false_vector, true_vector], dim=1) | |
| 302 | + pair_scores = torch.nn.functional.log_softmax(pair_scores, dim=1) | |
| 303 | + return pair_scores[:, 1].exp().tolist(), int(inputs["input_ids"].shape[1]) | |
| 304 | + | |
| 305 | + def score_with_meta( | |
| 306 | + self, | |
| 307 | + query: str, | |
| 308 | + docs: List[str], | |
| 309 | + normalize: bool = True, | |
| 310 | + ) -> Tuple[List[float], Dict[str, Any]]: | |
| 311 | + start_ts = time.time() | |
| 312 | + total_docs = len(docs) if docs else 0 | |
| 313 | + output_scores: List[float] = [0.0] * total_docs | |
| 314 | + | |
| 315 | + query = "" if query is None else str(query).strip() | |
| 316 | + indexed: List[Tuple[int, str]] = [] | |
| 317 | + for i, doc in enumerate(docs or []): | |
| 318 | + if doc is None: | |
| 319 | + continue | |
| 320 | + text = str(doc).strip() | |
| 321 | + if not text: | |
| 322 | + continue | |
| 323 | + indexed.append((i, text)) | |
| 324 | + | |
| 325 | + if not query or not indexed: | |
| 326 | + elapsed_ms = (time.time() - start_ts) * 1000.0 | |
| 327 | + return output_scores, { | |
| 328 | + "input_docs": total_docs, | |
| 329 | + "usable_docs": len(indexed), | |
| 330 | + "unique_docs": 0, | |
| 331 | + "dedup_ratio": 0.0, | |
| 332 | + "elapsed_ms": round(elapsed_ms, 3), | |
| 333 | + "model": self._model_name, | |
| 334 | + "backend": "qwen3_transformers_packed", | |
| 335 | + "normalize": normalize, | |
| 336 | + "packed_batches": 0, | |
| 337 | + "max_model_len": self._max_model_len, | |
| 338 | + "max_doc_len": self._max_doc_len, | |
| 339 | + "sort_by_doc_length": self._sort_by_doc_length, | |
| 340 | + } | |
| 341 | + | |
| 342 | + indexed_texts = [text for _, text in indexed] | |
| 343 | + unique_texts, position_to_unique = _deduplicate_with_positions(indexed_texts) | |
| 344 | + | |
| 345 | + query_prefix_tokens = self._build_pair_prefix_tokens(query) | |
| 346 | + doc_tokens = self._tokenize_documents(unique_texts, query_prefix_len=len(query_prefix_tokens)) | |
| 347 | + pack_plan = self._build_pack_plan( | |
| 348 | + query_prefix_len=len(query_prefix_tokens), | |
| 349 | + doc_tokens=doc_tokens, | |
| 350 | + ) | |
| 351 | + | |
| 352 | + unique_scores: List[float] = [0.0] * len(unique_texts) | |
| 353 | + pack_lengths: List[int] = [] | |
| 354 | + with self._infer_lock: | |
| 355 | + for pack_doc_indices in pack_plan: | |
| 356 | + batch_scores, pack_seq_len = self._score_pack( | |
| 357 | + query_prefix_tokens=query_prefix_tokens, | |
| 358 | + doc_tokens=doc_tokens, | |
| 359 | + doc_indices=pack_doc_indices, | |
| 360 | + ) | |
| 361 | + if len(batch_scores) != len(pack_doc_indices): | |
| 362 | + raise RuntimeError( | |
| 363 | + "Packed reranker score size mismatch: " | |
| 364 | + f"expected {len(pack_doc_indices)}, got {len(batch_scores)}" | |
| 365 | + ) | |
| 366 | + for idx, score in zip(pack_doc_indices, batch_scores): | |
| 367 | + unique_scores[idx] = float(score) | |
| 368 | + pack_lengths.append(pack_seq_len) | |
| 369 | + | |
| 370 | + for (orig_idx, _), unique_idx in zip(indexed, position_to_unique): | |
| 371 | + output_scores[orig_idx] = float(unique_scores[unique_idx]) | |
| 372 | + | |
| 373 | + elapsed_ms = (time.time() - start_ts) * 1000.0 | |
| 374 | + dedup_ratio = 0.0 | |
| 375 | + if indexed: | |
| 376 | + dedup_ratio = 1.0 - (len(unique_texts) / float(len(indexed))) | |
| 377 | + | |
| 378 | + meta = { | |
| 379 | + "input_docs": total_docs, | |
| 380 | + "usable_docs": len(indexed), | |
| 381 | + "unique_docs": len(unique_texts), | |
| 382 | + "dedup_ratio": round(dedup_ratio, 4), | |
| 383 | + "elapsed_ms": round(elapsed_ms, 3), | |
| 384 | + "model": self._model_name, | |
| 385 | + "backend": "qwen3_transformers_packed", | |
| 386 | + "normalize": normalize, | |
| 387 | + "packed_batches": len(pack_plan), | |
| 388 | + "packed_max_seq_len": max(pack_lengths) if pack_lengths else 0, | |
| 389 | + "packed_avg_seq_len": round(sum(pack_lengths) / len(pack_lengths), 3) | |
| 390 | + if pack_lengths | |
| 391 | + else 0.0, | |
| 392 | + "max_model_len": self._max_model_len, | |
| 393 | + "max_doc_len": self._max_doc_len, | |
| 394 | + "max_docs_per_pack": self._max_docs_per_pack, | |
| 395 | + "sort_by_doc_length": self._sort_by_doc_length, | |
| 396 | + "attn_implementation": self._attn_impl, | |
| 397 | + } | |
| 398 | + return output_scores, meta | ... | ... |
reranker/backends/qwen3_vllm_score.py
| 1 | 1 | """ |
| 2 | -Qwen3-Reranker via vLLM ``task="score"`` (official pooling/score API). | |
| 2 | +Qwen3-Reranker via vLLM ``LLM.score()`` (pooling / cross-encoder score API). | |
| 3 | 3 | |
| 4 | -Matches vLLM ``examples/offline_inference/qwen3_reranker.py``: paired ``llm.score(query_texts, doc_texts)`` | |
| 5 | -with the recommended prefix/suffix templates. Same venv and default model as ``qwen3_vllm``. | |
| 4 | +Matches vLLM ``examples/offline_inference/qwen3_reranker.py``: paired | |
| 5 | +``llm.score(query_texts, doc_texts)`` with the recommended prefix/suffix templates. | |
| 6 | +Requires vLLM >= 0.17 (uses ``runner``/``convert`` auto, not legacy ``task="score"``). | |
| 6 | 7 | |
| 7 | -Reference: https://docs.vllm.ai/ (Qwen3 reranker example) | |
| 8 | -https://docs.vllm.com.cn/en/latest/examples/offline_inference/qwen3_reranker.html | |
| 8 | +Dedicated venv: ``.venv-reranker-score`` + ``requirements_reranker_qwen3_vllm_score.txt`` | |
| 9 | +(see ``./scripts/setup_reranker_venv.sh qwen3_vllm_score``). Default ``model_name`` can match | |
| 10 | +``qwen3_vllm``; only the Python env differs for pinned high-performance vLLM. | |
| 11 | + | |
| 12 | +Reference: https://docs.vllm.ai/ — Qwen3 reranker example | |
| 9 | 13 | """ |
| 10 | 14 | |
| 11 | 15 | from __future__ import annotations |
| ... | ... | @@ -35,9 +39,44 @@ _DEFAULT_QUERY_TEMPLATE = "{prefix}<Instruct>: {instruction}\n<Query>: {query}\n |
| 35 | 39 | _DEFAULT_DOCUMENT_TEMPLATE = "<Document>: {doc}{suffix}" |
| 36 | 40 | |
| 37 | 41 | |
| 42 | +def _resolve_vllm_attention_config(config: Dict[str, Any]) -> Dict[str, Any] | None: | |
| 43 | + """ | |
| 44 | + vLLM 0.18 defaults to Flash-Attention paths that require compute capability >= 8 (Ampere+). | |
| 45 | + Turing / Volta (e.g. T4 sm_75) must use a non-FA backend such as TRITON_ATTN. | |
| 46 | + """ | |
| 47 | + env = (os.getenv("RERANK_VLLM_ATTENTION_BACKEND") or "").strip() | |
| 48 | + raw = config.get("vllm_attention_backend") | |
| 49 | + if env: | |
| 50 | + choice = env | |
| 51 | + elif raw is not None and str(raw).strip() and str(raw).strip().lower() != "auto": | |
| 52 | + choice = str(raw).strip() | |
| 53 | + else: | |
| 54 | + choice = "" | |
| 55 | + if choice: | |
| 56 | + backend = choice.strip().upper() | |
| 57 | + if backend == "AUTO": | |
| 58 | + choice = "" | |
| 59 | + else: | |
| 60 | + logger.info("[Qwen3_VLLM_SCORE] attention_config.backend=%s (from config/env)", backend) | |
| 61 | + return {"backend": backend} | |
| 62 | + | |
| 63 | + major, minor = torch.cuda.get_device_capability() | |
| 64 | + if major < 8: | |
| 65 | + logger.info( | |
| 66 | + "[Qwen3_VLLM_SCORE] GPU compute capability %d.%d < 8.0; using attention backend " | |
| 67 | + "TRITON_ATTN (Flash-Attention 2 requires sm >= 80). " | |
| 68 | + "Override with services.rerank.backends.qwen3_vllm_score.vllm_attention_backend " | |
| 69 | + "or RERANK_VLLM_ATTENTION_BACKEND.", | |
| 70 | + major, | |
| 71 | + minor, | |
| 72 | + ) | |
| 73 | + return {"backend": "TRITON_ATTN"} | |
| 74 | + return None | |
| 75 | + | |
| 76 | + | |
| 38 | 77 | class Qwen3VLLMScoreRerankerBackend: |
| 39 | 78 | """ |
| 40 | - Qwen3 reranker using vLLM ``LLM(..., task="score")`` and ``llm.score(queries, documents)``. | |
| 79 | + Qwen3 reranker using vLLM ``LLM.score()`` (pooling runner) for cross-encoder scores. | |
| 41 | 80 | |
| 42 | 81 | Config from ``services.rerank.backends.qwen3_vllm_score``. |
| 43 | 82 | """ |
| ... | ... | @@ -139,6 +178,10 @@ class Qwen3VLLMScoreRerankerBackend: |
| 139 | 178 | if hf_overrides: |
| 140 | 179 | llm_kwargs["hf_overrides"] = hf_overrides |
| 141 | 180 | |
| 181 | + attn_cfg = _resolve_vllm_attention_config(self._config) | |
| 182 | + if attn_cfg is not None: | |
| 183 | + llm_kwargs["attention_config"] = attn_cfg | |
| 184 | + | |
| 142 | 185 | self._llm = LLM(**llm_kwargs) |
| 143 | 186 | # vLLM score path: single-process safety (mirrors generate backend until verified). |
| 144 | 187 | self._infer_lock = threading.Lock() | ... | ... |
reranker/server.py
| ... | ... | @@ -7,7 +7,7 @@ Request: { "query": "...", "docs": ["doc1", "doc2", ...], "normalize": optional |
| 7 | 7 | Response: { "scores": [float], "meta": {...} } |
| 8 | 8 | |
| 9 | 9 | Backend selected via config: services.rerank.backend |
| 10 | -(bge | qwen3_vllm | qwen3_vllm_score | qwen3_transformers | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank), env RERANK_BACKEND. | |
| 10 | +(bge | qwen3_vllm | qwen3_vllm_score | qwen3_transformers | qwen3_transformers_packed | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank), env RERANK_BACKEND. | |
| 11 | 11 | """ |
| 12 | 12 | |
| 13 | 13 | import logging | ... | ... |
scripts/lib/reranker_backend_env.sh
| ... | ... | @@ -38,10 +38,12 @@ reranker_backend_venv_dir() { |
| 38 | 38 | local backend="$2" |
| 39 | 39 | |
| 40 | 40 | case "${backend}" in |
| 41 | - qwen3_vllm|qwen3_vllm_score) printf '%s/.venv-reranker\n' "${project_root}" ;; | |
| 41 | + qwen3_vllm) printf '%s/.venv-reranker\n' "${project_root}" ;; | |
| 42 | + qwen3_vllm_score) printf '%s/.venv-reranker-score\n' "${project_root}" ;; | |
| 42 | 43 | qwen3_gguf) printf '%s/.venv-reranker-gguf\n' "${project_root}" ;; |
| 43 | 44 | qwen3_gguf_06b) printf '%s/.venv-reranker-gguf-06b\n' "${project_root}" ;; |
| 44 | 45 | qwen3_transformers) printf '%s/.venv-reranker-transformers\n' "${project_root}" ;; |
| 46 | + qwen3_transformers_packed) printf '%s/.venv-reranker-transformers-packed\n' "${project_root}" ;; | |
| 45 | 47 | bge) printf '%s/.venv-reranker-bge\n' "${project_root}" ;; |
| 46 | 48 | dashscope_rerank) printf '%s/.venv-reranker-dashscope\n' "${project_root}" ;; |
| 47 | 49 | *) printf '%s/.venv-reranker-%s\n' "${project_root}" "${backend}" ;; |
| ... | ... | @@ -53,10 +55,12 @@ reranker_backend_requirements_file() { |
| 53 | 55 | local backend="$2" |
| 54 | 56 | |
| 55 | 57 | case "${backend}" in |
| 56 | - qwen3_vllm|qwen3_vllm_score) printf '%s/requirements_reranker_qwen3_vllm.txt\n' "${project_root}" ;; | |
| 58 | + qwen3_vllm) printf '%s/requirements_reranker_qwen3_vllm.txt\n' "${project_root}" ;; | |
| 59 | + qwen3_vllm_score) printf '%s/requirements_reranker_qwen3_vllm_score.txt\n' "${project_root}" ;; | |
| 57 | 60 | qwen3_gguf) printf '%s/requirements_reranker_qwen3_gguf.txt\n' "${project_root}" ;; |
| 58 | 61 | qwen3_gguf_06b) printf '%s/requirements_reranker_qwen3_gguf_06b.txt\n' "${project_root}" ;; |
| 59 | 62 | qwen3_transformers) printf '%s/requirements_reranker_qwen3_transformers.txt\n' "${project_root}" ;; |
| 63 | + qwen3_transformers_packed) printf '%s/requirements_reranker_qwen3_transformers_packed.txt\n' "${project_root}" ;; | |
| 60 | 64 | bge) printf '%s/requirements_reranker_bge.txt\n' "${project_root}" ;; |
| 61 | 65 | dashscope_rerank) printf '%s/requirements_reranker_dashscope.txt\n' "${project_root}" ;; |
| 62 | 66 | *) return 1 ;; | ... | ... |
scripts/start_reranker.sh
| ... | ... | @@ -47,22 +47,29 @@ if [[ "${RERANK_BACKEND}" == qwen3_gguf* ]]; then |
| 47 | 47 | export HF_HUB_DISABLE_XET="${HF_HUB_DISABLE_XET:-1}" |
| 48 | 48 | fi |
| 49 | 49 | |
| 50 | -if [[ "${RERANK_BACKEND}" == "qwen3_vllm" || "${RERANK_BACKEND}" == "qwen3_vllm_score" ]]; then | |
| 50 | +if [[ "${RERANK_BACKEND}" == "qwen3_vllm" || "${RERANK_BACKEND}" == "qwen3_vllm_score" || "${RERANK_BACKEND}" == "qwen3_transformers_packed" ]]; then | |
| 51 | 51 | if ! command -v nvidia-smi >/dev/null 2>&1 || ! nvidia-smi >/dev/null 2>&1; then |
| 52 | 52 | echo "ERROR: ${RERANK_BACKEND} backend requires NVIDIA GPU, but nvidia-smi is unavailable." >&2 |
| 53 | 53 | exit 1 |
| 54 | 54 | fi |
| 55 | 55 | if ! "${PYTHON_BIN}" - <<'PY' |
| 56 | 56 | try: |
| 57 | - import vllm # noqa: F401 | |
| 58 | 57 | import torch |
| 58 | + try: | |
| 59 | + import vllm # noqa: F401 | |
| 60 | + except Exception: | |
| 61 | + pass | |
| 59 | 62 | if not torch.cuda.is_available(): |
| 60 | 63 | raise SystemExit(1) |
| 61 | 64 | except Exception: |
| 62 | 65 | raise SystemExit(1) |
| 63 | 66 | PY |
| 64 | 67 | then |
| 65 | - echo "ERROR: ${RERANK_BACKEND} backend requires vllm + CUDA runtime in ${RERANKER_VENV}." >&2 | |
| 68 | + if [[ "${RERANK_BACKEND}" == "qwen3_transformers_packed" ]]; then | |
| 69 | + echo "ERROR: ${RERANK_BACKEND} backend requires torch + CUDA runtime in ${RERANKER_VENV}." >&2 | |
| 70 | + else | |
| 71 | + echo "ERROR: ${RERANK_BACKEND} backend requires vllm + CUDA runtime in ${RERANKER_VENV}." >&2 | |
| 72 | + fi | |
| 66 | 73 | echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND} and verify CUDA is available." >&2 |
| 67 | 74 | exit 1 |
| 68 | 75 | fi | ... | ... |