Commit 4823f4631caed65dc708e201adc75d4e76036293

Authored by tangwang
1 parent 9de5ef49

qwen3_vllm_score + 独立 0.18 环境

config/config.yaml
... ... @@ -381,7 +381,7 @@ services:
381 381 max_docs: 1000
382 382 normalize: true
383 383 # 服务内后端(reranker 进程启动时读取)
384   - backend: "qwen3_vllm_score" # bge | qwen3_vllm | qwen3_vllm_score | qwen3_transformers | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank
  384 + backend: "qwen3_vllm_score" # bge | qwen3_vllm | qwen3_vllm_score | qwen3_transformers | qwen3_transformers_packed | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank
385 385 backends:
386 386 bge:
387 387 model_name: "BAAI/bge-reranker-v2-m3"
... ... @@ -411,11 +411,14 @@ services:
411 411 # instruction: "Relevance ranking: category & style match first"
412 412 # instruction: "Score product relevance by query with category & style match prioritized"
413 413 instruction: "Rank products by query with category & style match prioritized"
414   - # vLLM LLM.score()(跨编码打分);与 qwen3_vllm 共用 .venv-reranker 与同模型权重(vLLM 0.17+ 用 runner/convert=auto,旧版曾用 task=score)
  414 + # vLLM LLM.score()(跨编码打分)。独立高性能环境 .venv-reranker-score(vllm 0.18 固定版):./scripts/setup_reranker_venv.sh qwen3_vllm_score
  415 + # 与 qwen3_vllm 可共用同一 model_name / HF 缓存;venv 分离以便升级 vLLM 而不影响 generate 后端。
415 416 qwen3_vllm_score:
416 417 model_name: "Qwen/Qwen3-Reranker-0.6B"
417 418 # 官方 Hub 原版需 true;若改用已转换的 seq-cls 权重(如 tomaarsen/...-seq-cls)则设为 false
418 419 use_original_qwen3_hf_overrides: true
  420 + # vLLM 0.18:算力 < 8(如 T4)默认自动用 TRITON_ATTN;Ampere+ 可省略或设 auto。也可设环境变量 RERANK_VLLM_ATTENTION_BACKEND
  421 + # vllm_attention_backend: "auto"
419 422 # 可选:与 vLLM 对齐;一般保持 auto
420 423 # vllm_runner: "auto"
421 424 # vllm_convert: "auto"
... ... @@ -440,6 +443,20 @@ services:
440 443 use_fp16: true
441 444 # sdpa:默认无需 flash-attn;若已安装 flash_attn 可改为 flash_attention_2
442 445 attn_implementation: "sdpa"
  446 + # Packed Transformers backend: shared query prefix + custom position_ids/attention_mask.
  447 + # For 1 query + many short docs (for example 400 product titles), this usually reduces
  448 + # repeated prefix work and padding waste compared with pairwise batching.
  449 + qwen3_transformers_packed:
  450 + model_name: "Qwen/Qwen3-Reranker-0.6B"
  451 + instruction: "Rank products by query with category & style match prioritized"
  452 + max_model_len: 4096
  453 + max_doc_len: 160
  454 + max_docs_per_pack: 0
  455 + use_fp16: true
  456 + sort_by_doc_length: true
  457 + # Packed mode relies on a custom 4D attention mask. "eager" is the safest default.
  458 + # If your torch/transformers stack validates it, you can benchmark "sdpa".
  459 + attn_implementation: "eager"
443 460 qwen3_gguf:
444 461 repo_id: "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF"
445 462 filename: "*Q8_0.gguf"
... ...
requirements_reranker_qwen3_transformers_packed.txt 0 → 100644
... ... @@ -0,0 +1,3 @@
  1 +# Isolated dependencies for qwen3_transformers_packed reranker backend.
  2 +
  3 +-r requirements_reranker_qwen3_transformers.txt
... ...
requirements_reranker_qwen3_vllm_score.txt 0 → 100644
... ... @@ -0,0 +1,14 @@
  1 +# Dedicated high-performance venv for qwen3_vllm_score: .venv-reranker-score
  2 +#
  3 +# Create / refresh:
  4 +# ./scripts/setup_reranker_venv.sh qwen3_vllm_score
  5 +#
  6 +# vLLM 0.17+ replaces LLM(task="score") with runner/convert auto + LLM.score().
  7 +# Pin vLLM for reproducible perf baselines; bump after validating CUDA/driver on your hosts.
  8 +# If pip cannot find a wheel for your CUDA version, edit the vllm line or install from:
  9 +# https://docs.vllm.ai/en/latest/getting_started/installation.html
  10 +
  11 +-r requirements_reranker_base.txt
  12 +vllm==0.18.0
  13 +# Match vLLM 0.18 stack; cap <5 to avoid pip prefetching incompatible transformers 5.x.
  14 +transformers>=4.51.0,<5
... ...
requirements_reranker_service.txt
... ... @@ -2,6 +2,7 @@
2 2 #
3 3 # Prefer backend-specific requirements files:
4 4 # - requirements_reranker_qwen3_vllm.txt
  5 +# - requirements_reranker_qwen3_vllm_score.txt
5 6 # - requirements_reranker_qwen3_gguf.txt
6 7 # - requirements_reranker_qwen3_transformers.txt
7 8 # - requirements_reranker_bge.txt
... ...
reranker/README.md
... ... @@ -7,7 +7,7 @@
7 7 Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwen3-vLLM、Qwen3-Transformers、Qwen3-GGUF、DashScope 云重排)。调用方通过 HTTP 访问,不关心具体后端。
8 8  
9 9 **特性**
10   -- 多后端:`qwen3_vllm`、`qwen3_transformers`、`qwen3_gguf`(Qwen3-Reranker-4B GGUF + llama.cpp)、`qwen3_gguf_06b`(Qwen3-Reranker-0.6B Q8_0 GGUF + llama.cpp)、`bge`(兼容保留)
  10 +- 多后端:`qwen3_vllm`、`qwen3_vllm_score`(同模型,vLLM ``LLM.score()`` + 独立 `.venv-reranker-score`)、`qwen3_transformers`、`qwen3_transformers_packed`(共享前缀 + packed attention mask)、`qwen3_gguf`(Qwen3-Reranker-4B GGUF + llama.cpp)、`qwen3_gguf_06b`(Qwen3-Reranker-0.6B Q8_0 GGUF + llama.cpp)、`bge`(兼容保留)
11 11 - 云后端:`dashscope_rerank`(调用 DashScope `/compatible-api/v1/reranks`,支持按地域切换 endpoint)
12 12 - 统一配置:`config/config.yaml` → `services.rerank.backend` / `services.rerank.backends.<name>`
13 13 - 文档去重、分数与输入顺序一致、FP16/GPU 支持(视后端)
... ... @@ -17,8 +17,10 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe
17 17 - `reranker/backends/`:后端实现与工厂
18 18 - `backends/__init__.py`:`get_rerank_backend(name, config)`
19 19 - `backends/bge.py`:BGE 后端
20   - - `backends/qwen3_vllm.py`:Qwen3-Reranker-0.6B + vLLM 后端
  20 + - `backends/qwen3_vllm.py`:Qwen3-Reranker-0.6B + vLLM(generate + logprobs)
  21 + - `backends/qwen3_vllm_score.py`:同上模型 + vLLM ``LLM.score()``(`requirements_reranker_qwen3_vllm_score.txt` / `.venv-reranker-score`)
21 22 - `backends/qwen3_transformers.py`:Qwen3-Reranker-0.6B 纯 Transformers 后端(官方 Usage 方式)
  23 + - `backends/qwen3_transformers_packed.py`:Qwen3-Reranker-0.6B + Transformers packed 推理(共享 query prefix,适合 `1 query + 400 docs`)
22 24 - `backends/qwen3_gguf.py`:Qwen3-Reranker GGUF + llama.cpp 后端(支持 `qwen3_gguf` / `qwen3_gguf_06b`)
23 25 - `backends/dashscope_rerank.py`:DashScope 云重排后端(HTTP 调用)
24 26 - `reranker/bge_reranker.py`:BGE 核心推理(被 bge 后端封装)
... ... @@ -26,14 +28,18 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe
26 28  
27 29 ## 依赖
28 30 - 通用:`torch`、`transformers`、`fastapi`、`uvicorn`(隔离环境见 `requirements_reranker_service.txt`;全量 ML 环境另见 `requirements_ml.txt`)
29   -- **Qwen3-vLLM 后端**:`vllm>=0.8.5`、`transformers>=4.51.0`(仅当使用 `backend: qwen3_vllm` 时需 vLLM)
  31 +- **Qwen3-vLLM 后端**:`vllm>=0.8.5`、`transformers>=4.51.0`(`qwen3_vllm` → `.venv-reranker`)
  32 +- **Qwen3-vLLM-score 后端**:固定 `vllm==0.18.0`(`qwen3_vllm_score` → `.venv-reranker-score`,见 `requirements_reranker_qwen3_vllm_score.txt`)
30 33 - **Qwen3-Transformers 后端**:`transformers>=4.51.0`、`torch`(无需 vLLM,适合 CPU 或小显存)
  34 +- **Qwen3-Transformers-Packed 后端**:复用 Transformers 依赖(`qwen3_transformers_packed` → `.venv-reranker-transformers-packed`)
31 35 - **Qwen3-GGUF 后端**:`llama-cpp-python>=0.3.16`
32 36 - 现在按 backend 使用独立 venv:
33 37 - `qwen3_vllm` -> `.venv-reranker`
  38 + - `qwen3_vllm_score` -> `.venv-reranker-score`
34 39 - `qwen3_gguf` -> `.venv-reranker-gguf`
35 40 - `qwen3_gguf_06b` -> `.venv-reranker-gguf-06b`
36 41 - `qwen3_transformers` -> `.venv-reranker-transformers`
  42 + - `qwen3_transformers_packed` -> `.venv-reranker-transformers-packed`
37 43 - `bge` -> `.venv-reranker-bge`
38 44 - `dashscope_rerank` -> `.venv-reranker-dashscope`
39 45 ```bash
... ... @@ -49,7 +55,7 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe
49 55 ```
50 56  
51 57 ## 配置
52   -- **后端选择**:`config/config.yaml` 中 `services.rerank.backend`(`qwen3_vllm` | `qwen3_transformers` | `qwen3_gguf` | `qwen3_gguf_06b` | `bge` | `dashscope_rerank`),或环境变量 `RERANK_BACKEND`。
  58 +- **后端选择**:`config/config.yaml` 中 `services.rerank.backend`(`qwen3_vllm` | `qwen3_vllm_score` | `qwen3_transformers` | `qwen3_transformers_packed` | `qwen3_gguf` | `qwen3_gguf_06b` | `bge` | `dashscope_rerank`),或环境变量 `RERANK_BACKEND`。
53 59 - **后端参数**:`services.rerank.backends.bge` / `services.rerank.backends.qwen3_vllm`,例如:
54 60  
55 61 ```yaml
... ... @@ -82,6 +88,15 @@ services:
82 88 tensor_parallel_size: 1
83 89 gpu_memory_utilization: 0.8
84 90 instruction: "Given a shopping query, rank product titles by relevance"
  91 + qwen3_transformers_packed:
  92 + model_name: "Qwen/Qwen3-Reranker-0.6B"
  93 + instruction: "Rank products by query with category & style match prioritized"
  94 + max_model_len: 4096
  95 + max_doc_len: 160
  96 + max_docs_per_pack: 0
  97 + use_fp16: true
  98 + sort_by_doc_length: true
  99 + attn_implementation: "eager"
85 100 qwen3_gguf:
86 101 repo_id: "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF"
87 102 filename: "*Q8_0.gguf"
... ... @@ -168,7 +183,7 @@ Content-Type: application/json
168 183 ```
169 184  
170 185 `top_n` 为可选字段:
171   -- 对本地后端(`qwen3_vllm` / `qwen3_transformers` / `qwen3_gguf` / `qwen3_gguf_06b` / `bge`)通常会忽略,仍返回全量分数。
  186 +- 对本地后端(`qwen3_vllm` / `qwen3_transformers` / `qwen3_transformers_packed` / `qwen3_gguf` / `qwen3_gguf_06b` / `bge`)通常会忽略,仍返回全量分数。
172 187 - 对 `dashscope_rerank` 可用于控制云端返回的候选量,建议设置为 `page+size`(例如分页 `from=20,size=10` 时传 `30`)。
173 188  
174 189 Response:
... ... @@ -206,5 +221,6 @@ uvicorn reranker.server:app --host 0.0.0.0 --port 6007 --log-level info
206 221 - 运行时可用环境变量临时覆盖批量参数:`RERANK_VLLM_INFER_BATCH_SIZE`、`RERANK_VLLM_SORT_BY_DOC_LENGTH`。
207 222 - **Qwen3-vLLM**:参考 [Qwen3-Reranker-0.6B](https://huggingface.co/Qwen/Qwen3-Reranker-0.6B),需 GPU 与较多显存;与 BGE 相比适合长文本、高吞吐场景(vLLM 前缀缓存)。
208 223 - **Qwen3-Transformers**:官方 Transformers Usage 方式,无需 vLLM;适合 CPU 或小显存。默认 `attn_implementation: "sdpa"`;若已安装 `flash_attn` 可设 `flash_attention_2`(未安装时服务会自动回退到 sdpa)。
  224 +- **Qwen3-Transformers-Packed**:仍使用 Hugging Face Transformers 与 PyTorch CUDA 内核,只定制 packed 输入、`position_ids` 和 4D `attention_mask`。它更适合在线检索里的“一个 query 对几百个短 doc”场景;默认 `attn_implementation: "eager"` 以保证自定义 mask 兼容性,若你的 `torch/transformers` 版本已验证支持,可再压测 `"sdpa"`。
209 225 - **Qwen3-GGUF**:参考 [DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF](https://huggingface.co/DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF)。单卡 T4 且仅剩约 `4.8~6GB` 显存时,推荐 `Q8_0 + n_ctx=384 + n_gpu_layers=24 + flash_attn=true + offload_kqv=true` 起步;若启动 OOM,优先把 `n_gpu_layers` 下调到 `20`,再把 `n_ctx` 下调到 `320`。`infer_batch_size` 在 GGUF 后端是服务侧 work chunk,大多不如 `n_gpu_layers` / `n_ctx` 关键。
210 226 - **Qwen3-GGUF-0.6B**:参考 [ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF](https://huggingface.co/ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF)。它的优点是权重小、显存占用低,单进程实测约 `0.9~1.1 GiB`;但在当前 llama.cpp 串行打分接法下,`1 query + 400 titles` 的实测延迟仍约 `265s`。因此它更适合低显存功能后备,不适合作为在线低延迟主 reranker。
... ...
reranker/backends/__init__.py
... ... @@ -49,6 +49,11 @@ def get_rerank_backend(name: str, config: Dict[str, Any]) -&gt; RerankBackendProtoc
49 49 if name == "qwen3_transformers":
50 50 from reranker.backends.qwen3_transformers import Qwen3TransformersRerankerBackend
51 51 return Qwen3TransformersRerankerBackend(config)
  52 + if name == "qwen3_transformers_packed":
  53 + from reranker.backends.qwen3_transformers_packed import (
  54 + Qwen3TransformersPackedRerankerBackend,
  55 + )
  56 + return Qwen3TransformersPackedRerankerBackend(config)
52 57 if name == "qwen3_gguf":
53 58 from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend
54 59 gguf_config = dict(config or {})
... ... @@ -63,7 +68,7 @@ def get_rerank_backend(name: str, config: Dict[str, Any]) -&gt; RerankBackendProtoc
63 68 from reranker.backends.dashscope_rerank import DashScopeRerankBackend
64 69 return DashScopeRerankBackend(config)
65 70 raise ValueError(
66   - f"Unknown rerank backend: {name!r}. Supported: bge, qwen3_vllm, qwen3_vllm_score, qwen3_transformers, qwen3_gguf, qwen3_gguf_06b, dashscope_rerank"
  71 + f"Unknown rerank backend: {name!r}. Supported: bge, qwen3_vllm, qwen3_vllm_score, qwen3_transformers, qwen3_transformers_packed, qwen3_gguf, qwen3_gguf_06b, dashscope_rerank"
67 72 )
68 73  
69 74  
... ...
reranker/backends/qwen3_transformers_packed.py 0 → 100644
... ... @@ -0,0 +1,398 @@
  1 +"""
  2 +Qwen3-Reranker backend using packed inference with Transformers.
  3 +
  4 +This backend implements the sequence stitching optimization described in
  5 +Qwen3-Reranker packed inference examples:
  6 +1. Share the query/instruction prefix across many documents.
  7 +2. Reset document ``position_ids`` relative to the shared prefix.
  8 +3. Use a custom causal attention mask so each document can attend to the
  9 + prefix and itself, but never to other documents.
  10 +
  11 +Compared with the standard per-pair batching path, this reduces repeated
  12 +prefix computation and removes inter-sample padding waste. For online search
  13 +requests like ``1 query + 400 docs``, the backend further packs documents into
  14 +multiple chunks under a configurable total token budget.
  15 +"""
  16 +
  17 +from __future__ import annotations
  18 +
  19 +import logging
  20 +import threading
  21 +import time
  22 +from typing import Any, Dict, List, Sequence, Tuple
  23 +
  24 +import torch
  25 +from transformers import AutoModelForCausalLM, AutoTokenizer
  26 +
  27 +logger = logging.getLogger("reranker.backends.qwen3_transformers_packed")
  28 +
  29 +_DEFAULT_PREFIX = (
  30 + "<|im_start|>system\n"
  31 + "Judge whether the Document meets the requirements based on the Query and the Instruct "
  32 + 'provided. Note that the answer can only be "yes" or "no".'
  33 + "<|im_end|>\n<|im_start|>user\n"
  34 +)
  35 +_DEFAULT_SUFFIX = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
  36 +_DEFAULT_PAIR_PREFIX_TEMPLATE = "{prefix}<Instruct>: {instruction}\n<Query>: {query}\n<Document>: "
  37 +
  38 +
  39 +def _deduplicate_with_positions(texts: Sequence[str]) -> Tuple[List[str], List[int]]:
  40 + unique_texts: List[str] = []
  41 + position_to_unique: List[int] = []
  42 + seen: Dict[str, int] = {}
  43 +
  44 + for text in texts:
  45 + idx = seen.get(text)
  46 + if idx is None:
  47 + idx = len(unique_texts)
  48 + seen[text] = idx
  49 + unique_texts.append(text)
  50 + position_to_unique.append(idx)
  51 +
  52 + return unique_texts, position_to_unique
  53 +
  54 +
  55 +class Qwen3TransformersPackedRerankerBackend:
  56 + """
  57 + Qwen3-Reranker packed inference backend using Transformers.
  58 +
  59 + Config from ``services.rerank.backends.qwen3_transformers_packed``.
  60 + """
  61 +
  62 + def __init__(self, config: Dict[str, Any]) -> None:
  63 + self._config = config or {}
  64 + model_name = str(self._config.get("model_name") or "Qwen/Qwen3-Reranker-0.6B")
  65 + self._instruction = str(
  66 + self._config.get("instruction")
  67 + or "Rank products by query with category & style match prioritized"
  68 + )
  69 + self._prefix = str(self._config.get("prompt_prefix") or _DEFAULT_PREFIX)
  70 + self._suffix = str(self._config.get("prompt_suffix") or _DEFAULT_SUFFIX)
  71 + self._pair_prefix_template = str(
  72 + self._config.get("pair_prefix_template") or _DEFAULT_PAIR_PREFIX_TEMPLATE
  73 + )
  74 +
  75 + max_model_len = int(self._config.get("max_model_len", 4096))
  76 + max_doc_len = int(self._config.get("max_doc_len", 160))
  77 + max_docs_per_pack = int(self._config.get("max_docs_per_pack", 0))
  78 + use_fp16 = bool(self._config.get("use_fp16", True))
  79 + device = self._config.get("device")
  80 + attn_impl = str(self._config.get("attn_implementation") or "eager").strip()
  81 + sort_by_doc_length = self._config.get("sort_by_doc_length", True)
  82 +
  83 + self._model_name = model_name
  84 + self._max_model_len = max_model_len
  85 + self._max_doc_len = max_doc_len
  86 + self._max_docs_per_pack = max_docs_per_pack
  87 + self._sort_by_doc_length = str(sort_by_doc_length).strip().lower() in {
  88 + "1",
  89 + "true",
  90 + "yes",
  91 + "y",
  92 + "on",
  93 + }
  94 + self._attn_impl = attn_impl
  95 +
  96 + logger.info(
  97 + "[Qwen3_Transformers_Packed] Loading model %s (max_model_len=%s, max_doc_len=%s, "
  98 + "max_docs_per_pack=%s, fp16=%s, attn_impl=%s)",
  99 + model_name,
  100 + max_model_len,
  101 + max_doc_len,
  102 + max_docs_per_pack,
  103 + use_fp16,
  104 + attn_impl,
  105 + )
  106 +
  107 + self._tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
  108 + self._tokenizer.pad_token = self._tokenizer.eos_token
  109 +
  110 + self._prefix_tokens = self._tokenizer.encode(self._prefix, add_special_tokens=False)
  111 + self._suffix_tokens = self._tokenizer.encode(self._suffix, add_special_tokens=False)
  112 + self._suffix_len = len(self._suffix_tokens)
  113 +
  114 + if not torch.cuda.is_available():
  115 + raise RuntimeError(
  116 + "qwen3_transformers_packed backend requires CUDA GPU, "
  117 + "but torch.cuda.is_available() is False"
  118 + )
  119 +
  120 + kwargs: Dict[str, Any] = {}
  121 + if use_fp16:
  122 + kwargs["torch_dtype"] = torch.float16
  123 + if attn_impl:
  124 + kwargs["attn_implementation"] = attn_impl
  125 +
  126 + self._model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs).eval()
  127 + target_device = str(device).strip() if device is not None else "cuda"
  128 + if not target_device.startswith("cuda"):
  129 + raise ValueError(
  130 + "qwen3_transformers_packed backend is GPU-only. "
  131 + f"Unsupported device setting: {target_device!r}"
  132 + )
  133 + self._model = self._model.to(target_device)
  134 + self._device = next(self._model.parameters()).device
  135 + if self._device.type != "cuda":
  136 + raise RuntimeError(
  137 + "qwen3_transformers_packed backend failed to place model on CUDA. "
  138 + f"Current device: {self._device}"
  139 + )
  140 +
  141 + self._token_true_id = self._tokenizer.convert_tokens_to_ids("yes")
  142 + self._token_false_id = self._tokenizer.convert_tokens_to_ids("no")
  143 + if self._token_true_id is None or self._token_false_id is None:
  144 + raise RuntimeError("Failed to resolve Qwen3 reranker classifier token ids for yes/no")
  145 +
  146 + prefix_budget = len(self._prefix_tokens) + self._suffix_len + 1
  147 + if self._max_model_len <= prefix_budget:
  148 + raise ValueError(
  149 + "max_model_len is too small for packed reranking. "
  150 + f"Need > {prefix_budget}, got {self._max_model_len}."
  151 + )
  152 + if self._max_doc_len <= 0:
  153 + raise ValueError(f"max_doc_len must be > 0, got {self._max_doc_len}")
  154 + if self._max_docs_per_pack < 0:
  155 + raise ValueError(
  156 + f"max_docs_per_pack must be >= 0, got {self._max_docs_per_pack}"
  157 + )
  158 +
  159 + self._infer_lock = threading.Lock()
  160 +
  161 + logger.info(
  162 + "[Qwen3_Transformers_Packed] Model ready | model=%s device=%s",
  163 + model_name,
  164 + self._device,
  165 + )
  166 +
  167 + def _build_pair_prefix_tokens(self, query: str) -> List[int]:
  168 + pair_prefix = self._pair_prefix_template.format(
  169 + prefix=self._prefix,
  170 + instruction=self._instruction,
  171 + query=query,
  172 + )
  173 + return self._tokenizer.encode(pair_prefix, add_special_tokens=False)
  174 +
  175 + def _tokenize_documents(self, docs: Sequence[str], query_prefix_len: int) -> List[List[int]]:
  176 + max_doc_tokens = min(
  177 + self._max_doc_len,
  178 + max(1, self._max_model_len - query_prefix_len - self._suffix_len),
  179 + )
  180 + tokenized = self._tokenizer(
  181 + list(docs),
  182 + padding=False,
  183 + truncation=True,
  184 + max_length=max_doc_tokens,
  185 + add_special_tokens=False,
  186 + return_attention_mask=False,
  187 + )
  188 + return [list(ids) for ids in tokenized["input_ids"]]
  189 +
  190 + def _build_pack_plan(
  191 + self,
  192 + query_prefix_len: int,
  193 + doc_tokens: Sequence[Sequence[int]],
  194 + ) -> List[List[int]]:
  195 + order = list(range(len(doc_tokens)))
  196 + if self._sort_by_doc_length and len(order) > 1:
  197 + order.sort(key=lambda idx: len(doc_tokens[idx]))
  198 +
  199 + packs: List[List[int]] = []
  200 + current_pack: List[int] = []
  201 + current_len = query_prefix_len
  202 + for idx in order:
  203 + packed_doc_len = len(doc_tokens[idx]) + self._suffix_len
  204 + if packed_doc_len <= 0:
  205 + continue
  206 +
  207 + over_docs_cap = self._max_docs_per_pack > 0 and len(current_pack) >= self._max_docs_per_pack
  208 + over_token_cap = current_pack and (current_len + packed_doc_len > self._max_model_len)
  209 + if over_docs_cap or over_token_cap:
  210 + packs.append(current_pack)
  211 + current_pack = []
  212 + current_len = query_prefix_len
  213 +
  214 + if query_prefix_len + packed_doc_len > self._max_model_len:
  215 + raise ValueError(
  216 + "Packed doc still exceeds max_model_len after truncation. "
  217 + f"query_prefix_len={query_prefix_len}, doc_len={packed_doc_len}, "
  218 + f"max_model_len={self._max_model_len}"
  219 + )
  220 +
  221 + current_pack.append(idx)
  222 + current_len += packed_doc_len
  223 +
  224 + if current_pack:
  225 + packs.append(current_pack)
  226 + return packs
  227 +
  228 + def _build_pack_inputs(
  229 + self,
  230 + query_prefix_tokens: Sequence[int],
  231 + doc_tokens: Sequence[Sequence[int]],
  232 + doc_indices: Sequence[int],
  233 + ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
  234 + prefix_len = len(query_prefix_tokens)
  235 + input_ids_list = list(query_prefix_tokens)
  236 + position_ids_list = list(range(prefix_len))
  237 + spans: List[Tuple[int, int]] = []
  238 + current_len = prefix_len
  239 +
  240 + for idx in doc_indices:
  241 + doc_with_suffix = list(doc_tokens[idx]) + self._suffix_tokens
  242 + start = current_len
  243 + end = start + len(doc_with_suffix)
  244 + spans.append((start, end))
  245 + input_ids_list.extend(doc_with_suffix)
  246 + position_ids_list.extend(range(prefix_len, prefix_len + len(doc_with_suffix)))
  247 + current_len = end
  248 +
  249 + total_len = len(input_ids_list)
  250 + device = self._device
  251 + neg_inf = torch.finfo(torch.float32).min
  252 +
  253 + allowed = torch.zeros((total_len, total_len), dtype=torch.bool, device=device)
  254 + prefix_causal = torch.tril(
  255 + torch.ones((prefix_len, prefix_len), dtype=torch.bool, device=device)
  256 + )
  257 + allowed[:prefix_len, :prefix_len] = prefix_causal
  258 + for start, end in spans:
  259 + allowed[start:end, :prefix_len] = True
  260 + doc_len = end - start
  261 + allowed[start:end, start:end] = torch.tril(
  262 + torch.ones((doc_len, doc_len), dtype=torch.bool, device=device)
  263 + )
  264 +
  265 + attention_mask = torch.full(
  266 + (total_len, total_len),
  267 + neg_inf,
  268 + dtype=torch.float32,
  269 + device=device,
  270 + )
  271 + attention_mask.masked_fill_(allowed, 0.0)
  272 +
  273 + inputs = {
  274 + "input_ids": torch.tensor([input_ids_list], dtype=torch.long, device=device),
  275 + "position_ids": torch.tensor([position_ids_list], dtype=torch.long, device=device),
  276 + "attention_mask": attention_mask.view(1, 1, total_len, total_len),
  277 + }
  278 + logits_ids = torch.tensor(
  279 + [end - 1 for _, end in spans],
  280 + dtype=torch.long,
  281 + device=device,
  282 + )
  283 + return inputs, logits_ids
  284 +
  285 + @torch.no_grad()
  286 + def _score_pack(
  287 + self,
  288 + query_prefix_tokens: Sequence[int],
  289 + doc_tokens: Sequence[Sequence[int]],
  290 + doc_indices: Sequence[int],
  291 + ) -> Tuple[List[float], int]:
  292 + inputs, logits_ids = self._build_pack_inputs(
  293 + query_prefix_tokens=query_prefix_tokens,
  294 + doc_tokens=doc_tokens,
  295 + doc_indices=doc_indices,
  296 + )
  297 + outputs = self._model(**inputs)
  298 + scores = outputs.logits[0, logits_ids, :]
  299 + true_vector = scores[:, self._token_true_id]
  300 + false_vector = scores[:, self._token_false_id]
  301 + pair_scores = torch.stack([false_vector, true_vector], dim=1)
  302 + pair_scores = torch.nn.functional.log_softmax(pair_scores, dim=1)
  303 + return pair_scores[:, 1].exp().tolist(), int(inputs["input_ids"].shape[1])
  304 +
  305 + def score_with_meta(
  306 + self,
  307 + query: str,
  308 + docs: List[str],
  309 + normalize: bool = True,
  310 + ) -> Tuple[List[float], Dict[str, Any]]:
  311 + start_ts = time.time()
  312 + total_docs = len(docs) if docs else 0
  313 + output_scores: List[float] = [0.0] * total_docs
  314 +
  315 + query = "" if query is None else str(query).strip()
  316 + indexed: List[Tuple[int, str]] = []
  317 + for i, doc in enumerate(docs or []):
  318 + if doc is None:
  319 + continue
  320 + text = str(doc).strip()
  321 + if not text:
  322 + continue
  323 + indexed.append((i, text))
  324 +
  325 + if not query or not indexed:
  326 + elapsed_ms = (time.time() - start_ts) * 1000.0
  327 + return output_scores, {
  328 + "input_docs": total_docs,
  329 + "usable_docs": len(indexed),
  330 + "unique_docs": 0,
  331 + "dedup_ratio": 0.0,
  332 + "elapsed_ms": round(elapsed_ms, 3),
  333 + "model": self._model_name,
  334 + "backend": "qwen3_transformers_packed",
  335 + "normalize": normalize,
  336 + "packed_batches": 0,
  337 + "max_model_len": self._max_model_len,
  338 + "max_doc_len": self._max_doc_len,
  339 + "sort_by_doc_length": self._sort_by_doc_length,
  340 + }
  341 +
  342 + indexed_texts = [text for _, text in indexed]
  343 + unique_texts, position_to_unique = _deduplicate_with_positions(indexed_texts)
  344 +
  345 + query_prefix_tokens = self._build_pair_prefix_tokens(query)
  346 + doc_tokens = self._tokenize_documents(unique_texts, query_prefix_len=len(query_prefix_tokens))
  347 + pack_plan = self._build_pack_plan(
  348 + query_prefix_len=len(query_prefix_tokens),
  349 + doc_tokens=doc_tokens,
  350 + )
  351 +
  352 + unique_scores: List[float] = [0.0] * len(unique_texts)
  353 + pack_lengths: List[int] = []
  354 + with self._infer_lock:
  355 + for pack_doc_indices in pack_plan:
  356 + batch_scores, pack_seq_len = self._score_pack(
  357 + query_prefix_tokens=query_prefix_tokens,
  358 + doc_tokens=doc_tokens,
  359 + doc_indices=pack_doc_indices,
  360 + )
  361 + if len(batch_scores) != len(pack_doc_indices):
  362 + raise RuntimeError(
  363 + "Packed reranker score size mismatch: "
  364 + f"expected {len(pack_doc_indices)}, got {len(batch_scores)}"
  365 + )
  366 + for idx, score in zip(pack_doc_indices, batch_scores):
  367 + unique_scores[idx] = float(score)
  368 + pack_lengths.append(pack_seq_len)
  369 +
  370 + for (orig_idx, _), unique_idx in zip(indexed, position_to_unique):
  371 + output_scores[orig_idx] = float(unique_scores[unique_idx])
  372 +
  373 + elapsed_ms = (time.time() - start_ts) * 1000.0
  374 + dedup_ratio = 0.0
  375 + if indexed:
  376 + dedup_ratio = 1.0 - (len(unique_texts) / float(len(indexed)))
  377 +
  378 + meta = {
  379 + "input_docs": total_docs,
  380 + "usable_docs": len(indexed),
  381 + "unique_docs": len(unique_texts),
  382 + "dedup_ratio": round(dedup_ratio, 4),
  383 + "elapsed_ms": round(elapsed_ms, 3),
  384 + "model": self._model_name,
  385 + "backend": "qwen3_transformers_packed",
  386 + "normalize": normalize,
  387 + "packed_batches": len(pack_plan),
  388 + "packed_max_seq_len": max(pack_lengths) if pack_lengths else 0,
  389 + "packed_avg_seq_len": round(sum(pack_lengths) / len(pack_lengths), 3)
  390 + if pack_lengths
  391 + else 0.0,
  392 + "max_model_len": self._max_model_len,
  393 + "max_doc_len": self._max_doc_len,
  394 + "max_docs_per_pack": self._max_docs_per_pack,
  395 + "sort_by_doc_length": self._sort_by_doc_length,
  396 + "attn_implementation": self._attn_impl,
  397 + }
  398 + return output_scores, meta
... ...
reranker/backends/qwen3_vllm_score.py
1 1 """
2   -Qwen3-Reranker via vLLM ``task="score"`` (official pooling/score API).
  2 +Qwen3-Reranker via vLLM ``LLM.score()`` (pooling / cross-encoder score API).
3 3  
4   -Matches vLLM ``examples/offline_inference/qwen3_reranker.py``: paired ``llm.score(query_texts, doc_texts)``
5   -with the recommended prefix/suffix templates. Same venv and default model as ``qwen3_vllm``.
  4 +Matches vLLM ``examples/offline_inference/qwen3_reranker.py``: paired
  5 +``llm.score(query_texts, doc_texts)`` with the recommended prefix/suffix templates.
  6 +Requires vLLM >= 0.17 (uses ``runner``/``convert`` auto, not legacy ``task="score"``).
6 7  
7   -Reference: https://docs.vllm.ai/ (Qwen3 reranker example)
8   -https://docs.vllm.com.cn/en/latest/examples/offline_inference/qwen3_reranker.html
  8 +Dedicated venv: ``.venv-reranker-score`` + ``requirements_reranker_qwen3_vllm_score.txt``
  9 +(see ``./scripts/setup_reranker_venv.sh qwen3_vllm_score``). Default ``model_name`` can match
  10 +``qwen3_vllm``; only the Python env differs for pinned high-performance vLLM.
  11 +
  12 +Reference: https://docs.vllm.ai/ — Qwen3 reranker example
9 13 """
10 14  
11 15 from __future__ import annotations
... ... @@ -35,9 +39,44 @@ _DEFAULT_QUERY_TEMPLATE = &quot;{prefix}&lt;Instruct&gt;: {instruction}\n&lt;Query&gt;: {query}\n
35 39 _DEFAULT_DOCUMENT_TEMPLATE = "<Document>: {doc}{suffix}"
36 40  
37 41  
  42 +def _resolve_vllm_attention_config(config: Dict[str, Any]) -> Dict[str, Any] | None:
  43 + """
  44 + vLLM 0.18 defaults to Flash-Attention paths that require compute capability >= 8 (Ampere+).
  45 + Turing / Volta (e.g. T4 sm_75) must use a non-FA backend such as TRITON_ATTN.
  46 + """
  47 + env = (os.getenv("RERANK_VLLM_ATTENTION_BACKEND") or "").strip()
  48 + raw = config.get("vllm_attention_backend")
  49 + if env:
  50 + choice = env
  51 + elif raw is not None and str(raw).strip() and str(raw).strip().lower() != "auto":
  52 + choice = str(raw).strip()
  53 + else:
  54 + choice = ""
  55 + if choice:
  56 + backend = choice.strip().upper()
  57 + if backend == "AUTO":
  58 + choice = ""
  59 + else:
  60 + logger.info("[Qwen3_VLLM_SCORE] attention_config.backend=%s (from config/env)", backend)
  61 + return {"backend": backend}
  62 +
  63 + major, minor = torch.cuda.get_device_capability()
  64 + if major < 8:
  65 + logger.info(
  66 + "[Qwen3_VLLM_SCORE] GPU compute capability %d.%d < 8.0; using attention backend "
  67 + "TRITON_ATTN (Flash-Attention 2 requires sm >= 80). "
  68 + "Override with services.rerank.backends.qwen3_vllm_score.vllm_attention_backend "
  69 + "or RERANK_VLLM_ATTENTION_BACKEND.",
  70 + major,
  71 + minor,
  72 + )
  73 + return {"backend": "TRITON_ATTN"}
  74 + return None
  75 +
  76 +
38 77 class Qwen3VLLMScoreRerankerBackend:
39 78 """
40   - Qwen3 reranker using vLLM ``LLM(..., task="score")`` and ``llm.score(queries, documents)``.
  79 + Qwen3 reranker using vLLM ``LLM.score()`` (pooling runner) for cross-encoder scores.
41 80  
42 81 Config from ``services.rerank.backends.qwen3_vllm_score``.
43 82 """
... ... @@ -139,6 +178,10 @@ class Qwen3VLLMScoreRerankerBackend:
139 178 if hf_overrides:
140 179 llm_kwargs["hf_overrides"] = hf_overrides
141 180  
  181 + attn_cfg = _resolve_vllm_attention_config(self._config)
  182 + if attn_cfg is not None:
  183 + llm_kwargs["attention_config"] = attn_cfg
  184 +
142 185 self._llm = LLM(**llm_kwargs)
143 186 # vLLM score path: single-process safety (mirrors generate backend until verified).
144 187 self._infer_lock = threading.Lock()
... ...
reranker/server.py
... ... @@ -7,7 +7,7 @@ Request: { &quot;query&quot;: &quot;...&quot;, &quot;docs&quot;: [&quot;doc1&quot;, &quot;doc2&quot;, ...], &quot;normalize&quot;: optional
7 7 Response: { "scores": [float], "meta": {...} }
8 8  
9 9 Backend selected via config: services.rerank.backend
10   -(bge | qwen3_vllm | qwen3_vllm_score | qwen3_transformers | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank), env RERANK_BACKEND.
  10 +(bge | qwen3_vllm | qwen3_vllm_score | qwen3_transformers | qwen3_transformers_packed | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank), env RERANK_BACKEND.
11 11 """
12 12  
13 13 import logging
... ...
scripts/lib/reranker_backend_env.sh
... ... @@ -38,10 +38,12 @@ reranker_backend_venv_dir() {
38 38 local backend="$2"
39 39  
40 40 case "${backend}" in
41   - qwen3_vllm|qwen3_vllm_score) printf '%s/.venv-reranker\n' "${project_root}" ;;
  41 + qwen3_vllm) printf '%s/.venv-reranker\n' "${project_root}" ;;
  42 + qwen3_vllm_score) printf '%s/.venv-reranker-score\n' "${project_root}" ;;
42 43 qwen3_gguf) printf '%s/.venv-reranker-gguf\n' "${project_root}" ;;
43 44 qwen3_gguf_06b) printf '%s/.venv-reranker-gguf-06b\n' "${project_root}" ;;
44 45 qwen3_transformers) printf '%s/.venv-reranker-transformers\n' "${project_root}" ;;
  46 + qwen3_transformers_packed) printf '%s/.venv-reranker-transformers-packed\n' "${project_root}" ;;
45 47 bge) printf '%s/.venv-reranker-bge\n' "${project_root}" ;;
46 48 dashscope_rerank) printf '%s/.venv-reranker-dashscope\n' "${project_root}" ;;
47 49 *) printf '%s/.venv-reranker-%s\n' "${project_root}" "${backend}" ;;
... ... @@ -53,10 +55,12 @@ reranker_backend_requirements_file() {
53 55 local backend="$2"
54 56  
55 57 case "${backend}" in
56   - qwen3_vllm|qwen3_vllm_score) printf '%s/requirements_reranker_qwen3_vllm.txt\n' "${project_root}" ;;
  58 + qwen3_vllm) printf '%s/requirements_reranker_qwen3_vllm.txt\n' "${project_root}" ;;
  59 + qwen3_vllm_score) printf '%s/requirements_reranker_qwen3_vllm_score.txt\n' "${project_root}" ;;
57 60 qwen3_gguf) printf '%s/requirements_reranker_qwen3_gguf.txt\n' "${project_root}" ;;
58 61 qwen3_gguf_06b) printf '%s/requirements_reranker_qwen3_gguf_06b.txt\n' "${project_root}" ;;
59 62 qwen3_transformers) printf '%s/requirements_reranker_qwen3_transformers.txt\n' "${project_root}" ;;
  63 + qwen3_transformers_packed) printf '%s/requirements_reranker_qwen3_transformers_packed.txt\n' "${project_root}" ;;
60 64 bge) printf '%s/requirements_reranker_bge.txt\n' "${project_root}" ;;
61 65 dashscope_rerank) printf '%s/requirements_reranker_dashscope.txt\n' "${project_root}" ;;
62 66 *) return 1 ;;
... ...
scripts/start_reranker.sh
... ... @@ -47,22 +47,29 @@ if [[ &quot;${RERANK_BACKEND}&quot; == qwen3_gguf* ]]; then
47 47 export HF_HUB_DISABLE_XET="${HF_HUB_DISABLE_XET:-1}"
48 48 fi
49 49  
50   -if [[ "${RERANK_BACKEND}" == "qwen3_vllm" || "${RERANK_BACKEND}" == "qwen3_vllm_score" ]]; then
  50 +if [[ "${RERANK_BACKEND}" == "qwen3_vllm" || "${RERANK_BACKEND}" == "qwen3_vllm_score" || "${RERANK_BACKEND}" == "qwen3_transformers_packed" ]]; then
51 51 if ! command -v nvidia-smi >/dev/null 2>&1 || ! nvidia-smi >/dev/null 2>&1; then
52 52 echo "ERROR: ${RERANK_BACKEND} backend requires NVIDIA GPU, but nvidia-smi is unavailable." >&2
53 53 exit 1
54 54 fi
55 55 if ! "${PYTHON_BIN}" - <<'PY'
56 56 try:
57   - import vllm # noqa: F401
58 57 import torch
  58 + try:
  59 + import vllm # noqa: F401
  60 + except Exception:
  61 + pass
59 62 if not torch.cuda.is_available():
60 63 raise SystemExit(1)
61 64 except Exception:
62 65 raise SystemExit(1)
63 66 PY
64 67 then
65   - echo "ERROR: ${RERANK_BACKEND} backend requires vllm + CUDA runtime in ${RERANKER_VENV}." >&2
  68 + if [[ "${RERANK_BACKEND}" == "qwen3_transformers_packed" ]]; then
  69 + echo "ERROR: ${RERANK_BACKEND} backend requires torch + CUDA runtime in ${RERANKER_VENV}." >&2
  70 + else
  71 + echo "ERROR: ${RERANK_BACKEND} backend requires vllm + CUDA runtime in ${RERANKER_VENV}." >&2
  72 + fi
66 73 echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND} and verify CUDA is available." >&2
67 74 exit 1
68 75 fi
... ...