diff --git a/config/config.yaml b/config/config.yaml index e5dbf4c..776e7b5 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -381,7 +381,7 @@ services: max_docs: 1000 normalize: true # 服务内后端(reranker 进程启动时读取) - backend: "qwen3_vllm_score" # bge | qwen3_vllm | qwen3_vllm_score | qwen3_transformers | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank + backend: "qwen3_vllm_score" # bge | qwen3_vllm | qwen3_vllm_score | qwen3_transformers | qwen3_transformers_packed | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank backends: bge: model_name: "BAAI/bge-reranker-v2-m3" @@ -411,11 +411,14 @@ services: # instruction: "Relevance ranking: category & style match first" # instruction: "Score product relevance by query with category & style match prioritized" instruction: "Rank products by query with category & style match prioritized" - # vLLM LLM.score()(跨编码打分);与 qwen3_vllm 共用 .venv-reranker 与同模型权重(vLLM 0.17+ 用 runner/convert=auto,旧版曾用 task=score) + # vLLM LLM.score()(跨编码打分)。独立高性能环境 .venv-reranker-score(vllm 0.18 固定版):./scripts/setup_reranker_venv.sh qwen3_vllm_score + # 与 qwen3_vllm 可共用同一 model_name / HF 缓存;venv 分离以便升级 vLLM 而不影响 generate 后端。 qwen3_vllm_score: model_name: "Qwen/Qwen3-Reranker-0.6B" # 官方 Hub 原版需 true;若改用已转换的 seq-cls 权重(如 tomaarsen/...-seq-cls)则设为 false use_original_qwen3_hf_overrides: true + # vLLM 0.18:算力 < 8(如 T4)默认自动用 TRITON_ATTN;Ampere+ 可省略或设 auto。也可设环境变量 RERANK_VLLM_ATTENTION_BACKEND + # vllm_attention_backend: "auto" # 可选:与 vLLM 对齐;一般保持 auto # vllm_runner: "auto" # vllm_convert: "auto" @@ -440,6 +443,20 @@ services: use_fp16: true # sdpa:默认无需 flash-attn;若已安装 flash_attn 可改为 flash_attention_2 attn_implementation: "sdpa" + # Packed Transformers backend: shared query prefix + custom position_ids/attention_mask. + # For 1 query + many short docs (for example 400 product titles), this usually reduces + # repeated prefix work and padding waste compared with pairwise batching. + qwen3_transformers_packed: + model_name: "Qwen/Qwen3-Reranker-0.6B" + instruction: "Rank products by query with category & style match prioritized" + max_model_len: 4096 + max_doc_len: 160 + max_docs_per_pack: 0 + use_fp16: true + sort_by_doc_length: true + # Packed mode relies on a custom 4D attention mask. "eager" is the safest default. + # If your torch/transformers stack validates it, you can benchmark "sdpa". + attn_implementation: "eager" qwen3_gguf: repo_id: "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF" filename: "*Q8_0.gguf" diff --git a/requirements_reranker_qwen3_transformers_packed.txt b/requirements_reranker_qwen3_transformers_packed.txt new file mode 100644 index 0000000..025981f --- /dev/null +++ b/requirements_reranker_qwen3_transformers_packed.txt @@ -0,0 +1,3 @@ +# Isolated dependencies for qwen3_transformers_packed reranker backend. + +-r requirements_reranker_qwen3_transformers.txt diff --git a/requirements_reranker_qwen3_vllm_score.txt b/requirements_reranker_qwen3_vllm_score.txt new file mode 100644 index 0000000..0e2c2e8 --- /dev/null +++ b/requirements_reranker_qwen3_vllm_score.txt @@ -0,0 +1,14 @@ +# Dedicated high-performance venv for qwen3_vllm_score: .venv-reranker-score +# +# Create / refresh: +# ./scripts/setup_reranker_venv.sh qwen3_vllm_score +# +# vLLM 0.17+ replaces LLM(task="score") with runner/convert auto + LLM.score(). +# Pin vLLM for reproducible perf baselines; bump after validating CUDA/driver on your hosts. +# If pip cannot find a wheel for your CUDA version, edit the vllm line or install from: +# https://docs.vllm.ai/en/latest/getting_started/installation.html + +-r requirements_reranker_base.txt +vllm==0.18.0 +# Match vLLM 0.18 stack; cap <5 to avoid pip prefetching incompatible transformers 5.x. +transformers>=4.51.0,<5 diff --git a/requirements_reranker_service.txt b/requirements_reranker_service.txt index 18b738d..e7234e0 100644 --- a/requirements_reranker_service.txt +++ b/requirements_reranker_service.txt @@ -2,6 +2,7 @@ # # Prefer backend-specific requirements files: # - requirements_reranker_qwen3_vllm.txt +# - requirements_reranker_qwen3_vllm_score.txt # - requirements_reranker_qwen3_gguf.txt # - requirements_reranker_qwen3_transformers.txt # - requirements_reranker_bge.txt diff --git a/reranker/README.md b/reranker/README.md index d77ef7f..39acbeb 100644 --- a/reranker/README.md +++ b/reranker/README.md @@ -7,7 +7,7 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwen3-vLLM、Qwen3-Transformers、Qwen3-GGUF、DashScope 云重排)。调用方通过 HTTP 访问,不关心具体后端。 **特性** -- 多后端:`qwen3_vllm`、`qwen3_transformers`、`qwen3_gguf`(Qwen3-Reranker-4B GGUF + llama.cpp)、`qwen3_gguf_06b`(Qwen3-Reranker-0.6B Q8_0 GGUF + llama.cpp)、`bge`(兼容保留) +- 多后端:`qwen3_vllm`、`qwen3_vllm_score`(同模型,vLLM ``LLM.score()`` + 独立 `.venv-reranker-score`)、`qwen3_transformers`、`qwen3_transformers_packed`(共享前缀 + packed attention mask)、`qwen3_gguf`(Qwen3-Reranker-4B GGUF + llama.cpp)、`qwen3_gguf_06b`(Qwen3-Reranker-0.6B Q8_0 GGUF + llama.cpp)、`bge`(兼容保留) - 云后端:`dashscope_rerank`(调用 DashScope `/compatible-api/v1/reranks`,支持按地域切换 endpoint) - 统一配置:`config/config.yaml` → `services.rerank.backend` / `services.rerank.backends.` - 文档去重、分数与输入顺序一致、FP16/GPU 支持(视后端) @@ -17,8 +17,10 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe - `reranker/backends/`:后端实现与工厂 - `backends/__init__.py`:`get_rerank_backend(name, config)` - `backends/bge.py`:BGE 后端 - - `backends/qwen3_vllm.py`:Qwen3-Reranker-0.6B + vLLM 后端 + - `backends/qwen3_vllm.py`:Qwen3-Reranker-0.6B + vLLM(generate + logprobs) + - `backends/qwen3_vllm_score.py`:同上模型 + vLLM ``LLM.score()``(`requirements_reranker_qwen3_vllm_score.txt` / `.venv-reranker-score`) - `backends/qwen3_transformers.py`:Qwen3-Reranker-0.6B 纯 Transformers 后端(官方 Usage 方式) + - `backends/qwen3_transformers_packed.py`:Qwen3-Reranker-0.6B + Transformers packed 推理(共享 query prefix,适合 `1 query + 400 docs`) - `backends/qwen3_gguf.py`:Qwen3-Reranker GGUF + llama.cpp 后端(支持 `qwen3_gguf` / `qwen3_gguf_06b`) - `backends/dashscope_rerank.py`:DashScope 云重排后端(HTTP 调用) - `reranker/bge_reranker.py`:BGE 核心推理(被 bge 后端封装) @@ -26,14 +28,18 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe ## 依赖 - 通用:`torch`、`transformers`、`fastapi`、`uvicorn`(隔离环境见 `requirements_reranker_service.txt`;全量 ML 环境另见 `requirements_ml.txt`) -- **Qwen3-vLLM 后端**:`vllm>=0.8.5`、`transformers>=4.51.0`(仅当使用 `backend: qwen3_vllm` 时需 vLLM) +- **Qwen3-vLLM 后端**:`vllm>=0.8.5`、`transformers>=4.51.0`(`qwen3_vllm` → `.venv-reranker`) +- **Qwen3-vLLM-score 后端**:固定 `vllm==0.18.0`(`qwen3_vllm_score` → `.venv-reranker-score`,见 `requirements_reranker_qwen3_vllm_score.txt`) - **Qwen3-Transformers 后端**:`transformers>=4.51.0`、`torch`(无需 vLLM,适合 CPU 或小显存) +- **Qwen3-Transformers-Packed 后端**:复用 Transformers 依赖(`qwen3_transformers_packed` → `.venv-reranker-transformers-packed`) - **Qwen3-GGUF 后端**:`llama-cpp-python>=0.3.16` - 现在按 backend 使用独立 venv: - `qwen3_vllm` -> `.venv-reranker` + - `qwen3_vllm_score` -> `.venv-reranker-score` - `qwen3_gguf` -> `.venv-reranker-gguf` - `qwen3_gguf_06b` -> `.venv-reranker-gguf-06b` - `qwen3_transformers` -> `.venv-reranker-transformers` + - `qwen3_transformers_packed` -> `.venv-reranker-transformers-packed` - `bge` -> `.venv-reranker-bge` - `dashscope_rerank` -> `.venv-reranker-dashscope` ```bash @@ -49,7 +55,7 @@ Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwe ``` ## 配置 -- **后端选择**:`config/config.yaml` 中 `services.rerank.backend`(`qwen3_vllm` | `qwen3_transformers` | `qwen3_gguf` | `qwen3_gguf_06b` | `bge` | `dashscope_rerank`),或环境变量 `RERANK_BACKEND`。 +- **后端选择**:`config/config.yaml` 中 `services.rerank.backend`(`qwen3_vllm` | `qwen3_vllm_score` | `qwen3_transformers` | `qwen3_transformers_packed` | `qwen3_gguf` | `qwen3_gguf_06b` | `bge` | `dashscope_rerank`),或环境变量 `RERANK_BACKEND`。 - **后端参数**:`services.rerank.backends.bge` / `services.rerank.backends.qwen3_vllm`,例如: ```yaml @@ -82,6 +88,15 @@ services: tensor_parallel_size: 1 gpu_memory_utilization: 0.8 instruction: "Given a shopping query, rank product titles by relevance" + qwen3_transformers_packed: + model_name: "Qwen/Qwen3-Reranker-0.6B" + instruction: "Rank products by query with category & style match prioritized" + max_model_len: 4096 + max_doc_len: 160 + max_docs_per_pack: 0 + use_fp16: true + sort_by_doc_length: true + attn_implementation: "eager" qwen3_gguf: repo_id: "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF" filename: "*Q8_0.gguf" @@ -168,7 +183,7 @@ Content-Type: application/json ``` `top_n` 为可选字段: -- 对本地后端(`qwen3_vllm` / `qwen3_transformers` / `qwen3_gguf` / `qwen3_gguf_06b` / `bge`)通常会忽略,仍返回全量分数。 +- 对本地后端(`qwen3_vllm` / `qwen3_transformers` / `qwen3_transformers_packed` / `qwen3_gguf` / `qwen3_gguf_06b` / `bge`)通常会忽略,仍返回全量分数。 - 对 `dashscope_rerank` 可用于控制云端返回的候选量,建议设置为 `page+size`(例如分页 `from=20,size=10` 时传 `30`)。 Response: @@ -206,5 +221,6 @@ uvicorn reranker.server:app --host 0.0.0.0 --port 6007 --log-level info - 运行时可用环境变量临时覆盖批量参数:`RERANK_VLLM_INFER_BATCH_SIZE`、`RERANK_VLLM_SORT_BY_DOC_LENGTH`。 - **Qwen3-vLLM**:参考 [Qwen3-Reranker-0.6B](https://huggingface.co/Qwen/Qwen3-Reranker-0.6B),需 GPU 与较多显存;与 BGE 相比适合长文本、高吞吐场景(vLLM 前缀缓存)。 - **Qwen3-Transformers**:官方 Transformers Usage 方式,无需 vLLM;适合 CPU 或小显存。默认 `attn_implementation: "sdpa"`;若已安装 `flash_attn` 可设 `flash_attention_2`(未安装时服务会自动回退到 sdpa)。 +- **Qwen3-Transformers-Packed**:仍使用 Hugging Face Transformers 与 PyTorch CUDA 内核,只定制 packed 输入、`position_ids` 和 4D `attention_mask`。它更适合在线检索里的“一个 query 对几百个短 doc”场景;默认 `attn_implementation: "eager"` 以保证自定义 mask 兼容性,若你的 `torch/transformers` 版本已验证支持,可再压测 `"sdpa"`。 - **Qwen3-GGUF**:参考 [DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF](https://huggingface.co/DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF)。单卡 T4 且仅剩约 `4.8~6GB` 显存时,推荐 `Q8_0 + n_ctx=384 + n_gpu_layers=24 + flash_attn=true + offload_kqv=true` 起步;若启动 OOM,优先把 `n_gpu_layers` 下调到 `20`,再把 `n_ctx` 下调到 `320`。`infer_batch_size` 在 GGUF 后端是服务侧 work chunk,大多不如 `n_gpu_layers` / `n_ctx` 关键。 - **Qwen3-GGUF-0.6B**:参考 [ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF](https://huggingface.co/ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF)。它的优点是权重小、显存占用低,单进程实测约 `0.9~1.1 GiB`;但在当前 llama.cpp 串行打分接法下,`1 query + 400 titles` 的实测延迟仍约 `265s`。因此它更适合低显存功能后备,不适合作为在线低延迟主 reranker。 diff --git a/reranker/backends/__init__.py b/reranker/backends/__init__.py index f0d4499..7d23091 100644 --- a/reranker/backends/__init__.py +++ b/reranker/backends/__init__.py @@ -49,6 +49,11 @@ def get_rerank_backend(name: str, config: Dict[str, Any]) -> RerankBackendProtoc if name == "qwen3_transformers": from reranker.backends.qwen3_transformers import Qwen3TransformersRerankerBackend return Qwen3TransformersRerankerBackend(config) + if name == "qwen3_transformers_packed": + from reranker.backends.qwen3_transformers_packed import ( + Qwen3TransformersPackedRerankerBackend, + ) + return Qwen3TransformersPackedRerankerBackend(config) if name == "qwen3_gguf": from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend gguf_config = dict(config or {}) @@ -63,7 +68,7 @@ def get_rerank_backend(name: str, config: Dict[str, Any]) -> RerankBackendProtoc from reranker.backends.dashscope_rerank import DashScopeRerankBackend return DashScopeRerankBackend(config) raise ValueError( - f"Unknown rerank backend: {name!r}. Supported: bge, qwen3_vllm, qwen3_vllm_score, qwen3_transformers, qwen3_gguf, qwen3_gguf_06b, dashscope_rerank" + f"Unknown rerank backend: {name!r}. Supported: bge, qwen3_vllm, qwen3_vllm_score, qwen3_transformers, qwen3_transformers_packed, qwen3_gguf, qwen3_gguf_06b, dashscope_rerank" ) diff --git a/reranker/backends/qwen3_transformers_packed.py b/reranker/backends/qwen3_transformers_packed.py new file mode 100644 index 0000000..8af572a --- /dev/null +++ b/reranker/backends/qwen3_transformers_packed.py @@ -0,0 +1,398 @@ +""" +Qwen3-Reranker backend using packed inference with Transformers. + +This backend implements the sequence stitching optimization described in +Qwen3-Reranker packed inference examples: +1. Share the query/instruction prefix across many documents. +2. Reset document ``position_ids`` relative to the shared prefix. +3. Use a custom causal attention mask so each document can attend to the + prefix and itself, but never to other documents. + +Compared with the standard per-pair batching path, this reduces repeated +prefix computation and removes inter-sample padding waste. For online search +requests like ``1 query + 400 docs``, the backend further packs documents into +multiple chunks under a configurable total token budget. +""" + +from __future__ import annotations + +import logging +import threading +import time +from typing import Any, Dict, List, Sequence, Tuple + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +logger = logging.getLogger("reranker.backends.qwen3_transformers_packed") + +_DEFAULT_PREFIX = ( + "<|im_start|>system\n" + "Judge whether the Document meets the requirements based on the Query and the Instruct " + 'provided. Note that the answer can only be "yes" or "no".' + "<|im_end|>\n<|im_start|>user\n" +) +_DEFAULT_SUFFIX = "<|im_end|>\n<|im_start|>assistant\n\n\n\n\n" +_DEFAULT_PAIR_PREFIX_TEMPLATE = "{prefix}: {instruction}\n: {query}\n: " + + +def _deduplicate_with_positions(texts: Sequence[str]) -> Tuple[List[str], List[int]]: + unique_texts: List[str] = [] + position_to_unique: List[int] = [] + seen: Dict[str, int] = {} + + for text in texts: + idx = seen.get(text) + if idx is None: + idx = len(unique_texts) + seen[text] = idx + unique_texts.append(text) + position_to_unique.append(idx) + + return unique_texts, position_to_unique + + +class Qwen3TransformersPackedRerankerBackend: + """ + Qwen3-Reranker packed inference backend using Transformers. + + Config from ``services.rerank.backends.qwen3_transformers_packed``. + """ + + def __init__(self, config: Dict[str, Any]) -> None: + self._config = config or {} + model_name = str(self._config.get("model_name") or "Qwen/Qwen3-Reranker-0.6B") + self._instruction = str( + self._config.get("instruction") + or "Rank products by query with category & style match prioritized" + ) + self._prefix = str(self._config.get("prompt_prefix") or _DEFAULT_PREFIX) + self._suffix = str(self._config.get("prompt_suffix") or _DEFAULT_SUFFIX) + self._pair_prefix_template = str( + self._config.get("pair_prefix_template") or _DEFAULT_PAIR_PREFIX_TEMPLATE + ) + + max_model_len = int(self._config.get("max_model_len", 4096)) + max_doc_len = int(self._config.get("max_doc_len", 160)) + max_docs_per_pack = int(self._config.get("max_docs_per_pack", 0)) + use_fp16 = bool(self._config.get("use_fp16", True)) + device = self._config.get("device") + attn_impl = str(self._config.get("attn_implementation") or "eager").strip() + sort_by_doc_length = self._config.get("sort_by_doc_length", True) + + self._model_name = model_name + self._max_model_len = max_model_len + self._max_doc_len = max_doc_len + self._max_docs_per_pack = max_docs_per_pack + self._sort_by_doc_length = str(sort_by_doc_length).strip().lower() in { + "1", + "true", + "yes", + "y", + "on", + } + self._attn_impl = attn_impl + + logger.info( + "[Qwen3_Transformers_Packed] Loading model %s (max_model_len=%s, max_doc_len=%s, " + "max_docs_per_pack=%s, fp16=%s, attn_impl=%s)", + model_name, + max_model_len, + max_doc_len, + max_docs_per_pack, + use_fp16, + attn_impl, + ) + + self._tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left") + self._tokenizer.pad_token = self._tokenizer.eos_token + + self._prefix_tokens = self._tokenizer.encode(self._prefix, add_special_tokens=False) + self._suffix_tokens = self._tokenizer.encode(self._suffix, add_special_tokens=False) + self._suffix_len = len(self._suffix_tokens) + + if not torch.cuda.is_available(): + raise RuntimeError( + "qwen3_transformers_packed backend requires CUDA GPU, " + "but torch.cuda.is_available() is False" + ) + + kwargs: Dict[str, Any] = {} + if use_fp16: + kwargs["torch_dtype"] = torch.float16 + if attn_impl: + kwargs["attn_implementation"] = attn_impl + + self._model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs).eval() + target_device = str(device).strip() if device is not None else "cuda" + if not target_device.startswith("cuda"): + raise ValueError( + "qwen3_transformers_packed backend is GPU-only. " + f"Unsupported device setting: {target_device!r}" + ) + self._model = self._model.to(target_device) + self._device = next(self._model.parameters()).device + if self._device.type != "cuda": + raise RuntimeError( + "qwen3_transformers_packed backend failed to place model on CUDA. " + f"Current device: {self._device}" + ) + + self._token_true_id = self._tokenizer.convert_tokens_to_ids("yes") + self._token_false_id = self._tokenizer.convert_tokens_to_ids("no") + if self._token_true_id is None or self._token_false_id is None: + raise RuntimeError("Failed to resolve Qwen3 reranker classifier token ids for yes/no") + + prefix_budget = len(self._prefix_tokens) + self._suffix_len + 1 + if self._max_model_len <= prefix_budget: + raise ValueError( + "max_model_len is too small for packed reranking. " + f"Need > {prefix_budget}, got {self._max_model_len}." + ) + if self._max_doc_len <= 0: + raise ValueError(f"max_doc_len must be > 0, got {self._max_doc_len}") + if self._max_docs_per_pack < 0: + raise ValueError( + f"max_docs_per_pack must be >= 0, got {self._max_docs_per_pack}" + ) + + self._infer_lock = threading.Lock() + + logger.info( + "[Qwen3_Transformers_Packed] Model ready | model=%s device=%s", + model_name, + self._device, + ) + + def _build_pair_prefix_tokens(self, query: str) -> List[int]: + pair_prefix = self._pair_prefix_template.format( + prefix=self._prefix, + instruction=self._instruction, + query=query, + ) + return self._tokenizer.encode(pair_prefix, add_special_tokens=False) + + def _tokenize_documents(self, docs: Sequence[str], query_prefix_len: int) -> List[List[int]]: + max_doc_tokens = min( + self._max_doc_len, + max(1, self._max_model_len - query_prefix_len - self._suffix_len), + ) + tokenized = self._tokenizer( + list(docs), + padding=False, + truncation=True, + max_length=max_doc_tokens, + add_special_tokens=False, + return_attention_mask=False, + ) + return [list(ids) for ids in tokenized["input_ids"]] + + def _build_pack_plan( + self, + query_prefix_len: int, + doc_tokens: Sequence[Sequence[int]], + ) -> List[List[int]]: + order = list(range(len(doc_tokens))) + if self._sort_by_doc_length and len(order) > 1: + order.sort(key=lambda idx: len(doc_tokens[idx])) + + packs: List[List[int]] = [] + current_pack: List[int] = [] + current_len = query_prefix_len + for idx in order: + packed_doc_len = len(doc_tokens[idx]) + self._suffix_len + if packed_doc_len <= 0: + continue + + over_docs_cap = self._max_docs_per_pack > 0 and len(current_pack) >= self._max_docs_per_pack + over_token_cap = current_pack and (current_len + packed_doc_len > self._max_model_len) + if over_docs_cap or over_token_cap: + packs.append(current_pack) + current_pack = [] + current_len = query_prefix_len + + if query_prefix_len + packed_doc_len > self._max_model_len: + raise ValueError( + "Packed doc still exceeds max_model_len after truncation. " + f"query_prefix_len={query_prefix_len}, doc_len={packed_doc_len}, " + f"max_model_len={self._max_model_len}" + ) + + current_pack.append(idx) + current_len += packed_doc_len + + if current_pack: + packs.append(current_pack) + return packs + + def _build_pack_inputs( + self, + query_prefix_tokens: Sequence[int], + doc_tokens: Sequence[Sequence[int]], + doc_indices: Sequence[int], + ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]: + prefix_len = len(query_prefix_tokens) + input_ids_list = list(query_prefix_tokens) + position_ids_list = list(range(prefix_len)) + spans: List[Tuple[int, int]] = [] + current_len = prefix_len + + for idx in doc_indices: + doc_with_suffix = list(doc_tokens[idx]) + self._suffix_tokens + start = current_len + end = start + len(doc_with_suffix) + spans.append((start, end)) + input_ids_list.extend(doc_with_suffix) + position_ids_list.extend(range(prefix_len, prefix_len + len(doc_with_suffix))) + current_len = end + + total_len = len(input_ids_list) + device = self._device + neg_inf = torch.finfo(torch.float32).min + + allowed = torch.zeros((total_len, total_len), dtype=torch.bool, device=device) + prefix_causal = torch.tril( + torch.ones((prefix_len, prefix_len), dtype=torch.bool, device=device) + ) + allowed[:prefix_len, :prefix_len] = prefix_causal + for start, end in spans: + allowed[start:end, :prefix_len] = True + doc_len = end - start + allowed[start:end, start:end] = torch.tril( + torch.ones((doc_len, doc_len), dtype=torch.bool, device=device) + ) + + attention_mask = torch.full( + (total_len, total_len), + neg_inf, + dtype=torch.float32, + device=device, + ) + attention_mask.masked_fill_(allowed, 0.0) + + inputs = { + "input_ids": torch.tensor([input_ids_list], dtype=torch.long, device=device), + "position_ids": torch.tensor([position_ids_list], dtype=torch.long, device=device), + "attention_mask": attention_mask.view(1, 1, total_len, total_len), + } + logits_ids = torch.tensor( + [end - 1 for _, end in spans], + dtype=torch.long, + device=device, + ) + return inputs, logits_ids + + @torch.no_grad() + def _score_pack( + self, + query_prefix_tokens: Sequence[int], + doc_tokens: Sequence[Sequence[int]], + doc_indices: Sequence[int], + ) -> Tuple[List[float], int]: + inputs, logits_ids = self._build_pack_inputs( + query_prefix_tokens=query_prefix_tokens, + doc_tokens=doc_tokens, + doc_indices=doc_indices, + ) + outputs = self._model(**inputs) + scores = outputs.logits[0, logits_ids, :] + true_vector = scores[:, self._token_true_id] + false_vector = scores[:, self._token_false_id] + pair_scores = torch.stack([false_vector, true_vector], dim=1) + pair_scores = torch.nn.functional.log_softmax(pair_scores, dim=1) + return pair_scores[:, 1].exp().tolist(), int(inputs["input_ids"].shape[1]) + + def score_with_meta( + self, + query: str, + docs: List[str], + normalize: bool = True, + ) -> Tuple[List[float], Dict[str, Any]]: + start_ts = time.time() + total_docs = len(docs) if docs else 0 + output_scores: List[float] = [0.0] * total_docs + + query = "" if query is None else str(query).strip() + indexed: List[Tuple[int, str]] = [] + for i, doc in enumerate(docs or []): + if doc is None: + continue + text = str(doc).strip() + if not text: + continue + indexed.append((i, text)) + + if not query or not indexed: + elapsed_ms = (time.time() - start_ts) * 1000.0 + return output_scores, { + "input_docs": total_docs, + "usable_docs": len(indexed), + "unique_docs": 0, + "dedup_ratio": 0.0, + "elapsed_ms": round(elapsed_ms, 3), + "model": self._model_name, + "backend": "qwen3_transformers_packed", + "normalize": normalize, + "packed_batches": 0, + "max_model_len": self._max_model_len, + "max_doc_len": self._max_doc_len, + "sort_by_doc_length": self._sort_by_doc_length, + } + + indexed_texts = [text for _, text in indexed] + unique_texts, position_to_unique = _deduplicate_with_positions(indexed_texts) + + query_prefix_tokens = self._build_pair_prefix_tokens(query) + doc_tokens = self._tokenize_documents(unique_texts, query_prefix_len=len(query_prefix_tokens)) + pack_plan = self._build_pack_plan( + query_prefix_len=len(query_prefix_tokens), + doc_tokens=doc_tokens, + ) + + unique_scores: List[float] = [0.0] * len(unique_texts) + pack_lengths: List[int] = [] + with self._infer_lock: + for pack_doc_indices in pack_plan: + batch_scores, pack_seq_len = self._score_pack( + query_prefix_tokens=query_prefix_tokens, + doc_tokens=doc_tokens, + doc_indices=pack_doc_indices, + ) + if len(batch_scores) != len(pack_doc_indices): + raise RuntimeError( + "Packed reranker score size mismatch: " + f"expected {len(pack_doc_indices)}, got {len(batch_scores)}" + ) + for idx, score in zip(pack_doc_indices, batch_scores): + unique_scores[idx] = float(score) + pack_lengths.append(pack_seq_len) + + for (orig_idx, _), unique_idx in zip(indexed, position_to_unique): + output_scores[orig_idx] = float(unique_scores[unique_idx]) + + elapsed_ms = (time.time() - start_ts) * 1000.0 + dedup_ratio = 0.0 + if indexed: + dedup_ratio = 1.0 - (len(unique_texts) / float(len(indexed))) + + meta = { + "input_docs": total_docs, + "usable_docs": len(indexed), + "unique_docs": len(unique_texts), + "dedup_ratio": round(dedup_ratio, 4), + "elapsed_ms": round(elapsed_ms, 3), + "model": self._model_name, + "backend": "qwen3_transformers_packed", + "normalize": normalize, + "packed_batches": len(pack_plan), + "packed_max_seq_len": max(pack_lengths) if pack_lengths else 0, + "packed_avg_seq_len": round(sum(pack_lengths) / len(pack_lengths), 3) + if pack_lengths + else 0.0, + "max_model_len": self._max_model_len, + "max_doc_len": self._max_doc_len, + "max_docs_per_pack": self._max_docs_per_pack, + "sort_by_doc_length": self._sort_by_doc_length, + "attn_implementation": self._attn_impl, + } + return output_scores, meta diff --git a/reranker/backends/qwen3_vllm_score.py b/reranker/backends/qwen3_vllm_score.py index c26580e..aab8e40 100644 --- a/reranker/backends/qwen3_vllm_score.py +++ b/reranker/backends/qwen3_vllm_score.py @@ -1,11 +1,15 @@ """ -Qwen3-Reranker via vLLM ``task="score"`` (official pooling/score API). +Qwen3-Reranker via vLLM ``LLM.score()`` (pooling / cross-encoder score API). -Matches vLLM ``examples/offline_inference/qwen3_reranker.py``: paired ``llm.score(query_texts, doc_texts)`` -with the recommended prefix/suffix templates. Same venv and default model as ``qwen3_vllm``. +Matches vLLM ``examples/offline_inference/qwen3_reranker.py``: paired +``llm.score(query_texts, doc_texts)`` with the recommended prefix/suffix templates. +Requires vLLM >= 0.17 (uses ``runner``/``convert`` auto, not legacy ``task="score"``). -Reference: https://docs.vllm.ai/ (Qwen3 reranker example) -https://docs.vllm.com.cn/en/latest/examples/offline_inference/qwen3_reranker.html +Dedicated venv: ``.venv-reranker-score`` + ``requirements_reranker_qwen3_vllm_score.txt`` +(see ``./scripts/setup_reranker_venv.sh qwen3_vllm_score``). Default ``model_name`` can match +``qwen3_vllm``; only the Python env differs for pinned high-performance vLLM. + +Reference: https://docs.vllm.ai/ — Qwen3 reranker example """ from __future__ import annotations @@ -35,9 +39,44 @@ _DEFAULT_QUERY_TEMPLATE = "{prefix}: {instruction}\n: {query}\n _DEFAULT_DOCUMENT_TEMPLATE = ": {doc}{suffix}" +def _resolve_vllm_attention_config(config: Dict[str, Any]) -> Dict[str, Any] | None: + """ + vLLM 0.18 defaults to Flash-Attention paths that require compute capability >= 8 (Ampere+). + Turing / Volta (e.g. T4 sm_75) must use a non-FA backend such as TRITON_ATTN. + """ + env = (os.getenv("RERANK_VLLM_ATTENTION_BACKEND") or "").strip() + raw = config.get("vllm_attention_backend") + if env: + choice = env + elif raw is not None and str(raw).strip() and str(raw).strip().lower() != "auto": + choice = str(raw).strip() + else: + choice = "" + if choice: + backend = choice.strip().upper() + if backend == "AUTO": + choice = "" + else: + logger.info("[Qwen3_VLLM_SCORE] attention_config.backend=%s (from config/env)", backend) + return {"backend": backend} + + major, minor = torch.cuda.get_device_capability() + if major < 8: + logger.info( + "[Qwen3_VLLM_SCORE] GPU compute capability %d.%d < 8.0; using attention backend " + "TRITON_ATTN (Flash-Attention 2 requires sm >= 80). " + "Override with services.rerank.backends.qwen3_vllm_score.vllm_attention_backend " + "or RERANK_VLLM_ATTENTION_BACKEND.", + major, + minor, + ) + return {"backend": "TRITON_ATTN"} + return None + + class Qwen3VLLMScoreRerankerBackend: """ - Qwen3 reranker using vLLM ``LLM(..., task="score")`` and ``llm.score(queries, documents)``. + Qwen3 reranker using vLLM ``LLM.score()`` (pooling runner) for cross-encoder scores. Config from ``services.rerank.backends.qwen3_vllm_score``. """ @@ -139,6 +178,10 @@ class Qwen3VLLMScoreRerankerBackend: if hf_overrides: llm_kwargs["hf_overrides"] = hf_overrides + attn_cfg = _resolve_vllm_attention_config(self._config) + if attn_cfg is not None: + llm_kwargs["attention_config"] = attn_cfg + self._llm = LLM(**llm_kwargs) # vLLM score path: single-process safety (mirrors generate backend until verified). self._infer_lock = threading.Lock() diff --git a/reranker/server.py b/reranker/server.py index 4b60af3..ec76b4d 100644 --- a/reranker/server.py +++ b/reranker/server.py @@ -7,7 +7,7 @@ Request: { "query": "...", "docs": ["doc1", "doc2", ...], "normalize": optional Response: { "scores": [float], "meta": {...} } Backend selected via config: services.rerank.backend -(bge | qwen3_vllm | qwen3_vllm_score | qwen3_transformers | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank), env RERANK_BACKEND. +(bge | qwen3_vllm | qwen3_vllm_score | qwen3_transformers | qwen3_transformers_packed | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank), env RERANK_BACKEND. """ import logging diff --git a/scripts/lib/reranker_backend_env.sh b/scripts/lib/reranker_backend_env.sh index f5812b3..343b6c8 100644 --- a/scripts/lib/reranker_backend_env.sh +++ b/scripts/lib/reranker_backend_env.sh @@ -38,10 +38,12 @@ reranker_backend_venv_dir() { local backend="$2" case "${backend}" in - qwen3_vllm|qwen3_vllm_score) printf '%s/.venv-reranker\n' "${project_root}" ;; + qwen3_vllm) printf '%s/.venv-reranker\n' "${project_root}" ;; + qwen3_vllm_score) printf '%s/.venv-reranker-score\n' "${project_root}" ;; qwen3_gguf) printf '%s/.venv-reranker-gguf\n' "${project_root}" ;; qwen3_gguf_06b) printf '%s/.venv-reranker-gguf-06b\n' "${project_root}" ;; qwen3_transformers) printf '%s/.venv-reranker-transformers\n' "${project_root}" ;; + qwen3_transformers_packed) printf '%s/.venv-reranker-transformers-packed\n' "${project_root}" ;; bge) printf '%s/.venv-reranker-bge\n' "${project_root}" ;; dashscope_rerank) printf '%s/.venv-reranker-dashscope\n' "${project_root}" ;; *) printf '%s/.venv-reranker-%s\n' "${project_root}" "${backend}" ;; @@ -53,10 +55,12 @@ reranker_backend_requirements_file() { local backend="$2" case "${backend}" in - qwen3_vllm|qwen3_vllm_score) printf '%s/requirements_reranker_qwen3_vllm.txt\n' "${project_root}" ;; + qwen3_vllm) printf '%s/requirements_reranker_qwen3_vllm.txt\n' "${project_root}" ;; + qwen3_vllm_score) printf '%s/requirements_reranker_qwen3_vllm_score.txt\n' "${project_root}" ;; qwen3_gguf) printf '%s/requirements_reranker_qwen3_gguf.txt\n' "${project_root}" ;; qwen3_gguf_06b) printf '%s/requirements_reranker_qwen3_gguf_06b.txt\n' "${project_root}" ;; qwen3_transformers) printf '%s/requirements_reranker_qwen3_transformers.txt\n' "${project_root}" ;; + qwen3_transformers_packed) printf '%s/requirements_reranker_qwen3_transformers_packed.txt\n' "${project_root}" ;; bge) printf '%s/requirements_reranker_bge.txt\n' "${project_root}" ;; dashscope_rerank) printf '%s/requirements_reranker_dashscope.txt\n' "${project_root}" ;; *) return 1 ;; diff --git a/scripts/start_reranker.sh b/scripts/start_reranker.sh index e86428b..b26686f 100755 --- a/scripts/start_reranker.sh +++ b/scripts/start_reranker.sh @@ -47,22 +47,29 @@ if [[ "${RERANK_BACKEND}" == qwen3_gguf* ]]; then export HF_HUB_DISABLE_XET="${HF_HUB_DISABLE_XET:-1}" fi -if [[ "${RERANK_BACKEND}" == "qwen3_vllm" || "${RERANK_BACKEND}" == "qwen3_vllm_score" ]]; then +if [[ "${RERANK_BACKEND}" == "qwen3_vllm" || "${RERANK_BACKEND}" == "qwen3_vllm_score" || "${RERANK_BACKEND}" == "qwen3_transformers_packed" ]]; then if ! command -v nvidia-smi >/dev/null 2>&1 || ! nvidia-smi >/dev/null 2>&1; then echo "ERROR: ${RERANK_BACKEND} backend requires NVIDIA GPU, but nvidia-smi is unavailable." >&2 exit 1 fi if ! "${PYTHON_BIN}" - <<'PY' try: - import vllm # noqa: F401 import torch + try: + import vllm # noqa: F401 + except Exception: + pass if not torch.cuda.is_available(): raise SystemExit(1) except Exception: raise SystemExit(1) PY then - echo "ERROR: ${RERANK_BACKEND} backend requires vllm + CUDA runtime in ${RERANKER_VENV}." >&2 + if [[ "${RERANK_BACKEND}" == "qwen3_transformers_packed" ]]; then + echo "ERROR: ${RERANK_BACKEND} backend requires torch + CUDA runtime in ${RERANKER_VENV}." >&2 + else + echo "ERROR: ${RERANK_BACKEND} backend requires vllm + CUDA runtime in ${RERANKER_VENV}." >&2 + fi echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND} and verify CUDA is available." >&2 exit 1 fi -- libgit2 0.21.2