Commit b0972ff9ec69731615d76b0d73cb4b9f59317bea
1 parent
540fb5af
qwen3_vllm_score attention TRITON_ATTN -> FLASHINFER
(之前因为错误将attention方法该回到TRITON_ATTN,性能相比于之前的vllm版本更差。但是那个错误是能解决的。已修复保持FLASHINFER)
Showing
7 changed files
with
456 additions
and
466 deletions
Show diff stats
config/config.yaml
| @@ -381,7 +381,7 @@ services: | @@ -381,7 +381,7 @@ services: | ||
| 381 | max_docs: 1000 | 381 | max_docs: 1000 |
| 382 | normalize: true | 382 | normalize: true |
| 383 | # 服务内后端(reranker 进程启动时读取) | 383 | # 服务内后端(reranker 进程启动时读取) |
| 384 | - backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_vllm_score | qwen3_transformers | qwen3_transformers_packed | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank | 384 | + backend: "qwen3_vllm_score" # bge | qwen3_vllm | qwen3_vllm_score | qwen3_transformers | qwen3_transformers_packed | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank |
| 385 | backends: | 385 | backends: |
| 386 | bge: | 386 | bge: |
| 387 | model_name: "BAAI/bge-reranker-v2-m3" | 387 | model_name: "BAAI/bge-reranker-v2-m3" |
| @@ -394,7 +394,7 @@ services: | @@ -394,7 +394,7 @@ services: | ||
| 394 | qwen3_vllm: | 394 | qwen3_vllm: |
| 395 | model_name: "Qwen/Qwen3-Reranker-0.6B" | 395 | model_name: "Qwen/Qwen3-Reranker-0.6B" |
| 396 | engine: "vllm" | 396 | engine: "vllm" |
| 397 | - max_model_len: 160 | 397 | + max_model_len: 256 |
| 398 | tensor_parallel_size: 1 | 398 | tensor_parallel_size: 1 |
| 399 | gpu_memory_utilization: 0.20 | 399 | gpu_memory_utilization: 0.20 |
| 400 | dtype: "float16" | 400 | dtype: "float16" |
| @@ -402,9 +402,8 @@ services: | @@ -402,9 +402,8 @@ services: | ||
| 402 | enforce_eager: false | 402 | enforce_eager: false |
| 403 | infer_batch_size: 100 | 403 | infer_batch_size: 100 |
| 404 | sort_by_doc_length: true | 404 | sort_by_doc_length: true |
| 405 | - # 与 reranker/backends/qwen3_vllm.py 一致:standard=_format_instruction__standard(固定 yes/no system);compact=_format_instruction(instruction 作 system 且 user 内重复 Instruct) | ||
| 406 | - # instruction_format: compact | ||
| 407 | - instruction_format: standard | 405 | + # standard=_format_instruction__standard(固定 yes/no system);compact=_format_instruction(instruction 作 system 且 user 内重复 Instruct) |
| 406 | + instruction_format: standard # compact standard | ||
| 408 | # instruction: "Given a query, score the product for relevance" | 407 | # instruction: "Given a query, score the product for relevance" |
| 409 | # "rank products by given query" 比 “Given a query, score the product for relevance” 更好点 | 408 | # "rank products by given query" 比 “Given a query, score the product for relevance” 更好点 |
| 410 | # instruction: "rank products by given query, category match first" | 409 | # instruction: "rank products by given query, category match first" |
| @@ -420,18 +419,12 @@ services: | @@ -420,18 +419,12 @@ services: | ||
| 420 | model_name: "Qwen/Qwen3-Reranker-0.6B" | 419 | model_name: "Qwen/Qwen3-Reranker-0.6B" |
| 421 | # 官方 Hub 原版需 true;若改用已转换的 seq-cls 权重(如 tomaarsen/...-seq-cls)则设为 false | 420 | # 官方 Hub 原版需 true;若改用已转换的 seq-cls 权重(如 tomaarsen/...-seq-cls)则设为 false |
| 422 | use_original_qwen3_hf_overrides: true | 421 | use_original_qwen3_hf_overrides: true |
| 423 | - # vLLM 0.18:算力 < 8(如 T4)默认注入 TRITON_ATTN,避免 FA2 在 sm<80 上报错;若更慢可关回退让 vLLM 自选: | ||
| 424 | - # auto_triton_attn_on_sm_lt_8: false | ||
| 425 | - # 关回退时 vLLM 可能走 FLASHINFER,首次 score 会 JIT,需 PATH 上有 ninja(requirements 已列 ninja;请用 ./scripts/start_reranker.sh 或 source venv/bin/activate,勿裸跑 /usr/bin 解析后的 python 且 PATH 无 venv/bin) | ||
| 426 | - # 或环境变量 RERANK_VLLM_AUTO_TRITON_ATTN=0;仍可直接指定后端:RERANK_VLLM_ATTENTION_BACKEND / vllm_attention_backend | ||
| 427 | - # vllm_attention_backend: "auto" | ||
| 428 | - # 可选:与 vLLM 对齐;一般保持 auto | ||
| 429 | # vllm_runner: "auto" | 422 | # vllm_runner: "auto" |
| 430 | # vllm_convert: "auto" | 423 | # vllm_convert: "auto" |
| 431 | # 可选:在 use_original_qwen3_hf_overrides 为 true 时与内置 overrides 合并 | 424 | # 可选:在 use_original_qwen3_hf_overrides 为 true 时与内置 overrides 合并 |
| 432 | # hf_overrides: {} | 425 | # hf_overrides: {} |
| 433 | engine: "vllm" | 426 | engine: "vllm" |
| 434 | - max_model_len: 160 | 427 | + max_model_len: 256 |
| 435 | tensor_parallel_size: 1 | 428 | tensor_parallel_size: 1 |
| 436 | gpu_memory_utilization: 0.20 | 429 | gpu_memory_utilization: 0.20 |
| 437 | dtype: "float16" | 430 | dtype: "float16" |
| @@ -439,9 +432,8 @@ services: | @@ -439,9 +432,8 @@ services: | ||
| 439 | enforce_eager: false | 432 | enforce_eager: false |
| 440 | infer_batch_size: 100 | 433 | infer_batch_size: 100 |
| 441 | sort_by_doc_length: true | 434 | sort_by_doc_length: true |
| 442 | - # 与 qwen3_vllm 同名项语义一致;默认 standard 与 vLLM 官方 Qwen3 reranker 前缀一致 | ||
| 443 | - # instruction_format: compact | ||
| 444 | - instruction_format: standard | 435 | + # 默认 standard 与 vLLM 官方 Qwen3 reranker 前缀一致 |
| 436 | + instruction_format: standard # compact standard | ||
| 445 | instruction: "Rank products by query with category & style match prioritized" | 437 | instruction: "Rank products by query with category & style match prioritized" |
| 446 | qwen3_transformers: | 438 | qwen3_transformers: |
| 447 | model_name: "Qwen/Qwen3-Reranker-0.6B" | 439 | model_name: "Qwen/Qwen3-Reranker-0.6B" |
| @@ -458,7 +450,7 @@ services: | @@ -458,7 +450,7 @@ services: | ||
| 458 | qwen3_transformers_packed: | 450 | qwen3_transformers_packed: |
| 459 | model_name: "Qwen/Qwen3-Reranker-0.6B" | 451 | model_name: "Qwen/Qwen3-Reranker-0.6B" |
| 460 | instruction: "Rank products by query with category & style match prioritized" | 452 | instruction: "Rank products by query with category & style match prioritized" |
| 461 | - max_model_len: 4096 | 453 | + max_model_len: 256 |
| 462 | max_doc_len: 160 | 454 | max_doc_len: 160 |
| 463 | max_docs_per_pack: 0 | 455 | max_docs_per_pack: 0 |
| 464 | use_fp16: true | 456 | use_fp16: true |
perf_reports/reranker_vllm_instruction/2026-03-25/RESULTS.md
| @@ -51,7 +51,7 @@ JSON aggregates (means, stdev, raw `values_ms`): same directory, `qwen3_vllm_{co | @@ -51,7 +51,7 @@ JSON aggregates (means, stdev, raw `values_ms`): same directory, `qwen3_vllm_{co | ||
| 51 | 51 | ||
| 52 | 1. **`compact` vs `standard`:** For both backends, **`compact` is faster** on this setup (shorter / different chat template vs fixed yes/no system prompt + user block — see `reranker/backends/qwen3_vllm.py` / `qwen3_vllm_score.py`). | 52 | 1. **`compact` vs `standard`:** For both backends, **`compact` is faster** on this setup (shorter / different chat template vs fixed yes/no system prompt + user block — see `reranker/backends/qwen3_vllm.py` / `qwen3_vllm_score.py`). |
| 53 | 2. **`qwen3_vllm` vs `qwen3_vllm_score`:** At **`n=1000`**, **`qwen3_vllm` + `compact`** is the fastest row (~2162 ms mean); **`qwen3_vllm_score` + `standard`** is the slowest (~2932 ms). Ordering can change on other GPUs / vLLM versions / batching. | 53 | 2. **`qwen3_vllm` vs `qwen3_vllm_score`:** At **`n=1000`**, **`qwen3_vllm` + `compact`** is the fastest row (~2162 ms mean); **`qwen3_vllm_score` + `standard`** is the slowest (~2932 ms). Ordering can change on other GPUs / vLLM versions / batching. |
| 54 | -3. **Repo default** after tests: `services.rerank.backend: qwen3_vllm_score`, `instruction_format: compact` on **both** `qwen3_vllm` and `qwen3_vllm_score` blocks (patch script keeps them aligned). | 54 | +3. **Repo / 运维默认(当前)**:`services.rerank.backend` 多为 `qwen3_vllm_score`;**score** 块推荐 **`instruction_format: compact`**(与后端代码默认值一致)。`qwen3_vllm` 块的 `instruction_format` 可与 generate 后端单独配置,不必与 score 强制相同。 |
| 55 | 55 | ||
| 56 | ## Tooling added / changed | 56 | ## Tooling added / changed |
| 57 | 57 | ||
| @@ -59,3 +59,38 @@ JSON aggregates (means, stdev, raw `values_ms`): same directory, `qwen3_vllm_{co | @@ -59,3 +59,38 @@ JSON aggregates (means, stdev, raw `values_ms`): same directory, `qwen3_vllm_{co | ||
| 59 | - `scripts/benchmark_reranker_random_titles.py`: `--tag`, `--json-summary-out`, `--quiet-runs`. | 59 | - `scripts/benchmark_reranker_random_titles.py`: `--tag`, `--json-summary-out`, `--quiet-runs`. |
| 60 | - `scripts/patch_rerank_vllm_benchmark_config.py`: surgical YAML patch (preserves newlines). | 60 | - `scripts/patch_rerank_vllm_benchmark_config.py`: surgical YAML patch (preserves newlines). |
| 61 | - `scripts/run_reranker_vllm_instruction_benchmark.sh`: full matrix driver (continues if a benchmark exits non-zero; uses `--timeout 360`). | 61 | - `scripts/run_reranker_vllm_instruction_benchmark.sh`: full matrix driver (continues if a benchmark exits non-zero; uses `--timeout 360`). |
| 62 | + | ||
| 63 | +--- | ||
| 64 | + | ||
| 65 | +## Addendum: `qwen3_vllm_score` after attention auto-select (FLASHINFER on T4) | ||
| 66 | + | ||
| 67 | +**Do not replace the table above** — it records the **older** `qwen3_vllm_score` behaviour (roughly: sm<8 时向 vLLM 注入 `attention_config` / `TRITON_ATTN`,且代码里 `instruction_format` 默认曾为 `standard`). | ||
| 68 | + | ||
| 69 | +### What changed in code / ops | ||
| 70 | + | ||
| 71 | +| Area | Before (baseline table) | After (this addendum) | | ||
| 72 | +|------|-------------------------|------------------------| | ||
| 73 | +| Attention | Backend forced / steered attention on T4 (e.g. `TRITON_ATTN` path) | **No** `attention_config` in `LLM(...)`; vLLM **auto** — on this T4 run, logs show **`FLASHINFER`** | | ||
| 74 | +| Config surface | `vllm_attention_backend` / `RERANK_VLLM_ATTENTION_BACKEND` 等 | **Removed**(少 YAML/环境变量分支,逻辑收敛) | | ||
| 75 | +| Code default `instruction_format` | `qwen3_vllm_score` 默认 `standard` | 与 `qwen3_vllm` 对齐为 **`compact`**(仍可在 YAML 写 `standard`) | | ||
| 76 | +| Smoke / 启动 | — | `scripts/smoke_qwen3_vllm_score_backend.py`;`scripts/start_reranker.sh` 将 **venv `bin` 置于 `PATH`**(FLASHINFER JIT 依赖 venv 内的 `ninja`) | | ||
| 77 | + | ||
| 78 | +Micro-benchmark (same machine, isolated): **~927.5 ms → ~673.1 ms** at **n=400** docs on `LLM.score()` steady state (~**28%**), after removing the forced attention path and letting vLLM pick **FLASHINFER**. | ||
| 79 | + | ||
| 80 | +### Re-benchmark (HTTP `POST /rerank`, same methodology as §Methodology) | ||
| 81 | + | ||
| 82 | +- **Purpose:** Same comparison axis as the main table (`qwen3_vllm_score` only), **after** the FLASHINFER-friendly backend. | ||
| 83 | +- **Controlled for `max_model_len`:** `services.rerank.backends.qwen3_vllm_score.max_model_len` set to **160** for this run so numbers are comparable to the **baseline** rows (also 160). Production `config.yaml` may use a different value (e.g. **196**); adjust YAML before repeating the benchmark if you need prod-shaped latency. | ||
| 84 | +- **Seed / repeats:** `--seed 99`, `--repeat 5`, same script and title file as §Methodology. | ||
| 85 | +- **Artifacts:** `qwen3_vllm_score_compact_post_flashinfer_opt.json`, `qwen3_vllm_score_standard_post_flashinfer_opt.json`. | ||
| 86 | + | ||
| 87 | +#### `qwen3_vllm_score` — mean latency (ms), post optimization | ||
| 88 | + | ||
| 89 | +| instruction_format | n=100 | n=200 | n=400 | n=600 | n=800 | n=1000 | vs baseline same row (approx.) | | ||
| 90 | +|--------------------|------:|------:|------:|------:|------:|-------:|--------------------------------| | ||
| 91 | +| `compact` | 178.5 | 351.7 | **688.2** | 1024.0 | 1375.8 | **1752.4** | e.g. n=400 **−28.8%**, n=1000 **−27.8%** vs 966.2 / 2428.4 | | ||
| 92 | +| `standard` | 198.4 | 386.4 | **778.8** | 1174.6 | 1548.1 | **1956.6** | e.g. n=400 **−33.9%**, n=1000 **−33.3%** vs 1178.9 / 2931.7 | | ||
| 93 | + | ||
| 94 | +**`instruction_format: standard` 的优化点(本版):** 与 `compact` **共享**同一套 vLLM attention 自动选择;不再在 T4 上单独锁死 `TRITON_ATTN`。Prompt 仍比 `compact` 更长(固定 yes/no system + 官方前缀模板),因此 **absolute 延迟仍高于 `compact`**,但相对旧版 **standard** 行降幅与 **compact** 同量级(上表)。 | ||
| 95 | + | ||
| 96 | +**Takeaway:** Under T4 + vLLM 0.18 score path, **auto attention (FLASHINFER)** plus **`compact` default** brings `qwen3_vllm_score` much closer to `qwen3_vllm` timings from the baseline matrix; re-run the full 4-way matrix if you need refreshed `qwen3_vllm` rows on the same commit. |
reranker/README.md
| @@ -6,221 +6,433 @@ | @@ -6,221 +6,433 @@ | ||
| 6 | 6 | ||
| 7 | Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwen3-vLLM、Qwen3-Transformers、Qwen3-GGUF、DashScope 云重排)。调用方通过 HTTP 访问,不关心具体后端。 | 7 | Reranker 服务提供统一的 `/rerank` API,支持可插拔后端(BGE、Qwen3-vLLM、Qwen3-Transformers、Qwen3-GGUF、DashScope 云重排)。调用方通过 HTTP 访问,不关心具体后端。 |
| 8 | 8 | ||
| 9 | -**特性** | ||
| 10 | -- 多后端:`qwen3_vllm`、`qwen3_vllm_score`(同模型,vLLM ``LLM.score()`` + 独立 `.venv-reranker-score`)、`qwen3_transformers`、`qwen3_transformers_packed`(共享前缀 + packed attention mask)、`qwen3_gguf`(Qwen3-Reranker-4B GGUF + llama.cpp)、`qwen3_gguf_06b`(Qwen3-Reranker-0.6B Q8_0 GGUF + llama.cpp)、`bge`(兼容保留) | ||
| 11 | -- 云后端:`dashscope_rerank`(调用 DashScope `/compatible-api/v1/reranks`,支持按地域切换 endpoint) | ||
| 12 | -- 统一配置:`config/config.yaml` → `services.rerank.backend` / `services.rerank.backends.<name>` | ||
| 13 | -- 文档去重、分数与输入顺序一致、FP16/GPU 支持(视后端) | 9 | +## 当前结论 |
| 10 | + | ||
| 11 | +在当前项目的线上形态里,**首选后端是 `qwen3_vllm_score`**,**次选后端是 `qwen3_vllm`**。 | ||
| 12 | + | ||
| 13 | +原因不是“`LLM.score()` 理论上更高级”,而是这轮优化后,`qwen3_vllm_score` 在当前硬件和依赖栈上形成了一套更干净、更稳定、也更快的组合: | ||
| 14 | + | ||
| 15 | +- 模型:`Qwen/Qwen3-Reranker-0.6B` | ||
| 16 | +- GPU:Tesla T4 16GB | ||
| 17 | +- CUDA:12.8 | ||
| 18 | +- PyTorch:`2.10.0+cu128` | ||
| 19 | +- vLLM-score 环境:`vllm==0.18.0` | ||
| 20 | +- attention:**由 vLLM 运行时自动选择**后端实现;在已验证的 T4 栈上日志可见 **`FLASHINFER`** | ||
| 21 | + | ||
| 22 | +这次经验沉淀的核心结论有 4 条: | ||
| 23 | + | ||
| 24 | +1. **`qwen3_vllm_score` 的 attention 实现由 vLLM 在运行时按 GPU 与版本自动选择**。 | ||
| 25 | +2. 在已验证栈(T4 + vLLM 0.18.x 等)上,日志可见选用 **`FLASHINFER`** 等由运行时选定的路径。 | ||
| 26 | +3. 无论 `score` 还是 `generate`,真正有价值的优化点都不是 prompt 小改,而是: | ||
| 27 | + 去重、按 doc 长度排序分批、合适的 `infer_batch_size`、合理的 `max_model_len`、前缀缓存、隔离 venv 与运行时缓存目录。 | ||
| 28 | +4. 本项目当前统一把 `instruction_format` 配成 `standard`。代码仍兼容两种格式,但**它不是本轮性能优化的重点,也不是推荐继续投入精力的方向**。 | ||
| 29 | + | ||
| 30 | +## 后端总览 | ||
| 31 | + | ||
| 32 | +| 后端 | 当前定位 | 结论 | | ||
| 33 | +|------|----------|------| | ||
| 34 | +| `qwen3_vllm_score` | 主推荐 | 走 vLLM **`LLM.score()`** 的 **pooling / classify** 路径:对每条 (query, doc) **直接产出相关分**,不经 causal LM 的整步 **generate**。相对 **`qwen3_vllm`**(`generate(max_tokens=1)` + **yes/no** 的 logprob 推导),**省去**每对样本上**大词表 softmax / 采样约束**那一层的常规开销,语义与 cross-encoder 式 rerank 更一致;在当前栈与 T4 上延迟表现最好 | | ||
| 35 | +| `qwen3_vllm` | 次推荐 | 稳定、成熟、好排障,是很好的 fallback 和对照组 | | ||
| 36 | +| `qwen3_transformers` | 兼容方案 | | | ||
| 37 | +| `qwen3_transformers_packed` | 特定场景方案 | T可能实现还有问题,没调好 | | ||
| 38 | +| `qwen3_gguf` / `qwen3_gguf_06b` | 低显存 / 功能兜底 | 更适合资源受限场景,不适合作为当前主在线方案 | | ||
| 39 | +| `dashscope_rerank` | 云服务方案 | 运维简单,但依赖外部服务和网络 | | ||
| 14 | 40 | ||
| 15 | ## 目录与入口 | 41 | ## 目录与入口 |
| 42 | + | ||
| 16 | - `reranker/server.py`:FastAPI 服务,启动时按配置加载一个后端 | 43 | - `reranker/server.py`:FastAPI 服务,启动时按配置加载一个后端 |
| 17 | - `reranker/backends/`:后端实现与工厂 | 44 | - `reranker/backends/`:后端实现与工厂 |
| 18 | - `backends/__init__.py`:`get_rerank_backend(name, config)` | 45 | - `backends/__init__.py`:`get_rerank_backend(name, config)` |
| 19 | - - `backends/bge.py`:BGE 后端 | ||
| 20 | - - `backends/qwen3_vllm.py`:Qwen3-Reranker-0.6B + vLLM(generate + logprobs) | ||
| 21 | - - `backends/qwen3_vllm_score.py`:同上模型 + vLLM ``LLM.score()``(`requirements_reranker_qwen3_vllm_score.txt` / `.venv-reranker-score`) | ||
| 22 | - - `backends/qwen3_transformers.py`:Qwen3-Reranker-0.6B 纯 Transformers 后端(官方 Usage 方式) | ||
| 23 | - - `backends/qwen3_transformers_packed.py`:Qwen3-Reranker-0.6B + Transformers packed 推理(共享 query prefix,适合 `1 query + 400 docs`) | ||
| 24 | - - `backends/qwen3_gguf.py`:Qwen3-Reranker GGUF + llama.cpp 后端(支持 `qwen3_gguf` / `qwen3_gguf_06b`) | ||
| 25 | - - `backends/dashscope_rerank.py`:DashScope 云重排后端(HTTP 调用) | ||
| 26 | -- `reranker/bge_reranker.py`:BGE 核心推理(被 bge 后端封装) | ||
| 27 | -- `reranker/config.py`:服务端口、MAX_DOCS、NORMALIZE 等(后端参数在 config.yaml) | ||
| 28 | - | ||
| 29 | -## 依赖 | ||
| 30 | -- 通用:`torch`、`transformers`、`fastapi`、`uvicorn`(隔离环境见 `requirements_reranker_service.txt`;全量 ML 环境另见 `requirements_ml.txt`) | ||
| 31 | -- **Qwen3-vLLM 后端**:`vllm>=0.8.5`、`transformers>=4.51.0`(`qwen3_vllm` → `.venv-reranker`) | ||
| 32 | -- **Qwen3-vLLM-score 后端**:固定 `vllm==0.18.0`(`qwen3_vllm_score` → `.venv-reranker-score`,见 `requirements_reranker_qwen3_vllm_score.txt`) | ||
| 33 | -- **Qwen3-Transformers 后端**:`transformers>=4.51.0`、`torch`(无需 vLLM,适合 CPU 或小显存) | ||
| 34 | -- **Qwen3-Transformers-Packed 后端**:复用 Transformers 依赖(`qwen3_transformers_packed` → `.venv-reranker-transformers-packed`) | ||
| 35 | -- **Qwen3-GGUF 后端**:`llama-cpp-python>=0.3.16` | ||
| 36 | -- 现在按 backend 使用独立 venv: | ||
| 37 | - - `qwen3_vllm` -> `.venv-reranker` | ||
| 38 | - - `qwen3_vllm_score` -> `.venv-reranker-score` | ||
| 39 | - - `qwen3_gguf` -> `.venv-reranker-gguf` | ||
| 40 | - - `qwen3_gguf_06b` -> `.venv-reranker-gguf-06b` | ||
| 41 | - - `qwen3_transformers` -> `.venv-reranker-transformers` | ||
| 42 | - - `qwen3_transformers_packed` -> `.venv-reranker-transformers-packed` | ||
| 43 | - - `bge` -> `.venv-reranker-bge` | ||
| 44 | - - `dashscope_rerank` -> `.venv-reranker-dashscope` | ||
| 45 | - ```bash | ||
| 46 | - ./scripts/setup_reranker_venv.sh qwen3_gguf_06b | ||
| 47 | - ``` | ||
| 48 | - CUDA 构建建议: | ||
| 49 | - ```bash | ||
| 50 | - PATH=/usr/local/cuda/bin:$PATH \ | ||
| 51 | - CUDACXX=/usr/local/cuda/bin/nvcc \ | ||
| 52 | - CMAKE_ARGS="-DGGML_CUDA=on" \ | ||
| 53 | - FORCE_CMAKE=1 \ | ||
| 54 | - ./.venv-reranker-gguf/bin/pip install --no-cache-dir --force-reinstall --no-build-isolation llama-cpp-python==0.3.18 | ||
| 55 | - ``` | ||
| 56 | - | ||
| 57 | -## 配置 | ||
| 58 | -- **后端选择**:`config/config.yaml` 中 `services.rerank.backend`(`qwen3_vllm` | `qwen3_vllm_score` | `qwen3_transformers` | `qwen3_transformers_packed` | `qwen3_gguf` | `qwen3_gguf_06b` | `bge` | `dashscope_rerank`),或环境变量 `RERANK_BACKEND`。 | ||
| 59 | -- **后端参数**:`services.rerank.backends.bge` / `services.rerank.backends.qwen3_vllm`,例如: | 46 | + - `backends/qwen3_vllm_score.py`:当前最优的本地 GPU reranker |
| 47 | + - `backends/qwen3_vllm.py`:次优的本地 GPU reranker | ||
| 48 | + - `backends/qwen3_transformers.py`:Transformers 基线实现 | ||
| 49 | + - `backends/qwen3_transformers_packed.py`:packed 推理实现 | ||
| 50 | + - `backends/qwen3_gguf.py`:GGUF + llama.cpp 后端 | ||
| 51 | + - `backends/dashscope_rerank.py`:DashScope 云端重排后端 | ||
| 52 | +- `scripts/setup_reranker_venv.sh`:按后端创建独立 venv | ||
| 53 | +- `scripts/start_reranker.sh`:启动 reranker 服务 | ||
| 54 | +- `scripts/smoke_qwen3_vllm_score_backend.py`:`qwen3_vllm_score` 本地 smoke | ||
| 55 | +- `scripts/benchmark_reranker_random_titles.py`:随机标题压测脚本 | ||
| 56 | +- `scripts/run_reranker_vllm_instruction_benchmark.sh`:历史矩阵脚本 | ||
| 57 | + | ||
| 58 | +## 环境基线 | ||
| 59 | + | ||
| 60 | +当前验证环境: | ||
| 61 | + | ||
| 62 | +- GPU:`Tesla T4 16GB` | ||
| 63 | +- Driver / CUDA:`570.158.01 / 12.8` | ||
| 64 | +- Python:`3.12.3` | ||
| 65 | +- `torch`:`2.10.0+cu128` | ||
| 66 | +- `transformers`:`4.51+` | ||
| 67 | +- `qwen3_vllm_score` 环境:`vllm==0.18.0` | ||
| 68 | +- `qwen3_vllm` 环境:`vllm>=0.8.5` | ||
| 69 | + | ||
| 70 | +独立 venv 约定: | ||
| 71 | + | ||
| 72 | +- `qwen3_vllm` -> `.venv-reranker` | ||
| 73 | +- `qwen3_vllm_score` -> `.venv-reranker-score` | ||
| 74 | +- `qwen3_transformers` -> `.venv-reranker-transformers` | ||
| 75 | +- `qwen3_transformers_packed` -> `.venv-reranker-transformers-packed` | ||
| 76 | +- `qwen3_gguf` -> `.venv-reranker-gguf` | ||
| 77 | +- `qwen3_gguf_06b` -> `.venv-reranker-gguf-06b` | ||
| 78 | +- `bge` -> `.venv-reranker-bge` | ||
| 79 | +- `dashscope_rerank` -> `.venv-reranker-dashscope` | ||
| 80 | + | ||
| 81 | +这样做不是形式主义,而是因为: | ||
| 82 | + | ||
| 83 | +- 不同后端的 CUDA / vLLM / llama.cpp 依赖耦合很深,混装后更难定位性能和兼容性问题 | ||
| 84 | +- qwen3_vllm_score 和 qwen3_vllm 分了两个环境,是因为qwen3_vllm_score使用了vllm 0.18,但是后面经过测试两者性能相同。所以其实可以共用一个环境。不过没有动力合并回去。 | ||
| 85 | + | ||
| 86 | +## 安装与部署 | ||
| 87 | + | ||
| 88 | +### 1. 创建后端环境 | ||
| 89 | + | ||
| 90 | +`qwen3_vllm_score`: | ||
| 91 | + | ||
| 92 | +```bash | ||
| 93 | +./scripts/setup_reranker_venv.sh qwen3_vllm_score | ||
| 94 | +``` | ||
| 95 | + | ||
| 96 | +`qwen3_vllm`: | ||
| 97 | + | ||
| 98 | +```bash | ||
| 99 | +./scripts/setup_reranker_venv.sh qwen3_vllm | ||
| 100 | +``` | ||
| 101 | + | ||
| 102 | +### 2. 基础检查 | ||
| 103 | + | ||
| 104 | +```bash | ||
| 105 | +nvidia-smi | ||
| 106 | +./.venv-reranker-score/bin/python -c "import torch, vllm; print(torch.cuda.is_available(), torch.cuda.get_device_name(0), vllm.__version__)" | ||
| 107 | +./.venv-reranker/bin/python -c "import torch, vllm; print(torch.cuda.is_available(), torch.cuda.get_device_name(0), vllm.__version__)" | ||
| 108 | +``` | ||
| 109 | + | ||
| 110 | +### 3. 启动服务 | ||
| 111 | + | ||
| 112 | +```bash | ||
| 113 | +./scripts/start_reranker.sh | ||
| 114 | +``` | ||
| 115 | + | ||
| 116 | +`scripts/start_reranker.sh` 做了几件对性能和稳定性都很关键的事: | ||
| 117 | + | ||
| 118 | +- 自动选择当前 backend 对应的独立 venv | ||
| 119 | +- 为 vLLM / triton / torch.compile 指定独立缓存目录 | ||
| 120 | +- 把后端 venv 的 `bin` 放到 `PATH` 前面 | ||
| 121 | + | ||
| 122 | +最后这一点很重要。对 `qwen3_vllm_score` 来说,T4 上 vLLM 自动选择 `FLASHINFER` 时,首次 JIT 需要 `ninja`,而 `ninja` 是装在对应 venv 里的。如果裸跑一个没有正确 `PATH` 的 Python 进程,就可能出现“环境明明装了,worker 里却找不到编译工具”的问题。 | ||
| 123 | + | ||
| 124 | +### 4. Smoke | ||
| 125 | + | ||
| 126 | +```bash | ||
| 127 | +PYTHONPATH=. ./.venv-reranker-score/bin/python scripts/smoke_qwen3_vllm_score_backend.py --gpu-memory-utilization 0.2 | ||
| 128 | +``` | ||
| 129 | + | ||
| 130 | +如果显卡上还有别的重进程,`gpu_memory_utilization` 可以临时调小或调大做排查;smoke 本身建议单独跑,不要和大压测并发。 | ||
| 131 | + | ||
| 132 | +## 当前最优方案:`qwen3_vllm_score` | ||
| 133 | + | ||
| 134 | +### 它为什么是当前最优 | ||
| 135 | + | ||
| 136 | +`qwen3_vllm_score.py` 的优势,来自这几个组合在一起: | ||
| 137 | + | ||
| 138 | +1. 使用 vLLM 的 **`LLM.score()`**(pooling / classify),对 (query, doc) **直接打分**,而非借 **generate** 在整词表上走最后一步分布再抠 **yes/no**——**省掉**那一类路径上的常规算力与模板绕路。 | ||
| 139 | +2. 使用独立的 `.venv-reranker-score`,把 `vllm==0.18.0` 固定下来,避免和其他后端互相污染。 | ||
| 140 | +3. **attention 后端由 vLLM 按 GPU 与版本自动选择**;`config.yaml` 里与 rerank 相关的调参集中在批量、长度、缓存、显存占比等项。 | ||
| 141 | +4. 在已验证的 T4 依赖栈上,运行时通常选用 **`FLASHINFER`**(见服务日志);与 FlashAttention 2 等路径的取舍由 vLLM 内部策略完成。 | ||
| 142 | +5. 服务层保留高杠杆优化: | ||
| 143 | + 全局去重、按 doc 长度排序、分批推理、前缀缓存、单进程锁保护。 | ||
| 144 | + | ||
| 145 | +### 关键实现点 | ||
| 146 | + | ||
| 147 | +`qwen3_vllm_score.py` 里值得关注的地方: | ||
| 148 | + | ||
| 149 | +- `runner` / `convert` 保持 **auto**:走 **pooling / classify** 与 **`LLM.score()`** 的推荐接法(vLLM 0.17+) | ||
| 150 | +- `hf_overrides`:把原始 Qwen3 reranker 权重按官方要求映射到 `Qwen3ForSequenceClassification` | ||
| 151 | +- `LLM(...)` 仅使用本后端所需的模型与并行等参数;**attention 后端由 vLLM 内部按运行环境选用** | ||
| 152 | +- `deduplicate_with_positions(...)`:先去重,再回填原始顺序 | ||
| 153 | +- `sort_by_doc_length`:减少 padding 浪费 | ||
| 154 | +- `infer_batch_size`:控制服务层分批 | ||
| 155 | +- `enable_prefix_caching`:对重复前缀场景有收益 | ||
| 156 | +- `self._infer_lock`:避免当前进程模型下并发调用破坏 vLLM engine 稳定性 | ||
| 157 | + | ||
| 158 | +### Attention 与算力路径(现状) | ||
| 159 | + | ||
| 160 | +- **vLLM** 根据 **GPU 算力架构**与**当前 wheel 中的实现**(如随发行版提供的 **flashinfer** 等)自动选用 attention 路径。 | ||
| 161 | +- 在 **Tesla T4(`sm_75`)** + **vLLM 0.18.x** 的已验证环境中,服务日志中可见选用 **`FLASHINFER`**。 | ||
| 162 | +- **最佳实践**:性能调优放在 **`max_model_len`**、**`infer_batch_size`**、**`gpu_memory_utilization`**、去重、长度排序、prefix cache 等**服务可见**参数上。与 **400 docs** 量级相关的稳态 HTTP 数字见 `perf_reports/reranker_vllm_instruction/2026-03-25/RESULTS.md`(主表方法论 + **Addendum** 中 `qwen3_vllm_score` 补充行)。 | ||
| 163 | + | ||
| 164 | +### 推荐配置 | ||
| 165 | + | ||
| 166 | +当前项目统一使用 `standard`,README 也按这个基线描述: | ||
| 60 | 167 | ||
| 61 | ```yaml | 168 | ```yaml |
| 62 | services: | 169 | services: |
| 63 | rerank: | 170 | rerank: |
| 64 | - backend: "qwen3_gguf" # 或 qwen3_vllm / bge | 171 | + backend: "qwen3_vllm_score" |
| 65 | backends: | 172 | backends: |
| 66 | - bge: | ||
| 67 | - model_name: "BAAI/bge-reranker-v2-m3" | ||
| 68 | - device: null | ||
| 69 | - use_fp16: true | ||
| 70 | - batch_size: 64 | ||
| 71 | - max_length: 512 | ||
| 72 | - cache_dir: "./model_cache" | ||
| 73 | - enable_warmup: true | ||
| 74 | - qwen3_vllm: | 173 | + qwen3_vllm_score: |
| 75 | model_name: "Qwen/Qwen3-Reranker-0.6B" | 174 | model_name: "Qwen/Qwen3-Reranker-0.6B" |
| 175 | + use_original_qwen3_hf_overrides: true | ||
| 176 | + engine: "vllm" | ||
| 76 | max_model_len: 256 | 177 | max_model_len: 256 |
| 77 | - infer_batch_size: 64 | ||
| 78 | - sort_by_doc_length: true | 178 | + tensor_parallel_size: 1 |
| 179 | + gpu_memory_utilization: 0.20 | ||
| 180 | + dtype: "float16" | ||
| 79 | enable_prefix_caching: true | 181 | enable_prefix_caching: true |
| 80 | enforce_eager: false | 182 | enforce_eager: false |
| 81 | - instruction: "Given a shopping query, rank product titles by relevance" | ||
| 82 | - qwen3_transformers: | ||
| 83 | - model_name: "Qwen/Qwen3-Reranker-0.6B" | ||
| 84 | - instruction: "Given a shopping query, rank product titles by relevance" | ||
| 85 | - max_length: 8192 | ||
| 86 | - batch_size: 64 | ||
| 87 | - use_fp16: true | ||
| 88 | - tensor_parallel_size: 1 | ||
| 89 | - gpu_memory_utilization: 0.8 | ||
| 90 | - instruction: "Given a shopping query, rank product titles by relevance" | ||
| 91 | - qwen3_transformers_packed: | ||
| 92 | - model_name: "Qwen/Qwen3-Reranker-0.6B" | ||
| 93 | - instruction: "Rank products by query with category & style match prioritized" | ||
| 94 | - max_model_len: 4096 | ||
| 95 | - max_doc_len: 160 | ||
| 96 | - max_docs_per_pack: 0 | ||
| 97 | - use_fp16: true | 183 | + infer_batch_size: 100 |
| 98 | sort_by_doc_length: true | 184 | sort_by_doc_length: true |
| 99 | - attn_implementation: "eager" | ||
| 100 | - qwen3_gguf: | ||
| 101 | - repo_id: "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF" | ||
| 102 | - filename: "*Q8_0.gguf" | ||
| 103 | - local_dir: "./models/reranker/qwen3-reranker-4b-gguf" | ||
| 104 | - cache_dir: "./model_cache" | 185 | + instruction_format: standard |
| 105 | instruction: "Rank products by query with category & style match prioritized" | 186 | instruction: "Rank products by query with category & style match prioritized" |
| 106 | - n_ctx: 384 | ||
| 107 | - n_batch: 384 | ||
| 108 | - n_ubatch: 128 | ||
| 109 | - n_gpu_layers: 24 | ||
| 110 | - flash_attn: true | ||
| 111 | - offload_kqv: true | ||
| 112 | - infer_batch_size: 8 | 187 | +``` |
| 188 | + | ||
| 189 | +### 优点 | ||
| 190 | + | ||
| 191 | +- 当前本地 GPU 方案里性能最好 | ||
| 192 | +- attention 由 vLLM 在引擎内统一决策;仓库侧配置只覆盖批量、长度、缓存、显存等,实现路径短 | ||
| 193 | +- **score / classify** 路径与 rerank 任务对齐;相对 **generate + 词表 logprob** 少一层常规开销 | ||
| 194 | +- 服务层优化(去重、排序分批、缓存)与后端解耦清晰,易维护 | ||
| 195 | + | ||
| 196 | +### 缺点 | ||
| 197 | + | ||
| 198 | +- 依赖更新的 vLLM 栈,升级时要重新验证 | ||
| 199 | +- 首次启动会经历 compile / JIT / graph capture,冷启动偏慢 | ||
| 200 | +- 对环境完整性更敏感,尤其是 CUDA、worker 进程和 `ninja` | ||
| 201 | + | ||
| 202 | +## 次优方案:`qwen3_vllm` | ||
| 203 | + | ||
| 204 | +### 它为什么仍然很有价值 | ||
| 205 | + | ||
| 206 | +`qwen3_vllm.py` 是当前最好的次优方案,不只是“备用”,而是一个很重要的稳定对照组。 | ||
| 207 | + | ||
| 208 | +它走的是: | ||
| 209 | + | ||
| 210 | +- causal LM | ||
| 211 | +- `generate(max_tokens=1)` | ||
| 212 | +- 只允许输出 `yes/no` | ||
| 213 | +- 用最后一步 logprobs 反推出相关性分数 | ||
| 214 | + | ||
| 215 | +这条路径的优点是工程上非常稳: | ||
| 216 | + | ||
| 217 | +- 行为更容易理解 | ||
| 218 | +- 更容易和 Hugging Face tokenizer 对齐 | ||
| 219 | +- 排查问题时更直观 | ||
| 220 | +- 在一些旧版本 vLLM 或其他 GPU 组合上,表现可能仍然很好 | ||
| 221 | + | ||
| 222 | +### 它为什么排在第二 | ||
| 223 | + | ||
| 224 | +它不是当前第一名,主要不是因为模型差,而是路径更“绕”: | ||
| 225 | + | ||
| 226 | +- 要先走 chat template | ||
| 227 | +- 要自己维护 `yes/no` token | ||
| 228 | +- 要做一次短 decode | ||
| 229 | +- 要从 logprobs 里手工算概率 | ||
| 230 | + | ||
| 231 | +也就是说,`qwen3_vllm` 的打分是“借 generate 模式实现 rerank”,而不是原生 score 路径。它依然有效,但从结构上不如 `qwen3_vllm_score` 直接。 | ||
| 232 | + | ||
| 233 | +### 关键实现点 | ||
| 234 | + | ||
| 235 | +- `AutoTokenizer.apply_chat_template(...)` | ||
| 236 | +- `SamplingParams(max_tokens=1, allowed_token_ids=[yes, no])` | ||
| 237 | +- `generate(...)` 后从最后一步 logprobs 计算 yes/no 概率 | ||
| 238 | +- 同样具备去重、按长度排序、分批推理、前缀缓存、单进程锁等优化 | ||
| 239 | + | ||
| 240 | +### 推荐配置 | ||
| 241 | + | ||
| 242 | +```yaml | ||
| 243 | +services: | ||
| 244 | + rerank: | ||
| 245 | + backends: | ||
| 246 | + qwen3_vllm: | ||
| 247 | + model_name: "Qwen/Qwen3-Reranker-0.6B" | ||
| 248 | + engine: "vllm" | ||
| 249 | + max_model_len: 256 | ||
| 250 | + tensor_parallel_size: 1 | ||
| 251 | + gpu_memory_utilization: 0.20 | ||
| 252 | + dtype: "float16" | ||
| 253 | + enable_prefix_caching: true | ||
| 254 | + enforce_eager: false | ||
| 255 | + infer_batch_size: 100 | ||
| 113 | sort_by_doc_length: true | 256 | sort_by_doc_length: true |
| 114 | - length_sort_mode: "char" | ||
| 115 | - qwen3_gguf_06b: | ||
| 116 | - repo_id: "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF" | ||
| 117 | - filename: "qwen3-reranker-0.6b-q8_0.gguf" | ||
| 118 | - local_dir: "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf" | ||
| 119 | - cache_dir: "./model_cache" | 257 | + instruction_format: standard |
| 120 | instruction: "Rank products by query with category & style match prioritized" | 258 | instruction: "Rank products by query with category & style match prioritized" |
| 121 | - n_ctx: 256 | ||
| 122 | - n_batch: 256 | ||
| 123 | - n_ubatch: 256 | ||
| 124 | - n_gpu_layers: 999 | ||
| 125 | - infer_batch_size: 32 | ||
| 126 | - sort_by_doc_length: true | ||
| 127 | - length_sort_mode: "char" | ||
| 128 | - reuse_query_state: false | ||
| 129 | - dashscope_rerank: | ||
| 130 | - model_name: "qwen3-rerank" | ||
| 131 | - endpoint: "https://dashscope.aliyuncs.com/compatible-api/v1/reranks" | ||
| 132 | - api_key_env: "RERANK_DASHSCOPE_API_KEY_CN" | ||
| 133 | - timeout_sec: 15.0 | ||
| 134 | - top_n_cap: 0 | ||
| 135 | - batchsize: 64 # 0关闭;>0并发小包调度(top_n/top_n_cap 仍生效,分包后全局截断) | ||
| 136 | - instruct: "Given a shopping query, rank product titles by relevance" | ||
| 137 | - max_retries: 2 | ||
| 138 | - retry_backoff_sec: 0.2 | ||
| 139 | ``` | 259 | ``` |
| 140 | 260 | ||
| 141 | -DashScope endpoint 地域示例: | ||
| 142 | -- 中国:`https://dashscope.aliyuncs.com/compatible-api/v1/reranks` | ||
| 143 | -- 新加坡:`https://dashscope-intl.aliyuncs.com/compatible-api/v1/reranks` | ||
| 144 | -- 美国:`https://dashscope-us.aliyuncs.com/compatible-api/v1/reranks` | 261 | +### 优点 |
| 145 | 262 | ||
| 146 | -DashScope 认证: | ||
| 147 | -- `api_key_env` 必填,表示该后端读取哪个环境变量作为 API Key | ||
| 148 | -- 推荐按地域分别注入: | ||
| 149 | - - `RERANK_DASHSCOPE_API_KEY_CN=...` | ||
| 150 | - - `RERANK_DASHSCOPE_API_KEY_US=...` | 263 | +- 路径成熟,易理解,易排障 |
| 264 | +- 作为 fallback 很合适 | ||
| 265 | +- 和 `qwen3_vllm_score` 共用很多服务层优化经验 | ||
| 151 | 266 | ||
| 152 | -- 服务端口、请求限制等仍在 `reranker/config.py`(或环境变量 `RERANKER_PORT`、`RERANKER_HOST`)。 | 267 | +### 缺点 |
| 153 | 268 | ||
| 154 | -## 运行 | ||
| 155 | -```bash | ||
| 156 | -./scripts/start_reranker.sh | ||
| 157 | -``` | ||
| 158 | -该脚本会按当前 `services.rerank.backend` 自动选择对应的独立 venv;首次请先执行 `./scripts/setup_reranker_venv.sh <backend>`。 | 269 | +- 不是原生 reranker score 路径 |
| 270 | +- 比 `qwen3_vllm_score` 多一层 tokenizer / generate / logprob 推导成本 | ||
| 271 | +- 当前环境下性能略逊 | ||
| 159 | 272 | ||
| 160 | -## 性能压测(1000 docs) | ||
| 161 | -```bash | ||
| 162 | -./scripts/benchmark_reranker_1000docs.sh | ||
| 163 | -``` | ||
| 164 | -输出目录:`perf_reports/<date>/reranker_1000docs/`。 | 273 | +## 这轮优化里真正有价值的方法 |
| 165 | 274 | ||
| 166 | -## API | ||
| 167 | -### Health | ||
| 168 | -``` | ||
| 169 | -GET /health | ||
| 170 | -``` | ||
| 171 | -Response 含 `backend`(当前后端名)、`model`、`model_loaded`、`status`。 | 275 | +下面这些是跨后端都值得保留的经验,优先级高于 prompt 微调。 |
| 172 | 276 | ||
| 173 | -### Rerank | ||
| 174 | -``` | ||
| 175 | -POST /rerank | ||
| 176 | -Content-Type: application/json | ||
| 177 | - | ||
| 178 | -{ | ||
| 179 | - "query": "wireless mouse", | ||
| 180 | - "docs": ["logitech mx master", "usb cable", "wireless mouse bluetooth"], | ||
| 181 | - "top_n": 10 | ||
| 182 | -} | ||
| 183 | -``` | 277 | +### 1. 全局去重 |
| 184 | 278 | ||
| 185 | -`top_n` 为可选字段: | ||
| 186 | -- 对本地后端(`qwen3_vllm` / `qwen3_transformers` / `qwen3_transformers_packed` / `qwen3_gguf` / `qwen3_gguf_06b` / `bge`)通常会忽略,仍返回全量分数。 | ||
| 187 | -- 对 `dashscope_rerank` 可用于控制云端返回的候选量,建议设置为 `page+size`(例如分页 `from=20,size=10` 时传 `30`)。 | 279 | +对 doc 先做全局去重,再按原始索引回填,是收益最高、风险最低的优化之一。 |
| 188 | 280 | ||
| 189 | -Response: | ||
| 190 | -``` | ||
| 191 | -{ | ||
| 192 | - "scores": [0.93, 0.02, 0.88], | ||
| 193 | - "meta": { | ||
| 194 | - "input_docs": 3, | ||
| 195 | - "usable_docs": 3, | ||
| 196 | - "unique_docs": 3, | ||
| 197 | - "dedup_ratio": 0.0, | ||
| 198 | - "elapsed_ms": 12.4, | ||
| 199 | - "model": "BAAI/bge-reranker-v2-m3", | ||
| 200 | - "device": "cuda", | ||
| 201 | - "fp16": true, | ||
| 202 | - "batch_size": 64, | ||
| 203 | - "max_length": 512, | ||
| 204 | - "normalize": true, | ||
| 205 | - "service_elapsed_ms": 13.1 | ||
| 206 | - } | ||
| 207 | -} | ||
| 208 | -``` | 281 | +适用原因: |
| 209 | 282 | ||
| 210 | -## Logging | ||
| 211 | -The service uses standard Python logging. For structured logs and full output, | ||
| 212 | -run uvicorn with: | ||
| 213 | -```bash | ||
| 214 | -uvicorn reranker.server:app --host 0.0.0.0 --port 6007 --log-level info | ||
| 215 | -``` | 283 | +- 商品标题、变体标题、重复 SKU 文案很常见 |
| 284 | +- 去重不会改变 API 契约 | ||
| 285 | +- 能直接减少模型真实推理次数 | ||
| 286 | + | ||
| 287 | +### 2. 按 doc 长度排序再分批 | ||
| 288 | + | ||
| 289 | +`sort_by_doc_length: true` 建议保持开启。 | ||
| 290 | + | ||
| 291 | +原因: | ||
| 292 | + | ||
| 293 | +- 同一批里长度更接近,padding 更少 | ||
| 294 | +- 对 `infer_batch_size` 较大时收益更明显 | ||
| 295 | +- 实现成本低,行为稳定 | ||
| 296 | + | ||
| 297 | +当前实现里长度估计采用字符长度近似,这已经足够实用。没有必要为了这一层再引入额外 tokenizer 计算开销。 | ||
| 298 | + | ||
| 299 | +### 3. `infer_batch_size` 作为核心调参项 | ||
| 300 | + | ||
| 301 | +对当前业务形态,`infer_batch_size` 是最值得扫的参数。 | ||
| 302 | + | ||
| 303 | +建议: | ||
| 304 | + | ||
| 305 | +- 先固定其他参数,再扫 `64 / 80 / 96 / 100 / 128` | ||
| 306 | +- 看的是单请求延迟和稳定性,不只是吞吐 | ||
| 307 | +- 不要只拿一次结果下结论,至少 warm-up 后 repeat 5 次 | ||
| 308 | + | ||
| 309 | +### 4. `max_model_len` 不要盲目开大 | ||
| 310 | + | ||
| 311 | +当前场景是短 query + 商品标题/短描述,不需要把 `max_model_len` 拉得很高。 | ||
| 312 | + | ||
| 313 | +经验: | ||
| 314 | + | ||
| 315 | +- `160` 适合做对比实验 | ||
| 316 | +- `256` 更像当前线上保守值 | ||
| 317 | +- 再往上加,对当前场景通常是成本大于收益 | ||
| 318 | + | ||
| 319 | +### 5. `enable_prefix_caching` | ||
| 320 | + | ||
| 321 | +建议开启。 | ||
| 322 | + | ||
| 323 | +原因: | ||
| 324 | + | ||
| 325 | +- 一个请求里通常是同一个 query 对很多 doc | ||
| 326 | +- 前缀共享明显 | ||
| 327 | +- vLLM 在这类场景里能吃到 prefix cache 的收益 | ||
| 328 | + | ||
| 329 | +### 6. `enforce_eager` | ||
| 330 | + | ||
| 331 | +建议: | ||
| 332 | + | ||
| 333 | +- 线上常规运行:`false` | ||
| 334 | +- smoke / 排障 / 显存紧张时:可临时 `true` | ||
| 335 | + | ||
| 336 | +因为: | ||
| 337 | + | ||
| 338 | +- `false` 时可使用 compile / graph capture,稳态性能更好 | ||
| 339 | +- `true` 时启动更直接,问题更容易定位 | ||
| 340 | + | ||
| 341 | +### 7. 独立 venv + 独立运行时缓存 | ||
| 342 | + | ||
| 343 | +这不是“环境洁癖”,而是性能优化的一部分。 | ||
| 344 | + | ||
| 345 | +收益: | ||
| 346 | + | ||
| 347 | +- 避免不同 vLLM 版本互相污染 | ||
| 348 | +- compile / triton / flashinfer 缓存可复用 | ||
| 349 | +- 便于精确复现实验结果 | ||
| 350 | + | ||
| 351 | +## 性能数据应该怎么看 | ||
| 352 | + | ||
| 353 | +`perf_reports/reranker_vllm_instruction/2026-03-25/RESULTS.md` 建议按三部分读: | ||
| 354 | + | ||
| 355 | +- **方法论**:脚本、预热、`--seed`、HTTP 客户端延迟与 `/health` 核对等(可复用于后续复跑)。 | ||
| 356 | +- **主表**:`qwen3_vllm` / `qwen3_vllm_score` × `instruction_format` 的矩阵基线(固定 `max_model_len` 等条件见该文)。 | ||
| 357 | +- **Addendum**:同一方法下对 **`qwen3_vllm_score` 当前实现**的补充测数(含 compact/standard),便于与主表对照**同一指标口径**。 | ||
| 358 | + | ||
| 359 | +对外结论应基于:**当前代码 revision**、**文档中注明的 `max_model_len` / GPU 占用**、尽量**避免与大压测或其他 GPU 重进程并发**时的样本。 | ||
| 360 | + | ||
| 361 | +## benchmark 建议流程 | ||
| 362 | + | ||
| 363 | +推荐流程: | ||
| 364 | + | ||
| 365 | +1. 确认目标 backend 已切换到正确配置 | ||
| 366 | +2. `./scripts/start_reranker.sh` | ||
| 367 | +3. `curl http://127.0.0.1:6007/health` | ||
| 368 | +4. 跑 benchmark 脚本 | ||
| 369 | +5. 保存 JSON 和 Markdown 结果 | ||
| 370 | +6. 记录当时的 GPU 占用情况和 `nvidia-smi` | ||
| 371 | + | ||
| 372 | +重点观察: | ||
| 373 | + | ||
| 374 | +- 单请求延迟 | ||
| 375 | +- 稳态均值 | ||
| 376 | +- 波动大小 | ||
| 377 | +- 冷启动与热启动差异 | ||
| 378 | +- 是否有显存竞争导致的异常样本 | ||
| 379 | + | ||
| 380 | +## 常见问题 | ||
| 381 | + | ||
| 382 | +### 1. 为什么第一次启动很慢 | ||
| 383 | + | ||
| 384 | +因为第一次会叠加: | ||
| 385 | + | ||
| 386 | +- 模型加载 | ||
| 387 | +- torch.compile | ||
| 388 | +- CUDA graph capture | ||
| 389 | +- flashinfer / triton JIT | ||
| 390 | + | ||
| 391 | +这不是异常。看性能时要区分冷启动和稳态。 | ||
| 392 | + | ||
| 393 | +### 2. 为什么 smoke 有时会 OOM | ||
| 394 | + | ||
| 395 | +常见原因不是参数本身,而是: | ||
| 396 | + | ||
| 397 | +- GPU 上同时还有 embedding / translator / 其他 vLLM 进程 | ||
| 398 | +- smoke 和 benchmark 并发跑 | ||
| 399 | +- `gpu_memory_utilization` 设得不适合当前剩余显存 | ||
| 400 | + | ||
| 401 | +处理方式: | ||
| 402 | + | ||
| 403 | +- 先单独跑 smoke | ||
| 404 | +- 看 `nvidia-smi` | ||
| 405 | +- 适当调整 `gpu_memory_utilization` | ||
| 406 | + | ||
| 407 | +### 3. `qwen3_vllm_score` 的 attention 要在哪里调 | ||
| 408 | + | ||
| 409 | +**由 vLLM 在运行时按 GPU 与版本自动选择**;与延迟和稳定性更直接相关、且建议在仓库里动的,是 **`max_model_len`**、**`infer_batch_size`**、**`gpu_memory_utilization`**、去重、排序分批、prefix cache 等。 | ||
| 410 | + | ||
| 411 | +## 代码阅读建议 | ||
| 412 | + | ||
| 413 | +如果要快速理解当前主线实现,建议按这个顺序读: | ||
| 414 | + | ||
| 415 | +1. `reranker/backends/qwen3_vllm_score.py` | ||
| 416 | +2. `reranker/backends/qwen3_vllm.py` | ||
| 417 | +3. `scripts/start_reranker.sh` | ||
| 418 | +4. `scripts/setup_reranker_venv.sh` | ||
| 419 | +5. `config/config.yaml` 里的 `services.rerank.backends.*` | ||
| 420 | + | ||
| 421 | +阅读重点: | ||
| 422 | + | ||
| 423 | +- 后端如何构造 prompt(`instruction_format` compact / standard) | ||
| 424 | +- 后端调用 **`score()`** 还是 **`generate()`**,以及是否经过**整词表**上的最后一步分布 | ||
| 425 | +- `qwen3_vllm_score` 里 **`LLM(...)` 传了哪些字段**(模型、并行、dtype、缓存等),以及 attention 如何由 vLLM 内部承接 | ||
| 426 | +- 服务层去重 / 排序 / 分批 / 回填怎么做 | ||
| 427 | + | ||
| 428 | +## 最终建议 | ||
| 429 | + | ||
| 430 | +如果你的目标是“当前仓库在 T4 上的在线 reranker 最优落地”,建议直接遵循下面这条线: | ||
| 216 | 431 | ||
| 217 | -## Notes | ||
| 218 | -- 无请求级缓存;输入按字符串去重后推理,再按原始顺序回填分数。 | ||
| 219 | -- 空或 null 的 doc 跳过并计为 0。 | ||
| 220 | -- **Qwen3-vLLM 分批策略**:`docs` 请求体可为 1000+,服务端会按 `infer_batch_size` 拆分;当 `sort_by_doc_length=true` 时,会先按文档长度排序后分批,减少 padding 开销,最终再按输入顺序回填分数。 | ||
| 221 | -- 运行时可用环境变量临时覆盖批量参数:`RERANK_VLLM_INFER_BATCH_SIZE`、`RERANK_VLLM_SORT_BY_DOC_LENGTH`。 | ||
| 222 | -- **Qwen3-vLLM**:参考 [Qwen3-Reranker-0.6B](https://huggingface.co/Qwen/Qwen3-Reranker-0.6B),需 GPU 与较多显存;与 BGE 相比适合长文本、高吞吐场景(vLLM 前缀缓存)。 | ||
| 223 | -- **Qwen3-Transformers**:官方 Transformers Usage 方式,无需 vLLM;适合 CPU 或小显存。默认 `attn_implementation: "sdpa"`;若已安装 `flash_attn` 可设 `flash_attention_2`(未安装时服务会自动回退到 sdpa)。 | ||
| 224 | -- **Qwen3-Transformers-Packed**:仍使用 Hugging Face Transformers 与 PyTorch CUDA 内核,只定制 packed 输入、`position_ids` 和 4D `attention_mask`。它更适合在线检索里的“一个 query 对几百个短 doc”场景;默认 `attn_implementation: "eager"` 以保证自定义 mask 兼容性,若你的 `torch/transformers` 版本已验证支持,可再压测 `"sdpa"`。 | ||
| 225 | -- **Qwen3-GGUF**:参考 [DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF](https://huggingface.co/DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF)。单卡 T4 且仅剩约 `4.8~6GB` 显存时,推荐 `Q8_0 + n_ctx=384 + n_gpu_layers=24 + flash_attn=true + offload_kqv=true` 起步;若启动 OOM,优先把 `n_gpu_layers` 下调到 `20`,再把 `n_ctx` 下调到 `320`。`infer_batch_size` 在 GGUF 后端是服务侧 work chunk,大多不如 `n_gpu_layers` / `n_ctx` 关键。 | ||
| 226 | -- **Qwen3-GGUF-0.6B**:参考 [ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF](https://huggingface.co/ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF)。它的优点是权重小、显存占用低,单进程实测约 `0.9~1.1 GiB`;但在当前 llama.cpp 串行打分接法下,`1 query + 400 titles` 的实测延迟仍约 `265s`。因此它更适合低显存功能后备,不适合作为在线低延迟主 reranker。 | 432 | +- 主后端:`qwen3_vllm_score` |
| 433 | +- 模型:`Qwen/Qwen3-Reranker-0.6B` | ||
| 434 | +- 配置:`instruction_format` 以 `standard` 为项目统一基线;细调优先放在批量与长度相关项 | ||
| 435 | +- attention:由 vLLM 运行时自动选择;调参见 `max_model_len` / `infer_batch_size` / `gpu_memory_utilization` 等 | ||
| 436 | +- 关键参数:`max_model_len`、`infer_batch_size`、`gpu_memory_utilization` | ||
| 437 | +- 服务层优化:去重、长度排序、分批、prefix cache | ||
| 438 | +- 工程约束:独立 venv、正确 `PATH`、缓存目录隔离、单独 smoke、完整 benchmark 归档 |
reranker/backends/qwen3_vllm_score.py
| @@ -15,7 +15,6 @@ Reference: https://docs.vllm.ai/ — Qwen3 reranker example | @@ -15,7 +15,6 @@ Reference: https://docs.vllm.ai/ — Qwen3 reranker example | ||
| 15 | from __future__ import annotations | 15 | from __future__ import annotations |
| 16 | 16 | ||
| 17 | import logging | 17 | import logging |
| 18 | -import os | ||
| 19 | import threading | 18 | import threading |
| 20 | import time | 19 | import time |
| 21 | from typing import Any, Dict, List, Tuple | 20 | from typing import Any, Dict, List, Tuple |
| @@ -41,89 +40,6 @@ _DEFAULT_DOCUMENT_TEMPLATE = "<Document>: {doc}{suffix}" | @@ -41,89 +40,6 @@ _DEFAULT_DOCUMENT_TEMPLATE = "<Document>: {doc}{suffix}" | ||
| 41 | _IM_USER_START = "<|im_end|>\n<|im_start|>user\n" | 40 | _IM_USER_START = "<|im_end|>\n<|im_start|>user\n" |
| 42 | 41 | ||
| 43 | 42 | ||
| 44 | -def _parse_env_bool(raw: str | None) -> bool | None: | ||
| 45 | - if raw is None: | ||
| 46 | - return None | ||
| 47 | - s = str(raw).strip().lower() | ||
| 48 | - if not s: | ||
| 49 | - return None | ||
| 50 | - if s in {"1", "true", "yes", "y", "on"}: | ||
| 51 | - return True | ||
| 52 | - if s in {"0", "false", "no", "n", "off"}: | ||
| 53 | - return False | ||
| 54 | - return None | ||
| 55 | - | ||
| 56 | - | ||
| 57 | -def _auto_triton_on_sm_lt_8_enabled(config: Dict[str, Any]) -> bool: | ||
| 58 | - """ | ||
| 59 | - When True (default), sm < 8 injects TRITON_ATTN to avoid FA2-only paths that error on T4/V100. | ||
| 60 | - | ||
| 61 | - When False, vLLM may choose FLASHINFER on Turing; first ``score()`` can JIT-compile and needs | ||
| 62 | - ``ninja`` on PATH (``requirements_reranker_qwen3_vllm_score.txt``). Use | ||
| 63 | - ``./scripts/start_reranker.sh`` (prepends the backend venv's ``bin`` to ``PATH``) or | ||
| 64 | - ``source .../bin/activate``. | ||
| 65 | - """ | ||
| 66 | - env = _parse_env_bool(os.getenv("RERANK_VLLM_AUTO_TRITON_ATTN")) | ||
| 67 | - if env is not None: | ||
| 68 | - return env | ||
| 69 | - raw = config.get("auto_triton_attn_on_sm_lt_8") | ||
| 70 | - if raw is None: | ||
| 71 | - return True | ||
| 72 | - if isinstance(raw, bool): | ||
| 73 | - return raw | ||
| 74 | - parsed = _parse_env_bool(str(raw)) | ||
| 75 | - return True if parsed is None else parsed | ||
| 76 | - | ||
| 77 | - | ||
| 78 | -def _resolve_vllm_attention_config(config: Dict[str, Any]) -> Dict[str, Any] | None: | ||
| 79 | - """ | ||
| 80 | - Optional explicit backend via vllm_attention_backend / RERANK_VLLM_ATTENTION_BACKEND. | ||
| 81 | - | ||
| 82 | - On compute capability < 8, vLLM may default to Flash-Attention 2, which is not supported on | ||
| 83 | - Turing/Volta; this module historically injected TRITON_ATTN. That can be slower than vLLM's | ||
| 84 | - other fallbacks — disable with auto_triton_attn_on_sm_lt_8: false or | ||
| 85 | - RERANK_VLLM_AUTO_TRITON_ATTN=0 if your stack runs without errors. | ||
| 86 | - """ | ||
| 87 | - env = (os.getenv("RERANK_VLLM_ATTENTION_BACKEND") or "").strip() | ||
| 88 | - raw = config.get("vllm_attention_backend") | ||
| 89 | - if env: | ||
| 90 | - choice = env | ||
| 91 | - elif raw is not None and str(raw).strip() and str(raw).strip().lower() != "auto": | ||
| 92 | - choice = str(raw).strip() | ||
| 93 | - else: | ||
| 94 | - choice = "" | ||
| 95 | - if choice: | ||
| 96 | - backend = choice.strip().upper() | ||
| 97 | - if backend == "AUTO": | ||
| 98 | - choice = "" | ||
| 99 | - else: | ||
| 100 | - logger.info("[Qwen3_VLLM_SCORE] attention_config.backend=%s (from config/env)", backend) | ||
| 101 | - return {"backend": backend} | ||
| 102 | - | ||
| 103 | - major, minor = torch.cuda.get_device_capability() | ||
| 104 | - if major < 8 and _auto_triton_on_sm_lt_8_enabled(config): | ||
| 105 | - logger.info( | ||
| 106 | - "[Qwen3_VLLM_SCORE] GPU compute capability %d.%d < 8.0; using attention backend " | ||
| 107 | - "TRITON_ATTN (Flash-Attention 2 requires sm >= 80). " | ||
| 108 | - "To use vLLM default instead: auto_triton_attn_on_sm_lt_8: false or " | ||
| 109 | - "RERANK_VLLM_AUTO_TRITON_ATTN=0; or set vllm_attention_backend / " | ||
| 110 | - "RERANK_VLLM_ATTENTION_BACKEND.", | ||
| 111 | - major, | ||
| 112 | - minor, | ||
| 113 | - ) | ||
| 114 | - return {"backend": "TRITON_ATTN"} | ||
| 115 | - if major < 8 and not _auto_triton_on_sm_lt_8_enabled(config): | ||
| 116 | - logger.info( | ||
| 117 | - "[Qwen3_VLLM_SCORE] GPU compute capability %d.%d < 8.0; auto TRITON_ATTN disabled — " | ||
| 118 | - "leaving attention backend to vLLM (no attention_config). " | ||
| 119 | - "If the first score() fails on 'ninja', install ninja in the score venv, ensure " | ||
| 120 | - "PATH includes that venv's bin (see start_reranker.sh), or use system ninja-build.", | ||
| 121 | - major, | ||
| 122 | - minor, | ||
| 123 | - ) | ||
| 124 | - return None | ||
| 125 | - | ||
| 126 | - | ||
| 127 | class Qwen3VLLMScoreRerankerBackend: | 43 | class Qwen3VLLMScoreRerankerBackend: |
| 128 | """ | 44 | """ |
| 129 | Qwen3 reranker using vLLM ``LLM.score()`` (pooling runner) for cross-encoder scores. | 45 | Qwen3 reranker using vLLM ``LLM.score()`` (pooling runner) for cross-encoder scores. |
| @@ -149,7 +65,7 @@ class Qwen3VLLMScoreRerankerBackend: | @@ -149,7 +65,7 @@ class Qwen3VLLMScoreRerankerBackend: | ||
| 149 | self._config.get("instruction") | 65 | self._config.get("instruction") |
| 150 | or "Given a query, score the product for relevance" | 66 | or "Given a query, score the product for relevance" |
| 151 | ) | 67 | ) |
| 152 | - _fmt = str(self._config.get("instruction_format") or "standard").strip().lower() | 68 | + _fmt = str(self._config.get("instruction_format") or "compact").strip().lower() |
| 153 | if _fmt not in {"standard", "compact"}: | 69 | if _fmt not in {"standard", "compact"}: |
| 154 | raise ValueError( | 70 | raise ValueError( |
| 155 | f"instruction_format must be 'standard' or 'compact', got {_fmt!r}" | 71 | f"instruction_format must be 'standard' or 'compact', got {_fmt!r}" |
| @@ -162,21 +78,11 @@ class Qwen3VLLMScoreRerankerBackend: | @@ -162,21 +78,11 @@ class Qwen3VLLMScoreRerankerBackend: | ||
| 162 | self._config.get("document_template") or _DEFAULT_DOCUMENT_TEMPLATE | 78 | self._config.get("document_template") or _DEFAULT_DOCUMENT_TEMPLATE |
| 163 | ) | 79 | ) |
| 164 | 80 | ||
| 165 | - infer_batch_size = os.getenv("RERANK_VLLM_INFER_BATCH_SIZE") or self._config.get( | ||
| 166 | - "infer_batch_size", 64 | ||
| 167 | - ) | ||
| 168 | - sort_by_doc_length = os.getenv("RERANK_VLLM_SORT_BY_DOC_LENGTH") | ||
| 169 | - if sort_by_doc_length is None: | ||
| 170 | - sort_by_doc_length = self._config.get("sort_by_doc_length", True) | 81 | + infer_batch_size = self._config.get("infer_batch_size", 64) |
| 82 | + sort_by_doc_length = self._config.get("sort_by_doc_length", True) | ||
| 171 | 83 | ||
| 172 | self._infer_batch_size = int(infer_batch_size) | 84 | self._infer_batch_size = int(infer_batch_size) |
| 173 | - self._sort_by_doc_length = str(sort_by_doc_length).strip().lower() in { | ||
| 174 | - "1", | ||
| 175 | - "true", | ||
| 176 | - "yes", | ||
| 177 | - "y", | ||
| 178 | - "on", | ||
| 179 | - } | 85 | + self._sort_by_doc_length = bool(sort_by_doc_length) |
| 180 | 86 | ||
| 181 | if not torch.cuda.is_available(): | 87 | if not torch.cuda.is_available(): |
| 182 | raise RuntimeError( | 88 | raise RuntimeError( |
| @@ -199,7 +105,7 @@ class Qwen3VLLMScoreRerankerBackend: | @@ -199,7 +105,7 @@ class Qwen3VLLMScoreRerankerBackend: | ||
| 199 | logger.info( | 105 | logger.info( |
| 200 | "[Qwen3_VLLM_SCORE] Loading model %s (LLM.score API, runner=%s, convert=%s, " | 106 | "[Qwen3_VLLM_SCORE] Loading model %s (LLM.score API, runner=%s, convert=%s, " |
| 201 | "hf_overrides=%s, max_model_len=%s, tp=%s, gpu_mem=%.2f, dtype=%s, prefix_caching=%s, " | 107 | "hf_overrides=%s, max_model_len=%s, tp=%s, gpu_mem=%.2f, dtype=%s, prefix_caching=%s, " |
| 202 | - "instruction_format=%s)", | 108 | + "instruction_format=%s, cuda_capability=%d.%d, attention_backend=vllm_auto)", |
| 203 | model_name, | 109 | model_name, |
| 204 | runner, | 110 | runner, |
| 205 | convert, | 111 | convert, |
| @@ -210,6 +116,7 @@ class Qwen3VLLMScoreRerankerBackend: | @@ -210,6 +116,7 @@ class Qwen3VLLMScoreRerankerBackend: | ||
| 210 | dtype, | 116 | dtype, |
| 211 | enable_prefix_caching, | 117 | enable_prefix_caching, |
| 212 | self._instruction_format, | 118 | self._instruction_format, |
| 119 | + *torch.cuda.get_device_capability(), | ||
| 213 | ) | 120 | ) |
| 214 | 121 | ||
| 215 | # vLLM 0.17+ uses runner/convert instead of LLM(..., task="score"). With the official | 122 | # vLLM 0.17+ uses runner/convert instead of LLM(..., task="score"). With the official |
| @@ -236,10 +143,6 @@ class Qwen3VLLMScoreRerankerBackend: | @@ -236,10 +143,6 @@ class Qwen3VLLMScoreRerankerBackend: | ||
| 236 | if hf_overrides: | 143 | if hf_overrides: |
| 237 | llm_kwargs["hf_overrides"] = hf_overrides | 144 | llm_kwargs["hf_overrides"] = hf_overrides |
| 238 | 145 | ||
| 239 | - attn_cfg = _resolve_vllm_attention_config(self._config) | ||
| 240 | - if attn_cfg is not None: | ||
| 241 | - llm_kwargs["attention_config"] = attn_cfg | ||
| 242 | - | ||
| 243 | self._llm = LLM(**llm_kwargs) | 146 | self._llm = LLM(**llm_kwargs) |
| 244 | # vLLM score path: single-process safety (mirrors generate backend until verified). | 147 | # vLLM score path: single-process safety (mirrors generate backend until verified). |
| 245 | self._infer_lock = threading.Lock() | 148 | self._infer_lock = threading.Lock() |
reranker/性能优化版本的qwen3_vllm_score 为什么反而更慢.md deleted
| @@ -1,141 +0,0 @@ | @@ -1,141 +0,0 @@ | ||
| 1 | - | ||
| 2 | -结论先说:**YAML 里能对齐的项(`model_name`、`max_model_len`、`infer_batch_size`、`prefix_caching` 等)你们已经基本对齐了**;`qwen3_vllm_score` 更慢,主要来自**两条后端走的不是同一条 vLLM 推理路径**,以及 **score 后端在 T4 上强制了 attention 后端**,和 **generate 路径更容易吃到「同 query、多 doc」的优化**。 | ||
| 3 | - | ||
| 4 | ---- | ||
| 5 | - | ||
| 6 | -## 1. 配置层面:哪些「对等」、哪些根本不存在于另一侧 | ||
| 7 | - | ||
| 8 | -两边共用的逻辑在代码里是一致的:`infer_batch_size`、`sort_by_doc_length`、去重、`instruction` / `instruction_format` 的语义(在各自实现里)是对齐设计的。 | ||
| 9 | - | ||
| 10 | -差异在于 **`qwen3_vllm_score` 必须多出来的 LLM 构造参数**:`runner` / `convert` / `hf_overrides`(把 Hub 模型改成 `Qwen3ForSequenceClassification` 那条链路)。`qwen3_vllm` 没有这些,因为它是**普通 causal LM + `generate`**。这不是 `config.yaml` 漏配,而是两种 API 的必要差别。 | ||
| 11 | - | ||
| 12 | -```132:140:reranker/backends/qwen3_vllm.py | ||
| 13 | - self._llm = LLM( | ||
| 14 | - model=model_name, | ||
| 15 | - tensor_parallel_size=tensor_parallel_size, | ||
| 16 | - max_model_len=max_model_len, | ||
| 17 | - gpu_memory_utilization=gpu_memory_utilization, | ||
| 18 | - enable_prefix_caching=enable_prefix_caching, | ||
| 19 | - enforce_eager=enforce_eager, | ||
| 20 | - dtype=dtype, | ||
| 21 | - ) | ||
| 22 | -``` | ||
| 23 | - | ||
| 24 | -```167:195:reranker/backends/qwen3_vllm_score.py | ||
| 25 | - llm_kwargs: Dict[str, Any] = { | ||
| 26 | - "model": model_name, | ||
| 27 | - "runner": runner, | ||
| 28 | - "convert": convert, | ||
| 29 | - "tensor_parallel_size": tensor_parallel_size, | ||
| 30 | - "max_model_len": max_model_len, | ||
| 31 | - "gpu_memory_utilization": gpu_memory_utilization, | ||
| 32 | - "enable_prefix_caching": enable_prefix_caching, | ||
| 33 | - "enforce_eager": enforce_eager, | ||
| 34 | - "dtype": dtype, | ||
| 35 | - } | ||
| 36 | - hf_overrides: Dict[str, Any] = dict(self._config.get("hf_overrides") or {}) | ||
| 37 | - if use_hf_overrides: | ||
| 38 | - hf_overrides = { | ||
| 39 | - **hf_overrides, | ||
| 40 | - "architectures": ["Qwen3ForSequenceClassification"], | ||
| 41 | - "classifier_from_token": ["no", "yes"], | ||
| 42 | - "is_original_qwen3_reranker": True, | ||
| 43 | - } | ||
| 44 | - if hf_overrides: | ||
| 45 | - llm_kwargs["hf_overrides"] = hf_overrides | ||
| 46 | - | ||
| 47 | - attn_cfg = _resolve_vllm_attention_config(self._config) | ||
| 48 | - if attn_cfg is not None: | ||
| 49 | - llm_kwargs["attention_config"] = attn_cfg | ||
| 50 | - | ||
| 51 | - self._llm = LLM(**llm_kwargs) | ||
| 52 | -``` | ||
| 53 | - | ||
| 54 | -**小坑(仅当有人删掉 YAML 字段时):** | ||
| 55 | -`instruction_format` 的**代码默认值不一致**——`qwen3_vllm` 默认 `compact`,`qwen3_vllm_score` 默认 `standard`。你贴的片段里两边都写了 `standard`,所以当前是对齐的。 | ||
| 56 | - | ||
| 57 | -```93:98:reranker/backends/qwen3_vllm.py | ||
| 58 | - _fmt = str(self._config.get("instruction_format") or "compact").strip().lower() | ||
| 59 | -``` | ||
| 60 | - | ||
| 61 | -```104:109:reranker/backends/qwen3_vllm_score.py | ||
| 62 | - _fmt = str(self._config.get("instruction_format") or "standard").strip().lower() | ||
| 63 | -``` | ||
| 64 | - | ||
| 65 | ---- | ||
| 66 | - | ||
| 67 | -## 2. 为什么「按理 score 更快」在你们机器上反过来 | ||
| 68 | - | ||
| 69 | -你们自己的报告里写的是 **Tesla T4**(算力 **sm_75 < 8.0**)。这一点和代码里的行为直接相关。 | ||
| 70 | - | ||
| 71 | -### (1)只有 score 后端在 sm<8 时**强制** `TRITON_ATTN` | ||
| 72 | - | ||
| 73 | -```65:75:reranker/backends/qwen3_vllm_score.py | ||
| 74 | - major, minor = torch.cuda.get_device_capability() | ||
| 75 | - if major < 8: | ||
| 76 | - logger.info( | ||
| 77 | - "[Qwen3_VLLM_SCORE] GPU compute capability %d.%d < 8.0; using attention backend " | ||
| 78 | - "TRITON_ATTN (Flash-Attention 2 requires sm >= 80). " | ||
| 79 | - ... | ||
| 80 | - ) | ||
| 81 | - return {"backend": "TRITON_ATTN"} | ||
| 82 | -``` | ||
| 83 | - | ||
| 84 | -`qwen3_vllm` **没有**这段逻辑,**不写** `attention_config`,完全交给 vLLM 在 **generate** 路径上自己选实现。 | ||
| 85 | -因此在 T4 上很容易出现:**两条路径实际用的 attention / kernel 组合并不相同**;若默认路径比强制的 `TRITON_ATTN` 更适合你们的 batch 与序列长度,就会出现 **score 更慢**。 | ||
| 86 | -若要验证,可在 score 的 YAML 里试 `vllm_attention_backend`(或与 `RERANK_VLLM_ATTENTION_BACKEND` 对齐到和 generate 实际一致的后端),或在 Ampere+ 上复测矩阵。 | ||
| 87 | - | ||
| 88 | -### (2)工作量与 vLLM 优化重心不同(这是主因之一) | ||
| 89 | - | ||
| 90 | -- **generate 后端**:`max_tokens=1`、`allowed_token_ids` 只有 yes/no,本质是 **prefill + 极短 decode**,且 logprobs 只关心最后一步的分布。 | ||
| 91 | -- **score 后端**:`LLM.score()` 走 **pooling / cross-encoder 式**的打分图,是另一条 runner,**不等于**「比 1-token generate 一定更少算」;在 vLLM 里通常 **causal generate 路径打磨得更狠**。 | ||
| 92 | - | ||
| 93 | -所以「score API 更高级所以一定更快」在这个模型用法下**不一定成立**。 | ||
| 94 | - | ||
| 95 | -### (3)`enable_prefix_caching: true` 对两边的「可缓存前缀」不对称 | ||
| 96 | - | ||
| 97 | -同一 query、多个 doc 时,**generate** 路径用 chat template 拼出来的 prompt,**从 system 到 query 的长前缀在 batch 内完全相同**,很容易成为 prefix caching 的理想场景。 | ||
| 98 | - | ||
| 99 | -**score** 路径把内容拆成 `queries` / `documents` 两列交给 `score()`,内部如何切块、是否能把「同一 query 对应多 doc」映射成与 generate 同等强度的前缀复用,依赖 vLLM 实现;很多版本下 **generate + 共享前缀** 更占便宜。你们 `max_model_len: 160` 很短,prefill 成本敏感,**谁更吃到缓存**会明显拉开差距。 | ||
| 100 | - | ||
| 101 | -### (4)Tokenizer 侧:后者多了一步「批量模板」优化 | ||
| 102 | - | ||
| 103 | -`qwen3_vllm` 对整批 `apply_chat_template` 一次做完再 `generate`: | ||
| 104 | - | ||
| 105 | -```171:180:reranker/backends/qwen3_vllm.py | ||
| 106 | - messages_batch = [ | ||
| 107 | - self._format_messages(self._instruction, q, d) for q, d in pairs | ||
| 108 | - ] | ||
| 109 | - tokenized = self._tokenizer.apply_chat_template( | ||
| 110 | - messages_batch, | ||
| 111 | - tokenize=True, | ||
| 112 | - add_generation_prompt=False, | ||
| 113 | - enable_thinking=False, | ||
| 114 | - ) | ||
| 115 | -``` | ||
| 116 | - | ||
| 117 | -`qwen3_vllm_score` 在 Python 里逐对拼字符串,再进 `score()`(tokenization 在 vLLM 内)。这一项通常不是第一瓶颈,但在 **batch 大、序列短** 时也会有一点差别。 | ||
| 118 | - | ||
| 119 | -### (5)两个 venv 的 vLLM 版本不同 | ||
| 120 | - | ||
| 121 | -- `.venv-reranker`:`vllm>=0.8.5`(实际装的几版本会变) | ||
| 122 | -- `.venv-reranker-score`:固定 `vllm==0.18.0` | ||
| 123 | - | ||
| 124 | -对比「谁更快」时,**版本 + 代码路径**是绑在一起的;不能假设「新 vLLM + score」在 T4 上一定赢过「旧 vLLM + 1-token generate」。 | ||
| 125 | - | ||
| 126 | ---- | ||
| 127 | - | ||
| 128 | -## 3. 和你们 `RESULTS.md` 的对应关系 | ||
| 129 | - | ||
| 130 | -`perf_reports/.../RESULTS.md` 里:**同一 `instruction_format` 下 `qwen3_vllm` 全程低于 `qwen3_vllm_score`**,与上面 **T4 + attention 强制 + 不同 runner + prefix cache 利用率** 的解释一致;报告里也写了在别的 GPU / vLLM 版本下排序可能变,这是合理的。 | ||
| 131 | - | ||
| 132 | ---- | ||
| 133 | - | ||
| 134 | -## 4. 若要「对齐实验」可以怎么做(方向性) | ||
| 135 | - | ||
| 136 | -1. **在 Ampere(A10/A100 等 sm≥80)上跑同一脚本**,看 score 是否反超(FlashAttention 路径更完整时,score 路径有时会更合理)。 | ||
| 137 | -2. **在 score 侧显式设置 `vllm_attention_backend`**(或与 env 对齐),避免在 T4 上只有 score 被锁死 `TRITON_ATTN` 而 generate 走另一条。 | ||
| 138 | -3. **固定两边 `pip show vllm` 版本**再比,否则「版本差」会污染结论。 | ||
| 139 | -4. 用 vLLM 的 profiler / 日志确认 **prefix cache hit** 在两种后端上的差异(若你们要量化「缓存」这一条)。 | ||
| 140 | - | ||
| 141 | -**总结:** 不是 `config.yaml` 里少抄了几个键;而是 **推理图不同、T4 上 attention 策略不对称、以及 generate 对「同 query 多 doc」更友好**,导致在你们当前环境下 **`qwen3_vllm` 比 `qwen3_vllm_score` 更快是合理现象**,与「score API 理论上更干净」并不矛盾。 | ||
| 142 | \ No newline at end of file | 0 | \ No newline at end of file |
scripts/smoke_qwen3_vllm_score_backend.py
| @@ -6,8 +6,9 @@ Usage (from repo root, score venv): | @@ -6,8 +6,9 @@ Usage (from repo root, score venv): | ||
| 6 | PYTHONPATH=. ./.venv-reranker-score/bin/python scripts/smoke_qwen3_vllm_score_backend.py | 6 | PYTHONPATH=. ./.venv-reranker-score/bin/python scripts/smoke_qwen3_vllm_score_backend.py |
| 7 | 7 | ||
| 8 | Same as production: vLLM child processes need the venv's ``bin`` on PATH (for pip's ``ninja`` when | 8 | Same as production: vLLM child processes need the venv's ``bin`` on PATH (for pip's ``ninja`` when |
| 9 | -using FLASHINFER). ``start_reranker.sh`` exports that; this script prepends ``sysconfig.get_path("scripts")`` | ||
| 10 | -(the stdlib location for this environment's console scripts, independent of ``python`` symlink targets). | 9 | +vLLM auto-selects FLASHINFER on T4/Turing). ``start_reranker.sh`` exports that; this script prepends |
| 10 | +``sysconfig.get_path("scripts")`` (the stdlib location for this environment's console scripts, | ||
| 11 | +independent of ``python`` symlink targets). | ||
| 11 | """ | 12 | """ |
| 12 | 13 | ||
| 13 | from __future__ import annotations | 14 | from __future__ import annotations |
| @@ -30,18 +31,12 @@ import torch | @@ -30,18 +31,12 @@ import torch | ||
| 30 | 31 | ||
| 31 | from reranker.backends.qwen3_vllm_score import ( | 32 | from reranker.backends.qwen3_vllm_score import ( |
| 32 | Qwen3VLLMScoreRerankerBackend, | 33 | Qwen3VLLMScoreRerankerBackend, |
| 33 | - _resolve_vllm_attention_config, | ||
| 34 | ) | 34 | ) |
| 35 | 35 | ||
| 36 | 36 | ||
| 37 | def main() -> int: | 37 | def main() -> int: |
| 38 | p = argparse.ArgumentParser() | 38 | p = argparse.ArgumentParser() |
| 39 | p.add_argument( | 39 | p.add_argument( |
| 40 | - "--no-auto-triton", | ||
| 41 | - action="store_true", | ||
| 42 | - help="Set auto_triton_attn_on_sm_lt_8=False (match config opt-out)", | ||
| 43 | - ) | ||
| 44 | - p.add_argument( | ||
| 45 | "--gpu-memory-utilization", | 40 | "--gpu-memory-utilization", |
| 46 | type=float, | 41 | type=float, |
| 47 | default=0.12, | 42 | default=0.12, |
| @@ -66,14 +61,8 @@ def main() -> int: | @@ -66,14 +61,8 @@ def main() -> int: | ||
| 66 | "enable_prefix_caching": False, | 61 | "enable_prefix_caching": False, |
| 67 | "enforce_eager": True, | 62 | "enforce_eager": True, |
| 68 | "infer_batch_size": 4, | 63 | "infer_batch_size": 4, |
| 69 | - "instruction_format": "standard", | 64 | + "instruction_format": "compact", |
| 70 | } | 65 | } |
| 71 | - if args.no_auto_triton: | ||
| 72 | - cfg["auto_triton_attn_on_sm_lt_8"] = False | ||
| 73 | - | ||
| 74 | - attn = _resolve_vllm_attention_config(cfg) | ||
| 75 | - print("attention_config:", attn) | ||
| 76 | - | ||
| 77 | print("Loading backend ...") | 66 | print("Loading backend ...") |
| 78 | backend = Qwen3VLLMScoreRerankerBackend(cfg) | 67 | backend = Qwen3VLLMScoreRerankerBackend(cfg) |
| 79 | scores, meta = backend.score_with_meta("smoke query", ["title one", "title two"], normalize=False) | 68 | scores, meta = backend.score_with_meta("smoke query", ["title one", "title two"], normalize=False) |
scripts/start_reranker.sh
| @@ -41,8 +41,8 @@ export TRITON_CACHE_DIR="${RERANKER_RUNTIME_DIR}/triton" | @@ -41,8 +41,8 @@ export TRITON_CACHE_DIR="${RERANKER_RUNTIME_DIR}/triton" | ||
| 41 | export TORCHINDUCTOR_CACHE_DIR="${RERANKER_RUNTIME_DIR}/torch_compile" | 41 | export TORCHINDUCTOR_CACHE_DIR="${RERANKER_RUNTIME_DIR}/torch_compile" |
| 42 | export TMPDIR="${RERANKER_RUNTIME_DIR}/tmp" | 42 | export TMPDIR="${RERANKER_RUNTIME_DIR}/tmp" |
| 43 | export VLLM_NO_USAGE_STATS="${VLLM_NO_USAGE_STATS:-1}" | 43 | export VLLM_NO_USAGE_STATS="${VLLM_NO_USAGE_STATS:-1}" |
| 44 | -# venv bin must be on PATH before Python starts: vLLM worker inherits it; FlashInfer JIT needs | ||
| 45 | -# pip-installed ninja when qwen3_vllm_score does not force TRITON_ATTN (e.g. T4 + auto_triton off). | 44 | +# venv bin must be on PATH before Python starts: vLLM worker inherits it; on T4/Turing, |
| 45 | +# qwen3_vllm_score now relies on vLLM auto-selecting FLASHINFER, whose JIT needs pip-installed ninja. | ||
| 46 | export PATH="${RERANKER_VENV}/bin:${PATH}" | 46 | export PATH="${RERANKER_VENV}/bin:${PATH}" |
| 47 | 47 | ||
| 48 | if [[ "${RERANK_BACKEND}" == qwen3_gguf* ]]; then | 48 | if [[ "${RERANK_BACKEND}" == qwen3_gguf* ]]; then |