smoke_qwen3_vllm_score_backend.py
2.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python3
"""
Smoke test: load Qwen3VLLMScoreRerankerBackend (must run as a file, not stdin — vLLM spawn).
Usage (from repo root, score venv):
PYTHONPATH=. ./.venv-reranker-score/bin/python scripts/smoke_qwen3_vllm_score_backend.py
Same as production: vLLM child processes need the venv's ``bin`` on PATH (for pip's ``ninja`` when
vLLM auto-selects FLASHINFER on T4/Turing). ``start_reranker.sh`` exports that; this script prepends
``sysconfig.get_path("scripts")`` (the stdlib location for this environment's console scripts,
independent of ``python`` symlink targets).
"""
from __future__ import annotations
import argparse
import logging
import os
import sys
import sysconfig
from pathlib import Path
# Repo root on sys.path when run as scripts/smoke_*.py
_ROOT = Path(__file__).resolve().parents[1]
if str(_ROOT) not in sys.path:
sys.path.insert(0, str(_ROOT))
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
import torch
from reranker.backends.qwen3_vllm_score import (
Qwen3VLLMScoreRerankerBackend,
)
def main() -> int:
p = argparse.ArgumentParser()
p.add_argument(
"--gpu-memory-utilization",
type=float,
default=0.12,
help="vLLM gpu_memory_utilization (default 0.12 for tight GPUs)",
)
args = p.parse_args()
scripts = sysconfig.get_path("scripts")
if scripts:
os.environ["PATH"] = scripts + os.pathsep + os.environ.get("PATH", "")
if not torch.cuda.is_available():
print("SKIP: CUDA not available")
return 0
cfg = {
"model_name": "Qwen/Qwen3-Reranker-0.6B",
"max_model_len": 160,
"tensor_parallel_size": 1,
"gpu_memory_utilization": args.gpu_memory_utilization,
"dtype": "float16",
"enable_prefix_caching": False,
"enforce_eager": True,
"infer_batch_size": 4,
"instruction_format": "compact",
}
print("Loading backend ...")
backend = Qwen3VLLMScoreRerankerBackend(cfg)
scores, meta = backend.score_with_meta("smoke query", ["title one", "title two"], normalize=False)
print("scores:", scores)
print("meta:", {k: meta[k] for k in ("backend", "infer_batch_size", "instruction_format") if k in meta})
print("OK")
return 0
if __name__ == "__main__":
raise SystemExit(main())