Blame view

scripts/smoke_qwen3_vllm_score_backend.py 2.62 KB
540fb5af   tangwang   添加了可关闭的开关:保留默认行为(...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
  #!/usr/bin/env python3
  """
  Smoke test: load Qwen3VLLMScoreRerankerBackend (must run as a file, not stdin  vLLM spawn).
  
  Usage (from repo root, score venv):
    PYTHONPATH=. ./.venv-reranker-score/bin/python scripts/smoke_qwen3_vllm_score_backend.py
  
  Same as production: vLLM child processes need the venv's ``bin`` on PATH (for pip's ``ninja`` when
  using FLASHINFER). ``start_reranker.sh`` exports that; this script prepends ``sysconfig.get_path("scripts")``
  (the stdlib location for this environment's console scripts, independent of ``python`` symlink targets).
  """
  
  from __future__ import annotations
  
  import argparse
  import logging
  import os
  import sys
  import sysconfig
  from pathlib import Path
  
  # Repo root on sys.path when run as scripts/smoke_*.py
  _ROOT = Path(__file__).resolve().parents[1]
  if str(_ROOT) not in sys.path:
      sys.path.insert(0, str(_ROOT))
  
  logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
  
  import torch
  
  from reranker.backends.qwen3_vllm_score import (
      Qwen3VLLMScoreRerankerBackend,
      _resolve_vllm_attention_config,
  )
  
  
  def main() -> int:
      p = argparse.ArgumentParser()
      p.add_argument(
          "--no-auto-triton",
          action="store_true",
          help="Set auto_triton_attn_on_sm_lt_8=False (match config opt-out)",
      )
      p.add_argument(
          "--gpu-memory-utilization",
          type=float,
          default=0.12,
          help="vLLM gpu_memory_utilization (default 0.12 for tight GPUs)",
      )
      args = p.parse_args()
  
      scripts = sysconfig.get_path("scripts")
      if scripts:
          os.environ["PATH"] = scripts + os.pathsep + os.environ.get("PATH", "")
  
      if not torch.cuda.is_available():
          print("SKIP: CUDA not available")
          return 0
  
      cfg = {
          "model_name": "Qwen/Qwen3-Reranker-0.6B",
          "max_model_len": 160,
          "tensor_parallel_size": 1,
          "gpu_memory_utilization": args.gpu_memory_utilization,
          "dtype": "float16",
          "enable_prefix_caching": False,
          "enforce_eager": True,
          "infer_batch_size": 4,
          "instruction_format": "standard",
      }
      if args.no_auto_triton:
          cfg["auto_triton_attn_on_sm_lt_8"] = False
  
      attn = _resolve_vllm_attention_config(cfg)
      print("attention_config:", attn)
  
      print("Loading backend ...")
      backend = Qwen3VLLMScoreRerankerBackend(cfg)
      scores, meta = backend.score_with_meta("smoke query", ["title one", "title two"], normalize=False)
      print("scores:", scores)
      print("meta:", {k: meta[k] for k in ("backend", "infer_batch_size", "instruction_format") if k in meta})
      print("OK")
      return 0
  
  
  if __name__ == "__main__":
      raise SystemExit(main())