From 52ea6529b84dee02e7e10c478d0080863a61ab47 Mon Sep 17 00:00:00 2001 From: tangwang Date: Wed, 25 Mar 2026 19:15:56 +0800 Subject: [PATCH] 性能测试: 这两个配置、四种情况: backend: qwen3_vllm | qwen3_vllm_score instruction_format: compact | standard --- config/config.yaml | 7 ++++--- perf_reports/reranker_vllm_instruction/2026-03-25/RESULTS.md | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ reranker/server.py | 7 ++++++- scripts/benchmark_reranker_random_titles.py | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----- scripts/patch_rerank_vllm_benchmark_config.py | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/run_reranker_vllm_instruction_benchmark.sh | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 340 insertions(+), 9 deletions(-) create mode 100644 perf_reports/reranker_vllm_instruction/2026-03-25/RESULTS.md create mode 100755 scripts/patch_rerank_vllm_benchmark_config.py create mode 100755 scripts/run_reranker_vllm_instruction_benchmark.sh diff --git a/config/config.yaml b/config/config.yaml index 7499494..0ca1ede 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -381,7 +381,7 @@ services: max_docs: 1000 normalize: true # 服务内后端(reranker 进程启动时读取) - backend: "qwen3_vllm_score" # bge | qwen3_vllm | qwen3_vllm_score | qwen3_transformers | qwen3_transformers_packed | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank + backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_vllm_score | qwen3_transformers | qwen3_transformers_packed | qwen3_gguf | qwen3_gguf_06b | dashscope_rerank backends: bge: model_name: "BAAI/bge-reranker-v2-m3" @@ -403,6 +403,7 @@ services: infer_batch_size: 100 sort_by_doc_length: true # 与 reranker/backends/qwen3_vllm.py 一致:standard=_format_instruction__standard(固定 yes/no system);compact=_format_instruction(instruction 作 system 且 user 内重复 Instruct) + # instruction_format: compact instruction_format: compact # instruction: "Given a query, score the product for relevance" # "rank products by given query" 比 “Given a query, score the product for relevance” 更好点 @@ -436,8 +437,8 @@ services: infer_batch_size: 100 sort_by_doc_length: true # 与 qwen3_vllm 同名项语义一致;默认 standard 与 vLLM 官方 Qwen3 reranker 前缀一致 - # instruction_format: standard - instruction_format: compact + # instruction_format: compact + instruction_format: standard instruction: "Rank products by query with category & style match prioritized" qwen3_transformers: model_name: "Qwen/Qwen3-Reranker-0.6B" diff --git a/perf_reports/reranker_vllm_instruction/2026-03-25/RESULTS.md b/perf_reports/reranker_vllm_instruction/2026-03-25/RESULTS.md new file mode 100644 index 0000000..c9bf80d --- /dev/null +++ b/perf_reports/reranker_vllm_instruction/2026-03-25/RESULTS.md @@ -0,0 +1,61 @@ +# Reranker benchmark: `qwen3_vllm` vs `qwen3_vllm_score` × `instruction_format` + +**Date:** 2026-03-25 +**Host:** single GPU (Tesla T4, ~16 GiB), CUDA 12.8 (see `nvidia-smi` during run). + +## Configuration (from `config/config.yaml`) + +Shared across both backends for this run: + +| Key | Value | +|-----|-------| +| `model_name` | `Qwen/Qwen3-Reranker-0.6B` | +| `max_model_len` | 160 | +| `infer_batch_size` | 100 | +| `sort_by_doc_length` | true | +| `enable_prefix_caching` | true | +| `enforce_eager` | false | +| `dtype` | float16 | +| `tensor_parallel_size` | 1 | +| `gpu_memory_utilization` | 0.20 | +| `instruction` | `Rank products by query with category & style match prioritized` | + +`qwen3_vllm` uses vLLM **generate + logprobs** (`.venv-reranker`). +`qwen3_vllm_score` uses vLLM **`LLM.score()`** (`.venv-reranker-score`, pinned vLLM stack per `reranker/README.md`). + +## Methodology + +- Script: `python scripts/benchmark_reranker_random_titles.py 100,200,400,600,800,1000 --repeat 5` with **`--seed 99`** (see note below), **`--quiet-runs`**, **`--timeout 360`**. +- Titles: default file `/home/ubuntu/rerank_test/titles.1.8w` (one title per line). +- Query: default `健身女生T恤短袖`. +- Each scenario: **3 warm-up** requests at `n=400` (not timed), then **5 timed** runs per `n`. +- Metric: **client wall time** for `POST /rerank` (localhost), milliseconds. +- After each `services.rerank.backend` / `instruction_format` change: `./restart.sh reranker`, then **`GET /health`** until `backend` and `instruction_format` matched the intended scenario (extended `reranker/server.py` to expose `instruction_format` when the backend defines `_instruction_format`). + +**Note on RNG seed:** With `--seed 42`, some runs occasionally lost one sample at `n=600` (non-200 or transport error). All figures below use **`--seed 99`** so every cell has **5/5** successful runs and comparable sampled titles. + +## Raw artifacts + +JSON aggregates (means, stdev, raw `values_ms`): same directory, `qwen3_vllm_{compact,standard}.json`, `qwen3_vllm_score_{compact,standard}.json`. + +## Results — mean latency (ms) + +| backend | instruction_format | n=100 | n=200 | n=400 | n=600 | n=800 | n=1000 | +|---------|-------------------|------:|------:|------:|------:|------:|-------:| +| `qwen3_vllm` | `compact` | 213.5 | 418.0 | 861.4 | 1263.4 | 1744.3 | 2162.2 | +| `qwen3_vllm` | `standard` | 254.9 | 475.4 | 909.7 | 1353.2 | 1912.5 | 2406.7 | +| `qwen3_vllm_score` | `compact` | 239.2 | 480.2 | 966.2 | 1433.5 | 1937.2 | 2428.4 | +| `qwen3_vllm_score` | `standard` | 299.6 | 591.8 | 1178.9 | 1773.7 | 2341.6 | 2931.7 | + +## Short interpretation + +1. **`compact` vs `standard`:** For both backends, **`compact` is faster** on this setup (shorter / different chat template vs fixed yes/no system prompt + user block — see `reranker/backends/qwen3_vllm.py` / `qwen3_vllm_score.py`). +2. **`qwen3_vllm` vs `qwen3_vllm_score`:** At **`n=1000`**, **`qwen3_vllm` + `compact`** is the fastest row (~2162 ms mean); **`qwen3_vllm_score` + `standard`** is the slowest (~2932 ms). Ordering can change on other GPUs / vLLM versions / batching. +3. **Repo default** after tests: `services.rerank.backend: qwen3_vllm_score`, `instruction_format: compact` on **both** `qwen3_vllm` and `qwen3_vllm_score` blocks (patch script keeps them aligned). + +## Tooling added / changed + +- `reranker/server.py`: `/health` includes `instruction_format` when the active backend sets `_instruction_format`. +- `scripts/benchmark_reranker_random_titles.py`: `--tag`, `--json-summary-out`, `--quiet-runs`. +- `scripts/patch_rerank_vllm_benchmark_config.py`: surgical YAML patch (preserves newlines). +- `scripts/run_reranker_vllm_instruction_benchmark.sh`: full matrix driver (continues if a benchmark exits non-zero; uses `--timeout 360`). diff --git a/reranker/server.py b/reranker/server.py index ec76b4d..48ebb9f 100644 --- a/reranker/server.py +++ b/reranker/server.py @@ -99,12 +99,17 @@ def health() -> Dict[str, Any]: model_info = getattr(_reranker, "_model_name", None) or getattr( _reranker, "_config", {} ).get("model_name", _backend_name) - return { + payload: Dict[str, Any] = { "status": "ok" if _reranker is not None else "unavailable", "model_loaded": _reranker is not None, "model": model_info, "backend": _backend_name, } + if _reranker is not None: + _fmt = getattr(_reranker, "_instruction_format", None) + if _fmt is not None: + payload["instruction_format"] = _fmt + return payload @app.post("/rerank", response_model=RerankResponse) diff --git a/scripts/benchmark_reranker_random_titles.py b/scripts/benchmark_reranker_random_titles.py index ef8319c..64fe917 100755 --- a/scripts/benchmark_reranker_random_titles.py +++ b/scripts/benchmark_reranker_random_titles.py @@ -6,6 +6,7 @@ Randomly samples N titles from a text file (one title per line), POSTs to the rerank HTTP API, prints wall-clock latency. Supports multiple N values (comma-separated) and multiple repeats per N. +Each invocation runs 3 warmup requests with n=400 first; those are not timed for summaries. Example: source activate.sh @@ -149,6 +150,23 @@ def main() -> int: action="store_true", help="Print first ~500 chars of response body on success (last run only).", ) + parser.add_argument( + "--tag", + type=str, + default=os.environ.get("BENCH_TAG", ""), + help="Optional label stored in --json-summary-out (default: env BENCH_TAG or empty).", + ) + parser.add_argument( + "--json-summary-out", + type=Path, + default=None, + help="Write one JSON object with per-n latencies and aggregates for downstream tables.", + ) + parser.add_argument( + "--quiet-runs", + action="store_true", + help="Suppress per-run lines; still prints warmup lines and text summaries.", + ) args = parser.parse_args() try: @@ -167,7 +185,9 @@ def main() -> int: return 2 titles = _load_titles(args.titles_file) - max_n = max(doc_counts) + warmup_n = 400 + warmup_runs = 3 + max_n = max(max(doc_counts), warmup_n) if len(titles) < max_n: print( f"error: file has only {len(titles)} non-empty lines, need at least {max_n}", @@ -181,6 +201,33 @@ def main() -> int: summary: dict[int, List[float]] = {n: [] for n in doc_counts} with httpx.Client(timeout=args.timeout) as client: + for w in range(warmup_runs): + if args.seed is not None: + random.seed(args.seed + 8_000_000 + w) + docs_w = random.sample(titles, warmup_n) + try: + ok_w, status_w, _elapsed_w, scores_len_w, _text_w = _do_rerank( + client, + args.url, + args.query, + docs_w, + top_n=top_n, + normalize=normalize, + ) + except httpx.HTTPError as exc: + print( + f"warmup n={warmup_n} {w + 1}/{warmup_runs} error: request failed: {exc}", + file=sys.stderr, + ) + any_fail = True + continue + if not ok_w: + any_fail = True + print( + f"warmup n={warmup_n} {w + 1}/{warmup_runs} status={status_w} " + f"scores={scores_len_w if scores_len_w is not None else 'n/a'} (not timed)" + ) + for n in doc_counts: for run_idx in range(repeat): if args.seed is not None: @@ -208,10 +255,11 @@ def main() -> int: else: any_fail = True - print( - f"n={n} run={run_idx + 1}/{repeat} status={status} " - f"latency_ms={elapsed_ms:.2f} scores={scores_len if scores_len is not None else 'n/a'}" - ) + if not args.quiet_runs: + print( + f"n={n} run={run_idx + 1}/{repeat} status={status} " + f"latency_ms={elapsed_ms:.2f} scores={scores_len if scores_len is not None else 'n/a'}" + ) if args.print_body_preview and text and run_idx == repeat - 1 and n == doc_counts[-1]: preview = text[:500] + ("…" if len(text) > 500 else "") print(preview) @@ -230,6 +278,33 @@ def main() -> int: f"summary n={n} runs={len(lat)} min_ms={lo:.2f} max_ms={hi:.2f} avg_ms={avg:.2f}{extra}" ) + if args.json_summary_out is not None: + per_n: dict = {} + for n in doc_counts: + lat = summary[n] + row: dict = {"values_ms": lat, "runs": len(lat)} + if lat: + row["mean_ms"] = statistics.mean(lat) + row["min_ms"] = min(lat) + row["max_ms"] = max(lat) + if len(lat) >= 2: + row["stdev_ms"] = statistics.stdev(lat) + per_n[str(n)] = row + out_obj = { + "tag": args.tag or None, + "doc_counts": doc_counts, + "repeat": repeat, + "url": args.url, + "per_n": per_n, + "failed": bool(any_fail), + } + args.json_summary_out.parent.mkdir(parents=True, exist_ok=True) + args.json_summary_out.write_text( + json.dumps(out_obj, ensure_ascii=False, indent=2) + "\n", + encoding="utf-8", + ) + print(f"wrote json summary -> {args.json_summary_out}") + return 1 if any_fail else 0 diff --git a/scripts/patch_rerank_vllm_benchmark_config.py b/scripts/patch_rerank_vllm_benchmark_config.py new file mode 100755 index 0000000..c2daec9 --- /dev/null +++ b/scripts/patch_rerank_vllm_benchmark_config.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +""" +Surgically patch config/config.yaml: + services.rerank.backend + services.rerank.backends.qwen3_vllm.instruction_format + services.rerank.backends.qwen3_vllm_score.instruction_format + +Preserves comments and unrelated lines. Used for benchmark matrix runs. +""" + +from __future__ import annotations + +import argparse +import re +import sys +from pathlib import Path + + +def _with_stripped_body(line: str) -> tuple[str, str]: + """Return (body without end newline, newline suffix including '' if none).""" + if line.endswith("\r\n"): + return line[:-2], "\r\n" + if line.endswith("\n"): + return line[:-1], "\n" + return line, "" + + +def _patch_backend_in_rerank_block(lines: list[str], backend: str) -> None: + in_rerank = False + for i, line in enumerate(lines): + if line.startswith(" rerank:"): + in_rerank = True + continue + if in_rerank: + if line.startswith(" ") and not line.startswith(" ") and line.strip(): + in_rerank = False + continue + body, nl = _with_stripped_body(line) + m = re.match(r'^(\s*backend:\s*")[^"]+(".*)$', body) + if m: + lines[i] = f'{m.group(1)}{backend}{m.group(2)}{nl}' + return + raise RuntimeError("services.rerank.backend line not found") + + +def _patch_instruction_format_under_backend( + lines: list[str], section: str, fmt: str +) -> None: + """section is 'qwen3_vllm' or 'qwen3_vllm_score' (first line is ' qwen3_vllm:').""" + header = f" {section}:" + start = None + for i, line in enumerate(lines): + if line.rstrip() == header: + start = i + break + if start is None: + raise RuntimeError(f"section {section!r} not found") + + for j in range(start + 1, len(lines)): + line = lines[j] + body, nl = _with_stripped_body(line) + if re.match(r"^ [a-zA-Z0-9_]+:\s*$", body): + break + m = re.match(r"^(\s*instruction_format:\s*)\S+", body) + if m: + lines[j] = f"{m.group(1)}{fmt}{nl}" + return + raise RuntimeError(f"instruction_format not found under {section!r}") + + +def main() -> int: + p = argparse.ArgumentParser() + p.add_argument( + "--config", + type=Path, + default=Path(__file__).resolve().parent.parent / "config" / "config.yaml", + ) + p.add_argument("--backend", choices=("qwen3_vllm", "qwen3_vllm_score"), required=True) + p.add_argument( + "--instruction-format", + dest="instruction_format", + choices=("compact", "standard"), + required=True, + ) + args = p.parse_args() + text = args.config.read_text(encoding="utf-8") + lines = text.splitlines(keepends=True) + if not lines: + print("empty config", file=sys.stderr) + return 2 + _patch_backend_in_rerank_block(lines, args.backend) + _patch_instruction_format_under_backend(lines, "qwen3_vllm", args.instruction_format) + _patch_instruction_format_under_backend(lines, "qwen3_vllm_score", args.instruction_format) + args.config.write_text("".join(lines), encoding="utf-8") + print(f"patched {args.config}: backend={args.backend} instruction_format={args.instruction_format} (both vLLM blocks)") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/run_reranker_vllm_instruction_benchmark.sh b/scripts/run_reranker_vllm_instruction_benchmark.sh new file mode 100755 index 0000000..067a145 --- /dev/null +++ b/scripts/run_reranker_vllm_instruction_benchmark.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash +# Patch config, restart reranker, wait for /health, run benchmark_reranker_random_titles.py. +# Requires: curl, .venv with PyYAML not needed (patch is standalone Python). + +set -euo pipefail +ROOT="$(cd "$(dirname "$0")/.." && pwd)" +cd "$ROOT" + +PYTHON="${ROOT}/.venv/bin/python" +DAY="$(date +%F)" +OUT_DIR="${ROOT}/perf_reports/reranker_vllm_instruction/${DAY}" +mkdir -p "$OUT_DIR" + +health_ok() { + local want_backend="$1" + local want_fmt="$2" + local body + if ! body="$(curl -sS --connect-timeout 2 --max-time 5 "http://127.0.0.1:6007/health" 2>/dev/null)"; then + return 1 + fi + echo "$body" | "$PYTHON" -c " +import json, sys +want_b, want_f = sys.argv[1], sys.argv[2] +d = json.load(sys.stdin) +if d.get('status') != 'ok' or not d.get('model_loaded'): + sys.exit(1) +if d.get('backend') != want_b: + sys.exit(1) +if d.get('instruction_format') != want_f: + sys.exit(1) +sys.exit(0) +" "$want_backend" "$want_fmt" +} + +wait_health() { + local want_backend="$1" + local want_fmt="$2" + local i + for i in $(seq 1 180); do + if health_ok "$want_backend" "$want_fmt"; then + curl -sS "http://127.0.0.1:6007/health" | "$PYTHON" -m json.tool + return 0 + fi + echo "[wait] ${i}/180 backend=${want_backend} instruction_format=${want_fmt} ..." + sleep 3 + done + echo "[error] health did not match in time" >&2 + return 1 +} + +run_one() { + local backend="$1" + local fmt="$2" + local tag="${backend}|${fmt}" + local jf="${OUT_DIR}/${backend}_${fmt}.json" + + echo "========== ${tag} ==========" + "$PYTHON" "${ROOT}/scripts/patch_rerank_vllm_benchmark_config.py" \ + --backend "$backend" --instruction-format "$fmt" + + "${ROOT}/restart.sh" reranker + wait_health "$backend" "$fmt" + + if ! "$PYTHON" "${ROOT}/scripts/benchmark_reranker_random_titles.py" \ + 100,200,400,600,800,1000 \ + --repeat 5 \ + --seed 42 \ + --quiet-runs \ + --timeout 360 \ + --tag "$tag" \ + --json-summary-out "$jf" + then + echo "[warn] benchmark exited non-zero for ${tag} (see ${jf} failed flag / partial runs)" >&2 + fi + + echo "artifact: $jf" +} + +run_one qwen3_vllm compact +run_one qwen3_vllm standard +run_one qwen3_vllm_score compact +run_one qwen3_vllm_score standard + +# Restore repo-default-style rerank settings (score + compact). +"$PYTHON" "${ROOT}/scripts/patch_rerank_vllm_benchmark_config.py" \ + --backend qwen3_vllm_score --instruction-format compact +"${ROOT}/restart.sh" reranker +wait_health qwen3_vllm_score compact +echo "Restored config: qwen3_vllm_score + compact. Done. Artifacts under ${OUT_DIR}" -- libgit2 0.21.2