#!/usr/bin/env python3 """ Local tuning probe for GGUF reranker backends. Runs the backend directly in a fresh process per config to measure: - load time - GPU memory used by this process - single-request rerank latency Example: ./.venv-reranker-gguf/bin/python benchmarks/reranker/benchmark_reranker_gguf_local.py ./.venv-reranker-gguf-06b/bin/python benchmarks/reranker/benchmark_reranker_gguf_local.py --backend-name qwen3_gguf_06b --docs 400 """ from __future__ import annotations import argparse import json import os import random import statistics import subprocess import sys import time from pathlib import Path from typing import Any DEFAULT_TITLES = Path("/home/ubuntu/rerank_test/titles.1.8w") def load_titles(path: Path) -> list[str]: items: list[str] = [] with path.open(encoding="utf-8", errors="replace") as fh: for line in fh: text = line.strip() if text: items.append(text) return items def gpu_mem_for_pid(pid: int) -> int: try: out = subprocess.check_output( [ "nvidia-smi", "--query-compute-apps=pid,used_gpu_memory", "--format=csv,noheader,nounits", ], text=True, ) except Exception: return -1 for raw in out.splitlines(): parts = [p.strip() for p in raw.split(",")] if len(parts) != 2: continue try: row_pid = int(parts[0]) row_mem = int(parts[1]) except ValueError: continue if row_pid == pid: return row_mem return -1 def main() -> int: parser = argparse.ArgumentParser() parser.add_argument("--backend-name", type=str, default="qwen3_gguf") parser.add_argument("--titles-file", type=Path, default=DEFAULT_TITLES) parser.add_argument("--query", type=str, default="白色oversized T-shirt") parser.add_argument("--docs", type=int, default=160) parser.add_argument("--repeat", type=int, default=1) parser.add_argument("--seed", type=int, default=42) parser.add_argument( "--configs-json", type=str, default="", help="JSON array of config objects; when omitted, uses built-in scan set.", ) args = parser.parse_args() if not args.titles_file.is_file(): print(f"missing titles file: {args.titles_file}", file=sys.stderr) return 2 titles = load_titles(args.titles_file) if len(titles) < args.docs: print(f"not enough titles: need {args.docs}, got {len(titles)}", file=sys.stderr) return 2 random.seed(args.seed) docs = random.sample(titles, args.docs) if args.configs_json: configs = json.loads(args.configs_json) elif args.backend_name == "qwen3_gguf_06b": configs = [ {"name": "gguf_06b_full_256", "n_ctx": 256, "n_batch": 256, "n_ubatch": 256, "n_gpu_layers": 999}, {"name": "gguf_06b_full_320", "n_ctx": 320, "n_batch": 320, "n_ubatch": 320, "n_gpu_layers": 999}, {"name": "gguf_06b_full_384", "n_ctx": 384, "n_batch": 384, "n_ubatch": 384, "n_gpu_layers": 999}, {"name": "gguf_06b_full_512", "n_ctx": 512, "n_batch": 512, "n_ubatch": 512, "n_gpu_layers": 999}, ] else: configs = [ {"name": "gguf_t4_24g", "n_ctx": 384, "n_batch": 384, "n_ubatch": 128, "n_gpu_layers": 24}, {"name": "gguf_t4_40g", "n_ctx": 384, "n_batch": 384, "n_ubatch": 128, "n_gpu_layers": 40}, {"name": "gguf_t4_full", "n_ctx": 384, "n_batch": 384, "n_ubatch": 128, "n_gpu_layers": 999}, {"name": "gguf_t4_full_512", "n_ctx": 512, "n_batch": 512, "n_ubatch": 256, "n_gpu_layers": 999}, {"name": "gguf_t4_full_512_u512", "n_ctx": 512, "n_batch": 512, "n_ubatch": 512, "n_gpu_layers": 999}, {"name": "gguf_t4_full_768", "n_ctx": 768, "n_batch": 768, "n_ubatch": 256, "n_gpu_layers": 999}, ] from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend default_cfg_by_backend: dict[str, dict[str, Any]] = { "qwen3_gguf": { "_backend_name": "qwen3_gguf", "repo_id": "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF", "filename": "*Q8_0.gguf", "local_dir": "./models/reranker/qwen3-reranker-4b-gguf", "infer_batch_size": 8, }, "qwen3_gguf_06b": { "_backend_name": "qwen3_gguf_06b", "repo_id": "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF", "filename": "qwen3-reranker-0.6b-q8_0.gguf", "local_dir": "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf", "infer_batch_size": 32, }, } if args.backend_name not in default_cfg_by_backend: print(f"unsupported backend: {args.backend_name}", file=sys.stderr) return 2 base_cfg: dict[str, Any] = { **default_cfg_by_backend[args.backend_name], "instruction": "Rank products by query with category & style match prioritized", "cache_dir": "./model_cache", "main_gpu": 0, "n_threads": 2, "n_threads_batch": 4, "flash_attn": True, "offload_kqv": True, "use_mmap": True, "use_mlock": False, "sort_by_doc_length": True, "length_sort_mode": "char", "enable_warmup": True, "verbose": False, "reuse_query_state": True, } all_results: list[dict[str, Any]] = [] for cfg in configs: merged = dict(base_cfg) merged.update(cfg) name = str(merged.pop("name")) t0 = time.perf_counter() backend = Qwen3GGUFRerankerBackend(merged) load_ms = (time.perf_counter() - t0) * 1000.0 gpu_mem_mib = gpu_mem_for_pid(os.getpid()) runs: list[float] = [] last_meta: dict[str, Any] = {} for _ in range(args.repeat): t1 = time.perf_counter() _scores, meta = backend.score_with_meta(args.query, docs, normalize=True) runs.append((time.perf_counter() - t1) * 1000.0) last_meta = dict(meta) result = { "name": name, "config": merged, "load_ms": round(load_ms, 2), "gpu_mem_mib": gpu_mem_mib, "latency_ms_min": round(min(runs), 2), "latency_ms_avg": round(statistics.mean(runs), 2), "latency_ms_max": round(max(runs), 2), "meta": last_meta, } all_results.append(result) print(json.dumps(result, ensure_ascii=False)) del backend print("SUMMARY") for item in sorted(all_results, key=lambda x: x["latency_ms_avg"]): print( f'{item["name"]}: avg={item["latency_ms_avg"]}ms ' f'gpu={item["gpu_mem_mib"]}MiB load={item["load_ms"]}ms' ) return 0 if __name__ == "__main__": raise SystemExit(main())