benchmarks/reranker/benchmark_reranker_gguf_local.py

#!/usr/bin/env python3
"""
Local tuning probe for GGUF reranker backends.
Runs the backend directly in a fresh process per config to measure:
- load time
- GPU memory used by this process
- single-request rerank latency
Example:
  ./.venv-reranker-gguf/bin/python benchmarks/reranker/benchmark_reranker_gguf_local.py
  ./.venv-reranker-gguf-06b/bin/python benchmarks/reranker/benchmark_reranker_gguf_local.py --backend-name qwen3_gguf_06b --docs 400
"""
from __future__ import annotations
import argparse
import json
import os
import random
import statistics
import subprocess
import sys
import time
from pathlib import Path
from typing import Any
DEFAULT_TITLES = Path("/home/ubuntu/rerank_test/titles.1.8w")
def load_titles(path: Path) -> list[str]:
    items: list[str] = []
    with path.open(encoding="utf-8", errors="replace") as fh:
        for line in fh:
            text = line.strip()
            if text:
                items.append(text)
    return items
def gpu_mem_for_pid(pid: int) -> int:
    try:
        out = subprocess.check_output(
            [
                "nvidia-smi",
                "--query-compute-apps=pid,used_gpu_memory",
                "--format=csv,noheader,nounits",
            ],
            text=True,
        )
    except Exception:
        return -1
    for raw in out.splitlines():
        parts = [p.strip() for p in raw.split(",")]
        if len(parts) != 2:
            continue
        try:
            row_pid = int(parts[0])
            row_mem = int(parts[1])
        except ValueError:
            continue
        if row_pid == pid:
            return row_mem
    return -1
def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--backend-name", type=str, default="qwen3_gguf")
    parser.add_argument("--titles-file", type=Path, default=DEFAULT_TITLES)
    parser.add_argument("--query", type=str, default="白色oversized T-shirt")
    parser.add_argument("--docs", type=int, default=160)
    parser.add_argument("--repeat", type=int, default=1)
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument(
        "--configs-json",
        type=str,
        default="",
        help="JSON array of config objects; when omitted, uses built-in scan set.",
    )
    args = parser.parse_args()
    if not args.titles_file.is_file():
        print(f"missing titles file: {args.titles_file}", file=sys.stderr)
        return 2
    titles = load_titles(args.titles_file)
    if len(titles) < args.docs:
        print(f"not enough titles: need {args.docs}, got {len(titles)}", file=sys.stderr)
        return 2
    random.seed(args.seed)
    docs = random.sample(titles, args.docs)
    if args.configs_json:
        configs = json.loads(args.configs_json)
    elif args.backend_name == "qwen3_gguf_06b":
        configs = [
            {"name": "gguf_06b_full_256", "n_ctx": 256, "n_batch": 256, "n_ubatch": 256, "n_gpu_layers": 999},
            {"name": "gguf_06b_full_320", "n_ctx": 320, "n_batch": 320, "n_ubatch": 320, "n_gpu_layers": 999},
            {"name": "gguf_06b_full_384", "n_ctx": 384, "n_batch": 384, "n_ubatch": 384, "n_gpu_layers": 999},
            {"name": "gguf_06b_full_512", "n_ctx": 512, "n_batch": 512, "n_ubatch": 512, "n_gpu_layers": 999},
        ]
    else:
        configs = [
            {"name": "gguf_t4_24g", "n_ctx": 384, "n_batch": 384, "n_ubatch": 128, "n_gpu_layers": 24},
            {"name": "gguf_t4_40g", "n_ctx": 384, "n_batch": 384, "n_ubatch": 128, "n_gpu_layers": 40},
            {"name": "gguf_t4_full", "n_ctx": 384, "n_batch": 384, "n_ubatch": 128, "n_gpu_layers": 999},
            {"name": "gguf_t4_full_512", "n_ctx": 512, "n_batch": 512, "n_ubatch": 256, "n_gpu_layers": 999},
            {"name": "gguf_t4_full_512_u512", "n_ctx": 512, "n_batch": 512, "n_ubatch": 512, "n_gpu_layers": 999},
            {"name": "gguf_t4_full_768", "n_ctx": 768, "n_batch": 768, "n_ubatch": 256, "n_gpu_layers": 999},
        ]
    from reranker.backends.qwen3_gguf import Qwen3GGUFRerankerBackend
    default_cfg_by_backend: dict[str, dict[str, Any]] = {
        "qwen3_gguf": {
            "_backend_name": "qwen3_gguf",
            "repo_id": "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF",
            "filename": "*Q8_0.gguf",
            "local_dir": "./models/reranker/qwen3-reranker-4b-gguf",
            "infer_batch_size": 8,
        },
        "qwen3_gguf_06b": {
            "_backend_name": "qwen3_gguf_06b",
            "repo_id": "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF",
            "filename": "qwen3-reranker-0.6b-q8_0.gguf",
            "local_dir": "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf",
            "infer_batch_size": 32,
        },
    }
    if args.backend_name not in default_cfg_by_backend:
        print(f"unsupported backend: {args.backend_name}", file=sys.stderr)
        return 2
    base_cfg: dict[str, Any] = {
        **default_cfg_by_backend[args.backend_name],
        "instruction": "Rank products by query with category & style match prioritized",
        "cache_dir": "./model_cache",
        "main_gpu": 0,
        "n_threads": 2,
        "n_threads_batch": 4,
        "flash_attn": True,
        "offload_kqv": True,
        "use_mmap": True,
        "use_mlock": False,
        "sort_by_doc_length": True,
        "length_sort_mode": "char",
        "enable_warmup": True,
        "verbose": False,
        "reuse_query_state": True,
    }
    all_results: list[dict[str, Any]] = []
    for cfg in configs:
        merged = dict(base_cfg)
        merged.update(cfg)
        name = str(merged.pop("name"))
        t0 = time.perf_counter()
        backend = Qwen3GGUFRerankerBackend(merged)
        load_ms = (time.perf_counter() - t0) * 1000.0
        gpu_mem_mib = gpu_mem_for_pid(os.getpid())
        runs: list[float] = []
        last_meta: dict[str, Any] = {}
        for _ in range(args.repeat):
            t1 = time.perf_counter()
            _scores, meta = backend.score_with_meta(args.query, docs, normalize=True)
            runs.append((time.perf_counter() - t1) * 1000.0)
            last_meta = dict(meta)
        result = {
            "name": name,
            "config": merged,
            "load_ms": round(load_ms, 2),
            "gpu_mem_mib": gpu_mem_mib,
            "latency_ms_min": round(min(runs), 2),
            "latency_ms_avg": round(statistics.mean(runs), 2),
            "latency_ms_max": round(max(runs), 2),
            "meta": last_meta,
        }
        all_results.append(result)
        print(json.dumps(result, ensure_ascii=False))
        del backend
    print("SUMMARY")
    for item in sorted(all_results, key=lambda x: x["latency_ms_avg"]):
        print(
            f'{item["name"]}: avg={item["latency_ms_avg"]}ms '
            f'gpu={item["gpu_mem_mib"]}MiB load={item["load_ms"]}ms'
        )
    return 0
if __name__ == "__main__":
    raise SystemExit(main())