Commit 149dad2b269c659869f90d45c48c8c619b27d72e

Authored by tangwang
1 parent 0d3e73ba

add rerank-cloud-perf-study

reranker/rerank-cloud-perf-study/rerank_dashscope_perf.py 0 → 100644
@@ -0,0 +1,472 @@ @@ -0,0 +1,472 @@
  1 +#!/usr/bin/env python3
  2 +from __future__ import annotations
  3 +
  4 +import argparse
  5 +import asyncio
  6 +import json
  7 +import math
  8 +import random
  9 +import statistics
  10 +import time
  11 +from dataclasses import dataclass
  12 +from pathlib import Path
  13 +from typing import Any, Dict, List, Optional
  14 +
  15 +import httpx
  16 +
  17 +
  18 +@dataclass
  19 +class RequestTemplate:
  20 + method: str
  21 + url: str
  22 + json_body: Optional[Dict[str, Any]] = None
  23 + headers: Optional[Dict[str, str]] = None
  24 +
  25 +
  26 +def percentile(sorted_values: List[float], p: float) -> float:
  27 + if not sorted_values:
  28 + return 0.0
  29 + if p <= 0:
  30 + return sorted_values[0]
  31 + if p >= 100:
  32 + return sorted_values[-1]
  33 + rank = (len(sorted_values) - 1) * (p / 100.0)
  34 + low = int(math.floor(rank))
  35 + high = int(math.ceil(rank))
  36 + if low == high:
  37 + return sorted_values[low]
  38 + weight = rank - low
  39 + return sorted_values[low] * (1.0 - weight) + sorted_values[high] * weight
  40 +
  41 +
  42 +def parse_csv_items(raw: str) -> List[str]:
  43 + return [x.strip() for x in str(raw or "").split(",") if x.strip()]
  44 +
  45 +
  46 +def parse_csv_ints(raw: str) -> List[int]:
  47 + values: List[int] = []
  48 + seen = set()
  49 + for item in parse_csv_items(raw):
  50 + try:
  51 + value = int(item)
  52 + except ValueError as exc:
  53 + raise ValueError(f"Invalid integer in CSV list: {item}") from exc
  54 + if value <= 0:
  55 + raise ValueError(f"Concurrency must be > 0, got {value}")
  56 + if value in seen:
  57 + continue
  58 + seen.add(value)
  59 + values.append(value)
  60 + return values
  61 +
  62 +
  63 +def parse_args() -> argparse.Namespace:
  64 + parser = argparse.ArgumentParser(description="DashScope /compatible-api/v1/reranks perf test")
  65 + parser.add_argument("--duration", type=int, default=20, help="Duration seconds per concurrency; <=0 means no duration cap")
  66 + parser.add_argument("--concurrency", type=int, default=1, help="Default concurrency if --concurrency-list is not set")
  67 + parser.add_argument(
  68 + "--concurrency-list",
  69 + type=str,
  70 + default="1,5,10,20",
  71 + help="Comma-separated concurrency list (e.g. 1,5,10,20). If set, overrides --concurrency.",
  72 + )
  73 + parser.add_argument("--max-requests", type=int, default=0, help="Stop after N requests per concurrency (0 means unlimited)")
  74 + parser.add_argument("--timeout", type=float, default=90.0, help="Request timeout seconds")
  75 + parser.add_argument("--max-errors", type=int, default=0, help="Stop current run when accumulated errors reach this value")
  76 + parser.add_argument(
  77 + "--base-url",
  78 + type=str,
  79 + default="https://dashscope.aliyuncs.com/compatible-api/v1",
  80 + help="Base URL for DashScope compatible API",
  81 + )
  82 + parser.add_argument("--api-key", type=str, default="", help="DashScope API key; if omitted, read from DASHSCOPE_API_KEY env")
  83 + parser.add_argument("--output", type=str, default="", help="Optional output JSON path")
  84 + parser.add_argument("--pause", type=float, default=0.0, help="Pause seconds between concurrency runs")
  85 +
  86 + parser.add_argument("--model", type=str, default="qwen3-rerank", help="Rerank model name")
  87 + parser.add_argument("--rerank-dynamic-docs", action="store_true", help="Generate documents payload dynamically on every request")
  88 + parser.add_argument("--rerank-doc-count", type=int, default=386, help="Document count per rerank request")
  89 + parser.add_argument("--rerank-vocab-size", type=int, default=1000, help="Word pool size for synthetic document generation")
  90 + parser.add_argument("--rerank-sentence-min-words", type=int, default=15, help="Minimum words per generated document")
  91 + parser.add_argument("--rerank-sentence-max-words", type=int, default=40, help="Maximum words per generated document")
  92 + parser.add_argument("--rerank-query", type=str, default="wireless mouse", help="Fixed query used for rerank dynamic docs mode")
  93 + parser.add_argument("--rerank-seed", type=int, default=20260312, help="Base random seed for dynamic docs mode")
  94 + parser.add_argument("--rerank-top-n", type=int, default=386, help="top_n for rerank requests; 0 means omit")
  95 + parser.add_argument(
  96 + "--rerank-instruct",
  97 + type=str,
  98 + default="Given a web search query, retrieve relevant passages that answer the query.",
  99 + help="Instruct field for DashScope rerank",
  100 + )
  101 + return parser.parse_args()
  102 +
  103 +
  104 +def build_word_pool(vocab_size: int) -> List[str]:
  105 + syllables = [
  106 + "al", "an", "ar", "as", "at", "ba", "be", "bi", "bo", "ca",
  107 + "ce", "ci", "co", "da", "de", "di", "do", "el", "en", "er",
  108 + "fa", "fe", "fi", "fo", "ga", "ge", "gi", "go", "ha", "he",
  109 + "hi", "ho", "ia", "ie", "il", "in", "io", "is", "ka", "ke",
  110 + "ki", "ko", "la", "le", "li", "lo", "ma", "me", "mi", "mo",
  111 + ]
  112 + word_pool: List[str] = []
  113 + for a in syllables:
  114 + for b in syllables:
  115 + word_pool.append(f"{a}{b}")
  116 + if len(word_pool) >= vocab_size:
  117 + return word_pool
  118 + raise ValueError(f"Unable to generate enough synthetic words: requested={vocab_size}, got={len(word_pool)}")
  119 +
  120 +
  121 +def build_rerank_dynamic_cfg(args: argparse.Namespace) -> Dict[str, Any]:
  122 + min_words = int(args.rerank_sentence_min_words)
  123 + max_words = int(args.rerank_sentence_max_words)
  124 + doc_count = int(args.rerank_doc_count)
  125 + vocab_size = int(args.rerank_vocab_size)
  126 + if doc_count <= 0:
  127 + raise ValueError(f"rerank-doc-count must be > 0, got {doc_count}")
  128 + if vocab_size <= 0:
  129 + raise ValueError(f"rerank-vocab-size must be > 0, got {vocab_size}")
  130 + if min_words <= 0:
  131 + raise ValueError(f"rerank-sentence-min-words must be > 0, got {min_words}")
  132 + if max_words < min_words:
  133 + raise ValueError(
  134 + f"rerank-sentence-max-words must be >= rerank-sentence-min-words, got {max_words} < {min_words}"
  135 + )
  136 + if args.rerank_seed < 0:
  137 + raise ValueError(f"rerank-seed must be >= 0, got {args.rerank_seed}")
  138 + if int(args.rerank_top_n) < 0:
  139 + raise ValueError(f"rerank-top-n must be >= 0, got {args.rerank_top_n}")
  140 +
  141 + return {
  142 + "model": args.model,
  143 + "query": args.rerank_query,
  144 + "doc_count": doc_count,
  145 + "min_words": min_words,
  146 + "max_words": max_words,
  147 + "seed": int(args.rerank_seed),
  148 + "top_n": int(args.rerank_top_n),
  149 + "instruct": args.rerank_instruct,
  150 + "word_pool": build_word_pool(vocab_size),
  151 + }
  152 +
  153 +
  154 +def build_random_rerank_payload(cfg: Dict[str, Any], rng: random.Random) -> Dict[str, Any]:
  155 + word_pool: List[str] = cfg["word_pool"]
  156 + documents: List[str] = []
  157 + for _ in range(cfg["doc_count"]):
  158 + doc_len = rng.randint(cfg["min_words"], cfg["max_words"])
  159 + documents.append(" ".join(rng.choices(word_pool, k=doc_len)))
  160 +
  161 + payload = {
  162 + "model": cfg["model"],
  163 + "documents": documents,
  164 + "query": cfg["query"],
  165 + "instruct": cfg["instruct"],
  166 + }
  167 + if int(cfg.get("top_n", 0)) > 0:
  168 + payload["top_n"] = int(cfg["top_n"])
  169 + return payload
  170 +
  171 +
  172 +def build_static_template(base_url: str, api_key: str, args: argparse.Namespace) -> RequestTemplate:
  173 + payload: Dict[str, Any] = {
  174 + "model": args.model,
  175 + "documents": [
  176 + "文本排序模型广泛用于搜索引擎和推荐系统中,它们根据文本相关性对候选文本进行排序",
  177 + "量子计算是计算科学的一个前沿领域",
  178 + "预训练语言模型的发展给文本排序模型带来了新的进展",
  179 + ],
  180 + "query": "什么是文本排序模型",
  181 + "instruct": args.rerank_instruct,
  182 + }
  183 + if int(args.rerank_top_n) > 0:
  184 + payload["top_n"] = int(args.rerank_top_n)
  185 +
  186 + return RequestTemplate(
  187 + method="POST",
  188 + url=f"{base_url.rstrip('/')}/reranks",
  189 + json_body=payload,
  190 + headers={
  191 + "Authorization": f"Bearer {api_key}",
  192 + "Content-Type": "application/json",
  193 + },
  194 + )
  195 +
  196 +
  197 +def validate_response_payload(payload: Any) -> tuple[bool, str]:
  198 + if not isinstance(payload, dict):
  199 + return False, "invalid_payload_non_dict"
  200 + if "results" not in payload:
  201 + return False, "invalid_payload_missing_results"
  202 + if not isinstance(payload["results"], list):
  203 + return False, "invalid_payload_results_non_list"
  204 + return True, ""
  205 +
  206 +
  207 +async def run_single_concurrency(
  208 + template: RequestTemplate,
  209 + duration_sec: int,
  210 + concurrency: int,
  211 + max_requests: int,
  212 + max_errors: int,
  213 + timeout_sec: float,
  214 + rerank_dynamic_cfg: Optional[Dict[str, Any]],
  215 +) -> Dict[str, Any]:
  216 + latencies: List[float] = []
  217 + status_counter: Dict[int, int] = {}
  218 + err_counter: Dict[str, int] = {}
  219 + total_requests = 0
  220 + success_requests = 0
  221 + stop_flag = False
  222 + lock = asyncio.Lock()
  223 + start = time.perf_counter()
  224 +
  225 + timeout = httpx.Timeout(timeout=timeout_sec)
  226 + limits = httpx.Limits(max_connections=max(concurrency * 2, 20), max_keepalive_connections=max(concurrency, 10))
  227 +
  228 + async def worker(worker_id: int, client: httpx.AsyncClient) -> None:
  229 + nonlocal total_requests, success_requests, stop_flag
  230 + worker_rng: Optional[random.Random] = None
  231 + if rerank_dynamic_cfg is not None:
  232 + worker_rng = random.Random(int(rerank_dynamic_cfg["seed"]) + worker_id)
  233 +
  234 + while not stop_flag:
  235 + elapsed = time.perf_counter() - start
  236 + if duration_sec > 0 and elapsed >= duration_sec:
  237 + break
  238 +
  239 + async with lock:
  240 + if max_requests > 0 and total_requests >= max_requests:
  241 + stop_flag = True
  242 + break
  243 + total_requests += 1
  244 +
  245 + payload = template.json_body
  246 + if rerank_dynamic_cfg is not None and worker_rng is not None:
  247 + payload = build_random_rerank_payload(rerank_dynamic_cfg, worker_rng)
  248 +
  249 + t0 = time.perf_counter()
  250 + ok = False
  251 + status = 0
  252 + err = ""
  253 + try:
  254 + resp = await client.request(
  255 + method=template.method,
  256 + url=template.url,
  257 + headers=template.headers,
  258 + json=payload,
  259 + )
  260 + status = int(resp.status_code)
  261 + ok = 200 <= status < 300
  262 + if ok:
  263 + try:
  264 + body = resp.json()
  265 + except Exception:
  266 + ok = False
  267 + err = "invalid_json_response"
  268 + else:
  269 + valid, reason = validate_response_payload(body)
  270 + if not valid:
  271 + ok = False
  272 + err = reason or "invalid_payload"
  273 + if not ok and not err:
  274 + err = f"http_{status}"
  275 + except Exception as e:
  276 + err = type(e).__name__
  277 +
  278 + latency_ms = (time.perf_counter() - t0) * 1000.0
  279 + async with lock:
  280 + latencies.append(latency_ms)
  281 + if status:
  282 + status_counter[status] = status_counter.get(status, 0) + 1
  283 + if ok:
  284 + success_requests += 1
  285 + else:
  286 + err_counter[err or "unknown"] = err_counter.get(err or "unknown", 0) + 1
  287 + if max_errors > 0 and sum(err_counter.values()) >= max_errors:
  288 + stop_flag = True
  289 +
  290 + async with httpx.AsyncClient(timeout=timeout, limits=limits) as client:
  291 + tasks = [asyncio.create_task(worker(i, client)) for i in range(concurrency)]
  292 + await asyncio.gather(*tasks)
  293 +
  294 + elapsed = max(time.perf_counter() - start, 1e-9)
  295 + lat_sorted = sorted(latencies)
  296 + result = {
  297 + "scenario": "rerank_dashscope",
  298 + "concurrency": concurrency,
  299 + "duration_sec": round(elapsed, 3),
  300 + "total_requests": total_requests,
  301 + "success_requests": success_requests,
  302 + "failed_requests": max(total_requests - success_requests, 0),
  303 + "success_rate": round((success_requests / total_requests) * 100.0, 2) if total_requests else 0.0,
  304 + "throughput_rps": round(total_requests / elapsed, 2),
  305 + "latency_ms": {
  306 + "avg": round(statistics.mean(lat_sorted), 2) if lat_sorted else 0.0,
  307 + "p50": round(percentile(lat_sorted, 50), 2),
  308 + "p90": round(percentile(lat_sorted, 90), 2),
  309 + "p95": round(percentile(lat_sorted, 95), 2),
  310 + "p99": round(percentile(lat_sorted, 99), 2),
  311 + "max": round(max(lat_sorted), 2) if lat_sorted else 0.0,
  312 + },
  313 + "status_codes": dict(sorted(status_counter.items(), key=lambda x: x[0])),
  314 + "errors": dict(sorted(err_counter.items(), key=lambda x: x[0])),
  315 + }
  316 + return result
  317 +
  318 +
  319 +def format_summary(result: Dict[str, Any]) -> str:
  320 + lat = result["latency_ms"]
  321 + lines = [
  322 + f"
  323 +=== Scenario: {result['scenario']} @ concurrency={result['concurrency']} ===",
  324 + "requests={total_requests} success={success_requests} fail={failed_requests} success_rate={success_rate}% rps={throughput_rps}".format(**result),
  325 + f"latency(ms): avg={lat['avg']} p50={lat['p50']} p90={lat['p90']} p95={lat['p95']} p99={lat['p99']} max={lat['max']}",
  326 + f"status_codes: {result['status_codes']}",
  327 + ]
  328 + if result["errors"]:
  329 + lines.append(f"errors: {result['errors']}")
  330 + return "
  331 +".join(lines)
  332 +
  333 +
  334 +def aggregate_results(results: List[Dict[str, Any]]) -> Dict[str, Any]:
  335 + if not results:
  336 + return {}
  337 + total_requests = sum(x["total_requests"] for x in results)
  338 + success_requests = sum(x["success_requests"] for x in results)
  339 + failed_requests = sum(x["failed_requests"] for x in results)
  340 + total_duration = sum(x["duration_sec"] for x in results)
  341 + weighted_avg_latency = 0.0
  342 + if total_requests > 0:
  343 + weighted_avg_latency = sum(x["latency_ms"]["avg"] * x["total_requests"] for x in results) / total_requests
  344 +
  345 + return {
  346 + "scenario": "ALL",
  347 + "total_requests": total_requests,
  348 + "success_requests": success_requests,
  349 + "failed_requests": failed_requests,
  350 + "success_rate": round((success_requests / total_requests) * 100.0, 2) if total_requests else 0.0,
  351 + "aggregate_rps": round(total_requests / max(total_duration, 1e-9), 2),
  352 + "weighted_avg_latency_ms": round(weighted_avg_latency, 2),
  353 + }
  354 +
  355 +
  356 +async def main_async() -> int:
  357 + import os
  358 +
  359 + args = parse_args()
  360 + api_key = (args.api_key or os.getenv("DASHSCOPE_API_KEY") or "").strip()
  361 + if not api_key:
  362 + print("Missing API key. Set --api-key or DASHSCOPE_API_KEY.")
  363 + return 2
  364 +
  365 + try:
  366 + concurrency_values = parse_csv_ints(args.concurrency_list) if args.concurrency_list else [args.concurrency]
  367 + except ValueError as exc:
  368 + print(str(exc))
  369 + return 2
  370 + if not concurrency_values:
  371 + print("concurrency-list is empty after parsing.")
  372 + return 2
  373 +
  374 + try:
  375 + rerank_dynamic_cfg = build_rerank_dynamic_cfg(args) if args.rerank_dynamic_docs else None
  376 + except ValueError as exc:
  377 + print(str(exc))
  378 + return 2
  379 +
  380 + template = build_static_template(args.base_url, api_key, args)
  381 +
  382 + print("Load test config:")
  383 + print(" scenario=rerank_dashscope")
  384 + print(f" duration={args.duration}s")
  385 + print(f" concurrency={args.concurrency}")
  386 + print(f" concurrency_list={concurrency_values}")
  387 + print(f" max_requests={args.max_requests}")
  388 + print(f" timeout={args.timeout}s")
  389 + print(f" max_errors={args.max_errors}")
  390 + print(f" base_url={args.base_url}")
  391 + print(f" model={args.model}")
  392 + print(f" rerank_dynamic_docs={args.rerank_dynamic_docs}")
  393 + if args.rerank_dynamic_docs:
  394 + print(f" rerank_doc_count={args.rerank_doc_count}")
  395 + print(f" rerank_vocab_size={args.rerank_vocab_size}")
  396 + print(f" rerank_sentence_words=[{args.rerank_sentence_min_words},{args.rerank_sentence_max_words}]")
  397 + print(f" rerank_query={args.rerank_query}")
  398 + print(f" rerank_seed={args.rerank_seed}")
  399 + print(f" rerank_top_n={args.rerank_top_n}")
  400 + print(f" rerank_instruct={args.rerank_instruct}")
  401 + else:
  402 + print(" static_request_payload=demo_payload")
  403 +
  404 + results: List[Dict[str, Any]] = []
  405 + total_jobs = len(concurrency_values)
  406 + for idx, c in enumerate(concurrency_values, start=1):
  407 + print(f"
  408 +[{idx}/{total_jobs}] running rerank_dashscope @ concurrency={c} ...")
  409 + result = await run_single_concurrency(
  410 + template=template,
  411 + duration_sec=args.duration,
  412 + concurrency=c,
  413 + max_requests=args.max_requests,
  414 + max_errors=args.max_errors,
  415 + timeout_sec=args.timeout,
  416 + rerank_dynamic_cfg=rerank_dynamic_cfg,
  417 + )
  418 + print(format_summary(result))
  419 + results.append(result)
  420 + if args.pause > 0 and idx < total_jobs:
  421 + await asyncio.sleep(args.pause)
  422 +
  423 + final = {
  424 + "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
  425 + "config": {
  426 + "scenario": "rerank_dashscope",
  427 + "duration_sec": args.duration,
  428 + "concurrency": args.concurrency,
  429 + "concurrency_list": concurrency_values,
  430 + "max_requests": args.max_requests,
  431 + "timeout_sec": args.timeout,
  432 + "max_errors": args.max_errors,
  433 + "base_url": args.base_url,
  434 + "model": args.model,
  435 + "output": args.output or None,
  436 + "rerank_dynamic_docs": args.rerank_dynamic_docs,
  437 + "rerank_doc_count": args.rerank_doc_count,
  438 + "rerank_vocab_size": args.rerank_vocab_size,
  439 + "rerank_sentence_min_words": args.rerank_sentence_min_words,
  440 + "rerank_sentence_max_words": args.rerank_sentence_max_words,
  441 + "rerank_query": args.rerank_query,
  442 + "rerank_seed": args.rerank_seed,
  443 + "rerank_top_n": args.rerank_top_n,
  444 + "rerank_instruct": args.rerank_instruct,
  445 + },
  446 + "results": results,
  447 + "overall": aggregate_results(results),
  448 + }
  449 +
  450 + print("
  451 +=== Overall ===")
  452 + print(json.dumps(final["overall"], ensure_ascii=False, indent=2))
  453 +
  454 + if args.output:
  455 + out_path = Path(args.output)
  456 + out_path.parent.mkdir(parents=True, exist_ok=True)
  457 + out_path.write_text(json.dumps(final, ensure_ascii=False, indent=2), encoding="utf-8")
  458 + print(f"Saved JSON report: {out_path}")
  459 +
  460 + return 0
  461 +
  462 +
  463 +def main() -> int:
  464 + try:
  465 + return asyncio.run(main_async())
  466 + except KeyboardInterrupt:
  467 + print("Interrupted by user")
  468 + return 130
  469 +
  470 +
  471 +if __name__ == "__main__":
  472 + raise SystemExit(main())
reranker/rerank-cloud-perf-study/rerank_dashscope_perf_usage.md 0 → 100644
@@ -0,0 +1,269 @@ @@ -0,0 +1,269 @@
  1 +下面是一份**简洁但完整的使用说明**,适合直接放在 README 或脚本注释里。
  2 +
  3 +---
  4 +
  5 +# rerank_dashscope_perf.py 使用说明
  6 +
  7 +该脚本用于对 **DashScope `qwen3-rerank` 接口**进行并发性能测试,
  8 +测试接口:
  9 +
  10 +```
  11 +POST https://dashscope.aliyuncs.com/compatible-api/v1/reranks
  12 +```
  13 +
  14 +脚本支持:
  15 +
  16 +* 并发压测
  17 +* 固定或动态生成 documents
  18 +* 自定义 doc 数量(例如 386)
  19 +* 输出详细 latency / RPS 统计
  20 +* 多并发梯度测试(如 1,5,10,20)
  21 +
  22 +---
  23 +
  24 +# 一、环境准备
  25 +
  26 +### 1 安装依赖
  27 +
  28 +脚本依赖 `httpx`:
  29 +
  30 +```bash
  31 +pip install httpx
  32 +```
  33 +
  34 +如果使用虚拟环境:
  35 +
  36 +```bash
  37 +.venv/bin/pip install httpx
  38 +```
  39 +
  40 +---
  41 +
  42 +### 2 设置 DashScope API Key
  43 +
  44 +```bash
  45 +export DASHSCOPE_API_KEY=你的key
  46 +```
  47 +
  48 +也可以通过参数指定:
  49 +
  50 +```bash
  51 +--api-key xxx
  52 +```
  53 +
  54 +---
  55 +
  56 +# 二、基本用法
  57 +
  58 +最常见的压测方式:
  59 +
  60 +```bash
  61 +python rerank_dashscope_perf.py \
  62 + --duration 20 \
  63 + --concurrency-list 1,5,10,20 \
  64 + --timeout 90 \
  65 + --rerank-dynamic-docs \
  66 + --rerank-doc-count 386 \
  67 + --rerank-vocab-size 1000 \
  68 + --rerank-sentence-min-words 15 \
  69 + --rerank-sentence-max-words 40 \
  70 + --rerank-query "wireless mouse" \
  71 + --rerank-seed 20260312 \
  72 + --rerank-top-n 386 \
  73 + --output perf_result.json
  74 +```
  75 +
  76 +含义:
  77 +
  78 +| 参数 | 说明 |
  79 +| ------------------- | ------------- |
  80 +| duration | 每个并发测试持续时间(秒) |
  81 +| concurrency-list | 并发列表 |
  82 +| timeout | 单请求超时时间 |
  83 +| rerank-dynamic-docs | 启用动态 doc 生成 |
  84 +| rerank-doc-count | 每个请求 doc 数量 |
  85 +| rerank-top-n | 返回 top_n |
  86 +| output | 保存结果 JSON |
  87 +
  88 +---
  89 +
  90 +# 三、测试模式
  91 +
  92 +脚本有两种请求模式:
  93 +
  94 +---
  95 +
  96 +# 1 静态请求模式(默认)
  97 +
  98 +如果**不使用 `--rerank-dynamic-docs`**,请求 payload 固定为:
  99 +
  100 +```json
  101 +{
  102 + "model": "qwen3-rerank",
  103 + "documents": [
  104 + "文本排序模型广泛用于搜索引擎和推荐系统中,它们根据文本相关性对候选文本进行排序",
  105 + "量子计算是计算科学的一个前沿领域",
  106 + "预训练语言模型的发展给文本排序模型带来了新的进展"
  107 + ],
  108 + "query": "什么是文本排序模型",
  109 + "top_n": 2
  110 +}
  111 +```
  112 +
  113 +适合:
  114 +
  115 +* 验证接口
  116 +* 小规模测试
  117 +
  118 +---
  119 +
  120 +# 2 动态 documents 模式(推荐)
  121 +
  122 +启用参数:
  123 +
  124 +```
  125 +--rerank-dynamic-docs
  126 +```
  127 +
  128 +脚本会:
  129 +
  130 +* 每个请求生成 **N 条 documents**
  131 +* 每条 doc 是 **随机词拼接句子**
  132 +* 每个请求 **内容不同**
  133 +
  134 +示例 doc:
  135 +
  136 +```
  137 +alce bafi kolo dede hobe anma cigi lofi asbe erko kaci molo fadi helo
  138 +mace biro aldi kolo gace hoin doka lale cebo fafa ineri kasi hobe lomo
  139 +gifi beme koha laci anfi celi dore ioce kobo hila mefi arce enbo hega
  140 +```
  141 +
  142 +优点:
  143 +
  144 +* 不依赖真实语料
  145 +* 更接近真实 token 分布
  146 +* 压测稳定
  147 +
  148 +---
  149 +
  150 +# 四、386 documents 压测示例
  151 +
  152 +与你的测试方式一致:
  153 +
  154 +```bash
  155 +python rerank_dashscope_perf.py \
  156 + --duration 20 \
  157 + --concurrency-list 1,5,10,20 \
  158 + --timeout 90 \
  159 + --rerank-dynamic-docs \
  160 + --rerank-doc-count 386 \
  161 + --rerank-vocab-size 1000 \
  162 + --rerank-sentence-min-words 15 \
  163 + --rerank-sentence-max-words 40 \
  164 + --rerank-query "wireless mouse" \
  165 + --rerank-seed 20260312 \
  166 + --rerank-top-n 386
  167 +```
  168 +
  169 +每个请求:
  170 +
  171 +```
  172 +query: wireless mouse
  173 +documents: 386条
  174 +每条doc长度: 15~40词
  175 +```
  176 +
  177 +---
  178 +
  179 +# 五、输出结果示例
  180 +
  181 +终端输出:
  182 +
  183 +```
  184 +[1/4] running rerank_dashscope @ concurrency=1 ...
  185 +
  186 +=== Scenario: rerank_dashscope @ concurrency=1 ===
  187 +requests=84 success=84 fail=0 success_rate=100.0% rps=4.2
  188 +latency(ms): avg=230 p50=220 p90=260 p95=280 p99=310 max=340
  189 +status_codes: {200: 84}
  190 +```
  191 +
  192 +字段说明:
  193 +
  194 +| 指标 | 说明 |
  195 +| --------------- | ---- |
  196 +| requests | 总请求数 |
  197 +| success | 成功请求 |
  198 +| fail | 失败请求 |
  199 +| success_rate | 成功率 |
  200 +| rps | 吞吐量 |
  201 +| p50/p90/p95/p99 | 延迟分位 |
  202 +| max | 最大延迟 |
  203 +
  204 +---
  205 +
  206 +# 六、JSON 报告
  207 +
  208 +如果指定:
  209 +
  210 +```
  211 +--output perf_result.json
  212 +```
  213 +
  214 +会生成报告:
  215 +
  216 +```json
  217 +{
  218 + "results": [
  219 + {
  220 + "concurrency": 1,
  221 + "throughput_rps": 4.2,
  222 + "latency_ms": {
  223 + "avg": 230,
  224 + "p95": 280
  225 + }
  226 + }
  227 + ]
  228 +}
  229 +```
  230 +
  231 +适合:
  232 +
  233 +* 性能对比
  234 +* 画图
  235 +* 压测记录
  236 +
  237 +---
  238 +
  239 +# 七、常见参数
  240 +
  241 +| 参数 | 默认值 | 说明 |
  242 +| --------------------------- | --------- | ------- |
  243 +| --duration | 20 | 单并发测试时间 |
  244 +| --concurrency-list | 1,5,10,20 | 并发梯度 |
  245 +| --timeout | 90 | 请求超时 |
  246 +| --rerank-doc-count | 386 | doc数量 |
  247 +| --rerank-vocab-size | 1000 | 词表大小 |
  248 +| --rerank-sentence-min-words | 15 | doc最小长度 |
  249 +| --rerank-sentence-max-words | 40 | doc最大长度 |
  250 +| --rerank-top-n | 386 | 返回top_n |
  251 +
  252 +---
  253 +
  254 +# 八、推荐压测方式
  255 +
  256 +推荐测试:
  257 +
  258 +```
  259 +docs = 386
  260 +query = wireless mouse
  261 +concurrency = 1,5,10,20
  262 +duration = 20~60s
  263 +```
  264 +
  265 +即可得到:
  266 +
  267 +* latency 曲线
  268 +* RPS
  269 +* 并发极限