Commit 149dad2b269c659869f90d45c48c8c619b27d72e
1 parent
0d3e73ba
add rerank-cloud-perf-study
Showing
2 changed files
with
741 additions
and
0 deletions
Show diff stats
reranker/rerank-cloud-perf-study/rerank_dashscope_perf.py
0 → 100644
| ... | ... | @@ -0,0 +1,472 @@ |
| 1 | +#!/usr/bin/env python3 | |
| 2 | +from __future__ import annotations | |
| 3 | + | |
| 4 | +import argparse | |
| 5 | +import asyncio | |
| 6 | +import json | |
| 7 | +import math | |
| 8 | +import random | |
| 9 | +import statistics | |
| 10 | +import time | |
| 11 | +from dataclasses import dataclass | |
| 12 | +from pathlib import Path | |
| 13 | +from typing import Any, Dict, List, Optional | |
| 14 | + | |
| 15 | +import httpx | |
| 16 | + | |
| 17 | + | |
| 18 | +@dataclass | |
| 19 | +class RequestTemplate: | |
| 20 | + method: str | |
| 21 | + url: str | |
| 22 | + json_body: Optional[Dict[str, Any]] = None | |
| 23 | + headers: Optional[Dict[str, str]] = None | |
| 24 | + | |
| 25 | + | |
| 26 | +def percentile(sorted_values: List[float], p: float) -> float: | |
| 27 | + if not sorted_values: | |
| 28 | + return 0.0 | |
| 29 | + if p <= 0: | |
| 30 | + return sorted_values[0] | |
| 31 | + if p >= 100: | |
| 32 | + return sorted_values[-1] | |
| 33 | + rank = (len(sorted_values) - 1) * (p / 100.0) | |
| 34 | + low = int(math.floor(rank)) | |
| 35 | + high = int(math.ceil(rank)) | |
| 36 | + if low == high: | |
| 37 | + return sorted_values[low] | |
| 38 | + weight = rank - low | |
| 39 | + return sorted_values[low] * (1.0 - weight) + sorted_values[high] * weight | |
| 40 | + | |
| 41 | + | |
| 42 | +def parse_csv_items(raw: str) -> List[str]: | |
| 43 | + return [x.strip() for x in str(raw or "").split(",") if x.strip()] | |
| 44 | + | |
| 45 | + | |
| 46 | +def parse_csv_ints(raw: str) -> List[int]: | |
| 47 | + values: List[int] = [] | |
| 48 | + seen = set() | |
| 49 | + for item in parse_csv_items(raw): | |
| 50 | + try: | |
| 51 | + value = int(item) | |
| 52 | + except ValueError as exc: | |
| 53 | + raise ValueError(f"Invalid integer in CSV list: {item}") from exc | |
| 54 | + if value <= 0: | |
| 55 | + raise ValueError(f"Concurrency must be > 0, got {value}") | |
| 56 | + if value in seen: | |
| 57 | + continue | |
| 58 | + seen.add(value) | |
| 59 | + values.append(value) | |
| 60 | + return values | |
| 61 | + | |
| 62 | + | |
| 63 | +def parse_args() -> argparse.Namespace: | |
| 64 | + parser = argparse.ArgumentParser(description="DashScope /compatible-api/v1/reranks perf test") | |
| 65 | + parser.add_argument("--duration", type=int, default=20, help="Duration seconds per concurrency; <=0 means no duration cap") | |
| 66 | + parser.add_argument("--concurrency", type=int, default=1, help="Default concurrency if --concurrency-list is not set") | |
| 67 | + parser.add_argument( | |
| 68 | + "--concurrency-list", | |
| 69 | + type=str, | |
| 70 | + default="1,5,10,20", | |
| 71 | + help="Comma-separated concurrency list (e.g. 1,5,10,20). If set, overrides --concurrency.", | |
| 72 | + ) | |
| 73 | + parser.add_argument("--max-requests", type=int, default=0, help="Stop after N requests per concurrency (0 means unlimited)") | |
| 74 | + parser.add_argument("--timeout", type=float, default=90.0, help="Request timeout seconds") | |
| 75 | + parser.add_argument("--max-errors", type=int, default=0, help="Stop current run when accumulated errors reach this value") | |
| 76 | + parser.add_argument( | |
| 77 | + "--base-url", | |
| 78 | + type=str, | |
| 79 | + default="https://dashscope.aliyuncs.com/compatible-api/v1", | |
| 80 | + help="Base URL for DashScope compatible API", | |
| 81 | + ) | |
| 82 | + parser.add_argument("--api-key", type=str, default="", help="DashScope API key; if omitted, read from DASHSCOPE_API_KEY env") | |
| 83 | + parser.add_argument("--output", type=str, default="", help="Optional output JSON path") | |
| 84 | + parser.add_argument("--pause", type=float, default=0.0, help="Pause seconds between concurrency runs") | |
| 85 | + | |
| 86 | + parser.add_argument("--model", type=str, default="qwen3-rerank", help="Rerank model name") | |
| 87 | + parser.add_argument("--rerank-dynamic-docs", action="store_true", help="Generate documents payload dynamically on every request") | |
| 88 | + parser.add_argument("--rerank-doc-count", type=int, default=386, help="Document count per rerank request") | |
| 89 | + parser.add_argument("--rerank-vocab-size", type=int, default=1000, help="Word pool size for synthetic document generation") | |
| 90 | + parser.add_argument("--rerank-sentence-min-words", type=int, default=15, help="Minimum words per generated document") | |
| 91 | + parser.add_argument("--rerank-sentence-max-words", type=int, default=40, help="Maximum words per generated document") | |
| 92 | + parser.add_argument("--rerank-query", type=str, default="wireless mouse", help="Fixed query used for rerank dynamic docs mode") | |
| 93 | + parser.add_argument("--rerank-seed", type=int, default=20260312, help="Base random seed for dynamic docs mode") | |
| 94 | + parser.add_argument("--rerank-top-n", type=int, default=386, help="top_n for rerank requests; 0 means omit") | |
| 95 | + parser.add_argument( | |
| 96 | + "--rerank-instruct", | |
| 97 | + type=str, | |
| 98 | + default="Given a web search query, retrieve relevant passages that answer the query.", | |
| 99 | + help="Instruct field for DashScope rerank", | |
| 100 | + ) | |
| 101 | + return parser.parse_args() | |
| 102 | + | |
| 103 | + | |
| 104 | +def build_word_pool(vocab_size: int) -> List[str]: | |
| 105 | + syllables = [ | |
| 106 | + "al", "an", "ar", "as", "at", "ba", "be", "bi", "bo", "ca", | |
| 107 | + "ce", "ci", "co", "da", "de", "di", "do", "el", "en", "er", | |
| 108 | + "fa", "fe", "fi", "fo", "ga", "ge", "gi", "go", "ha", "he", | |
| 109 | + "hi", "ho", "ia", "ie", "il", "in", "io", "is", "ka", "ke", | |
| 110 | + "ki", "ko", "la", "le", "li", "lo", "ma", "me", "mi", "mo", | |
| 111 | + ] | |
| 112 | + word_pool: List[str] = [] | |
| 113 | + for a in syllables: | |
| 114 | + for b in syllables: | |
| 115 | + word_pool.append(f"{a}{b}") | |
| 116 | + if len(word_pool) >= vocab_size: | |
| 117 | + return word_pool | |
| 118 | + raise ValueError(f"Unable to generate enough synthetic words: requested={vocab_size}, got={len(word_pool)}") | |
| 119 | + | |
| 120 | + | |
| 121 | +def build_rerank_dynamic_cfg(args: argparse.Namespace) -> Dict[str, Any]: | |
| 122 | + min_words = int(args.rerank_sentence_min_words) | |
| 123 | + max_words = int(args.rerank_sentence_max_words) | |
| 124 | + doc_count = int(args.rerank_doc_count) | |
| 125 | + vocab_size = int(args.rerank_vocab_size) | |
| 126 | + if doc_count <= 0: | |
| 127 | + raise ValueError(f"rerank-doc-count must be > 0, got {doc_count}") | |
| 128 | + if vocab_size <= 0: | |
| 129 | + raise ValueError(f"rerank-vocab-size must be > 0, got {vocab_size}") | |
| 130 | + if min_words <= 0: | |
| 131 | + raise ValueError(f"rerank-sentence-min-words must be > 0, got {min_words}") | |
| 132 | + if max_words < min_words: | |
| 133 | + raise ValueError( | |
| 134 | + f"rerank-sentence-max-words must be >= rerank-sentence-min-words, got {max_words} < {min_words}" | |
| 135 | + ) | |
| 136 | + if args.rerank_seed < 0: | |
| 137 | + raise ValueError(f"rerank-seed must be >= 0, got {args.rerank_seed}") | |
| 138 | + if int(args.rerank_top_n) < 0: | |
| 139 | + raise ValueError(f"rerank-top-n must be >= 0, got {args.rerank_top_n}") | |
| 140 | + | |
| 141 | + return { | |
| 142 | + "model": args.model, | |
| 143 | + "query": args.rerank_query, | |
| 144 | + "doc_count": doc_count, | |
| 145 | + "min_words": min_words, | |
| 146 | + "max_words": max_words, | |
| 147 | + "seed": int(args.rerank_seed), | |
| 148 | + "top_n": int(args.rerank_top_n), | |
| 149 | + "instruct": args.rerank_instruct, | |
| 150 | + "word_pool": build_word_pool(vocab_size), | |
| 151 | + } | |
| 152 | + | |
| 153 | + | |
| 154 | +def build_random_rerank_payload(cfg: Dict[str, Any], rng: random.Random) -> Dict[str, Any]: | |
| 155 | + word_pool: List[str] = cfg["word_pool"] | |
| 156 | + documents: List[str] = [] | |
| 157 | + for _ in range(cfg["doc_count"]): | |
| 158 | + doc_len = rng.randint(cfg["min_words"], cfg["max_words"]) | |
| 159 | + documents.append(" ".join(rng.choices(word_pool, k=doc_len))) | |
| 160 | + | |
| 161 | + payload = { | |
| 162 | + "model": cfg["model"], | |
| 163 | + "documents": documents, | |
| 164 | + "query": cfg["query"], | |
| 165 | + "instruct": cfg["instruct"], | |
| 166 | + } | |
| 167 | + if int(cfg.get("top_n", 0)) > 0: | |
| 168 | + payload["top_n"] = int(cfg["top_n"]) | |
| 169 | + return payload | |
| 170 | + | |
| 171 | + | |
| 172 | +def build_static_template(base_url: str, api_key: str, args: argparse.Namespace) -> RequestTemplate: | |
| 173 | + payload: Dict[str, Any] = { | |
| 174 | + "model": args.model, | |
| 175 | + "documents": [ | |
| 176 | + "文本排序模型广泛用于搜索引擎和推荐系统中,它们根据文本相关性对候选文本进行排序", | |
| 177 | + "量子计算是计算科学的一个前沿领域", | |
| 178 | + "预训练语言模型的发展给文本排序模型带来了新的进展", | |
| 179 | + ], | |
| 180 | + "query": "什么是文本排序模型", | |
| 181 | + "instruct": args.rerank_instruct, | |
| 182 | + } | |
| 183 | + if int(args.rerank_top_n) > 0: | |
| 184 | + payload["top_n"] = int(args.rerank_top_n) | |
| 185 | + | |
| 186 | + return RequestTemplate( | |
| 187 | + method="POST", | |
| 188 | + url=f"{base_url.rstrip('/')}/reranks", | |
| 189 | + json_body=payload, | |
| 190 | + headers={ | |
| 191 | + "Authorization": f"Bearer {api_key}", | |
| 192 | + "Content-Type": "application/json", | |
| 193 | + }, | |
| 194 | + ) | |
| 195 | + | |
| 196 | + | |
| 197 | +def validate_response_payload(payload: Any) -> tuple[bool, str]: | |
| 198 | + if not isinstance(payload, dict): | |
| 199 | + return False, "invalid_payload_non_dict" | |
| 200 | + if "results" not in payload: | |
| 201 | + return False, "invalid_payload_missing_results" | |
| 202 | + if not isinstance(payload["results"], list): | |
| 203 | + return False, "invalid_payload_results_non_list" | |
| 204 | + return True, "" | |
| 205 | + | |
| 206 | + | |
| 207 | +async def run_single_concurrency( | |
| 208 | + template: RequestTemplate, | |
| 209 | + duration_sec: int, | |
| 210 | + concurrency: int, | |
| 211 | + max_requests: int, | |
| 212 | + max_errors: int, | |
| 213 | + timeout_sec: float, | |
| 214 | + rerank_dynamic_cfg: Optional[Dict[str, Any]], | |
| 215 | +) -> Dict[str, Any]: | |
| 216 | + latencies: List[float] = [] | |
| 217 | + status_counter: Dict[int, int] = {} | |
| 218 | + err_counter: Dict[str, int] = {} | |
| 219 | + total_requests = 0 | |
| 220 | + success_requests = 0 | |
| 221 | + stop_flag = False | |
| 222 | + lock = asyncio.Lock() | |
| 223 | + start = time.perf_counter() | |
| 224 | + | |
| 225 | + timeout = httpx.Timeout(timeout=timeout_sec) | |
| 226 | + limits = httpx.Limits(max_connections=max(concurrency * 2, 20), max_keepalive_connections=max(concurrency, 10)) | |
| 227 | + | |
| 228 | + async def worker(worker_id: int, client: httpx.AsyncClient) -> None: | |
| 229 | + nonlocal total_requests, success_requests, stop_flag | |
| 230 | + worker_rng: Optional[random.Random] = None | |
| 231 | + if rerank_dynamic_cfg is not None: | |
| 232 | + worker_rng = random.Random(int(rerank_dynamic_cfg["seed"]) + worker_id) | |
| 233 | + | |
| 234 | + while not stop_flag: | |
| 235 | + elapsed = time.perf_counter() - start | |
| 236 | + if duration_sec > 0 and elapsed >= duration_sec: | |
| 237 | + break | |
| 238 | + | |
| 239 | + async with lock: | |
| 240 | + if max_requests > 0 and total_requests >= max_requests: | |
| 241 | + stop_flag = True | |
| 242 | + break | |
| 243 | + total_requests += 1 | |
| 244 | + | |
| 245 | + payload = template.json_body | |
| 246 | + if rerank_dynamic_cfg is not None and worker_rng is not None: | |
| 247 | + payload = build_random_rerank_payload(rerank_dynamic_cfg, worker_rng) | |
| 248 | + | |
| 249 | + t0 = time.perf_counter() | |
| 250 | + ok = False | |
| 251 | + status = 0 | |
| 252 | + err = "" | |
| 253 | + try: | |
| 254 | + resp = await client.request( | |
| 255 | + method=template.method, | |
| 256 | + url=template.url, | |
| 257 | + headers=template.headers, | |
| 258 | + json=payload, | |
| 259 | + ) | |
| 260 | + status = int(resp.status_code) | |
| 261 | + ok = 200 <= status < 300 | |
| 262 | + if ok: | |
| 263 | + try: | |
| 264 | + body = resp.json() | |
| 265 | + except Exception: | |
| 266 | + ok = False | |
| 267 | + err = "invalid_json_response" | |
| 268 | + else: | |
| 269 | + valid, reason = validate_response_payload(body) | |
| 270 | + if not valid: | |
| 271 | + ok = False | |
| 272 | + err = reason or "invalid_payload" | |
| 273 | + if not ok and not err: | |
| 274 | + err = f"http_{status}" | |
| 275 | + except Exception as e: | |
| 276 | + err = type(e).__name__ | |
| 277 | + | |
| 278 | + latency_ms = (time.perf_counter() - t0) * 1000.0 | |
| 279 | + async with lock: | |
| 280 | + latencies.append(latency_ms) | |
| 281 | + if status: | |
| 282 | + status_counter[status] = status_counter.get(status, 0) + 1 | |
| 283 | + if ok: | |
| 284 | + success_requests += 1 | |
| 285 | + else: | |
| 286 | + err_counter[err or "unknown"] = err_counter.get(err or "unknown", 0) + 1 | |
| 287 | + if max_errors > 0 and sum(err_counter.values()) >= max_errors: | |
| 288 | + stop_flag = True | |
| 289 | + | |
| 290 | + async with httpx.AsyncClient(timeout=timeout, limits=limits) as client: | |
| 291 | + tasks = [asyncio.create_task(worker(i, client)) for i in range(concurrency)] | |
| 292 | + await asyncio.gather(*tasks) | |
| 293 | + | |
| 294 | + elapsed = max(time.perf_counter() - start, 1e-9) | |
| 295 | + lat_sorted = sorted(latencies) | |
| 296 | + result = { | |
| 297 | + "scenario": "rerank_dashscope", | |
| 298 | + "concurrency": concurrency, | |
| 299 | + "duration_sec": round(elapsed, 3), | |
| 300 | + "total_requests": total_requests, | |
| 301 | + "success_requests": success_requests, | |
| 302 | + "failed_requests": max(total_requests - success_requests, 0), | |
| 303 | + "success_rate": round((success_requests / total_requests) * 100.0, 2) if total_requests else 0.0, | |
| 304 | + "throughput_rps": round(total_requests / elapsed, 2), | |
| 305 | + "latency_ms": { | |
| 306 | + "avg": round(statistics.mean(lat_sorted), 2) if lat_sorted else 0.0, | |
| 307 | + "p50": round(percentile(lat_sorted, 50), 2), | |
| 308 | + "p90": round(percentile(lat_sorted, 90), 2), | |
| 309 | + "p95": round(percentile(lat_sorted, 95), 2), | |
| 310 | + "p99": round(percentile(lat_sorted, 99), 2), | |
| 311 | + "max": round(max(lat_sorted), 2) if lat_sorted else 0.0, | |
| 312 | + }, | |
| 313 | + "status_codes": dict(sorted(status_counter.items(), key=lambda x: x[0])), | |
| 314 | + "errors": dict(sorted(err_counter.items(), key=lambda x: x[0])), | |
| 315 | + } | |
| 316 | + return result | |
| 317 | + | |
| 318 | + | |
| 319 | +def format_summary(result: Dict[str, Any]) -> str: | |
| 320 | + lat = result["latency_ms"] | |
| 321 | + lines = [ | |
| 322 | + f" | |
| 323 | +=== Scenario: {result['scenario']} @ concurrency={result['concurrency']} ===", | |
| 324 | + "requests={total_requests} success={success_requests} fail={failed_requests} success_rate={success_rate}% rps={throughput_rps}".format(**result), | |
| 325 | + f"latency(ms): avg={lat['avg']} p50={lat['p50']} p90={lat['p90']} p95={lat['p95']} p99={lat['p99']} max={lat['max']}", | |
| 326 | + f"status_codes: {result['status_codes']}", | |
| 327 | + ] | |
| 328 | + if result["errors"]: | |
| 329 | + lines.append(f"errors: {result['errors']}") | |
| 330 | + return " | |
| 331 | +".join(lines) | |
| 332 | + | |
| 333 | + | |
| 334 | +def aggregate_results(results: List[Dict[str, Any]]) -> Dict[str, Any]: | |
| 335 | + if not results: | |
| 336 | + return {} | |
| 337 | + total_requests = sum(x["total_requests"] for x in results) | |
| 338 | + success_requests = sum(x["success_requests"] for x in results) | |
| 339 | + failed_requests = sum(x["failed_requests"] for x in results) | |
| 340 | + total_duration = sum(x["duration_sec"] for x in results) | |
| 341 | + weighted_avg_latency = 0.0 | |
| 342 | + if total_requests > 0: | |
| 343 | + weighted_avg_latency = sum(x["latency_ms"]["avg"] * x["total_requests"] for x in results) / total_requests | |
| 344 | + | |
| 345 | + return { | |
| 346 | + "scenario": "ALL", | |
| 347 | + "total_requests": total_requests, | |
| 348 | + "success_requests": success_requests, | |
| 349 | + "failed_requests": failed_requests, | |
| 350 | + "success_rate": round((success_requests / total_requests) * 100.0, 2) if total_requests else 0.0, | |
| 351 | + "aggregate_rps": round(total_requests / max(total_duration, 1e-9), 2), | |
| 352 | + "weighted_avg_latency_ms": round(weighted_avg_latency, 2), | |
| 353 | + } | |
| 354 | + | |
| 355 | + | |
| 356 | +async def main_async() -> int: | |
| 357 | + import os | |
| 358 | + | |
| 359 | + args = parse_args() | |
| 360 | + api_key = (args.api_key or os.getenv("DASHSCOPE_API_KEY") or "").strip() | |
| 361 | + if not api_key: | |
| 362 | + print("Missing API key. Set --api-key or DASHSCOPE_API_KEY.") | |
| 363 | + return 2 | |
| 364 | + | |
| 365 | + try: | |
| 366 | + concurrency_values = parse_csv_ints(args.concurrency_list) if args.concurrency_list else [args.concurrency] | |
| 367 | + except ValueError as exc: | |
| 368 | + print(str(exc)) | |
| 369 | + return 2 | |
| 370 | + if not concurrency_values: | |
| 371 | + print("concurrency-list is empty after parsing.") | |
| 372 | + return 2 | |
| 373 | + | |
| 374 | + try: | |
| 375 | + rerank_dynamic_cfg = build_rerank_dynamic_cfg(args) if args.rerank_dynamic_docs else None | |
| 376 | + except ValueError as exc: | |
| 377 | + print(str(exc)) | |
| 378 | + return 2 | |
| 379 | + | |
| 380 | + template = build_static_template(args.base_url, api_key, args) | |
| 381 | + | |
| 382 | + print("Load test config:") | |
| 383 | + print(" scenario=rerank_dashscope") | |
| 384 | + print(f" duration={args.duration}s") | |
| 385 | + print(f" concurrency={args.concurrency}") | |
| 386 | + print(f" concurrency_list={concurrency_values}") | |
| 387 | + print(f" max_requests={args.max_requests}") | |
| 388 | + print(f" timeout={args.timeout}s") | |
| 389 | + print(f" max_errors={args.max_errors}") | |
| 390 | + print(f" base_url={args.base_url}") | |
| 391 | + print(f" model={args.model}") | |
| 392 | + print(f" rerank_dynamic_docs={args.rerank_dynamic_docs}") | |
| 393 | + if args.rerank_dynamic_docs: | |
| 394 | + print(f" rerank_doc_count={args.rerank_doc_count}") | |
| 395 | + print(f" rerank_vocab_size={args.rerank_vocab_size}") | |
| 396 | + print(f" rerank_sentence_words=[{args.rerank_sentence_min_words},{args.rerank_sentence_max_words}]") | |
| 397 | + print(f" rerank_query={args.rerank_query}") | |
| 398 | + print(f" rerank_seed={args.rerank_seed}") | |
| 399 | + print(f" rerank_top_n={args.rerank_top_n}") | |
| 400 | + print(f" rerank_instruct={args.rerank_instruct}") | |
| 401 | + else: | |
| 402 | + print(" static_request_payload=demo_payload") | |
| 403 | + | |
| 404 | + results: List[Dict[str, Any]] = [] | |
| 405 | + total_jobs = len(concurrency_values) | |
| 406 | + for idx, c in enumerate(concurrency_values, start=1): | |
| 407 | + print(f" | |
| 408 | +[{idx}/{total_jobs}] running rerank_dashscope @ concurrency={c} ...") | |
| 409 | + result = await run_single_concurrency( | |
| 410 | + template=template, | |
| 411 | + duration_sec=args.duration, | |
| 412 | + concurrency=c, | |
| 413 | + max_requests=args.max_requests, | |
| 414 | + max_errors=args.max_errors, | |
| 415 | + timeout_sec=args.timeout, | |
| 416 | + rerank_dynamic_cfg=rerank_dynamic_cfg, | |
| 417 | + ) | |
| 418 | + print(format_summary(result)) | |
| 419 | + results.append(result) | |
| 420 | + if args.pause > 0 and idx < total_jobs: | |
| 421 | + await asyncio.sleep(args.pause) | |
| 422 | + | |
| 423 | + final = { | |
| 424 | + "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), | |
| 425 | + "config": { | |
| 426 | + "scenario": "rerank_dashscope", | |
| 427 | + "duration_sec": args.duration, | |
| 428 | + "concurrency": args.concurrency, | |
| 429 | + "concurrency_list": concurrency_values, | |
| 430 | + "max_requests": args.max_requests, | |
| 431 | + "timeout_sec": args.timeout, | |
| 432 | + "max_errors": args.max_errors, | |
| 433 | + "base_url": args.base_url, | |
| 434 | + "model": args.model, | |
| 435 | + "output": args.output or None, | |
| 436 | + "rerank_dynamic_docs": args.rerank_dynamic_docs, | |
| 437 | + "rerank_doc_count": args.rerank_doc_count, | |
| 438 | + "rerank_vocab_size": args.rerank_vocab_size, | |
| 439 | + "rerank_sentence_min_words": args.rerank_sentence_min_words, | |
| 440 | + "rerank_sentence_max_words": args.rerank_sentence_max_words, | |
| 441 | + "rerank_query": args.rerank_query, | |
| 442 | + "rerank_seed": args.rerank_seed, | |
| 443 | + "rerank_top_n": args.rerank_top_n, | |
| 444 | + "rerank_instruct": args.rerank_instruct, | |
| 445 | + }, | |
| 446 | + "results": results, | |
| 447 | + "overall": aggregate_results(results), | |
| 448 | + } | |
| 449 | + | |
| 450 | + print(" | |
| 451 | +=== Overall ===") | |
| 452 | + print(json.dumps(final["overall"], ensure_ascii=False, indent=2)) | |
| 453 | + | |
| 454 | + if args.output: | |
| 455 | + out_path = Path(args.output) | |
| 456 | + out_path.parent.mkdir(parents=True, exist_ok=True) | |
| 457 | + out_path.write_text(json.dumps(final, ensure_ascii=False, indent=2), encoding="utf-8") | |
| 458 | + print(f"Saved JSON report: {out_path}") | |
| 459 | + | |
| 460 | + return 0 | |
| 461 | + | |
| 462 | + | |
| 463 | +def main() -> int: | |
| 464 | + try: | |
| 465 | + return asyncio.run(main_async()) | |
| 466 | + except KeyboardInterrupt: | |
| 467 | + print("Interrupted by user") | |
| 468 | + return 130 | |
| 469 | + | |
| 470 | + | |
| 471 | +if __name__ == "__main__": | |
| 472 | + raise SystemExit(main()) | ... | ... |
reranker/rerank-cloud-perf-study/rerank_dashscope_perf_usage.md
0 → 100644
| ... | ... | @@ -0,0 +1,269 @@ |
| 1 | +下面是一份**简洁但完整的使用说明**,适合直接放在 README 或脚本注释里。 | |
| 2 | + | |
| 3 | +--- | |
| 4 | + | |
| 5 | +# rerank_dashscope_perf.py 使用说明 | |
| 6 | + | |
| 7 | +该脚本用于对 **DashScope `qwen3-rerank` 接口**进行并发性能测试, | |
| 8 | +测试接口: | |
| 9 | + | |
| 10 | +``` | |
| 11 | +POST https://dashscope.aliyuncs.com/compatible-api/v1/reranks | |
| 12 | +``` | |
| 13 | + | |
| 14 | +脚本支持: | |
| 15 | + | |
| 16 | +* 并发压测 | |
| 17 | +* 固定或动态生成 documents | |
| 18 | +* 自定义 doc 数量(例如 386) | |
| 19 | +* 输出详细 latency / RPS 统计 | |
| 20 | +* 多并发梯度测试(如 1,5,10,20) | |
| 21 | + | |
| 22 | +--- | |
| 23 | + | |
| 24 | +# 一、环境准备 | |
| 25 | + | |
| 26 | +### 1 安装依赖 | |
| 27 | + | |
| 28 | +脚本依赖 `httpx`: | |
| 29 | + | |
| 30 | +```bash | |
| 31 | +pip install httpx | |
| 32 | +``` | |
| 33 | + | |
| 34 | +如果使用虚拟环境: | |
| 35 | + | |
| 36 | +```bash | |
| 37 | +.venv/bin/pip install httpx | |
| 38 | +``` | |
| 39 | + | |
| 40 | +--- | |
| 41 | + | |
| 42 | +### 2 设置 DashScope API Key | |
| 43 | + | |
| 44 | +```bash | |
| 45 | +export DASHSCOPE_API_KEY=你的key | |
| 46 | +``` | |
| 47 | + | |
| 48 | +也可以通过参数指定: | |
| 49 | + | |
| 50 | +```bash | |
| 51 | +--api-key xxx | |
| 52 | +``` | |
| 53 | + | |
| 54 | +--- | |
| 55 | + | |
| 56 | +# 二、基本用法 | |
| 57 | + | |
| 58 | +最常见的压测方式: | |
| 59 | + | |
| 60 | +```bash | |
| 61 | +python rerank_dashscope_perf.py \ | |
| 62 | + --duration 20 \ | |
| 63 | + --concurrency-list 1,5,10,20 \ | |
| 64 | + --timeout 90 \ | |
| 65 | + --rerank-dynamic-docs \ | |
| 66 | + --rerank-doc-count 386 \ | |
| 67 | + --rerank-vocab-size 1000 \ | |
| 68 | + --rerank-sentence-min-words 15 \ | |
| 69 | + --rerank-sentence-max-words 40 \ | |
| 70 | + --rerank-query "wireless mouse" \ | |
| 71 | + --rerank-seed 20260312 \ | |
| 72 | + --rerank-top-n 386 \ | |
| 73 | + --output perf_result.json | |
| 74 | +``` | |
| 75 | + | |
| 76 | +含义: | |
| 77 | + | |
| 78 | +| 参数 | 说明 | | |
| 79 | +| ------------------- | ------------- | | |
| 80 | +| duration | 每个并发测试持续时间(秒) | | |
| 81 | +| concurrency-list | 并发列表 | | |
| 82 | +| timeout | 单请求超时时间 | | |
| 83 | +| rerank-dynamic-docs | 启用动态 doc 生成 | | |
| 84 | +| rerank-doc-count | 每个请求 doc 数量 | | |
| 85 | +| rerank-top-n | 返回 top_n | | |
| 86 | +| output | 保存结果 JSON | | |
| 87 | + | |
| 88 | +--- | |
| 89 | + | |
| 90 | +# 三、测试模式 | |
| 91 | + | |
| 92 | +脚本有两种请求模式: | |
| 93 | + | |
| 94 | +--- | |
| 95 | + | |
| 96 | +# 1 静态请求模式(默认) | |
| 97 | + | |
| 98 | +如果**不使用 `--rerank-dynamic-docs`**,请求 payload 固定为: | |
| 99 | + | |
| 100 | +```json | |
| 101 | +{ | |
| 102 | + "model": "qwen3-rerank", | |
| 103 | + "documents": [ | |
| 104 | + "文本排序模型广泛用于搜索引擎和推荐系统中,它们根据文本相关性对候选文本进行排序", | |
| 105 | + "量子计算是计算科学的一个前沿领域", | |
| 106 | + "预训练语言模型的发展给文本排序模型带来了新的进展" | |
| 107 | + ], | |
| 108 | + "query": "什么是文本排序模型", | |
| 109 | + "top_n": 2 | |
| 110 | +} | |
| 111 | +``` | |
| 112 | + | |
| 113 | +适合: | |
| 114 | + | |
| 115 | +* 验证接口 | |
| 116 | +* 小规模测试 | |
| 117 | + | |
| 118 | +--- | |
| 119 | + | |
| 120 | +# 2 动态 documents 模式(推荐) | |
| 121 | + | |
| 122 | +启用参数: | |
| 123 | + | |
| 124 | +``` | |
| 125 | +--rerank-dynamic-docs | |
| 126 | +``` | |
| 127 | + | |
| 128 | +脚本会: | |
| 129 | + | |
| 130 | +* 每个请求生成 **N 条 documents** | |
| 131 | +* 每条 doc 是 **随机词拼接句子** | |
| 132 | +* 每个请求 **内容不同** | |
| 133 | + | |
| 134 | +示例 doc: | |
| 135 | + | |
| 136 | +``` | |
| 137 | +alce bafi kolo dede hobe anma cigi lofi asbe erko kaci molo fadi helo | |
| 138 | +mace biro aldi kolo gace hoin doka lale cebo fafa ineri kasi hobe lomo | |
| 139 | +gifi beme koha laci anfi celi dore ioce kobo hila mefi arce enbo hega | |
| 140 | +``` | |
| 141 | + | |
| 142 | +优点: | |
| 143 | + | |
| 144 | +* 不依赖真实语料 | |
| 145 | +* 更接近真实 token 分布 | |
| 146 | +* 压测稳定 | |
| 147 | + | |
| 148 | +--- | |
| 149 | + | |
| 150 | +# 四、386 documents 压测示例 | |
| 151 | + | |
| 152 | +与你的测试方式一致: | |
| 153 | + | |
| 154 | +```bash | |
| 155 | +python rerank_dashscope_perf.py \ | |
| 156 | + --duration 20 \ | |
| 157 | + --concurrency-list 1,5,10,20 \ | |
| 158 | + --timeout 90 \ | |
| 159 | + --rerank-dynamic-docs \ | |
| 160 | + --rerank-doc-count 386 \ | |
| 161 | + --rerank-vocab-size 1000 \ | |
| 162 | + --rerank-sentence-min-words 15 \ | |
| 163 | + --rerank-sentence-max-words 40 \ | |
| 164 | + --rerank-query "wireless mouse" \ | |
| 165 | + --rerank-seed 20260312 \ | |
| 166 | + --rerank-top-n 386 | |
| 167 | +``` | |
| 168 | + | |
| 169 | +每个请求: | |
| 170 | + | |
| 171 | +``` | |
| 172 | +query: wireless mouse | |
| 173 | +documents: 386条 | |
| 174 | +每条doc长度: 15~40词 | |
| 175 | +``` | |
| 176 | + | |
| 177 | +--- | |
| 178 | + | |
| 179 | +# 五、输出结果示例 | |
| 180 | + | |
| 181 | +终端输出: | |
| 182 | + | |
| 183 | +``` | |
| 184 | +[1/4] running rerank_dashscope @ concurrency=1 ... | |
| 185 | + | |
| 186 | +=== Scenario: rerank_dashscope @ concurrency=1 === | |
| 187 | +requests=84 success=84 fail=0 success_rate=100.0% rps=4.2 | |
| 188 | +latency(ms): avg=230 p50=220 p90=260 p95=280 p99=310 max=340 | |
| 189 | +status_codes: {200: 84} | |
| 190 | +``` | |
| 191 | + | |
| 192 | +字段说明: | |
| 193 | + | |
| 194 | +| 指标 | 说明 | | |
| 195 | +| --------------- | ---- | | |
| 196 | +| requests | 总请求数 | | |
| 197 | +| success | 成功请求 | | |
| 198 | +| fail | 失败请求 | | |
| 199 | +| success_rate | 成功率 | | |
| 200 | +| rps | 吞吐量 | | |
| 201 | +| p50/p90/p95/p99 | 延迟分位 | | |
| 202 | +| max | 最大延迟 | | |
| 203 | + | |
| 204 | +--- | |
| 205 | + | |
| 206 | +# 六、JSON 报告 | |
| 207 | + | |
| 208 | +如果指定: | |
| 209 | + | |
| 210 | +``` | |
| 211 | +--output perf_result.json | |
| 212 | +``` | |
| 213 | + | |
| 214 | +会生成报告: | |
| 215 | + | |
| 216 | +```json | |
| 217 | +{ | |
| 218 | + "results": [ | |
| 219 | + { | |
| 220 | + "concurrency": 1, | |
| 221 | + "throughput_rps": 4.2, | |
| 222 | + "latency_ms": { | |
| 223 | + "avg": 230, | |
| 224 | + "p95": 280 | |
| 225 | + } | |
| 226 | + } | |
| 227 | + ] | |
| 228 | +} | |
| 229 | +``` | |
| 230 | + | |
| 231 | +适合: | |
| 232 | + | |
| 233 | +* 性能对比 | |
| 234 | +* 画图 | |
| 235 | +* 压测记录 | |
| 236 | + | |
| 237 | +--- | |
| 238 | + | |
| 239 | +# 七、常见参数 | |
| 240 | + | |
| 241 | +| 参数 | 默认值 | 说明 | | |
| 242 | +| --------------------------- | --------- | ------- | | |
| 243 | +| --duration | 20 | 单并发测试时间 | | |
| 244 | +| --concurrency-list | 1,5,10,20 | 并发梯度 | | |
| 245 | +| --timeout | 90 | 请求超时 | | |
| 246 | +| --rerank-doc-count | 386 | doc数量 | | |
| 247 | +| --rerank-vocab-size | 1000 | 词表大小 | | |
| 248 | +| --rerank-sentence-min-words | 15 | doc最小长度 | | |
| 249 | +| --rerank-sentence-max-words | 40 | doc最大长度 | | |
| 250 | +| --rerank-top-n | 386 | 返回top_n | | |
| 251 | + | |
| 252 | +--- | |
| 253 | + | |
| 254 | +# 八、推荐压测方式 | |
| 255 | + | |
| 256 | +推荐测试: | |
| 257 | + | |
| 258 | +``` | |
| 259 | +docs = 386 | |
| 260 | +query = wireless mouse | |
| 261 | +concurrency = 1,5,10,20 | |
| 262 | +duration = 20~60s | |
| 263 | +``` | |
| 264 | + | |
| 265 | +即可得到: | |
| 266 | + | |
| 267 | +* latency 曲线 | |
| 268 | +* RPS | |
| 269 | +* 并发极限 | ... | ... |