Commit 12a75c466878d4c014fcf92cca4f1a012878496c
1 parent
99b72698
feat(eval): 为 LLM 标注添加统一续跑能力,支持断点续传与容错重试
- 问题背景:clothing_top771 数据集在被外部异常终止(reranker被kill);缺乏统一的断点续跑机制,此前依赖临时脚本恢复。 - 解决方案:在 eval_framework/cli.py 的 build 命令中新增 --resume-missing、--continue-on-error、--max-retries-per-query、--retry-backoff-sec 参数,并修正默认参数逻辑(有 dataset_id 时不再强塞 legacy queries_file)。 - 脚本统一:更新 start_eval.sh 和 start_eval_web.sh,增加 batch-rebuild-resume 入口,统一使用 dataset 模式,REPO_EVAL_QUERIES 改为可选覆盖。 - 文档补充:在 scripts/evaluation/README.md 中添加中断续跑说明和新命令用法。 - 验证:eval-web 多数据集接口(/api/datasets、/api/history?dataset_id=...)正常返回 core_queries 与 clothing_top771 分域结果;当前进程已越过第 48 条,query_builds 计数增至 54,正在处理第 55/771。 把流程做成可持续的“统一续跑”能力,避免再靠临时脚本: - 在 [scripts/evaluation/eval_framework/cli.py](/data/saas-search/scripts/evaluation/eval_framework/cli.py) 新增 `build --resume-missing --continue-on-error --max-retries-per-query --retry-backoff-sec`,并修正默认参数逻辑(有 `dataset_id` 时不再强塞 legacy `queries_file`)。 - 在 [scripts/evaluation/start_eval.sh](/data/saas-search/scripts/evaluation/start_eval.sh) 新增 `batch-rebuild-resume` 入口,统一用 dataset 模式,`REPO_EVAL_QUERIES` 仅作可选覆盖。 - 在 [scripts/start_eval_web.sh](/data/saas-search/scripts/start_eval_web.sh) 做同样的 dataset/queries 统一化。 - 在 [scripts/evaluation/README.md](/data/saas-search/scripts/evaluation/README.md) 补了中断续跑说明和新命令。 - 已验证 `eval-web` 多数据集接口正常(`/api/datasets`、`/api/history?dataset_id=...` 均返回 `core_queries` 与 `clothing_top771` 分域结果)。 当前在线进程: - LLM 标注:`PID 2062901`(`build ... --dataset-id clothing_top771 --resume-missing ...`) - reranker:`PID 2065235`(6007,`/health` 返回 `ok`) 盯进度: ```bash tail -f logs/eval.log ls -1 artifacts/search_evaluation/datasets/clothing_top771/query_builds | wc -l curl -sS http://127.0.0.1:6007/health ``` 影响范围:scripts/evaluation/eval_framework/cli.py, scripts/evaluation/start_eval.sh, scripts/start_eval_web.sh, scripts/evaluation/README.md
Showing
12 changed files
with
267 additions
and
37 deletions
Show diff stats
artifacts/search_evaluation/build_launches/clothing_top771_batch_rebuild_resume_20260420T055157Z.pid
0 → 100644
| ... | ... | @@ -0,0 +1 @@ |
| 1 | +2061952 | ... | ... |
artifacts/search_evaluation/build_launches/clothing_top771_build_resume_direct_20260420T055302Z.pid
0 → 100644
| ... | ... | @@ -0,0 +1 @@ |
| 1 | +2062709 | ... | ... |
artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054051Z.cmd
0 → 100644
| ... | ... | @@ -0,0 +1,44 @@ |
| 1 | +./.venv/bin/python - <<'PY' | |
| 2 | +import json | |
| 3 | +from pathlib import Path | |
| 4 | +from scripts.evaluation.eval_framework.datasets import resolve_dataset, query_builds_dir | |
| 5 | +from scripts.evaluation.eval_framework.framework import SearchEvaluationFramework | |
| 6 | + | |
| 7 | +dataset = resolve_dataset(dataset_id='clothing_top771', tenant_id='163', language='en', require_enabled=True) | |
| 8 | +fw = SearchEvaluationFramework(tenant_id=dataset.tenant_id) | |
| 9 | +qdir = query_builds_dir(fw.artifact_root, dataset.dataset_id) | |
| 10 | +seen = set() | |
| 11 | +for p in qdir.glob('*.json'): | |
| 12 | + try: | |
| 13 | + obj = json.loads(p.read_text(encoding='utf-8')) | |
| 14 | + q = str(obj.get('query') or '').strip() | |
| 15 | + if q: | |
| 16 | + seen.add(q) | |
| 17 | + except Exception: | |
| 18 | + pass | |
| 19 | +queries = list(dataset.queries) | |
| 20 | +remaining = [q for q in queries if q not in seen] | |
| 21 | +print(f"[resume] dataset={dataset.dataset_id} total={len(queries)} built={len(seen)} remaining={len(remaining)}") | |
| 22 | +if remaining: | |
| 23 | + print(f"[resume] first_remaining={remaining[0]!r} line={queries.index(remaining[0])+1}") | |
| 24 | + | |
| 25 | +for idx, q in enumerate(queries, start=1): | |
| 26 | + if q in seen: | |
| 27 | + continue | |
| 28 | + print(f"[resume] ({idx}/{len(queries)}) start query={q!r}") | |
| 29 | + result = fw.build_query_annotation_set( | |
| 30 | + query=q, | |
| 31 | + dataset=dataset, | |
| 32 | + search_depth=500, | |
| 33 | + rerank_depth=10000, | |
| 34 | + language='en', | |
| 35 | + force_refresh_rerank=True, | |
| 36 | + force_refresh_labels=True, | |
| 37 | + ) | |
| 38 | + print( | |
| 39 | + f"[resume] done query={q!r} search_total={result.search_total} " | |
| 40 | + f"search_depth={result.search_depth} corpus={result.rerank_corpus_size} " | |
| 41 | + f"annotated={result.annotated_count} output={result.output_json_path}" | |
| 42 | + ) | |
| 43 | +print("[resume] all remaining queries completed") | |
| 44 | +PY | ... | ... |
artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054051Z.pid
0 → 100644
| ... | ... | @@ -0,0 +1 @@ |
| 1 | +2053402 | ... | ... |
artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054253Z_ub.cmd
0 → 100644
| ... | ... | @@ -0,0 +1,45 @@ |
| 1 | +PYTHONUNBUFFERED=1 ./.venv/bin/python - <<'PY' | |
| 2 | +import json | |
| 3 | +from pathlib import Path | |
| 4 | +from scripts.evaluation.eval_framework.datasets import resolve_dataset, query_builds_dir | |
| 5 | +from scripts.evaluation.eval_framework.framework import SearchEvaluationFramework | |
| 6 | + | |
| 7 | +dataset = resolve_dataset(dataset_id='clothing_top771', tenant_id='163', language='en', require_enabled=True) | |
| 8 | +fw = SearchEvaluationFramework(tenant_id=dataset.tenant_id) | |
| 9 | +qdir = query_builds_dir(fw.artifact_root, dataset.dataset_id) | |
| 10 | +seen = set() | |
| 11 | +for p in qdir.glob('*.json'): | |
| 12 | + try: | |
| 13 | + obj = json.loads(p.read_text(encoding='utf-8')) | |
| 14 | + q = str(obj.get('query') or '').strip() | |
| 15 | + if q: | |
| 16 | + seen.add(q) | |
| 17 | + except Exception: | |
| 18 | + pass | |
| 19 | +queries = list(dataset.queries) | |
| 20 | +remaining = [q for q in queries if q not in seen] | |
| 21 | +print(f"[resume] dataset={dataset.dataset_id} total={len(queries)} built={len(seen)} remaining={len(remaining)}") | |
| 22 | +if not remaining: | |
| 23 | + print('[resume] nothing to do') | |
| 24 | +else: | |
| 25 | + print(f"[resume] first_remaining={remaining[0]!r} line={queries.index(remaining[0])+1}") | |
| 26 | +for idx, q in enumerate(queries, start=1): | |
| 27 | + if q in seen: | |
| 28 | + continue | |
| 29 | + print(f"[resume] ({idx}/{len(queries)}) start query={q!r}") | |
| 30 | + result = fw.build_query_annotation_set( | |
| 31 | + query=q, | |
| 32 | + dataset=dataset, | |
| 33 | + search_depth=500, | |
| 34 | + rerank_depth=10000, | |
| 35 | + language='en', | |
| 36 | + force_refresh_rerank=True, | |
| 37 | + force_refresh_labels=True, | |
| 38 | + ) | |
| 39 | + print( | |
| 40 | + f"[resume] done query={q!r} search_total={result.search_total} " | |
| 41 | + f"search_depth={result.search_depth} corpus={result.rerank_corpus_size} " | |
| 42 | + f"annotated={result.annotated_count} output={result.output_json_path}" | |
| 43 | + ) | |
| 44 | +print('[resume] all remaining queries completed') | |
| 45 | +PY | ... | ... |
artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054253Z_ub.pid
0 → 100644
| ... | ... | @@ -0,0 +1 @@ |
| 1 | +2054946 | ... | ... |
artifacts/search_evaluation/build_launches/reranker-resume.pid
0 → 100644
| ... | ... | @@ -0,0 +1 @@ |
| 1 | +2064765 | ... | ... |
scripts/evaluation/README.md
| ... | ... | @@ -24,7 +24,7 @@ This directory holds the offline annotation builder, the evaluation web UI/API, |
| 24 | 24 | | `queries/queries.txt` | Legacy core query set (`dataset_id=core_queries`) | |
| 25 | 25 | | `queries/all_keywords.txt.top1w.shuf.top1k.clothing_filtered` | Expanded clothing dataset (`dataset_id=clothing_top771`) | |
| 26 | 26 | | `README_Requirement.md` | Product/requirements reference | |
| 27 | -| `start_eval.sh` | Wrapper: `batch`, `batch-rebuild` (deep `build` + `--force-refresh-labels`), or `serve` | | |
| 27 | +| `start_eval.sh` | Wrapper: `batch`, `batch-rebuild`, `batch-rebuild-resume` (resume from existing per-query outputs), or `serve` | | |
| 28 | 28 | | `../start_eval_web.sh` | Same as `serve` with `activate.sh`; use `./scripts/service_ctl.sh start eval-web` (default port **6010**, override with `EVAL_WEB_PORT`). `./run.sh all` includes eval-web. | |
| 29 | 29 | |
| 30 | 30 | ## Quick start (repo root) |
| ... | ... | @@ -41,6 +41,9 @@ REPO_EVAL_DATASET_ID=clothing_top771 ./scripts/evaluation/start_eval.sh batch |
| 41 | 41 | # Deep rebuild: per-query full corpus rerank (outside search recall pool) + LLM in batches along global sort order (early stop; expensive) |
| 42 | 42 | ./scripts/evaluation/start_eval.sh batch-rebuild |
| 43 | 43 | |
| 44 | +# Resume deep rebuild from existing query_builds (recommended for long 771-query runs) | |
| 45 | +REPO_EVAL_DATASET_ID=clothing_top771 ./scripts/evaluation/start_eval.sh batch-rebuild-resume | |
| 46 | + | |
| 44 | 47 | # UI: http://127.0.0.1:6010/ |
| 45 | 48 | ./scripts/evaluation/start_eval.sh serve |
| 46 | 49 | # or: ./scripts/service_ctl.sh start eval-web |
| ... | ... | @@ -79,7 +82,7 @@ Each `batch` run walks the full queries file and writes a **batch report** under |
| 79 | 82 | |
| 80 | 83 | This runs `build_annotation_set.py build` with **`--force-refresh-labels`** and **`--force-refresh-rerank`** (see the explicit command block below). It does **not** run the `batch` subcommand: there is **no** aggregate batch report for this step; outputs are per-query JSON under `query_builds/` plus updates in `search_eval.sqlite3`. |
| 81 | 84 | |
| 82 | -For **each** query in `queries.txt`, in order: | |
| 85 | +For **each** query in the selected dataset query file (`--dataset-id` / `config.yaml search_evaluation.datasets[*].query_file`), in order: | |
| 83 | 86 | |
| 84 | 87 | 1. **Search recall** — Call the live search API with `size = max(--search-depth, --search-recall-top-k)` (the wrapper uses `--search-depth 500`). The first **`--search-recall-top-k`** hits (default **200**, see `eval_framework.constants.DEFAULT_SEARCH_RECALL_TOP_K`) form the **recall pool**; they are treated as rerank score **1** and are **not** sent to the reranker. |
| 85 | 88 | 2. **Full corpus** — Load the tenant’s product corpus from Elasticsearch (same tenant as `TENANT_ID` / `--tenant-id`, default **163**), via `corpus_docs()` (cached in SQLite after the first load). |
| ... | ... | @@ -104,6 +107,8 @@ For **each** query in `queries.txt`, in order: |
| 104 | 107 | |
| 105 | 108 | **Tuning the rebuild path:** `--search-recall-top-k`, `--rerank-high-threshold`, `--rerank-high-skip-count`, `--rebuild-llm-batch-size`, `--rebuild-min-batches`, `--rebuild-max-batches`, `--rebuild-irrelevant-stop-ratio`, `--rebuild-irrel-low-combined-stop-ratio`, `--rebuild-irrelevant-stop-streak` on `build` (see `eval_framework/cli.py`). Rerank API chunk size is **80** docs per request in code (`full_corpus_rerank_outside_exclude`). |
| 106 | 109 | |
| 110 | +**Resuming interrupted runs:** for long jobs (for example `clothing_top771`), use `batch-rebuild-resume` or pass `build --resume-missing --continue-on-error --max-retries-per-query N`. Resume mode skips queries that already have per-query JSON under `datasets/<dataset_id>/query_builds/`. | |
| 111 | + | |
| 107 | 112 | ## Artifacts |
| 108 | 113 | |
| 109 | 114 | Default root: `artifacts/search_evaluation/` | ... | ... |
scripts/evaluation/eval_framework/cli.py
| ... | ... | @@ -6,15 +6,16 @@ import argparse |
| 6 | 6 | import json |
| 7 | 7 | import logging |
| 8 | 8 | import shutil |
| 9 | +import time | |
| 9 | 10 | from pathlib import Path |
| 10 | -from typing import Any, Dict | |
| 11 | +from typing import Any, Dict, List, Set | |
| 11 | 12 | |
| 12 | 13 | from config.loader import get_app_config |
| 13 | 14 | |
| 14 | 15 | from .datasets import audits_dir, query_builds_dir, resolve_dataset |
| 15 | 16 | from .framework import SearchEvaluationFramework |
| 16 | 17 | from .logging_setup import setup_eval_logging |
| 17 | -from .utils import ensure_dir, utc_now_iso, utc_timestamp | |
| 18 | +from .utils import utc_now_iso, utc_timestamp | |
| 18 | 19 | from .web_app import create_web_app |
| 19 | 20 | |
| 20 | 21 | _cli_log = logging.getLogger("search_eval.cli") |
| ... | ... | @@ -95,7 +96,8 @@ def _apply_search_evaluation_cli_defaults(args: argparse.Namespace) -> None: |
| 95 | 96 | args.dataset_id = se.default_dataset_id |
| 96 | 97 | if getattr(args, "tenant_id", None) in (None, ""): |
| 97 | 98 | args.tenant_id = se.default_tenant_id |
| 98 | - if getattr(args, "queries_file", None) in (None, ""): | |
| 99 | + # Keep legacy queries_file fallback only when dataset_id is not specified. | |
| 100 | + if getattr(args, "queries_file", None) in (None, "") and getattr(args, "dataset_id", None) in (None, ""): | |
| 99 | 101 | args.queries_file = str(se.queries_file) |
| 100 | 102 | if getattr(args, "language", None) in (None, ""): |
| 101 | 103 | args.language = se.default_language |
| ... | ... | @@ -162,6 +164,23 @@ def _resolve_dataset_from_args(args: argparse.Namespace, *, require_enabled: boo |
| 162 | 164 | return dataset |
| 163 | 165 | |
| 164 | 166 | |
| 167 | +def _list_built_queries(artifact_root: Path, dataset_id: str) -> Set[str]: | |
| 168 | + built: Set[str] = set() | |
| 169 | + root = query_builds_dir(artifact_root, dataset_id) | |
| 170 | + for path in root.glob("*.json"): | |
| 171 | + name = path.name | |
| 172 | + if name.startswith("build_summary_") or name.startswith("build_failures_"): | |
| 173 | + continue | |
| 174 | + try: | |
| 175 | + payload = json.loads(path.read_text(encoding="utf-8")) | |
| 176 | + except Exception: | |
| 177 | + continue | |
| 178 | + query = str(payload.get("query") or "").strip() | |
| 179 | + if query: | |
| 180 | + built.add(query) | |
| 181 | + return built | |
| 182 | + | |
| 183 | + | |
| 165 | 184 | def build_cli_parser() -> argparse.ArgumentParser: |
| 166 | 185 | parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI") |
| 167 | 186 | sub = parser.add_subparsers(dest="command", required=True) |
| ... | ... | @@ -251,6 +270,28 @@ def build_cli_parser() -> argparse.ArgumentParser: |
| 251 | 270 | action="store_true", |
| 252 | 271 | help="Delete dataset-specific query_builds/audits before starting. Shared SQLite cache is preserved.", |
| 253 | 272 | ) |
| 273 | + build.add_argument( | |
| 274 | + "--resume-missing", | |
| 275 | + action="store_true", | |
| 276 | + help="Skip queries that already have per-query build JSONs in this dataset's query_builds directory.", | |
| 277 | + ) | |
| 278 | + build.add_argument( | |
| 279 | + "--continue-on-error", | |
| 280 | + action="store_true", | |
| 281 | + help="Continue with remaining queries when one query fails after retries.", | |
| 282 | + ) | |
| 283 | + build.add_argument( | |
| 284 | + "--max-retries-per-query", | |
| 285 | + type=int, | |
| 286 | + default=0, | |
| 287 | + help="Retry count per failed query before giving up (default: 0).", | |
| 288 | + ) | |
| 289 | + build.add_argument( | |
| 290 | + "--retry-backoff-sec", | |
| 291 | + type=float, | |
| 292 | + default=5.0, | |
| 293 | + help="Base backoff seconds between retries (actual sleep = base * attempt_no).", | |
| 294 | + ) | |
| 254 | 295 | build.add_argument("--force-refresh-rerank", action="store_true") |
| 255 | 296 | build.add_argument("--force-refresh-labels", action="store_true") |
| 256 | 297 | add_judge_llm_args(build) |
| ... | ... | @@ -300,7 +341,19 @@ def run_build(args: argparse.Namespace) -> None: |
| 300 | 341 | _reset_build_artifacts(dataset.dataset_id) |
| 301 | 342 | framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) |
| 302 | 343 | queries = list(dataset.queries) |
| 303 | - summary = [] | |
| 344 | + summary: List[Dict[str, Any]] = [] | |
| 345 | + failures: List[Dict[str, Any]] = [] | |
| 346 | + completed_queries: Set[str] = set() | |
| 347 | + if args.resume_missing: | |
| 348 | + completed_queries = _list_built_queries(framework.artifact_root, dataset.dataset_id) | |
| 349 | + _cli_log.info( | |
| 350 | + "[build] resume mode: dataset=%s total=%s already_built=%s remaining=%s", | |
| 351 | + dataset.dataset_id, | |
| 352 | + len(queries), | |
| 353 | + len(completed_queries), | |
| 354 | + max(0, len(queries) - len(completed_queries)), | |
| 355 | + ) | |
| 356 | + skipped_queries = 0 | |
| 304 | 357 | rebuild_kwargs = {} |
| 305 | 358 | if args.force_refresh_labels: |
| 306 | 359 | rebuild_kwargs = { |
| ... | ... | @@ -316,23 +369,69 @@ def run_build(args: argparse.Namespace) -> None: |
| 316 | 369 | } |
| 317 | 370 | total_q = len(queries) |
| 318 | 371 | for q_index, query in enumerate(queries, start=1): |
| 319 | - _cli_log.info("[build] (%s/%s) starting query=%r", q_index, total_q, query) | |
| 320 | - try: | |
| 321 | - result = framework.build_query_annotation_set( | |
| 322 | - query=query, | |
| 323 | - dataset=dataset, | |
| 324 | - search_depth=args.search_depth, | |
| 325 | - rerank_depth=args.rerank_depth, | |
| 326 | - annotate_search_top_k=args.annotate_search_top_k, | |
| 327 | - annotate_rerank_top_k=args.annotate_rerank_top_k, | |
| 328 | - language=args.language, | |
| 329 | - force_refresh_rerank=args.force_refresh_rerank, | |
| 330 | - force_refresh_labels=args.force_refresh_labels, | |
| 331 | - **rebuild_kwargs, | |
| 372 | + if query in completed_queries: | |
| 373 | + skipped_queries += 1 | |
| 374 | + _cli_log.info("[build] (%s/%s) skip query=%r (already built)", q_index, total_q, query) | |
| 375 | + continue | |
| 376 | + | |
| 377 | + attempt = 0 | |
| 378 | + while True: | |
| 379 | + max_attempts = max(1, int(args.max_retries_per_query) + 1) | |
| 380 | + _cli_log.info( | |
| 381 | + "[build] (%s/%s) starting query=%r attempt=%s/%s", | |
| 382 | + q_index, | |
| 383 | + total_q, | |
| 384 | + query, | |
| 385 | + attempt + 1, | |
| 386 | + max_attempts, | |
| 332 | 387 | ) |
| 333 | - except Exception: | |
| 334 | - _cli_log.exception("[build] failed query=%r index=%s/%s", query, q_index, total_q) | |
| 335 | - raise | |
| 388 | + try: | |
| 389 | + result = framework.build_query_annotation_set( | |
| 390 | + query=query, | |
| 391 | + dataset=dataset, | |
| 392 | + search_depth=args.search_depth, | |
| 393 | + rerank_depth=args.rerank_depth, | |
| 394 | + annotate_search_top_k=args.annotate_search_top_k, | |
| 395 | + annotate_rerank_top_k=args.annotate_rerank_top_k, | |
| 396 | + language=args.language, | |
| 397 | + force_refresh_rerank=args.force_refresh_rerank, | |
| 398 | + force_refresh_labels=args.force_refresh_labels, | |
| 399 | + **rebuild_kwargs, | |
| 400 | + ) | |
| 401 | + break | |
| 402 | + except Exception as exc: | |
| 403 | + attempt += 1 | |
| 404 | + if attempt <= int(args.max_retries_per_query): | |
| 405 | + sleep_seconds = max(0.0, float(args.retry_backoff_sec)) * attempt | |
| 406 | + _cli_log.warning( | |
| 407 | + "[build] query=%r failed attempt=%s/%s; retry in %.1fs: %s", | |
| 408 | + query, | |
| 409 | + attempt, | |
| 410 | + max_attempts, | |
| 411 | + sleep_seconds, | |
| 412 | + exc, | |
| 413 | + ) | |
| 414 | + if sleep_seconds > 0: | |
| 415 | + time.sleep(sleep_seconds) | |
| 416 | + continue | |
| 417 | + | |
| 418 | + _cli_log.exception("[build] failed query=%r index=%s/%s", query, q_index, total_q) | |
| 419 | + failures.append( | |
| 420 | + { | |
| 421 | + "query": query, | |
| 422 | + "index": q_index, | |
| 423 | + "error": repr(exc), | |
| 424 | + } | |
| 425 | + ) | |
| 426 | + if not args.continue_on_error: | |
| 427 | + raise | |
| 428 | + _cli_log.error("[build] continue_on_error=true; skip failed query=%r", query) | |
| 429 | + result = None | |
| 430 | + break | |
| 431 | + | |
| 432 | + if result is None: | |
| 433 | + continue | |
| 434 | + | |
| 336 | 435 | summary.append( |
| 337 | 436 | { |
| 338 | 437 | "query": result.query, |
| ... | ... | @@ -352,10 +451,19 @@ def run_build(args: argparse.Namespace) -> None: |
| 352 | 451 | result.annotated_count, |
| 353 | 452 | result.output_json_path, |
| 354 | 453 | ) |
| 355 | - out_path = ensure_dir(framework.artifact_root / "query_builds") / f"build_summary_{utc_timestamp()}.json" | |
| 356 | 454 | out_path = query_builds_dir(framework.artifact_root, dataset.dataset_id) / f"build_summary_{utc_timestamp()}.json" |
| 357 | 455 | out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") |
| 358 | - _cli_log.info("[done] summary=%s", out_path) | |
| 456 | + _cli_log.info( | |
| 457 | + "[done] summary=%s success=%s skipped=%s failed=%s", | |
| 458 | + out_path, | |
| 459 | + len(summary), | |
| 460 | + skipped_queries, | |
| 461 | + len(failures), | |
| 462 | + ) | |
| 463 | + if failures: | |
| 464 | + failed_path = query_builds_dir(framework.artifact_root, dataset.dataset_id) / f"build_failures_{utc_timestamp()}.json" | |
| 465 | + failed_path.write_text(json.dumps(failures, ensure_ascii=False, indent=2), encoding="utf-8") | |
| 466 | + _cli_log.warning("[done] failures=%s", failed_path) | |
| 359 | 467 | |
| 360 | 468 | |
| 361 | 469 | def run_batch(args: argparse.Namespace) -> None: | ... | ... |
scripts/evaluation/start_eval.sh
| ... | ... | @@ -7,14 +7,19 @@ cd "$ROOT" |
| 7 | 7 | PY="${ROOT}/.venv/bin/python" |
| 8 | 8 | TENANT_ID="${TENANT_ID:-163}" |
| 9 | 9 | DATASET_ID="${REPO_EVAL_DATASET_ID:-core_queries}" |
| 10 | -QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}" | |
| 10 | +RETRY_COUNT="${REPO_EVAL_RETRY_COUNT:-2}" | |
| 11 | +EXTRA_QUERY_ARGS=() | |
| 12 | +if [[ -n "${REPO_EVAL_QUERIES:-}" ]]; then | |
| 13 | + EXTRA_QUERY_ARGS=(--queries-file "${REPO_EVAL_QUERIES}") | |
| 14 | +fi | |
| 11 | 15 | |
| 12 | 16 | usage() { |
| 13 | - echo "Usage: $0 batch|batch-rebuild|serve" | |
| 17 | + echo "Usage: $0 batch|batch-rebuild|batch-rebuild-resume|serve" | |
| 14 | 18 | echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50)" |
| 15 | 19 | echo " batch-rebuild — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)" |
| 20 | + echo " batch-rebuild-resume — resume missing queries from dataset query_builds with retry/continue-on-error" | |
| 16 | 21 | echo " serve — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)" |
| 17 | - echo "Env: TENANT_ID (default 163), REPO_EVAL_DATASET_ID (default core_queries), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)" | |
| 22 | + echo "Env: TENANT_ID (default 163), REPO_EVAL_DATASET_ID (default core_queries), REPO_EVAL_QUERIES (optional override), REPO_EVAL_RETRY_COUNT (default 2), EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)" | |
| 18 | 23 | } |
| 19 | 24 | |
| 20 | 25 | case "${1:-}" in |
| ... | ... | @@ -22,21 +27,36 @@ case "${1:-}" in |
| 22 | 27 | exec "$PY" scripts/evaluation/build_annotation_set.py batch \ |
| 23 | 28 | --tenant-id "$TENANT_ID" \ |
| 24 | 29 | --dataset-id "$DATASET_ID" \ |
| 25 | - --queries-file "$QUERIES" \ | |
| 26 | 30 | --top-k 50 \ |
| 27 | - --language en | |
| 31 | + --language en \ | |
| 32 | + "${EXTRA_QUERY_ARGS[@]}" | |
| 28 | 33 | ;; |
| 29 | 34 | batch-rebuild) |
| 30 | 35 | exec "$PY" scripts/evaluation/build_annotation_set.py build \ |
| 31 | 36 | --tenant-id "$TENANT_ID" \ |
| 32 | 37 | --dataset-id "$DATASET_ID" \ |
| 33 | - --queries-file "$QUERIES" \ | |
| 34 | 38 | --search-depth 500 \ |
| 35 | 39 | --rerank-depth 10000 \ |
| 36 | 40 | --reset-artifacts \ |
| 37 | 41 | --force-refresh-rerank \ |
| 38 | 42 | --force-refresh-labels \ |
| 39 | - --language en | |
| 43 | + --language en \ | |
| 44 | + "${EXTRA_QUERY_ARGS[@]}" | |
| 45 | + ;; | |
| 46 | + batch-rebuild-resume) | |
| 47 | + exec "$PY" scripts/evaluation/build_annotation_set.py build \ | |
| 48 | + --tenant-id "$TENANT_ID" \ | |
| 49 | + --dataset-id "$DATASET_ID" \ | |
| 50 | + --search-depth 500 \ | |
| 51 | + --rerank-depth 10000 \ | |
| 52 | + --force-refresh-rerank \ | |
| 53 | + --force-refresh-labels \ | |
| 54 | + --resume-missing \ | |
| 55 | + --continue-on-error \ | |
| 56 | + --max-retries-per-query "$RETRY_COUNT" \ | |
| 57 | + --retry-backoff-sec 10 \ | |
| 58 | + --language en \ | |
| 59 | + "${EXTRA_QUERY_ARGS[@]}" | |
| 40 | 60 | ;; |
| 41 | 61 | serve) |
| 42 | 62 | EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}" |
| ... | ... | @@ -44,9 +64,9 @@ case "${1:-}" in |
| 44 | 64 | exec "$PY" scripts/evaluation/serve_eval_web.py serve \ |
| 45 | 65 | --tenant-id "$TENANT_ID" \ |
| 46 | 66 | --dataset-id "$DATASET_ID" \ |
| 47 | - --queries-file "$QUERIES" \ | |
| 48 | 67 | --host "$EVAL_WEB_HOST" \ |
| 49 | - --port "$EVAL_WEB_PORT" | |
| 68 | + --port "$EVAL_WEB_PORT" \ | |
| 69 | + "${EXTRA_QUERY_ARGS[@]}" | |
| 50 | 70 | ;; |
| 51 | 71 | *) |
| 52 | 72 | usage | ... | ... |
scripts/service_ctl.sh
| ... | ... | @@ -19,7 +19,7 @@ CORE_SERVICES=("backend" "indexer" "frontend" "eval-web") |
| 19 | 19 | # reranker-fine 暂时不用,因此暂时从OPTIONAL_SERVICES中删除 |
| 20 | 20 | OPTIONAL_SERVICES=("tei" "cnclip" "embedding" "embedding-image" "translator" "reranker") |
| 21 | 21 | FULL_SERVICES=("${OPTIONAL_SERVICES[@]}" "${CORE_SERVICES[@]}") |
| 22 | -STOP_ORDER_SERVICES=("frontend" "eval-web" "indexer" "backend" "reranker" "translator" "embedding-image" "embedding" "cnclip" "tei") | |
| 22 | +STOP_ORDER_SERVICES=("frontend" "eval-web" "indexer" "reranker" "translator" "embedding-image" "embedding" "cnclip" "tei" "backend") | |
| 23 | 23 | declare -Ag SERVICE_ENABLED_CACHE=() |
| 24 | 24 | |
| 25 | 25 | all_services() { | ... | ... |
scripts/start_eval_web.sh
| ... | ... | @@ -10,7 +10,10 @@ EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}" |
| 10 | 10 | EVAL_WEB_HOST="${EVAL_WEB_HOST:-0.0.0.0}" |
| 11 | 11 | TENANT_ID="${TENANT_ID:-163}" |
| 12 | 12 | DATASET_ID="${REPO_EVAL_DATASET_ID:-core_queries}" |
| 13 | -QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}" | |
| 13 | +EXTRA_QUERY_ARGS=() | |
| 14 | +if [[ -n "${REPO_EVAL_QUERIES:-}" ]]; then | |
| 15 | + EXTRA_QUERY_ARGS=(--queries-file "${REPO_EVAL_QUERIES}") | |
| 16 | +fi | |
| 14 | 17 | |
| 15 | 18 | GREEN='\033[0;32m' |
| 16 | 19 | YELLOW='\033[1;33m' |
| ... | ... | @@ -27,6 +30,6 @@ export EVAL_WEB_PORT EVAL_WEB_HOST TENANT_ID REPO_EVAL_DATASET_ID REPO_EVAL_QUER |
| 27 | 30 | exec python scripts/evaluation/serve_eval_web.py serve \ |
| 28 | 31 | --tenant-id "${TENANT_ID}" \ |
| 29 | 32 | --dataset-id "${DATASET_ID}" \ |
| 30 | - --queries-file "${QUERIES}" \ | |
| 31 | 33 | --host "${EVAL_WEB_HOST}" \ |
| 32 | - --port "${EVAL_WEB_PORT}" | |
| 34 | + --port "${EVAL_WEB_PORT}" \ | |
| 35 | + "${EXTRA_QUERY_ARGS[@]}" | ... | ... |