Commit 12a75c466878d4c014fcf92cca4f1a012878496c
1 parent
99b72698
feat(eval): 为 LLM 标注添加统一续跑能力,支持断点续传与容错重试
- 问题背景:clothing_top771 数据集在被外部异常终止(reranker被kill);缺乏统一的断点续跑机制,此前依赖临时脚本恢复。 - 解决方案:在 eval_framework/cli.py 的 build 命令中新增 --resume-missing、--continue-on-error、--max-retries-per-query、--retry-backoff-sec 参数,并修正默认参数逻辑(有 dataset_id 时不再强塞 legacy queries_file)。 - 脚本统一:更新 start_eval.sh 和 start_eval_web.sh,增加 batch-rebuild-resume 入口,统一使用 dataset 模式,REPO_EVAL_QUERIES 改为可选覆盖。 - 文档补充:在 scripts/evaluation/README.md 中添加中断续跑说明和新命令用法。 - 验证:eval-web 多数据集接口(/api/datasets、/api/history?dataset_id=...)正常返回 core_queries 与 clothing_top771 分域结果;当前进程已越过第 48 条,query_builds 计数增至 54,正在处理第 55/771。 把流程做成可持续的“统一续跑”能力,避免再靠临时脚本: - 在 [scripts/evaluation/eval_framework/cli.py](/data/saas-search/scripts/evaluation/eval_framework/cli.py) 新增 `build --resume-missing --continue-on-error --max-retries-per-query --retry-backoff-sec`,并修正默认参数逻辑(有 `dataset_id` 时不再强塞 legacy `queries_file`)。 - 在 [scripts/evaluation/start_eval.sh](/data/saas-search/scripts/evaluation/start_eval.sh) 新增 `batch-rebuild-resume` 入口,统一用 dataset 模式,`REPO_EVAL_QUERIES` 仅作可选覆盖。 - 在 [scripts/start_eval_web.sh](/data/saas-search/scripts/start_eval_web.sh) 做同样的 dataset/queries 统一化。 - 在 [scripts/evaluation/README.md](/data/saas-search/scripts/evaluation/README.md) 补了中断续跑说明和新命令。 - 已验证 `eval-web` 多数据集接口正常(`/api/datasets`、`/api/history?dataset_id=...` 均返回 `core_queries` 与 `clothing_top771` 分域结果)。 当前在线进程: - LLM 标注:`PID 2062901`(`build ... --dataset-id clothing_top771 --resume-missing ...`) - reranker:`PID 2065235`(6007,`/health` 返回 `ok`) 盯进度: ```bash tail -f logs/eval.log ls -1 artifacts/search_evaluation/datasets/clothing_top771/query_builds | wc -l curl -sS http://127.0.0.1:6007/health ``` 影响范围:scripts/evaluation/eval_framework/cli.py, scripts/evaluation/start_eval.sh, scripts/start_eval_web.sh, scripts/evaluation/README.md
Showing
12 changed files
with
267 additions
and
37 deletions
Show diff stats
artifacts/search_evaluation/build_launches/clothing_top771_batch_rebuild_resume_20260420T055157Z.pid
0 → 100644
| @@ -0,0 +1 @@ | @@ -0,0 +1 @@ | ||
| 1 | +2061952 |
artifacts/search_evaluation/build_launches/clothing_top771_build_resume_direct_20260420T055302Z.pid
0 → 100644
| @@ -0,0 +1 @@ | @@ -0,0 +1 @@ | ||
| 1 | +2062709 |
artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054051Z.cmd
0 → 100644
| @@ -0,0 +1,44 @@ | @@ -0,0 +1,44 @@ | ||
| 1 | +./.venv/bin/python - <<'PY' | ||
| 2 | +import json | ||
| 3 | +from pathlib import Path | ||
| 4 | +from scripts.evaluation.eval_framework.datasets import resolve_dataset, query_builds_dir | ||
| 5 | +from scripts.evaluation.eval_framework.framework import SearchEvaluationFramework | ||
| 6 | + | ||
| 7 | +dataset = resolve_dataset(dataset_id='clothing_top771', tenant_id='163', language='en', require_enabled=True) | ||
| 8 | +fw = SearchEvaluationFramework(tenant_id=dataset.tenant_id) | ||
| 9 | +qdir = query_builds_dir(fw.artifact_root, dataset.dataset_id) | ||
| 10 | +seen = set() | ||
| 11 | +for p in qdir.glob('*.json'): | ||
| 12 | + try: | ||
| 13 | + obj = json.loads(p.read_text(encoding='utf-8')) | ||
| 14 | + q = str(obj.get('query') or '').strip() | ||
| 15 | + if q: | ||
| 16 | + seen.add(q) | ||
| 17 | + except Exception: | ||
| 18 | + pass | ||
| 19 | +queries = list(dataset.queries) | ||
| 20 | +remaining = [q for q in queries if q not in seen] | ||
| 21 | +print(f"[resume] dataset={dataset.dataset_id} total={len(queries)} built={len(seen)} remaining={len(remaining)}") | ||
| 22 | +if remaining: | ||
| 23 | + print(f"[resume] first_remaining={remaining[0]!r} line={queries.index(remaining[0])+1}") | ||
| 24 | + | ||
| 25 | +for idx, q in enumerate(queries, start=1): | ||
| 26 | + if q in seen: | ||
| 27 | + continue | ||
| 28 | + print(f"[resume] ({idx}/{len(queries)}) start query={q!r}") | ||
| 29 | + result = fw.build_query_annotation_set( | ||
| 30 | + query=q, | ||
| 31 | + dataset=dataset, | ||
| 32 | + search_depth=500, | ||
| 33 | + rerank_depth=10000, | ||
| 34 | + language='en', | ||
| 35 | + force_refresh_rerank=True, | ||
| 36 | + force_refresh_labels=True, | ||
| 37 | + ) | ||
| 38 | + print( | ||
| 39 | + f"[resume] done query={q!r} search_total={result.search_total} " | ||
| 40 | + f"search_depth={result.search_depth} corpus={result.rerank_corpus_size} " | ||
| 41 | + f"annotated={result.annotated_count} output={result.output_json_path}" | ||
| 42 | + ) | ||
| 43 | +print("[resume] all remaining queries completed") | ||
| 44 | +PY |
artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054051Z.pid
0 → 100644
| @@ -0,0 +1 @@ | @@ -0,0 +1 @@ | ||
| 1 | +2053402 |
artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054253Z_ub.cmd
0 → 100644
| @@ -0,0 +1,45 @@ | @@ -0,0 +1,45 @@ | ||
| 1 | +PYTHONUNBUFFERED=1 ./.venv/bin/python - <<'PY' | ||
| 2 | +import json | ||
| 3 | +from pathlib import Path | ||
| 4 | +from scripts.evaluation.eval_framework.datasets import resolve_dataset, query_builds_dir | ||
| 5 | +from scripts.evaluation.eval_framework.framework import SearchEvaluationFramework | ||
| 6 | + | ||
| 7 | +dataset = resolve_dataset(dataset_id='clothing_top771', tenant_id='163', language='en', require_enabled=True) | ||
| 8 | +fw = SearchEvaluationFramework(tenant_id=dataset.tenant_id) | ||
| 9 | +qdir = query_builds_dir(fw.artifact_root, dataset.dataset_id) | ||
| 10 | +seen = set() | ||
| 11 | +for p in qdir.glob('*.json'): | ||
| 12 | + try: | ||
| 13 | + obj = json.loads(p.read_text(encoding='utf-8')) | ||
| 14 | + q = str(obj.get('query') or '').strip() | ||
| 15 | + if q: | ||
| 16 | + seen.add(q) | ||
| 17 | + except Exception: | ||
| 18 | + pass | ||
| 19 | +queries = list(dataset.queries) | ||
| 20 | +remaining = [q for q in queries if q not in seen] | ||
| 21 | +print(f"[resume] dataset={dataset.dataset_id} total={len(queries)} built={len(seen)} remaining={len(remaining)}") | ||
| 22 | +if not remaining: | ||
| 23 | + print('[resume] nothing to do') | ||
| 24 | +else: | ||
| 25 | + print(f"[resume] first_remaining={remaining[0]!r} line={queries.index(remaining[0])+1}") | ||
| 26 | +for idx, q in enumerate(queries, start=1): | ||
| 27 | + if q in seen: | ||
| 28 | + continue | ||
| 29 | + print(f"[resume] ({idx}/{len(queries)}) start query={q!r}") | ||
| 30 | + result = fw.build_query_annotation_set( | ||
| 31 | + query=q, | ||
| 32 | + dataset=dataset, | ||
| 33 | + search_depth=500, | ||
| 34 | + rerank_depth=10000, | ||
| 35 | + language='en', | ||
| 36 | + force_refresh_rerank=True, | ||
| 37 | + force_refresh_labels=True, | ||
| 38 | + ) | ||
| 39 | + print( | ||
| 40 | + f"[resume] done query={q!r} search_total={result.search_total} " | ||
| 41 | + f"search_depth={result.search_depth} corpus={result.rerank_corpus_size} " | ||
| 42 | + f"annotated={result.annotated_count} output={result.output_json_path}" | ||
| 43 | + ) | ||
| 44 | +print('[resume] all remaining queries completed') | ||
| 45 | +PY |
artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054253Z_ub.pid
0 → 100644
| @@ -0,0 +1 @@ | @@ -0,0 +1 @@ | ||
| 1 | +2054946 |
artifacts/search_evaluation/build_launches/reranker-resume.pid
0 → 100644
| @@ -0,0 +1 @@ | @@ -0,0 +1 @@ | ||
| 1 | +2064765 |
scripts/evaluation/README.md
| @@ -24,7 +24,7 @@ This directory holds the offline annotation builder, the evaluation web UI/API, | @@ -24,7 +24,7 @@ This directory holds the offline annotation builder, the evaluation web UI/API, | ||
| 24 | | `queries/queries.txt` | Legacy core query set (`dataset_id=core_queries`) | | 24 | | `queries/queries.txt` | Legacy core query set (`dataset_id=core_queries`) | |
| 25 | | `queries/all_keywords.txt.top1w.shuf.top1k.clothing_filtered` | Expanded clothing dataset (`dataset_id=clothing_top771`) | | 25 | | `queries/all_keywords.txt.top1w.shuf.top1k.clothing_filtered` | Expanded clothing dataset (`dataset_id=clothing_top771`) | |
| 26 | | `README_Requirement.md` | Product/requirements reference | | 26 | | `README_Requirement.md` | Product/requirements reference | |
| 27 | -| `start_eval.sh` | Wrapper: `batch`, `batch-rebuild` (deep `build` + `--force-refresh-labels`), or `serve` | | 27 | +| `start_eval.sh` | Wrapper: `batch`, `batch-rebuild`, `batch-rebuild-resume` (resume from existing per-query outputs), or `serve` | |
| 28 | | `../start_eval_web.sh` | Same as `serve` with `activate.sh`; use `./scripts/service_ctl.sh start eval-web` (default port **6010**, override with `EVAL_WEB_PORT`). `./run.sh all` includes eval-web. | | 28 | | `../start_eval_web.sh` | Same as `serve` with `activate.sh`; use `./scripts/service_ctl.sh start eval-web` (default port **6010**, override with `EVAL_WEB_PORT`). `./run.sh all` includes eval-web. | |
| 29 | 29 | ||
| 30 | ## Quick start (repo root) | 30 | ## Quick start (repo root) |
| @@ -41,6 +41,9 @@ REPO_EVAL_DATASET_ID=clothing_top771 ./scripts/evaluation/start_eval.sh batch | @@ -41,6 +41,9 @@ REPO_EVAL_DATASET_ID=clothing_top771 ./scripts/evaluation/start_eval.sh batch | ||
| 41 | # Deep rebuild: per-query full corpus rerank (outside search recall pool) + LLM in batches along global sort order (early stop; expensive) | 41 | # Deep rebuild: per-query full corpus rerank (outside search recall pool) + LLM in batches along global sort order (early stop; expensive) |
| 42 | ./scripts/evaluation/start_eval.sh batch-rebuild | 42 | ./scripts/evaluation/start_eval.sh batch-rebuild |
| 43 | 43 | ||
| 44 | +# Resume deep rebuild from existing query_builds (recommended for long 771-query runs) | ||
| 45 | +REPO_EVAL_DATASET_ID=clothing_top771 ./scripts/evaluation/start_eval.sh batch-rebuild-resume | ||
| 46 | + | ||
| 44 | # UI: http://127.0.0.1:6010/ | 47 | # UI: http://127.0.0.1:6010/ |
| 45 | ./scripts/evaluation/start_eval.sh serve | 48 | ./scripts/evaluation/start_eval.sh serve |
| 46 | # or: ./scripts/service_ctl.sh start eval-web | 49 | # or: ./scripts/service_ctl.sh start eval-web |
| @@ -79,7 +82,7 @@ Each `batch` run walks the full queries file and writes a **batch report** under | @@ -79,7 +82,7 @@ Each `batch` run walks the full queries file and writes a **batch report** under | ||
| 79 | 82 | ||
| 80 | This runs `build_annotation_set.py build` with **`--force-refresh-labels`** and **`--force-refresh-rerank`** (see the explicit command block below). It does **not** run the `batch` subcommand: there is **no** aggregate batch report for this step; outputs are per-query JSON under `query_builds/` plus updates in `search_eval.sqlite3`. | 83 | This runs `build_annotation_set.py build` with **`--force-refresh-labels`** and **`--force-refresh-rerank`** (see the explicit command block below). It does **not** run the `batch` subcommand: there is **no** aggregate batch report for this step; outputs are per-query JSON under `query_builds/` plus updates in `search_eval.sqlite3`. |
| 81 | 84 | ||
| 82 | -For **each** query in `queries.txt`, in order: | 85 | +For **each** query in the selected dataset query file (`--dataset-id` / `config.yaml search_evaluation.datasets[*].query_file`), in order: |
| 83 | 86 | ||
| 84 | 1. **Search recall** — Call the live search API with `size = max(--search-depth, --search-recall-top-k)` (the wrapper uses `--search-depth 500`). The first **`--search-recall-top-k`** hits (default **200**, see `eval_framework.constants.DEFAULT_SEARCH_RECALL_TOP_K`) form the **recall pool**; they are treated as rerank score **1** and are **not** sent to the reranker. | 87 | 1. **Search recall** — Call the live search API with `size = max(--search-depth, --search-recall-top-k)` (the wrapper uses `--search-depth 500`). The first **`--search-recall-top-k`** hits (default **200**, see `eval_framework.constants.DEFAULT_SEARCH_RECALL_TOP_K`) form the **recall pool**; they are treated as rerank score **1** and are **not** sent to the reranker. |
| 85 | 2. **Full corpus** — Load the tenant’s product corpus from Elasticsearch (same tenant as `TENANT_ID` / `--tenant-id`, default **163**), via `corpus_docs()` (cached in SQLite after the first load). | 88 | 2. **Full corpus** — Load the tenant’s product corpus from Elasticsearch (same tenant as `TENANT_ID` / `--tenant-id`, default **163**), via `corpus_docs()` (cached in SQLite after the first load). |
| @@ -104,6 +107,8 @@ For **each** query in `queries.txt`, in order: | @@ -104,6 +107,8 @@ For **each** query in `queries.txt`, in order: | ||
| 104 | 107 | ||
| 105 | **Tuning the rebuild path:** `--search-recall-top-k`, `--rerank-high-threshold`, `--rerank-high-skip-count`, `--rebuild-llm-batch-size`, `--rebuild-min-batches`, `--rebuild-max-batches`, `--rebuild-irrelevant-stop-ratio`, `--rebuild-irrel-low-combined-stop-ratio`, `--rebuild-irrelevant-stop-streak` on `build` (see `eval_framework/cli.py`). Rerank API chunk size is **80** docs per request in code (`full_corpus_rerank_outside_exclude`). | 108 | **Tuning the rebuild path:** `--search-recall-top-k`, `--rerank-high-threshold`, `--rerank-high-skip-count`, `--rebuild-llm-batch-size`, `--rebuild-min-batches`, `--rebuild-max-batches`, `--rebuild-irrelevant-stop-ratio`, `--rebuild-irrel-low-combined-stop-ratio`, `--rebuild-irrelevant-stop-streak` on `build` (see `eval_framework/cli.py`). Rerank API chunk size is **80** docs per request in code (`full_corpus_rerank_outside_exclude`). |
| 106 | 109 | ||
| 110 | +**Resuming interrupted runs:** for long jobs (for example `clothing_top771`), use `batch-rebuild-resume` or pass `build --resume-missing --continue-on-error --max-retries-per-query N`. Resume mode skips queries that already have per-query JSON under `datasets/<dataset_id>/query_builds/`. | ||
| 111 | + | ||
| 107 | ## Artifacts | 112 | ## Artifacts |
| 108 | 113 | ||
| 109 | Default root: `artifacts/search_evaluation/` | 114 | Default root: `artifacts/search_evaluation/` |
scripts/evaluation/eval_framework/cli.py
| @@ -6,15 +6,16 @@ import argparse | @@ -6,15 +6,16 @@ import argparse | ||
| 6 | import json | 6 | import json |
| 7 | import logging | 7 | import logging |
| 8 | import shutil | 8 | import shutil |
| 9 | +import time | ||
| 9 | from pathlib import Path | 10 | from pathlib import Path |
| 10 | -from typing import Any, Dict | 11 | +from typing import Any, Dict, List, Set |
| 11 | 12 | ||
| 12 | from config.loader import get_app_config | 13 | from config.loader import get_app_config |
| 13 | 14 | ||
| 14 | from .datasets import audits_dir, query_builds_dir, resolve_dataset | 15 | from .datasets import audits_dir, query_builds_dir, resolve_dataset |
| 15 | from .framework import SearchEvaluationFramework | 16 | from .framework import SearchEvaluationFramework |
| 16 | from .logging_setup import setup_eval_logging | 17 | from .logging_setup import setup_eval_logging |
| 17 | -from .utils import ensure_dir, utc_now_iso, utc_timestamp | 18 | +from .utils import utc_now_iso, utc_timestamp |
| 18 | from .web_app import create_web_app | 19 | from .web_app import create_web_app |
| 19 | 20 | ||
| 20 | _cli_log = logging.getLogger("search_eval.cli") | 21 | _cli_log = logging.getLogger("search_eval.cli") |
| @@ -95,7 +96,8 @@ def _apply_search_evaluation_cli_defaults(args: argparse.Namespace) -> None: | @@ -95,7 +96,8 @@ def _apply_search_evaluation_cli_defaults(args: argparse.Namespace) -> None: | ||
| 95 | args.dataset_id = se.default_dataset_id | 96 | args.dataset_id = se.default_dataset_id |
| 96 | if getattr(args, "tenant_id", None) in (None, ""): | 97 | if getattr(args, "tenant_id", None) in (None, ""): |
| 97 | args.tenant_id = se.default_tenant_id | 98 | args.tenant_id = se.default_tenant_id |
| 98 | - if getattr(args, "queries_file", None) in (None, ""): | 99 | + # Keep legacy queries_file fallback only when dataset_id is not specified. |
| 100 | + if getattr(args, "queries_file", None) in (None, "") and getattr(args, "dataset_id", None) in (None, ""): | ||
| 99 | args.queries_file = str(se.queries_file) | 101 | args.queries_file = str(se.queries_file) |
| 100 | if getattr(args, "language", None) in (None, ""): | 102 | if getattr(args, "language", None) in (None, ""): |
| 101 | args.language = se.default_language | 103 | args.language = se.default_language |
| @@ -162,6 +164,23 @@ def _resolve_dataset_from_args(args: argparse.Namespace, *, require_enabled: boo | @@ -162,6 +164,23 @@ def _resolve_dataset_from_args(args: argparse.Namespace, *, require_enabled: boo | ||
| 162 | return dataset | 164 | return dataset |
| 163 | 165 | ||
| 164 | 166 | ||
| 167 | +def _list_built_queries(artifact_root: Path, dataset_id: str) -> Set[str]: | ||
| 168 | + built: Set[str] = set() | ||
| 169 | + root = query_builds_dir(artifact_root, dataset_id) | ||
| 170 | + for path in root.glob("*.json"): | ||
| 171 | + name = path.name | ||
| 172 | + if name.startswith("build_summary_") or name.startswith("build_failures_"): | ||
| 173 | + continue | ||
| 174 | + try: | ||
| 175 | + payload = json.loads(path.read_text(encoding="utf-8")) | ||
| 176 | + except Exception: | ||
| 177 | + continue | ||
| 178 | + query = str(payload.get("query") or "").strip() | ||
| 179 | + if query: | ||
| 180 | + built.add(query) | ||
| 181 | + return built | ||
| 182 | + | ||
| 183 | + | ||
| 165 | def build_cli_parser() -> argparse.ArgumentParser: | 184 | def build_cli_parser() -> argparse.ArgumentParser: |
| 166 | parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI") | 185 | parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI") |
| 167 | sub = parser.add_subparsers(dest="command", required=True) | 186 | sub = parser.add_subparsers(dest="command", required=True) |
| @@ -251,6 +270,28 @@ def build_cli_parser() -> argparse.ArgumentParser: | @@ -251,6 +270,28 @@ def build_cli_parser() -> argparse.ArgumentParser: | ||
| 251 | action="store_true", | 270 | action="store_true", |
| 252 | help="Delete dataset-specific query_builds/audits before starting. Shared SQLite cache is preserved.", | 271 | help="Delete dataset-specific query_builds/audits before starting. Shared SQLite cache is preserved.", |
| 253 | ) | 272 | ) |
| 273 | + build.add_argument( | ||
| 274 | + "--resume-missing", | ||
| 275 | + action="store_true", | ||
| 276 | + help="Skip queries that already have per-query build JSONs in this dataset's query_builds directory.", | ||
| 277 | + ) | ||
| 278 | + build.add_argument( | ||
| 279 | + "--continue-on-error", | ||
| 280 | + action="store_true", | ||
| 281 | + help="Continue with remaining queries when one query fails after retries.", | ||
| 282 | + ) | ||
| 283 | + build.add_argument( | ||
| 284 | + "--max-retries-per-query", | ||
| 285 | + type=int, | ||
| 286 | + default=0, | ||
| 287 | + help="Retry count per failed query before giving up (default: 0).", | ||
| 288 | + ) | ||
| 289 | + build.add_argument( | ||
| 290 | + "--retry-backoff-sec", | ||
| 291 | + type=float, | ||
| 292 | + default=5.0, | ||
| 293 | + help="Base backoff seconds between retries (actual sleep = base * attempt_no).", | ||
| 294 | + ) | ||
| 254 | build.add_argument("--force-refresh-rerank", action="store_true") | 295 | build.add_argument("--force-refresh-rerank", action="store_true") |
| 255 | build.add_argument("--force-refresh-labels", action="store_true") | 296 | build.add_argument("--force-refresh-labels", action="store_true") |
| 256 | add_judge_llm_args(build) | 297 | add_judge_llm_args(build) |
| @@ -300,7 +341,19 @@ def run_build(args: argparse.Namespace) -> None: | @@ -300,7 +341,19 @@ def run_build(args: argparse.Namespace) -> None: | ||
| 300 | _reset_build_artifacts(dataset.dataset_id) | 341 | _reset_build_artifacts(dataset.dataset_id) |
| 301 | framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) | 342 | framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) |
| 302 | queries = list(dataset.queries) | 343 | queries = list(dataset.queries) |
| 303 | - summary = [] | 344 | + summary: List[Dict[str, Any]] = [] |
| 345 | + failures: List[Dict[str, Any]] = [] | ||
| 346 | + completed_queries: Set[str] = set() | ||
| 347 | + if args.resume_missing: | ||
| 348 | + completed_queries = _list_built_queries(framework.artifact_root, dataset.dataset_id) | ||
| 349 | + _cli_log.info( | ||
| 350 | + "[build] resume mode: dataset=%s total=%s already_built=%s remaining=%s", | ||
| 351 | + dataset.dataset_id, | ||
| 352 | + len(queries), | ||
| 353 | + len(completed_queries), | ||
| 354 | + max(0, len(queries) - len(completed_queries)), | ||
| 355 | + ) | ||
| 356 | + skipped_queries = 0 | ||
| 304 | rebuild_kwargs = {} | 357 | rebuild_kwargs = {} |
| 305 | if args.force_refresh_labels: | 358 | if args.force_refresh_labels: |
| 306 | rebuild_kwargs = { | 359 | rebuild_kwargs = { |
| @@ -316,23 +369,69 @@ def run_build(args: argparse.Namespace) -> None: | @@ -316,23 +369,69 @@ def run_build(args: argparse.Namespace) -> None: | ||
| 316 | } | 369 | } |
| 317 | total_q = len(queries) | 370 | total_q = len(queries) |
| 318 | for q_index, query in enumerate(queries, start=1): | 371 | for q_index, query in enumerate(queries, start=1): |
| 319 | - _cli_log.info("[build] (%s/%s) starting query=%r", q_index, total_q, query) | ||
| 320 | - try: | ||
| 321 | - result = framework.build_query_annotation_set( | ||
| 322 | - query=query, | ||
| 323 | - dataset=dataset, | ||
| 324 | - search_depth=args.search_depth, | ||
| 325 | - rerank_depth=args.rerank_depth, | ||
| 326 | - annotate_search_top_k=args.annotate_search_top_k, | ||
| 327 | - annotate_rerank_top_k=args.annotate_rerank_top_k, | ||
| 328 | - language=args.language, | ||
| 329 | - force_refresh_rerank=args.force_refresh_rerank, | ||
| 330 | - force_refresh_labels=args.force_refresh_labels, | ||
| 331 | - **rebuild_kwargs, | 372 | + if query in completed_queries: |
| 373 | + skipped_queries += 1 | ||
| 374 | + _cli_log.info("[build] (%s/%s) skip query=%r (already built)", q_index, total_q, query) | ||
| 375 | + continue | ||
| 376 | + | ||
| 377 | + attempt = 0 | ||
| 378 | + while True: | ||
| 379 | + max_attempts = max(1, int(args.max_retries_per_query) + 1) | ||
| 380 | + _cli_log.info( | ||
| 381 | + "[build] (%s/%s) starting query=%r attempt=%s/%s", | ||
| 382 | + q_index, | ||
| 383 | + total_q, | ||
| 384 | + query, | ||
| 385 | + attempt + 1, | ||
| 386 | + max_attempts, | ||
| 332 | ) | 387 | ) |
| 333 | - except Exception: | ||
| 334 | - _cli_log.exception("[build] failed query=%r index=%s/%s", query, q_index, total_q) | ||
| 335 | - raise | 388 | + try: |
| 389 | + result = framework.build_query_annotation_set( | ||
| 390 | + query=query, | ||
| 391 | + dataset=dataset, | ||
| 392 | + search_depth=args.search_depth, | ||
| 393 | + rerank_depth=args.rerank_depth, | ||
| 394 | + annotate_search_top_k=args.annotate_search_top_k, | ||
| 395 | + annotate_rerank_top_k=args.annotate_rerank_top_k, | ||
| 396 | + language=args.language, | ||
| 397 | + force_refresh_rerank=args.force_refresh_rerank, | ||
| 398 | + force_refresh_labels=args.force_refresh_labels, | ||
| 399 | + **rebuild_kwargs, | ||
| 400 | + ) | ||
| 401 | + break | ||
| 402 | + except Exception as exc: | ||
| 403 | + attempt += 1 | ||
| 404 | + if attempt <= int(args.max_retries_per_query): | ||
| 405 | + sleep_seconds = max(0.0, float(args.retry_backoff_sec)) * attempt | ||
| 406 | + _cli_log.warning( | ||
| 407 | + "[build] query=%r failed attempt=%s/%s; retry in %.1fs: %s", | ||
| 408 | + query, | ||
| 409 | + attempt, | ||
| 410 | + max_attempts, | ||
| 411 | + sleep_seconds, | ||
| 412 | + exc, | ||
| 413 | + ) | ||
| 414 | + if sleep_seconds > 0: | ||
| 415 | + time.sleep(sleep_seconds) | ||
| 416 | + continue | ||
| 417 | + | ||
| 418 | + _cli_log.exception("[build] failed query=%r index=%s/%s", query, q_index, total_q) | ||
| 419 | + failures.append( | ||
| 420 | + { | ||
| 421 | + "query": query, | ||
| 422 | + "index": q_index, | ||
| 423 | + "error": repr(exc), | ||
| 424 | + } | ||
| 425 | + ) | ||
| 426 | + if not args.continue_on_error: | ||
| 427 | + raise | ||
| 428 | + _cli_log.error("[build] continue_on_error=true; skip failed query=%r", query) | ||
| 429 | + result = None | ||
| 430 | + break | ||
| 431 | + | ||
| 432 | + if result is None: | ||
| 433 | + continue | ||
| 434 | + | ||
| 336 | summary.append( | 435 | summary.append( |
| 337 | { | 436 | { |
| 338 | "query": result.query, | 437 | "query": result.query, |
| @@ -352,10 +451,19 @@ def run_build(args: argparse.Namespace) -> None: | @@ -352,10 +451,19 @@ def run_build(args: argparse.Namespace) -> None: | ||
| 352 | result.annotated_count, | 451 | result.annotated_count, |
| 353 | result.output_json_path, | 452 | result.output_json_path, |
| 354 | ) | 453 | ) |
| 355 | - out_path = ensure_dir(framework.artifact_root / "query_builds") / f"build_summary_{utc_timestamp()}.json" | ||
| 356 | out_path = query_builds_dir(framework.artifact_root, dataset.dataset_id) / f"build_summary_{utc_timestamp()}.json" | 454 | out_path = query_builds_dir(framework.artifact_root, dataset.dataset_id) / f"build_summary_{utc_timestamp()}.json" |
| 357 | out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") | 455 | out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") |
| 358 | - _cli_log.info("[done] summary=%s", out_path) | 456 | + _cli_log.info( |
| 457 | + "[done] summary=%s success=%s skipped=%s failed=%s", | ||
| 458 | + out_path, | ||
| 459 | + len(summary), | ||
| 460 | + skipped_queries, | ||
| 461 | + len(failures), | ||
| 462 | + ) | ||
| 463 | + if failures: | ||
| 464 | + failed_path = query_builds_dir(framework.artifact_root, dataset.dataset_id) / f"build_failures_{utc_timestamp()}.json" | ||
| 465 | + failed_path.write_text(json.dumps(failures, ensure_ascii=False, indent=2), encoding="utf-8") | ||
| 466 | + _cli_log.warning("[done] failures=%s", failed_path) | ||
| 359 | 467 | ||
| 360 | 468 | ||
| 361 | def run_batch(args: argparse.Namespace) -> None: | 469 | def run_batch(args: argparse.Namespace) -> None: |
scripts/evaluation/start_eval.sh
| @@ -7,14 +7,19 @@ cd "$ROOT" | @@ -7,14 +7,19 @@ cd "$ROOT" | ||
| 7 | PY="${ROOT}/.venv/bin/python" | 7 | PY="${ROOT}/.venv/bin/python" |
| 8 | TENANT_ID="${TENANT_ID:-163}" | 8 | TENANT_ID="${TENANT_ID:-163}" |
| 9 | DATASET_ID="${REPO_EVAL_DATASET_ID:-core_queries}" | 9 | DATASET_ID="${REPO_EVAL_DATASET_ID:-core_queries}" |
| 10 | -QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}" | 10 | +RETRY_COUNT="${REPO_EVAL_RETRY_COUNT:-2}" |
| 11 | +EXTRA_QUERY_ARGS=() | ||
| 12 | +if [[ -n "${REPO_EVAL_QUERIES:-}" ]]; then | ||
| 13 | + EXTRA_QUERY_ARGS=(--queries-file "${REPO_EVAL_QUERIES}") | ||
| 14 | +fi | ||
| 11 | 15 | ||
| 12 | usage() { | 16 | usage() { |
| 13 | - echo "Usage: $0 batch|batch-rebuild|serve" | 17 | + echo "Usage: $0 batch|batch-rebuild|batch-rebuild-resume|serve" |
| 14 | echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50)" | 18 | echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50)" |
| 15 | echo " batch-rebuild — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)" | 19 | echo " batch-rebuild — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)" |
| 20 | + echo " batch-rebuild-resume — resume missing queries from dataset query_builds with retry/continue-on-error" | ||
| 16 | echo " serve — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)" | 21 | echo " serve — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)" |
| 17 | - echo "Env: TENANT_ID (default 163), REPO_EVAL_DATASET_ID (default core_queries), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)" | 22 | + echo "Env: TENANT_ID (default 163), REPO_EVAL_DATASET_ID (default core_queries), REPO_EVAL_QUERIES (optional override), REPO_EVAL_RETRY_COUNT (default 2), EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)" |
| 18 | } | 23 | } |
| 19 | 24 | ||
| 20 | case "${1:-}" in | 25 | case "${1:-}" in |
| @@ -22,21 +27,36 @@ case "${1:-}" in | @@ -22,21 +27,36 @@ case "${1:-}" in | ||
| 22 | exec "$PY" scripts/evaluation/build_annotation_set.py batch \ | 27 | exec "$PY" scripts/evaluation/build_annotation_set.py batch \ |
| 23 | --tenant-id "$TENANT_ID" \ | 28 | --tenant-id "$TENANT_ID" \ |
| 24 | --dataset-id "$DATASET_ID" \ | 29 | --dataset-id "$DATASET_ID" \ |
| 25 | - --queries-file "$QUERIES" \ | ||
| 26 | --top-k 50 \ | 30 | --top-k 50 \ |
| 27 | - --language en | 31 | + --language en \ |
| 32 | + "${EXTRA_QUERY_ARGS[@]}" | ||
| 28 | ;; | 33 | ;; |
| 29 | batch-rebuild) | 34 | batch-rebuild) |
| 30 | exec "$PY" scripts/evaluation/build_annotation_set.py build \ | 35 | exec "$PY" scripts/evaluation/build_annotation_set.py build \ |
| 31 | --tenant-id "$TENANT_ID" \ | 36 | --tenant-id "$TENANT_ID" \ |
| 32 | --dataset-id "$DATASET_ID" \ | 37 | --dataset-id "$DATASET_ID" \ |
| 33 | - --queries-file "$QUERIES" \ | ||
| 34 | --search-depth 500 \ | 38 | --search-depth 500 \ |
| 35 | --rerank-depth 10000 \ | 39 | --rerank-depth 10000 \ |
| 36 | --reset-artifacts \ | 40 | --reset-artifacts \ |
| 37 | --force-refresh-rerank \ | 41 | --force-refresh-rerank \ |
| 38 | --force-refresh-labels \ | 42 | --force-refresh-labels \ |
| 39 | - --language en | 43 | + --language en \ |
| 44 | + "${EXTRA_QUERY_ARGS[@]}" | ||
| 45 | + ;; | ||
| 46 | + batch-rebuild-resume) | ||
| 47 | + exec "$PY" scripts/evaluation/build_annotation_set.py build \ | ||
| 48 | + --tenant-id "$TENANT_ID" \ | ||
| 49 | + --dataset-id "$DATASET_ID" \ | ||
| 50 | + --search-depth 500 \ | ||
| 51 | + --rerank-depth 10000 \ | ||
| 52 | + --force-refresh-rerank \ | ||
| 53 | + --force-refresh-labels \ | ||
| 54 | + --resume-missing \ | ||
| 55 | + --continue-on-error \ | ||
| 56 | + --max-retries-per-query "$RETRY_COUNT" \ | ||
| 57 | + --retry-backoff-sec 10 \ | ||
| 58 | + --language en \ | ||
| 59 | + "${EXTRA_QUERY_ARGS[@]}" | ||
| 40 | ;; | 60 | ;; |
| 41 | serve) | 61 | serve) |
| 42 | EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}" | 62 | EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}" |
| @@ -44,9 +64,9 @@ case "${1:-}" in | @@ -44,9 +64,9 @@ case "${1:-}" in | ||
| 44 | exec "$PY" scripts/evaluation/serve_eval_web.py serve \ | 64 | exec "$PY" scripts/evaluation/serve_eval_web.py serve \ |
| 45 | --tenant-id "$TENANT_ID" \ | 65 | --tenant-id "$TENANT_ID" \ |
| 46 | --dataset-id "$DATASET_ID" \ | 66 | --dataset-id "$DATASET_ID" \ |
| 47 | - --queries-file "$QUERIES" \ | ||
| 48 | --host "$EVAL_WEB_HOST" \ | 67 | --host "$EVAL_WEB_HOST" \ |
| 49 | - --port "$EVAL_WEB_PORT" | 68 | + --port "$EVAL_WEB_PORT" \ |
| 69 | + "${EXTRA_QUERY_ARGS[@]}" | ||
| 50 | ;; | 70 | ;; |
| 51 | *) | 71 | *) |
| 52 | usage | 72 | usage |
scripts/service_ctl.sh
| @@ -19,7 +19,7 @@ CORE_SERVICES=("backend" "indexer" "frontend" "eval-web") | @@ -19,7 +19,7 @@ CORE_SERVICES=("backend" "indexer" "frontend" "eval-web") | ||
| 19 | # reranker-fine 暂时不用,因此暂时从OPTIONAL_SERVICES中删除 | 19 | # reranker-fine 暂时不用,因此暂时从OPTIONAL_SERVICES中删除 |
| 20 | OPTIONAL_SERVICES=("tei" "cnclip" "embedding" "embedding-image" "translator" "reranker") | 20 | OPTIONAL_SERVICES=("tei" "cnclip" "embedding" "embedding-image" "translator" "reranker") |
| 21 | FULL_SERVICES=("${OPTIONAL_SERVICES[@]}" "${CORE_SERVICES[@]}") | 21 | FULL_SERVICES=("${OPTIONAL_SERVICES[@]}" "${CORE_SERVICES[@]}") |
| 22 | -STOP_ORDER_SERVICES=("frontend" "eval-web" "indexer" "backend" "reranker" "translator" "embedding-image" "embedding" "cnclip" "tei") | 22 | +STOP_ORDER_SERVICES=("frontend" "eval-web" "indexer" "reranker" "translator" "embedding-image" "embedding" "cnclip" "tei" "backend") |
| 23 | declare -Ag SERVICE_ENABLED_CACHE=() | 23 | declare -Ag SERVICE_ENABLED_CACHE=() |
| 24 | 24 | ||
| 25 | all_services() { | 25 | all_services() { |
scripts/start_eval_web.sh
| @@ -10,7 +10,10 @@ EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}" | @@ -10,7 +10,10 @@ EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}" | ||
| 10 | EVAL_WEB_HOST="${EVAL_WEB_HOST:-0.0.0.0}" | 10 | EVAL_WEB_HOST="${EVAL_WEB_HOST:-0.0.0.0}" |
| 11 | TENANT_ID="${TENANT_ID:-163}" | 11 | TENANT_ID="${TENANT_ID:-163}" |
| 12 | DATASET_ID="${REPO_EVAL_DATASET_ID:-core_queries}" | 12 | DATASET_ID="${REPO_EVAL_DATASET_ID:-core_queries}" |
| 13 | -QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}" | 13 | +EXTRA_QUERY_ARGS=() |
| 14 | +if [[ -n "${REPO_EVAL_QUERIES:-}" ]]; then | ||
| 15 | + EXTRA_QUERY_ARGS=(--queries-file "${REPO_EVAL_QUERIES}") | ||
| 16 | +fi | ||
| 14 | 17 | ||
| 15 | GREEN='\033[0;32m' | 18 | GREEN='\033[0;32m' |
| 16 | YELLOW='\033[1;33m' | 19 | YELLOW='\033[1;33m' |
| @@ -27,6 +30,6 @@ export EVAL_WEB_PORT EVAL_WEB_HOST TENANT_ID REPO_EVAL_DATASET_ID REPO_EVAL_QUER | @@ -27,6 +30,6 @@ export EVAL_WEB_PORT EVAL_WEB_HOST TENANT_ID REPO_EVAL_DATASET_ID REPO_EVAL_QUER | ||
| 27 | exec python scripts/evaluation/serve_eval_web.py serve \ | 30 | exec python scripts/evaluation/serve_eval_web.py serve \ |
| 28 | --tenant-id "${TENANT_ID}" \ | 31 | --tenant-id "${TENANT_ID}" \ |
| 29 | --dataset-id "${DATASET_ID}" \ | 32 | --dataset-id "${DATASET_ID}" \ |
| 30 | - --queries-file "${QUERIES}" \ | ||
| 31 | --host "${EVAL_WEB_HOST}" \ | 33 | --host "${EVAL_WEB_HOST}" \ |
| 32 | - --port "${EVAL_WEB_PORT}" | 34 | + --port "${EVAL_WEB_PORT}" \ |
| 35 | + "${EXTRA_QUERY_ARGS[@]}" |