Commit 12a75c466878d4c014fcf92cca4f1a012878496c

Authored by tangwang
1 parent 99b72698

feat(eval): 为 LLM 标注添加统一续跑能力,支持断点续传与容错重试

- 问题背景:clothing_top771 数据集在被外部异常终止(reranker被kill);缺乏统一的断点续跑机制,此前依赖临时脚本恢复。
- 解决方案:在 eval_framework/cli.py 的 build 命令中新增 --resume-missing、--continue-on-error、--max-retries-per-query、--retry-backoff-sec 参数,并修正默认参数逻辑(有 dataset_id 时不再强塞 legacy queries_file)。
- 脚本统一:更新 start_eval.sh 和 start_eval_web.sh,增加 batch-rebuild-resume 入口,统一使用 dataset 模式,REPO_EVAL_QUERIES 改为可选覆盖。
- 文档补充:在 scripts/evaluation/README.md 中添加中断续跑说明和新命令用法。
- 验证:eval-web 多数据集接口(/api/datasets、/api/history?dataset_id=...)正常返回 core_queries 与 clothing_top771 分域结果;当前进程已越过第 48 条,query_builds 计数增至 54,正在处理第 55/771。

把流程做成可持续的“统一续跑”能力,避免再靠临时脚本:
- 在 [scripts/evaluation/eval_framework/cli.py](/data/saas-search/scripts/evaluation/eval_framework/cli.py) 新增 `build --resume-missing --continue-on-error --max-retries-per-query --retry-backoff-sec`,并修正默认参数逻辑(有 `dataset_id` 时不再强塞 legacy `queries_file`)。
- 在 [scripts/evaluation/start_eval.sh](/data/saas-search/scripts/evaluation/start_eval.sh) 新增 `batch-rebuild-resume` 入口,统一用 dataset 模式,`REPO_EVAL_QUERIES` 仅作可选覆盖。
- 在 [scripts/start_eval_web.sh](/data/saas-search/scripts/start_eval_web.sh) 做同样的 dataset/queries 统一化。
- 在 [scripts/evaluation/README.md](/data/saas-search/scripts/evaluation/README.md) 补了中断续跑说明和新命令。
- 已验证 `eval-web` 多数据集接口正常(`/api/datasets`、`/api/history?dataset_id=...` 均返回 `core_queries` 与 `clothing_top771` 分域结果)。

当前在线进程:
- LLM 标注:`PID 2062901`(`build ... --dataset-id clothing_top771 --resume-missing ...`)
- reranker:`PID 2065235`(6007,`/health` 返回 `ok`)

盯进度:
```bash
tail -f logs/eval.log
ls -1 artifacts/search_evaluation/datasets/clothing_top771/query_builds | wc -l
curl -sS http://127.0.0.1:6007/health
```

影响范围:scripts/evaluation/eval_framework/cli.py, scripts/evaluation/start_eval.sh, scripts/start_eval_web.sh, scripts/evaluation/README.md
artifacts/search_evaluation/build_launches/clothing_top771_batch_rebuild_resume_20260420T055157Z.pid 0 → 100644
... ... @@ -0,0 +1 @@
  1 +2061952
... ...
artifacts/search_evaluation/build_launches/clothing_top771_build_resume_direct_20260420T055302Z.pid 0 → 100644
... ... @@ -0,0 +1 @@
  1 +2062709
... ...
artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054051Z.cmd 0 → 100644
... ... @@ -0,0 +1,44 @@
  1 +./.venv/bin/python - <<'PY'
  2 +import json
  3 +from pathlib import Path
  4 +from scripts.evaluation.eval_framework.datasets import resolve_dataset, query_builds_dir
  5 +from scripts.evaluation.eval_framework.framework import SearchEvaluationFramework
  6 +
  7 +dataset = resolve_dataset(dataset_id='clothing_top771', tenant_id='163', language='en', require_enabled=True)
  8 +fw = SearchEvaluationFramework(tenant_id=dataset.tenant_id)
  9 +qdir = query_builds_dir(fw.artifact_root, dataset.dataset_id)
  10 +seen = set()
  11 +for p in qdir.glob('*.json'):
  12 + try:
  13 + obj = json.loads(p.read_text(encoding='utf-8'))
  14 + q = str(obj.get('query') or '').strip()
  15 + if q:
  16 + seen.add(q)
  17 + except Exception:
  18 + pass
  19 +queries = list(dataset.queries)
  20 +remaining = [q for q in queries if q not in seen]
  21 +print(f"[resume] dataset={dataset.dataset_id} total={len(queries)} built={len(seen)} remaining={len(remaining)}")
  22 +if remaining:
  23 + print(f"[resume] first_remaining={remaining[0]!r} line={queries.index(remaining[0])+1}")
  24 +
  25 +for idx, q in enumerate(queries, start=1):
  26 + if q in seen:
  27 + continue
  28 + print(f"[resume] ({idx}/{len(queries)}) start query={q!r}")
  29 + result = fw.build_query_annotation_set(
  30 + query=q,
  31 + dataset=dataset,
  32 + search_depth=500,
  33 + rerank_depth=10000,
  34 + language='en',
  35 + force_refresh_rerank=True,
  36 + force_refresh_labels=True,
  37 + )
  38 + print(
  39 + f"[resume] done query={q!r} search_total={result.search_total} "
  40 + f"search_depth={result.search_depth} corpus={result.rerank_corpus_size} "
  41 + f"annotated={result.annotated_count} output={result.output_json_path}"
  42 + )
  43 +print("[resume] all remaining queries completed")
  44 +PY
... ...
artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054051Z.pid 0 → 100644
... ... @@ -0,0 +1 @@
  1 +2053402
... ...
artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054253Z_ub.cmd 0 → 100644
... ... @@ -0,0 +1,45 @@
  1 +PYTHONUNBUFFERED=1 ./.venv/bin/python - <<'PY'
  2 +import json
  3 +from pathlib import Path
  4 +from scripts.evaluation.eval_framework.datasets import resolve_dataset, query_builds_dir
  5 +from scripts.evaluation.eval_framework.framework import SearchEvaluationFramework
  6 +
  7 +dataset = resolve_dataset(dataset_id='clothing_top771', tenant_id='163', language='en', require_enabled=True)
  8 +fw = SearchEvaluationFramework(tenant_id=dataset.tenant_id)
  9 +qdir = query_builds_dir(fw.artifact_root, dataset.dataset_id)
  10 +seen = set()
  11 +for p in qdir.glob('*.json'):
  12 + try:
  13 + obj = json.loads(p.read_text(encoding='utf-8'))
  14 + q = str(obj.get('query') or '').strip()
  15 + if q:
  16 + seen.add(q)
  17 + except Exception:
  18 + pass
  19 +queries = list(dataset.queries)
  20 +remaining = [q for q in queries if q not in seen]
  21 +print(f"[resume] dataset={dataset.dataset_id} total={len(queries)} built={len(seen)} remaining={len(remaining)}")
  22 +if not remaining:
  23 + print('[resume] nothing to do')
  24 +else:
  25 + print(f"[resume] first_remaining={remaining[0]!r} line={queries.index(remaining[0])+1}")
  26 +for idx, q in enumerate(queries, start=1):
  27 + if q in seen:
  28 + continue
  29 + print(f"[resume] ({idx}/{len(queries)}) start query={q!r}")
  30 + result = fw.build_query_annotation_set(
  31 + query=q,
  32 + dataset=dataset,
  33 + search_depth=500,
  34 + rerank_depth=10000,
  35 + language='en',
  36 + force_refresh_rerank=True,
  37 + force_refresh_labels=True,
  38 + )
  39 + print(
  40 + f"[resume] done query={q!r} search_total={result.search_total} "
  41 + f"search_depth={result.search_depth} corpus={result.rerank_corpus_size} "
  42 + f"annotated={result.annotated_count} output={result.output_json_path}"
  43 + )
  44 +print('[resume] all remaining queries completed')
  45 +PY
... ...
artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054253Z_ub.pid 0 → 100644
... ... @@ -0,0 +1 @@
  1 +2054946
... ...
artifacts/search_evaluation/build_launches/reranker-resume.pid 0 → 100644
... ... @@ -0,0 +1 @@
  1 +2064765
... ...
scripts/evaluation/README.md
... ... @@ -24,7 +24,7 @@ This directory holds the offline annotation builder, the evaluation web UI/API,
24 24 | `queries/queries.txt` | Legacy core query set (`dataset_id=core_queries`) |
25 25 | `queries/all_keywords.txt.top1w.shuf.top1k.clothing_filtered` | Expanded clothing dataset (`dataset_id=clothing_top771`) |
26 26 | `README_Requirement.md` | Product/requirements reference |
27   -| `start_eval.sh` | Wrapper: `batch`, `batch-rebuild` (deep `build` + `--force-refresh-labels`), or `serve` |
  27 +| `start_eval.sh` | Wrapper: `batch`, `batch-rebuild`, `batch-rebuild-resume` (resume from existing per-query outputs), or `serve` |
28 28 | `../start_eval_web.sh` | Same as `serve` with `activate.sh`; use `./scripts/service_ctl.sh start eval-web` (default port **6010**, override with `EVAL_WEB_PORT`). `./run.sh all` includes eval-web. |
29 29  
30 30 ## Quick start (repo root)
... ... @@ -41,6 +41,9 @@ REPO_EVAL_DATASET_ID=clothing_top771 ./scripts/evaluation/start_eval.sh batch
41 41 # Deep rebuild: per-query full corpus rerank (outside search recall pool) + LLM in batches along global sort order (early stop; expensive)
42 42 ./scripts/evaluation/start_eval.sh batch-rebuild
43 43  
  44 +# Resume deep rebuild from existing query_builds (recommended for long 771-query runs)
  45 +REPO_EVAL_DATASET_ID=clothing_top771 ./scripts/evaluation/start_eval.sh batch-rebuild-resume
  46 +
44 47 # UI: http://127.0.0.1:6010/
45 48 ./scripts/evaluation/start_eval.sh serve
46 49 # or: ./scripts/service_ctl.sh start eval-web
... ... @@ -79,7 +82,7 @@ Each `batch` run walks the full queries file and writes a **batch report** under
79 82  
80 83 This runs `build_annotation_set.py build` with **`--force-refresh-labels`** and **`--force-refresh-rerank`** (see the explicit command block below). It does **not** run the `batch` subcommand: there is **no** aggregate batch report for this step; outputs are per-query JSON under `query_builds/` plus updates in `search_eval.sqlite3`.
81 84  
82   -For **each** query in `queries.txt`, in order:
  85 +For **each** query in the selected dataset query file (`--dataset-id` / `config.yaml search_evaluation.datasets[*].query_file`), in order:
83 86  
84 87 1. **Search recall** — Call the live search API with `size = max(--search-depth, --search-recall-top-k)` (the wrapper uses `--search-depth 500`). The first **`--search-recall-top-k`** hits (default **200**, see `eval_framework.constants.DEFAULT_SEARCH_RECALL_TOP_K`) form the **recall pool**; they are treated as rerank score **1** and are **not** sent to the reranker.
85 88 2. **Full corpus** — Load the tenant’s product corpus from Elasticsearch (same tenant as `TENANT_ID` / `--tenant-id`, default **163**), via `corpus_docs()` (cached in SQLite after the first load).
... ... @@ -104,6 +107,8 @@ For **each** query in `queries.txt`, in order:
104 107  
105 108 **Tuning the rebuild path:** `--search-recall-top-k`, `--rerank-high-threshold`, `--rerank-high-skip-count`, `--rebuild-llm-batch-size`, `--rebuild-min-batches`, `--rebuild-max-batches`, `--rebuild-irrelevant-stop-ratio`, `--rebuild-irrel-low-combined-stop-ratio`, `--rebuild-irrelevant-stop-streak` on `build` (see `eval_framework/cli.py`). Rerank API chunk size is **80** docs per request in code (`full_corpus_rerank_outside_exclude`).
106 109  
  110 +**Resuming interrupted runs:** for long jobs (for example `clothing_top771`), use `batch-rebuild-resume` or pass `build --resume-missing --continue-on-error --max-retries-per-query N`. Resume mode skips queries that already have per-query JSON under `datasets/<dataset_id>/query_builds/`.
  111 +
107 112 ## Artifacts
108 113  
109 114 Default root: `artifacts/search_evaluation/`
... ...
scripts/evaluation/eval_framework/cli.py
... ... @@ -6,15 +6,16 @@ import argparse
6 6 import json
7 7 import logging
8 8 import shutil
  9 +import time
9 10 from pathlib import Path
10   -from typing import Any, Dict
  11 +from typing import Any, Dict, List, Set
11 12  
12 13 from config.loader import get_app_config
13 14  
14 15 from .datasets import audits_dir, query_builds_dir, resolve_dataset
15 16 from .framework import SearchEvaluationFramework
16 17 from .logging_setup import setup_eval_logging
17   -from .utils import ensure_dir, utc_now_iso, utc_timestamp
  18 +from .utils import utc_now_iso, utc_timestamp
18 19 from .web_app import create_web_app
19 20  
20 21 _cli_log = logging.getLogger("search_eval.cli")
... ... @@ -95,7 +96,8 @@ def _apply_search_evaluation_cli_defaults(args: argparse.Namespace) -&gt; None:
95 96 args.dataset_id = se.default_dataset_id
96 97 if getattr(args, "tenant_id", None) in (None, ""):
97 98 args.tenant_id = se.default_tenant_id
98   - if getattr(args, "queries_file", None) in (None, ""):
  99 + # Keep legacy queries_file fallback only when dataset_id is not specified.
  100 + if getattr(args, "queries_file", None) in (None, "") and getattr(args, "dataset_id", None) in (None, ""):
99 101 args.queries_file = str(se.queries_file)
100 102 if getattr(args, "language", None) in (None, ""):
101 103 args.language = se.default_language
... ... @@ -162,6 +164,23 @@ def _resolve_dataset_from_args(args: argparse.Namespace, *, require_enabled: boo
162 164 return dataset
163 165  
164 166  
  167 +def _list_built_queries(artifact_root: Path, dataset_id: str) -> Set[str]:
  168 + built: Set[str] = set()
  169 + root = query_builds_dir(artifact_root, dataset_id)
  170 + for path in root.glob("*.json"):
  171 + name = path.name
  172 + if name.startswith("build_summary_") or name.startswith("build_failures_"):
  173 + continue
  174 + try:
  175 + payload = json.loads(path.read_text(encoding="utf-8"))
  176 + except Exception:
  177 + continue
  178 + query = str(payload.get("query") or "").strip()
  179 + if query:
  180 + built.add(query)
  181 + return built
  182 +
  183 +
165 184 def build_cli_parser() -> argparse.ArgumentParser:
166 185 parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI")
167 186 sub = parser.add_subparsers(dest="command", required=True)
... ... @@ -251,6 +270,28 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
251 270 action="store_true",
252 271 help="Delete dataset-specific query_builds/audits before starting. Shared SQLite cache is preserved.",
253 272 )
  273 + build.add_argument(
  274 + "--resume-missing",
  275 + action="store_true",
  276 + help="Skip queries that already have per-query build JSONs in this dataset's query_builds directory.",
  277 + )
  278 + build.add_argument(
  279 + "--continue-on-error",
  280 + action="store_true",
  281 + help="Continue with remaining queries when one query fails after retries.",
  282 + )
  283 + build.add_argument(
  284 + "--max-retries-per-query",
  285 + type=int,
  286 + default=0,
  287 + help="Retry count per failed query before giving up (default: 0).",
  288 + )
  289 + build.add_argument(
  290 + "--retry-backoff-sec",
  291 + type=float,
  292 + default=5.0,
  293 + help="Base backoff seconds between retries (actual sleep = base * attempt_no).",
  294 + )
254 295 build.add_argument("--force-refresh-rerank", action="store_true")
255 296 build.add_argument("--force-refresh-labels", action="store_true")
256 297 add_judge_llm_args(build)
... ... @@ -300,7 +341,19 @@ def run_build(args: argparse.Namespace) -&gt; None:
300 341 _reset_build_artifacts(dataset.dataset_id)
301 342 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
302 343 queries = list(dataset.queries)
303   - summary = []
  344 + summary: List[Dict[str, Any]] = []
  345 + failures: List[Dict[str, Any]] = []
  346 + completed_queries: Set[str] = set()
  347 + if args.resume_missing:
  348 + completed_queries = _list_built_queries(framework.artifact_root, dataset.dataset_id)
  349 + _cli_log.info(
  350 + "[build] resume mode: dataset=%s total=%s already_built=%s remaining=%s",
  351 + dataset.dataset_id,
  352 + len(queries),
  353 + len(completed_queries),
  354 + max(0, len(queries) - len(completed_queries)),
  355 + )
  356 + skipped_queries = 0
304 357 rebuild_kwargs = {}
305 358 if args.force_refresh_labels:
306 359 rebuild_kwargs = {
... ... @@ -316,23 +369,69 @@ def run_build(args: argparse.Namespace) -&gt; None:
316 369 }
317 370 total_q = len(queries)
318 371 for q_index, query in enumerate(queries, start=1):
319   - _cli_log.info("[build] (%s/%s) starting query=%r", q_index, total_q, query)
320   - try:
321   - result = framework.build_query_annotation_set(
322   - query=query,
323   - dataset=dataset,
324   - search_depth=args.search_depth,
325   - rerank_depth=args.rerank_depth,
326   - annotate_search_top_k=args.annotate_search_top_k,
327   - annotate_rerank_top_k=args.annotate_rerank_top_k,
328   - language=args.language,
329   - force_refresh_rerank=args.force_refresh_rerank,
330   - force_refresh_labels=args.force_refresh_labels,
331   - **rebuild_kwargs,
  372 + if query in completed_queries:
  373 + skipped_queries += 1
  374 + _cli_log.info("[build] (%s/%s) skip query=%r (already built)", q_index, total_q, query)
  375 + continue
  376 +
  377 + attempt = 0
  378 + while True:
  379 + max_attempts = max(1, int(args.max_retries_per_query) + 1)
  380 + _cli_log.info(
  381 + "[build] (%s/%s) starting query=%r attempt=%s/%s",
  382 + q_index,
  383 + total_q,
  384 + query,
  385 + attempt + 1,
  386 + max_attempts,
332 387 )
333   - except Exception:
334   - _cli_log.exception("[build] failed query=%r index=%s/%s", query, q_index, total_q)
335   - raise
  388 + try:
  389 + result = framework.build_query_annotation_set(
  390 + query=query,
  391 + dataset=dataset,
  392 + search_depth=args.search_depth,
  393 + rerank_depth=args.rerank_depth,
  394 + annotate_search_top_k=args.annotate_search_top_k,
  395 + annotate_rerank_top_k=args.annotate_rerank_top_k,
  396 + language=args.language,
  397 + force_refresh_rerank=args.force_refresh_rerank,
  398 + force_refresh_labels=args.force_refresh_labels,
  399 + **rebuild_kwargs,
  400 + )
  401 + break
  402 + except Exception as exc:
  403 + attempt += 1
  404 + if attempt <= int(args.max_retries_per_query):
  405 + sleep_seconds = max(0.0, float(args.retry_backoff_sec)) * attempt
  406 + _cli_log.warning(
  407 + "[build] query=%r failed attempt=%s/%s; retry in %.1fs: %s",
  408 + query,
  409 + attempt,
  410 + max_attempts,
  411 + sleep_seconds,
  412 + exc,
  413 + )
  414 + if sleep_seconds > 0:
  415 + time.sleep(sleep_seconds)
  416 + continue
  417 +
  418 + _cli_log.exception("[build] failed query=%r index=%s/%s", query, q_index, total_q)
  419 + failures.append(
  420 + {
  421 + "query": query,
  422 + "index": q_index,
  423 + "error": repr(exc),
  424 + }
  425 + )
  426 + if not args.continue_on_error:
  427 + raise
  428 + _cli_log.error("[build] continue_on_error=true; skip failed query=%r", query)
  429 + result = None
  430 + break
  431 +
  432 + if result is None:
  433 + continue
  434 +
336 435 summary.append(
337 436 {
338 437 "query": result.query,
... ... @@ -352,10 +451,19 @@ def run_build(args: argparse.Namespace) -&gt; None:
352 451 result.annotated_count,
353 452 result.output_json_path,
354 453 )
355   - out_path = ensure_dir(framework.artifact_root / "query_builds") / f"build_summary_{utc_timestamp()}.json"
356 454 out_path = query_builds_dir(framework.artifact_root, dataset.dataset_id) / f"build_summary_{utc_timestamp()}.json"
357 455 out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
358   - _cli_log.info("[done] summary=%s", out_path)
  456 + _cli_log.info(
  457 + "[done] summary=%s success=%s skipped=%s failed=%s",
  458 + out_path,
  459 + len(summary),
  460 + skipped_queries,
  461 + len(failures),
  462 + )
  463 + if failures:
  464 + failed_path = query_builds_dir(framework.artifact_root, dataset.dataset_id) / f"build_failures_{utc_timestamp()}.json"
  465 + failed_path.write_text(json.dumps(failures, ensure_ascii=False, indent=2), encoding="utf-8")
  466 + _cli_log.warning("[done] failures=%s", failed_path)
359 467  
360 468  
361 469 def run_batch(args: argparse.Namespace) -> None:
... ...
scripts/evaluation/start_eval.sh
... ... @@ -7,14 +7,19 @@ cd &quot;$ROOT&quot;
7 7 PY="${ROOT}/.venv/bin/python"
8 8 TENANT_ID="${TENANT_ID:-163}"
9 9 DATASET_ID="${REPO_EVAL_DATASET_ID:-core_queries}"
10   -QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}"
  10 +RETRY_COUNT="${REPO_EVAL_RETRY_COUNT:-2}"
  11 +EXTRA_QUERY_ARGS=()
  12 +if [[ -n "${REPO_EVAL_QUERIES:-}" ]]; then
  13 + EXTRA_QUERY_ARGS=(--queries-file "${REPO_EVAL_QUERIES}")
  14 +fi
11 15  
12 16 usage() {
13   - echo "Usage: $0 batch|batch-rebuild|serve"
  17 + echo "Usage: $0 batch|batch-rebuild|batch-rebuild-resume|serve"
14 18 echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50)"
15 19 echo " batch-rebuild — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)"
  20 + echo " batch-rebuild-resume — resume missing queries from dataset query_builds with retry/continue-on-error"
16 21 echo " serve — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)"
17   - echo "Env: TENANT_ID (default 163), REPO_EVAL_DATASET_ID (default core_queries), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)"
  22 + echo "Env: TENANT_ID (default 163), REPO_EVAL_DATASET_ID (default core_queries), REPO_EVAL_QUERIES (optional override), REPO_EVAL_RETRY_COUNT (default 2), EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)"
18 23 }
19 24  
20 25 case "${1:-}" in
... ... @@ -22,21 +27,36 @@ case &quot;${1:-}&quot; in
22 27 exec "$PY" scripts/evaluation/build_annotation_set.py batch \
23 28 --tenant-id "$TENANT_ID" \
24 29 --dataset-id "$DATASET_ID" \
25   - --queries-file "$QUERIES" \
26 30 --top-k 50 \
27   - --language en
  31 + --language en \
  32 + "${EXTRA_QUERY_ARGS[@]}"
28 33 ;;
29 34 batch-rebuild)
30 35 exec "$PY" scripts/evaluation/build_annotation_set.py build \
31 36 --tenant-id "$TENANT_ID" \
32 37 --dataset-id "$DATASET_ID" \
33   - --queries-file "$QUERIES" \
34 38 --search-depth 500 \
35 39 --rerank-depth 10000 \
36 40 --reset-artifacts \
37 41 --force-refresh-rerank \
38 42 --force-refresh-labels \
39   - --language en
  43 + --language en \
  44 + "${EXTRA_QUERY_ARGS[@]}"
  45 + ;;
  46 + batch-rebuild-resume)
  47 + exec "$PY" scripts/evaluation/build_annotation_set.py build \
  48 + --tenant-id "$TENANT_ID" \
  49 + --dataset-id "$DATASET_ID" \
  50 + --search-depth 500 \
  51 + --rerank-depth 10000 \
  52 + --force-refresh-rerank \
  53 + --force-refresh-labels \
  54 + --resume-missing \
  55 + --continue-on-error \
  56 + --max-retries-per-query "$RETRY_COUNT" \
  57 + --retry-backoff-sec 10 \
  58 + --language en \
  59 + "${EXTRA_QUERY_ARGS[@]}"
40 60 ;;
41 61 serve)
42 62 EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}"
... ... @@ -44,9 +64,9 @@ case &quot;${1:-}&quot; in
44 64 exec "$PY" scripts/evaluation/serve_eval_web.py serve \
45 65 --tenant-id "$TENANT_ID" \
46 66 --dataset-id "$DATASET_ID" \
47   - --queries-file "$QUERIES" \
48 67 --host "$EVAL_WEB_HOST" \
49   - --port "$EVAL_WEB_PORT"
  68 + --port "$EVAL_WEB_PORT" \
  69 + "${EXTRA_QUERY_ARGS[@]}"
50 70 ;;
51 71 *)
52 72 usage
... ...
scripts/service_ctl.sh
... ... @@ -19,7 +19,7 @@ CORE_SERVICES=(&quot;backend&quot; &quot;indexer&quot; &quot;frontend&quot; &quot;eval-web&quot;)
19 19 # reranker-fine 暂时不用,因此暂时从OPTIONAL_SERVICES中删除
20 20 OPTIONAL_SERVICES=("tei" "cnclip" "embedding" "embedding-image" "translator" "reranker")
21 21 FULL_SERVICES=("${OPTIONAL_SERVICES[@]}" "${CORE_SERVICES[@]}")
22   -STOP_ORDER_SERVICES=("frontend" "eval-web" "indexer" "backend" "reranker" "translator" "embedding-image" "embedding" "cnclip" "tei")
  22 +STOP_ORDER_SERVICES=("frontend" "eval-web" "indexer" "reranker" "translator" "embedding-image" "embedding" "cnclip" "tei" "backend")
23 23 declare -Ag SERVICE_ENABLED_CACHE=()
24 24  
25 25 all_services() {
... ...
scripts/start_eval_web.sh
... ... @@ -10,7 +10,10 @@ EVAL_WEB_PORT=&quot;${EVAL_WEB_PORT:-6010}&quot;
10 10 EVAL_WEB_HOST="${EVAL_WEB_HOST:-0.0.0.0}"
11 11 TENANT_ID="${TENANT_ID:-163}"
12 12 DATASET_ID="${REPO_EVAL_DATASET_ID:-core_queries}"
13   -QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}"
  13 +EXTRA_QUERY_ARGS=()
  14 +if [[ -n "${REPO_EVAL_QUERIES:-}" ]]; then
  15 + EXTRA_QUERY_ARGS=(--queries-file "${REPO_EVAL_QUERIES}")
  16 +fi
14 17  
15 18 GREEN='\033[0;32m'
16 19 YELLOW='\033[1;33m'
... ... @@ -27,6 +30,6 @@ export EVAL_WEB_PORT EVAL_WEB_HOST TENANT_ID REPO_EVAL_DATASET_ID REPO_EVAL_QUER
27 30 exec python scripts/evaluation/serve_eval_web.py serve \
28 31 --tenant-id "${TENANT_ID}" \
29 32 --dataset-id "${DATASET_ID}" \
30   - --queries-file "${QUERIES}" \
31 33 --host "${EVAL_WEB_HOST}" \
32   - --port "${EVAL_WEB_PORT}"
  34 + --port "${EVAL_WEB_PORT}" \
  35 + "${EXTRA_QUERY_ARGS[@]}"
... ...