From 12a75c466878d4c014fcf92cca4f1a012878496c Mon Sep 17 00:00:00 2001 From: tangwang Date: Mon, 20 Apr 2026 14:16:15 +0800 Subject: [PATCH] feat(eval): 为 LLM 标注添加统一续跑能力,支持断点续传与容错重试 --- artifacts/search_evaluation/build_launches/clothing_top771_batch_rebuild_resume_20260420T055157Z.pid | 1 + artifacts/search_evaluation/build_launches/clothing_top771_build_resume_direct_20260420T055302Z.pid | 1 + artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054051Z.cmd | 44 ++++++++++++++++++++++++++++++++++++++++++++ artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054051Z.pid | 1 + artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054253Z_ub.cmd | 45 +++++++++++++++++++++++++++++++++++++++++++++ artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054253Z_ub.pid | 1 + artifacts/search_evaluation/build_launches/reranker-resume.pid | 1 + scripts/evaluation/README.md | 9 +++++++-- scripts/evaluation/eval_framework/cli.py | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------- scripts/evaluation/start_eval.sh | 38 +++++++++++++++++++++++++++++--------- scripts/service_ctl.sh | 2 +- scripts/start_eval_web.sh | 9 ++++++--- 12 files changed, 267 insertions(+), 37 deletions(-) create mode 100644 artifacts/search_evaluation/build_launches/clothing_top771_batch_rebuild_resume_20260420T055157Z.pid create mode 100644 artifacts/search_evaluation/build_launches/clothing_top771_build_resume_direct_20260420T055302Z.pid create mode 100644 artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054051Z.cmd create mode 100644 artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054051Z.pid create mode 100644 artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054253Z_ub.cmd create mode 100644 artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054253Z_ub.pid create mode 100644 artifacts/search_evaluation/build_launches/reranker-resume.pid diff --git a/artifacts/search_evaluation/build_launches/clothing_top771_batch_rebuild_resume_20260420T055157Z.pid b/artifacts/search_evaluation/build_launches/clothing_top771_batch_rebuild_resume_20260420T055157Z.pid new file mode 100644 index 0000000..28370b2 --- /dev/null +++ b/artifacts/search_evaluation/build_launches/clothing_top771_batch_rebuild_resume_20260420T055157Z.pid @@ -0,0 +1 @@ +2061952 diff --git a/artifacts/search_evaluation/build_launches/clothing_top771_build_resume_direct_20260420T055302Z.pid b/artifacts/search_evaluation/build_launches/clothing_top771_build_resume_direct_20260420T055302Z.pid new file mode 100644 index 0000000..77aca32 --- /dev/null +++ b/artifacts/search_evaluation/build_launches/clothing_top771_build_resume_direct_20260420T055302Z.pid @@ -0,0 +1 @@ +2062709 diff --git a/artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054051Z.cmd b/artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054051Z.cmd new file mode 100644 index 0000000..259f655 --- /dev/null +++ b/artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054051Z.cmd @@ -0,0 +1,44 @@ +./.venv/bin/python - <<'PY' +import json +from pathlib import Path +from scripts.evaluation.eval_framework.datasets import resolve_dataset, query_builds_dir +from scripts.evaluation.eval_framework.framework import SearchEvaluationFramework + +dataset = resolve_dataset(dataset_id='clothing_top771', tenant_id='163', language='en', require_enabled=True) +fw = SearchEvaluationFramework(tenant_id=dataset.tenant_id) +qdir = query_builds_dir(fw.artifact_root, dataset.dataset_id) +seen = set() +for p in qdir.glob('*.json'): + try: + obj = json.loads(p.read_text(encoding='utf-8')) + q = str(obj.get('query') or '').strip() + if q: + seen.add(q) + except Exception: + pass +queries = list(dataset.queries) +remaining = [q for q in queries if q not in seen] +print(f"[resume] dataset={dataset.dataset_id} total={len(queries)} built={len(seen)} remaining={len(remaining)}") +if remaining: + print(f"[resume] first_remaining={remaining[0]!r} line={queries.index(remaining[0])+1}") + +for idx, q in enumerate(queries, start=1): + if q in seen: + continue + print(f"[resume] ({idx}/{len(queries)}) start query={q!r}") + result = fw.build_query_annotation_set( + query=q, + dataset=dataset, + search_depth=500, + rerank_depth=10000, + language='en', + force_refresh_rerank=True, + force_refresh_labels=True, + ) + print( + f"[resume] done query={q!r} search_total={result.search_total} " + f"search_depth={result.search_depth} corpus={result.rerank_corpus_size} " + f"annotated={result.annotated_count} output={result.output_json_path}" + ) +print("[resume] all remaining queries completed") +PY diff --git a/artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054051Z.pid b/artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054051Z.pid new file mode 100644 index 0000000..e76c345 --- /dev/null +++ b/artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054051Z.pid @@ -0,0 +1 @@ +2053402 diff --git a/artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054253Z_ub.cmd b/artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054253Z_ub.cmd new file mode 100644 index 0000000..a6c8dff --- /dev/null +++ b/artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054253Z_ub.cmd @@ -0,0 +1,45 @@ +PYTHONUNBUFFERED=1 ./.venv/bin/python - <<'PY' +import json +from pathlib import Path +from scripts.evaluation.eval_framework.datasets import resolve_dataset, query_builds_dir +from scripts.evaluation.eval_framework.framework import SearchEvaluationFramework + +dataset = resolve_dataset(dataset_id='clothing_top771', tenant_id='163', language='en', require_enabled=True) +fw = SearchEvaluationFramework(tenant_id=dataset.tenant_id) +qdir = query_builds_dir(fw.artifact_root, dataset.dataset_id) +seen = set() +for p in qdir.glob('*.json'): + try: + obj = json.loads(p.read_text(encoding='utf-8')) + q = str(obj.get('query') or '').strip() + if q: + seen.add(q) + except Exception: + pass +queries = list(dataset.queries) +remaining = [q for q in queries if q not in seen] +print(f"[resume] dataset={dataset.dataset_id} total={len(queries)} built={len(seen)} remaining={len(remaining)}") +if not remaining: + print('[resume] nothing to do') +else: + print(f"[resume] first_remaining={remaining[0]!r} line={queries.index(remaining[0])+1}") +for idx, q in enumerate(queries, start=1): + if q in seen: + continue + print(f"[resume] ({idx}/{len(queries)}) start query={q!r}") + result = fw.build_query_annotation_set( + query=q, + dataset=dataset, + search_depth=500, + rerank_depth=10000, + language='en', + force_refresh_rerank=True, + force_refresh_labels=True, + ) + print( + f"[resume] done query={q!r} search_total={result.search_total} " + f"search_depth={result.search_depth} corpus={result.rerank_corpus_size} " + f"annotated={result.annotated_count} output={result.output_json_path}" + ) +print('[resume] all remaining queries completed') +PY diff --git a/artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054253Z_ub.pid b/artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054253Z_ub.pid new file mode 100644 index 0000000..d29088f --- /dev/null +++ b/artifacts/search_evaluation/build_launches/clothing_top771_resume_from_missing_20260420T054253Z_ub.pid @@ -0,0 +1 @@ +2054946 diff --git a/artifacts/search_evaluation/build_launches/reranker-resume.pid b/artifacts/search_evaluation/build_launches/reranker-resume.pid new file mode 100644 index 0000000..0bd0b3e --- /dev/null +++ b/artifacts/search_evaluation/build_launches/reranker-resume.pid @@ -0,0 +1 @@ +2064765 diff --git a/scripts/evaluation/README.md b/scripts/evaluation/README.md index 23c60b0..b053f3f 100644 --- a/scripts/evaluation/README.md +++ b/scripts/evaluation/README.md @@ -24,7 +24,7 @@ This directory holds the offline annotation builder, the evaluation web UI/API, | `queries/queries.txt` | Legacy core query set (`dataset_id=core_queries`) | | `queries/all_keywords.txt.top1w.shuf.top1k.clothing_filtered` | Expanded clothing dataset (`dataset_id=clothing_top771`) | | `README_Requirement.md` | Product/requirements reference | -| `start_eval.sh` | Wrapper: `batch`, `batch-rebuild` (deep `build` + `--force-refresh-labels`), or `serve` | +| `start_eval.sh` | Wrapper: `batch`, `batch-rebuild`, `batch-rebuild-resume` (resume from existing per-query outputs), or `serve` | | `../start_eval_web.sh` | Same as `serve` with `activate.sh`; use `./scripts/service_ctl.sh start eval-web` (default port **6010**, override with `EVAL_WEB_PORT`). `./run.sh all` includes eval-web. | ## Quick start (repo root) @@ -41,6 +41,9 @@ REPO_EVAL_DATASET_ID=clothing_top771 ./scripts/evaluation/start_eval.sh batch # Deep rebuild: per-query full corpus rerank (outside search recall pool) + LLM in batches along global sort order (early stop; expensive) ./scripts/evaluation/start_eval.sh batch-rebuild +# Resume deep rebuild from existing query_builds (recommended for long 771-query runs) +REPO_EVAL_DATASET_ID=clothing_top771 ./scripts/evaluation/start_eval.sh batch-rebuild-resume + # UI: http://127.0.0.1:6010/ ./scripts/evaluation/start_eval.sh serve # or: ./scripts/service_ctl.sh start eval-web @@ -79,7 +82,7 @@ Each `batch` run walks the full queries file and writes a **batch report** under This runs `build_annotation_set.py build` with **`--force-refresh-labels`** and **`--force-refresh-rerank`** (see the explicit command block below). It does **not** run the `batch` subcommand: there is **no** aggregate batch report for this step; outputs are per-query JSON under `query_builds/` plus updates in `search_eval.sqlite3`. -For **each** query in `queries.txt`, in order: +For **each** query in the selected dataset query file (`--dataset-id` / `config.yaml search_evaluation.datasets[*].query_file`), in order: 1. **Search recall** — Call the live search API with `size = max(--search-depth, --search-recall-top-k)` (the wrapper uses `--search-depth 500`). The first **`--search-recall-top-k`** hits (default **200**, see `eval_framework.constants.DEFAULT_SEARCH_RECALL_TOP_K`) form the **recall pool**; they are treated as rerank score **1** and are **not** sent to the reranker. 2. **Full corpus** — Load the tenant’s product corpus from Elasticsearch (same tenant as `TENANT_ID` / `--tenant-id`, default **163**), via `corpus_docs()` (cached in SQLite after the first load). @@ -104,6 +107,8 @@ For **each** query in `queries.txt`, in order: **Tuning the rebuild path:** `--search-recall-top-k`, `--rerank-high-threshold`, `--rerank-high-skip-count`, `--rebuild-llm-batch-size`, `--rebuild-min-batches`, `--rebuild-max-batches`, `--rebuild-irrelevant-stop-ratio`, `--rebuild-irrel-low-combined-stop-ratio`, `--rebuild-irrelevant-stop-streak` on `build` (see `eval_framework/cli.py`). Rerank API chunk size is **80** docs per request in code (`full_corpus_rerank_outside_exclude`). +**Resuming interrupted runs:** for long jobs (for example `clothing_top771`), use `batch-rebuild-resume` or pass `build --resume-missing --continue-on-error --max-retries-per-query N`. Resume mode skips queries that already have per-query JSON under `datasets//query_builds/`. + ## Artifacts Default root: `artifacts/search_evaluation/` diff --git a/scripts/evaluation/eval_framework/cli.py b/scripts/evaluation/eval_framework/cli.py index 90120d2..530d122 100644 --- a/scripts/evaluation/eval_framework/cli.py +++ b/scripts/evaluation/eval_framework/cli.py @@ -6,15 +6,16 @@ import argparse import json import logging import shutil +import time from pathlib import Path -from typing import Any, Dict +from typing import Any, Dict, List, Set from config.loader import get_app_config from .datasets import audits_dir, query_builds_dir, resolve_dataset from .framework import SearchEvaluationFramework from .logging_setup import setup_eval_logging -from .utils import ensure_dir, utc_now_iso, utc_timestamp +from .utils import utc_now_iso, utc_timestamp from .web_app import create_web_app _cli_log = logging.getLogger("search_eval.cli") @@ -95,7 +96,8 @@ def _apply_search_evaluation_cli_defaults(args: argparse.Namespace) -> None: args.dataset_id = se.default_dataset_id if getattr(args, "tenant_id", None) in (None, ""): args.tenant_id = se.default_tenant_id - if getattr(args, "queries_file", None) in (None, ""): + # Keep legacy queries_file fallback only when dataset_id is not specified. + if getattr(args, "queries_file", None) in (None, "") and getattr(args, "dataset_id", None) in (None, ""): args.queries_file = str(se.queries_file) if getattr(args, "language", None) in (None, ""): args.language = se.default_language @@ -162,6 +164,23 @@ def _resolve_dataset_from_args(args: argparse.Namespace, *, require_enabled: boo return dataset +def _list_built_queries(artifact_root: Path, dataset_id: str) -> Set[str]: + built: Set[str] = set() + root = query_builds_dir(artifact_root, dataset_id) + for path in root.glob("*.json"): + name = path.name + if name.startswith("build_summary_") or name.startswith("build_failures_"): + continue + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except Exception: + continue + query = str(payload.get("query") or "").strip() + if query: + built.add(query) + return built + + def build_cli_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI") sub = parser.add_subparsers(dest="command", required=True) @@ -251,6 +270,28 @@ def build_cli_parser() -> argparse.ArgumentParser: action="store_true", help="Delete dataset-specific query_builds/audits before starting. Shared SQLite cache is preserved.", ) + build.add_argument( + "--resume-missing", + action="store_true", + help="Skip queries that already have per-query build JSONs in this dataset's query_builds directory.", + ) + build.add_argument( + "--continue-on-error", + action="store_true", + help="Continue with remaining queries when one query fails after retries.", + ) + build.add_argument( + "--max-retries-per-query", + type=int, + default=0, + help="Retry count per failed query before giving up (default: 0).", + ) + build.add_argument( + "--retry-backoff-sec", + type=float, + default=5.0, + help="Base backoff seconds between retries (actual sleep = base * attempt_no).", + ) build.add_argument("--force-refresh-rerank", action="store_true") build.add_argument("--force-refresh-labels", action="store_true") add_judge_llm_args(build) @@ -300,7 +341,19 @@ def run_build(args: argparse.Namespace) -> None: _reset_build_artifacts(dataset.dataset_id) framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) queries = list(dataset.queries) - summary = [] + summary: List[Dict[str, Any]] = [] + failures: List[Dict[str, Any]] = [] + completed_queries: Set[str] = set() + if args.resume_missing: + completed_queries = _list_built_queries(framework.artifact_root, dataset.dataset_id) + _cli_log.info( + "[build] resume mode: dataset=%s total=%s already_built=%s remaining=%s", + dataset.dataset_id, + len(queries), + len(completed_queries), + max(0, len(queries) - len(completed_queries)), + ) + skipped_queries = 0 rebuild_kwargs = {} if args.force_refresh_labels: rebuild_kwargs = { @@ -316,23 +369,69 @@ def run_build(args: argparse.Namespace) -> None: } total_q = len(queries) for q_index, query in enumerate(queries, start=1): - _cli_log.info("[build] (%s/%s) starting query=%r", q_index, total_q, query) - try: - result = framework.build_query_annotation_set( - query=query, - dataset=dataset, - search_depth=args.search_depth, - rerank_depth=args.rerank_depth, - annotate_search_top_k=args.annotate_search_top_k, - annotate_rerank_top_k=args.annotate_rerank_top_k, - language=args.language, - force_refresh_rerank=args.force_refresh_rerank, - force_refresh_labels=args.force_refresh_labels, - **rebuild_kwargs, + if query in completed_queries: + skipped_queries += 1 + _cli_log.info("[build] (%s/%s) skip query=%r (already built)", q_index, total_q, query) + continue + + attempt = 0 + while True: + max_attempts = max(1, int(args.max_retries_per_query) + 1) + _cli_log.info( + "[build] (%s/%s) starting query=%r attempt=%s/%s", + q_index, + total_q, + query, + attempt + 1, + max_attempts, ) - except Exception: - _cli_log.exception("[build] failed query=%r index=%s/%s", query, q_index, total_q) - raise + try: + result = framework.build_query_annotation_set( + query=query, + dataset=dataset, + search_depth=args.search_depth, + rerank_depth=args.rerank_depth, + annotate_search_top_k=args.annotate_search_top_k, + annotate_rerank_top_k=args.annotate_rerank_top_k, + language=args.language, + force_refresh_rerank=args.force_refresh_rerank, + force_refresh_labels=args.force_refresh_labels, + **rebuild_kwargs, + ) + break + except Exception as exc: + attempt += 1 + if attempt <= int(args.max_retries_per_query): + sleep_seconds = max(0.0, float(args.retry_backoff_sec)) * attempt + _cli_log.warning( + "[build] query=%r failed attempt=%s/%s; retry in %.1fs: %s", + query, + attempt, + max_attempts, + sleep_seconds, + exc, + ) + if sleep_seconds > 0: + time.sleep(sleep_seconds) + continue + + _cli_log.exception("[build] failed query=%r index=%s/%s", query, q_index, total_q) + failures.append( + { + "query": query, + "index": q_index, + "error": repr(exc), + } + ) + if not args.continue_on_error: + raise + _cli_log.error("[build] continue_on_error=true; skip failed query=%r", query) + result = None + break + + if result is None: + continue + summary.append( { "query": result.query, @@ -352,10 +451,19 @@ def run_build(args: argparse.Namespace) -> None: result.annotated_count, result.output_json_path, ) - out_path = ensure_dir(framework.artifact_root / "query_builds") / f"build_summary_{utc_timestamp()}.json" out_path = query_builds_dir(framework.artifact_root, dataset.dataset_id) / f"build_summary_{utc_timestamp()}.json" out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") - _cli_log.info("[done] summary=%s", out_path) + _cli_log.info( + "[done] summary=%s success=%s skipped=%s failed=%s", + out_path, + len(summary), + skipped_queries, + len(failures), + ) + if failures: + failed_path = query_builds_dir(framework.artifact_root, dataset.dataset_id) / f"build_failures_{utc_timestamp()}.json" + failed_path.write_text(json.dumps(failures, ensure_ascii=False, indent=2), encoding="utf-8") + _cli_log.warning("[done] failures=%s", failed_path) def run_batch(args: argparse.Namespace) -> None: diff --git a/scripts/evaluation/start_eval.sh b/scripts/evaluation/start_eval.sh index 6f6e71a..43f33a3 100755 --- a/scripts/evaluation/start_eval.sh +++ b/scripts/evaluation/start_eval.sh @@ -7,14 +7,19 @@ cd "$ROOT" PY="${ROOT}/.venv/bin/python" TENANT_ID="${TENANT_ID:-163}" DATASET_ID="${REPO_EVAL_DATASET_ID:-core_queries}" -QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}" +RETRY_COUNT="${REPO_EVAL_RETRY_COUNT:-2}" +EXTRA_QUERY_ARGS=() +if [[ -n "${REPO_EVAL_QUERIES:-}" ]]; then + EXTRA_QUERY_ARGS=(--queries-file "${REPO_EVAL_QUERIES}") +fi usage() { - echo "Usage: $0 batch|batch-rebuild|serve" + echo "Usage: $0 batch|batch-rebuild|batch-rebuild-resume|serve" echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50)" echo " batch-rebuild — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)" + echo " batch-rebuild-resume — resume missing queries from dataset query_builds with retry/continue-on-error" echo " serve — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)" - echo "Env: TENANT_ID (default 163), REPO_EVAL_DATASET_ID (default core_queries), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)" + echo "Env: TENANT_ID (default 163), REPO_EVAL_DATASET_ID (default core_queries), REPO_EVAL_QUERIES (optional override), REPO_EVAL_RETRY_COUNT (default 2), EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)" } case "${1:-}" in @@ -22,21 +27,36 @@ case "${1:-}" in exec "$PY" scripts/evaluation/build_annotation_set.py batch \ --tenant-id "$TENANT_ID" \ --dataset-id "$DATASET_ID" \ - --queries-file "$QUERIES" \ --top-k 50 \ - --language en + --language en \ + "${EXTRA_QUERY_ARGS[@]}" ;; batch-rebuild) exec "$PY" scripts/evaluation/build_annotation_set.py build \ --tenant-id "$TENANT_ID" \ --dataset-id "$DATASET_ID" \ - --queries-file "$QUERIES" \ --search-depth 500 \ --rerank-depth 10000 \ --reset-artifacts \ --force-refresh-rerank \ --force-refresh-labels \ - --language en + --language en \ + "${EXTRA_QUERY_ARGS[@]}" + ;; + batch-rebuild-resume) + exec "$PY" scripts/evaluation/build_annotation_set.py build \ + --tenant-id "$TENANT_ID" \ + --dataset-id "$DATASET_ID" \ + --search-depth 500 \ + --rerank-depth 10000 \ + --force-refresh-rerank \ + --force-refresh-labels \ + --resume-missing \ + --continue-on-error \ + --max-retries-per-query "$RETRY_COUNT" \ + --retry-backoff-sec 10 \ + --language en \ + "${EXTRA_QUERY_ARGS[@]}" ;; serve) EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}" @@ -44,9 +64,9 @@ case "${1:-}" in exec "$PY" scripts/evaluation/serve_eval_web.py serve \ --tenant-id "$TENANT_ID" \ --dataset-id "$DATASET_ID" \ - --queries-file "$QUERIES" \ --host "$EVAL_WEB_HOST" \ - --port "$EVAL_WEB_PORT" + --port "$EVAL_WEB_PORT" \ + "${EXTRA_QUERY_ARGS[@]}" ;; *) usage diff --git a/scripts/service_ctl.sh b/scripts/service_ctl.sh index 9d89cc6..14a0b79 100755 --- a/scripts/service_ctl.sh +++ b/scripts/service_ctl.sh @@ -19,7 +19,7 @@ CORE_SERVICES=("backend" "indexer" "frontend" "eval-web") # reranker-fine 暂时不用,因此暂时从OPTIONAL_SERVICES中删除 OPTIONAL_SERVICES=("tei" "cnclip" "embedding" "embedding-image" "translator" "reranker") FULL_SERVICES=("${OPTIONAL_SERVICES[@]}" "${CORE_SERVICES[@]}") -STOP_ORDER_SERVICES=("frontend" "eval-web" "indexer" "backend" "reranker" "translator" "embedding-image" "embedding" "cnclip" "tei") +STOP_ORDER_SERVICES=("frontend" "eval-web" "indexer" "reranker" "translator" "embedding-image" "embedding" "cnclip" "tei" "backend") declare -Ag SERVICE_ENABLED_CACHE=() all_services() { diff --git a/scripts/start_eval_web.sh b/scripts/start_eval_web.sh index e96b0a0..20c9d41 100755 --- a/scripts/start_eval_web.sh +++ b/scripts/start_eval_web.sh @@ -10,7 +10,10 @@ EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}" EVAL_WEB_HOST="${EVAL_WEB_HOST:-0.0.0.0}" TENANT_ID="${TENANT_ID:-163}" DATASET_ID="${REPO_EVAL_DATASET_ID:-core_queries}" -QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}" +EXTRA_QUERY_ARGS=() +if [[ -n "${REPO_EVAL_QUERIES:-}" ]]; then + EXTRA_QUERY_ARGS=(--queries-file "${REPO_EVAL_QUERIES}") +fi GREEN='\033[0;32m' YELLOW='\033[1;33m' @@ -27,6 +30,6 @@ export EVAL_WEB_PORT EVAL_WEB_HOST TENANT_ID REPO_EVAL_DATASET_ID REPO_EVAL_QUER exec python scripts/evaluation/serve_eval_web.py serve \ --tenant-id "${TENANT_ID}" \ --dataset-id "${DATASET_ID}" \ - --queries-file "${QUERIES}" \ --host "${EVAL_WEB_HOST}" \ - --port "${EVAL_WEB_PORT}" + --port "${EVAL_WEB_PORT}" \ + "${EXTRA_QUERY_ARGS[@]}" -- libgit2 0.21.2