#!/bin/bash set -euo pipefail cd "$(dirname "$0")/../.." source ./activate.sh usage() { echo "usage: $0 [resume_run_dir]" >&2 exit 1 } if [ "$#" -lt 8 ]; then usage fi RUN_NAME="$1" DATASET_ID="$2" MAX_EVALS="$3" BATCH_SIZE="$4" CANDIDATE_POOL_SIZE="$5" RANDOM_SEED="$6" SEARCH_SPACE="$7" SEED_REPORT="$8" RESUME_RUN_DIR="${9:-}" BATCH_EVAL_TIMEOUT_SEC="${BATCH_EVAL_TIMEOUT_SEC:-0}" RESTART_SLEEP_SEC="${RESTART_SLEEP_SEC:-30}" SEARCH_BASE_URL="${SEARCH_BASE_URL:-http://127.0.0.1:6002}" EVAL_WEB_BASE_URL="${EVAL_WEB_BASE_URL:-http://127.0.0.1:6010}" RUN_DIR="artifacts/search_evaluation/tuning_runs/${RUN_NAME}" mkdir -p "$(dirname "$RUN_DIR")" count_live_successes() { python3 - "$RUN_DIR" <<'PY' import json import sys from pathlib import Path run_dir = Path(sys.argv[1]) path = run_dir / "trials.jsonl" count = 0 if path.is_file(): for line in path.read_text(encoding="utf-8").splitlines(): line = line.strip() if not line: continue obj = json.loads(line) if obj.get("status") == "ok" and not obj.get("is_seed"): count += 1 print(count) PY } build_cmd() { local cmd=( python scripts/evaluation/tune_fusion.py --mode optimize --search-space "$SEARCH_SPACE" --seed-report "$SEED_REPORT" --tenant-id 163 --dataset-id "$DATASET_ID" --queries-file scripts/evaluation/queries/queries.txt --top-k 100 --language en --search-base-url "$SEARCH_BASE_URL" --eval-web-base-url "$EVAL_WEB_BASE_URL" --max-evals "$MAX_EVALS" --batch-size "$BATCH_SIZE" --candidate-pool-size "$CANDIDATE_POOL_SIZE" --random-seed "$RANDOM_SEED" --batch-eval-timeout-sec "$BATCH_EVAL_TIMEOUT_SEC" ) if [ -n "$RESUME_RUN_DIR" ]; then cmd+=(--resume-run "$RESUME_RUN_DIR") else cmd+=(--run-name "$RUN_NAME") fi printf '%q ' "${cmd[@]}" printf '\n' } attempt=0 while true; do live_successes="$(count_live_successes)" if [ "$live_successes" -ge "$MAX_EVALS" ]; then echo "[resilient] complete run_name=$RUN_NAME live_successes=$live_successes target=$MAX_EVALS" exit 0 fi attempt=$((attempt + 1)) if [ -d "$RUN_DIR" ]; then RESUME_RUN_DIR="$RUN_DIR" fi echo "[resilient] attempt=$attempt run_name=$RUN_NAME live_successes=$live_successes target=$MAX_EVALS" CMD_STR="$(build_cmd)" echo "[resilient] cmd=$CMD_STR" set +e bash -lc "$CMD_STR" exit_code=$? set -e live_successes="$(count_live_successes)" echo "[resilient] exit_code=$exit_code live_successes=$live_successes" if [ "$live_successes" -ge "$MAX_EVALS" ]; then echo "[resilient] finished after attempt=$attempt" exit 0 fi echo "[resilient] sleeping ${RESTART_SLEEP_SEC}s before resume" sleep "$RESTART_SLEEP_SEC" done