#!/bin/bash set -euo pipefail cd "$(dirname "$0")/../.." source ./activate.sh usage() { echo "usage: $0 [resume_run_dir]" >&2 exit 1 } if [ "$#" -lt 8 ]; then usage fi RUN_NAME="$1" DATASET_ID="$2" MAX_EVALS="$3" BATCH_SIZE="$4" CANDIDATE_POOL_SIZE="$5" RANDOM_SEED="$6" SEARCH_SPACE="$7" SEED_REPORT="$8" RESUME_RUN_DIR="${9:-}" BATCH_EVAL_TIMEOUT_SEC="${BATCH_EVAL_TIMEOUT_SEC:-0}" RESTART_SLEEP_SEC="${RESTART_SLEEP_SEC:-30}" SEARCH_BASE_URL="${SEARCH_BASE_URL:-http://127.0.0.1:6002}" EVAL_WEB_BASE_URL="${EVAL_WEB_BASE_URL:-http://127.0.0.1:6010}" RUN_DIR="artifacts/search_evaluation/tuning_runs/${RUN_NAME}" LOCK_DIR="${RUN_DIR}/.resilient_lock" HEALTH_POLL_SEC="${HEALTH_POLL_SEC:-15}" mkdir -p "$(dirname "$RUN_DIR")" release_lock() { if [ -d "$LOCK_DIR" ] && [ -f "$LOCK_DIR/pid" ] && [ "$(cat "$LOCK_DIR/pid" 2>/dev/null || true)" = "$$" ]; then rm -rf "$LOCK_DIR" fi } acquire_lock() { mkdir -p "$RUN_DIR" if mkdir "$LOCK_DIR" 2>/dev/null; then echo "$$" > "$LOCK_DIR/pid" date -u +%Y-%m-%dT%H:%M:%SZ > "$LOCK_DIR/started_at" return 0 fi local owner_pid="" if [ -f "$LOCK_DIR/pid" ]; then owner_pid="$(cat "$LOCK_DIR/pid" 2>/dev/null || true)" fi if [ -n "$owner_pid" ] && kill -0 "$owner_pid" 2>/dev/null; then echo "[resilient] lock already held by pid=${owner_pid}, exiting" exit 0 fi echo "[resilient] removing stale lock at ${LOCK_DIR}" rm -rf "$LOCK_DIR" if mkdir "$LOCK_DIR" 2>/dev/null; then echo "$$" > "$LOCK_DIR/pid" date -u +%Y-%m-%dT%H:%M:%SZ > "$LOCK_DIR/started_at" return 0 fi echo "[resilient] failed to acquire lock at ${LOCK_DIR}" exit 1 } trap release_lock EXIT INT TERM count_live_successes() { python3 - "$RUN_DIR" <<'PY' import json import sys from pathlib import Path run_dir = Path(sys.argv[1]) path = run_dir / "trials.jsonl" count = 0 if path.is_file(): for line in path.read_text(encoding="utf-8").splitlines(): line = line.strip() if not line: continue obj = json.loads(line) if obj.get("status") == "ok" and not obj.get("is_seed"): count += 1 print(count) PY } wait_for_health() { local url="$1" local timeout_sec="$2" local deadline=$(( $(date +%s) + timeout_sec )) while [ "$(date +%s)" -lt "$deadline" ]; do if curl -fsS "$url" >/dev/null 2>&1; then return 0 fi sleep 2 done return 1 } ensure_services() { if ! wait_for_health "${SEARCH_BASE_URL}/health" 20; then echo "[resilient] backend unhealthy, restarting backend" ./restart.sh backend || true sleep 5 fi if ! wait_for_health "${SEARCH_BASE_URL}/health" 180; then echo "[resilient] backend still unhealthy after restart" return 1 fi if ! wait_for_health "${EVAL_WEB_BASE_URL}/api/history" 20; then echo "[resilient] eval-web unhealthy, restarting eval-web" ./restart.sh eval-web || true sleep 5 fi if ! wait_for_health "${EVAL_WEB_BASE_URL}/api/history" 180; then echo "[resilient] eval-web still unhealthy after restart" return 1 fi return 0 } heal_services_nonblocking() { if ! curl -fsS "${SEARCH_BASE_URL}/health" >/dev/null 2>&1; then echo "[resilient] backend became unhealthy during run, restarting backend" ./restart.sh backend || true sleep 5 fi if ! curl -fsS "${EVAL_WEB_BASE_URL}/api/history" >/dev/null 2>&1; then echo "[resilient] eval-web became unhealthy during run, restarting eval-web" ./restart.sh eval-web || true sleep 5 fi } build_cmd() { local cmd=( python scripts/evaluation/tune_fusion.py --mode optimize --search-space "$SEARCH_SPACE" --tenant-id 163 --dataset-id "$DATASET_ID" --queries-file scripts/evaluation/queries/queries.txt --top-k 100 --language en --search-base-url "$SEARCH_BASE_URL" --eval-web-base-url "$EVAL_WEB_BASE_URL" --max-evals "$MAX_EVALS" --batch-size "$BATCH_SIZE" --candidate-pool-size "$CANDIDATE_POOL_SIZE" --random-seed "$RANDOM_SEED" --batch-eval-timeout-sec "$BATCH_EVAL_TIMEOUT_SEC" ) if [ -n "$SEED_REPORT" ]; then cmd+=(--seed-report "$SEED_REPORT") fi if [ -n "$RESUME_RUN_DIR" ]; then cmd+=(--resume-run "$RESUME_RUN_DIR") else cmd+=(--run-name "$RUN_NAME") fi printf '%q ' "${cmd[@]}" printf '\n' } attempt=0 acquire_lock while true; do live_successes="$(count_live_successes)" if [ "$live_successes" -ge "$MAX_EVALS" ]; then echo "[resilient] complete run_name=$RUN_NAME live_successes=$live_successes target=$MAX_EVALS" exit 0 fi attempt=$((attempt + 1)) if [ -d "$RUN_DIR" ]; then RESUME_RUN_DIR="$RUN_DIR" fi echo "[resilient] attempt=$attempt run_name=$RUN_NAME live_successes=$live_successes target=$MAX_EVALS" if ! ensure_services; then echo "[resilient] service preflight failed, sleeping ${RESTART_SLEEP_SEC}s before retry" sleep "$RESTART_SLEEP_SEC" continue fi CMD_STR="$(build_cmd)" echo "[resilient] cmd=$CMD_STR" set +e bash -lc "$CMD_STR" & child_pid=$! echo "[resilient] child_pid=${child_pid}" while kill -0 "$child_pid" 2>/dev/null; do heal_services_nonblocking sleep "$HEALTH_POLL_SEC" done wait "$child_pid" exit_code=$? set -e live_successes="$(count_live_successes)" echo "[resilient] exit_code=$exit_code live_successes=$live_successes" if [ "$live_successes" -ge "$MAX_EVALS" ]; then echo "[resilient] finished after attempt=$attempt" exit 0 fi if ! ensure_services; then echo "[resilient] service recovery failed after exit_code=$exit_code" fi echo "[resilient] sleeping ${RESTART_SLEEP_SEC}s before resume" sleep "$RESTART_SLEEP_SEC" done