run_coarse_fusion_tuning_resilient.sh 5.69 KB
#!/bin/bash

set -euo pipefail

cd "$(dirname "$0")/../.."
source ./activate.sh

usage() {
  echo "usage: $0 <run_name> <dataset_id> <max_evals> <batch_size> <candidate_pool_size> <random_seed> <search_space> <seed_report> [resume_run_dir]" >&2
  exit 1
}

if [ "$#" -lt 8 ]; then
  usage
fi

RUN_NAME="$1"
DATASET_ID="$2"
MAX_EVALS="$3"
BATCH_SIZE="$4"
CANDIDATE_POOL_SIZE="$5"
RANDOM_SEED="$6"
SEARCH_SPACE="$7"
SEED_REPORT="$8"
RESUME_RUN_DIR="${9:-}"

BATCH_EVAL_TIMEOUT_SEC="${BATCH_EVAL_TIMEOUT_SEC:-0}"
RESTART_SLEEP_SEC="${RESTART_SLEEP_SEC:-30}"
SEARCH_BASE_URL="${SEARCH_BASE_URL:-http://127.0.0.1:6002}"
EVAL_WEB_BASE_URL="${EVAL_WEB_BASE_URL:-http://127.0.0.1:6010}"
RUN_DIR="artifacts/search_evaluation/tuning_runs/${RUN_NAME}"
LOCK_DIR="${RUN_DIR}/.resilient_lock"
HEALTH_POLL_SEC="${HEALTH_POLL_SEC:-15}"

mkdir -p "$(dirname "$RUN_DIR")"

release_lock() {
  if [ -d "$LOCK_DIR" ] && [ -f "$LOCK_DIR/pid" ] && [ "$(cat "$LOCK_DIR/pid" 2>/dev/null || true)" = "$$" ]; then
    rm -rf "$LOCK_DIR"
  fi
}

acquire_lock() {
  mkdir -p "$RUN_DIR"
  if mkdir "$LOCK_DIR" 2>/dev/null; then
    echo "$$" > "$LOCK_DIR/pid"
    date -u +%Y-%m-%dT%H:%M:%SZ > "$LOCK_DIR/started_at"
    return 0
  fi

  local owner_pid=""
  if [ -f "$LOCK_DIR/pid" ]; then
    owner_pid="$(cat "$LOCK_DIR/pid" 2>/dev/null || true)"
  fi
  if [ -n "$owner_pid" ] && kill -0 "$owner_pid" 2>/dev/null; then
    echo "[resilient] lock already held by pid=${owner_pid}, exiting"
    exit 0
  fi

  echo "[resilient] removing stale lock at ${LOCK_DIR}"
  rm -rf "$LOCK_DIR"
  if mkdir "$LOCK_DIR" 2>/dev/null; then
    echo "$$" > "$LOCK_DIR/pid"
    date -u +%Y-%m-%dT%H:%M:%SZ > "$LOCK_DIR/started_at"
    return 0
  fi

  echo "[resilient] failed to acquire lock at ${LOCK_DIR}"
  exit 1
}

trap release_lock EXIT INT TERM

count_live_successes() {
  python3 - "$RUN_DIR" <<'PY'
import json
import sys
from pathlib import Path

run_dir = Path(sys.argv[1])
path = run_dir / "trials.jsonl"
count = 0
if path.is_file():
    for line in path.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line:
            continue
        obj = json.loads(line)
        if obj.get("status") == "ok" and not obj.get("is_seed"):
            count += 1
print(count)
PY
}

wait_for_health() {
  local url="$1"
  local timeout_sec="$2"
  local deadline=$(( $(date +%s) + timeout_sec ))
  while [ "$(date +%s)" -lt "$deadline" ]; do
    if curl -fsS "$url" >/dev/null 2>&1; then
      return 0
    fi
    sleep 2
  done
  return 1
}

ensure_services() {
  if ! wait_for_health "${SEARCH_BASE_URL}/health" 20; then
    echo "[resilient] backend unhealthy, restarting backend"
    ./restart.sh backend || true
    sleep 5
  fi
  if ! wait_for_health "${SEARCH_BASE_URL}/health" 180; then
    echo "[resilient] backend still unhealthy after restart"
    return 1
  fi

  if ! wait_for_health "${EVAL_WEB_BASE_URL}/api/history" 20; then
    echo "[resilient] eval-web unhealthy, restarting eval-web"
    ./restart.sh eval-web || true
    sleep 5
  fi
  if ! wait_for_health "${EVAL_WEB_BASE_URL}/api/history" 180; then
    echo "[resilient] eval-web still unhealthy after restart"
    return 1
  fi
  return 0
}

heal_services_nonblocking() {
  if ! curl -fsS "${SEARCH_BASE_URL}/health" >/dev/null 2>&1; then
    echo "[resilient] backend became unhealthy during run, restarting backend"
    ./restart.sh backend || true
    sleep 5
  fi
  if ! curl -fsS "${EVAL_WEB_BASE_URL}/api/history" >/dev/null 2>&1; then
    echo "[resilient] eval-web became unhealthy during run, restarting eval-web"
    ./restart.sh eval-web || true
    sleep 5
  fi
}

build_cmd() {
  local cmd=(
    python
    scripts/evaluation/tune_fusion.py
    --mode optimize
    --search-space "$SEARCH_SPACE"
    --tenant-id 163
    --dataset-id "$DATASET_ID"
    --queries-file scripts/evaluation/queries/queries.txt
    --top-k 100
    --language en
    --search-base-url "$SEARCH_BASE_URL"
    --eval-web-base-url "$EVAL_WEB_BASE_URL"
    --max-evals "$MAX_EVALS"
    --batch-size "$BATCH_SIZE"
    --candidate-pool-size "$CANDIDATE_POOL_SIZE"
    --random-seed "$RANDOM_SEED"
    --batch-eval-timeout-sec "$BATCH_EVAL_TIMEOUT_SEC"
  )
  if [ -n "$SEED_REPORT" ]; then
    cmd+=(--seed-report "$SEED_REPORT")
  fi
  if [ -n "$RESUME_RUN_DIR" ]; then
    cmd+=(--resume-run "$RESUME_RUN_DIR")
  else
    cmd+=(--run-name "$RUN_NAME")
  fi
  printf '%q ' "${cmd[@]}"
  printf '\n'
}

attempt=0
acquire_lock
while true; do
  live_successes="$(count_live_successes)"
  if [ "$live_successes" -ge "$MAX_EVALS" ]; then
    echo "[resilient] complete run_name=$RUN_NAME live_successes=$live_successes target=$MAX_EVALS"
    exit 0
  fi

  attempt=$((attempt + 1))
  if [ -d "$RUN_DIR" ]; then
    RESUME_RUN_DIR="$RUN_DIR"
  fi

  echo "[resilient] attempt=$attempt run_name=$RUN_NAME live_successes=$live_successes target=$MAX_EVALS"
  if ! ensure_services; then
    echo "[resilient] service preflight failed, sleeping ${RESTART_SLEEP_SEC}s before retry"
    sleep "$RESTART_SLEEP_SEC"
    continue
  fi
  CMD_STR="$(build_cmd)"
  echo "[resilient] cmd=$CMD_STR"

  set +e
  bash -lc "$CMD_STR" &
  child_pid=$!
  echo "[resilient] child_pid=${child_pid}"
  while kill -0 "$child_pid" 2>/dev/null; do
    heal_services_nonblocking
    sleep "$HEALTH_POLL_SEC"
  done
  wait "$child_pid"
  exit_code=$?
  set -e

  live_successes="$(count_live_successes)"
  echo "[resilient] exit_code=$exit_code live_successes=$live_successes"

  if [ "$live_successes" -ge "$MAX_EVALS" ]; then
    echo "[resilient] finished after attempt=$attempt"
    exit 0
  fi

  if ! ensure_services; then
    echo "[resilient] service recovery failed after exit_code=$exit_code"
  fi
  echo "[resilient] sleeping ${RESTART_SLEEP_SEC}s before resume"
  sleep "$RESTART_SLEEP_SEC"
done