run_coarse_fusion_tuning_resilient.sh 2.82 KB
#!/bin/bash

set -euo pipefail

cd "$(dirname "$0")/../.."
source ./activate.sh

usage() {
  echo "usage: $0 <run_name> <dataset_id> <max_evals> <batch_size> <candidate_pool_size> <random_seed> <search_space> <seed_report> [resume_run_dir]" >&2
  exit 1
}

if [ "$#" -lt 8 ]; then
  usage
fi

RUN_NAME="$1"
DATASET_ID="$2"
MAX_EVALS="$3"
BATCH_SIZE="$4"
CANDIDATE_POOL_SIZE="$5"
RANDOM_SEED="$6"
SEARCH_SPACE="$7"
SEED_REPORT="$8"
RESUME_RUN_DIR="${9:-}"

BATCH_EVAL_TIMEOUT_SEC="${BATCH_EVAL_TIMEOUT_SEC:-0}"
RESTART_SLEEP_SEC="${RESTART_SLEEP_SEC:-30}"
SEARCH_BASE_URL="${SEARCH_BASE_URL:-http://127.0.0.1:6002}"
EVAL_WEB_BASE_URL="${EVAL_WEB_BASE_URL:-http://127.0.0.1:6010}"
RUN_DIR="artifacts/search_evaluation/tuning_runs/${RUN_NAME}"

mkdir -p "$(dirname "$RUN_DIR")"

count_live_successes() {
  python3 - "$RUN_DIR" <<'PY'
import json
import sys
from pathlib import Path

run_dir = Path(sys.argv[1])
path = run_dir / "trials.jsonl"
count = 0
if path.is_file():
    for line in path.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line:
            continue
        obj = json.loads(line)
        if obj.get("status") == "ok" and not obj.get("is_seed"):
            count += 1
print(count)
PY
}

build_cmd() {
  local cmd=(
    python
    scripts/evaluation/tune_fusion.py
    --mode optimize
    --search-space "$SEARCH_SPACE"
    --seed-report "$SEED_REPORT"
    --tenant-id 163
    --dataset-id "$DATASET_ID"
    --queries-file scripts/evaluation/queries/queries.txt
    --top-k 100
    --language en
    --search-base-url "$SEARCH_BASE_URL"
    --eval-web-base-url "$EVAL_WEB_BASE_URL"
    --max-evals "$MAX_EVALS"
    --batch-size "$BATCH_SIZE"
    --candidate-pool-size "$CANDIDATE_POOL_SIZE"
    --random-seed "$RANDOM_SEED"
    --batch-eval-timeout-sec "$BATCH_EVAL_TIMEOUT_SEC"
  )
  if [ -n "$RESUME_RUN_DIR" ]; then
    cmd+=(--resume-run "$RESUME_RUN_DIR")
  else
    cmd+=(--run-name "$RUN_NAME")
  fi
  printf '%q ' "${cmd[@]}"
  printf '\n'
}

attempt=0
while true; do
  live_successes="$(count_live_successes)"
  if [ "$live_successes" -ge "$MAX_EVALS" ]; then
    echo "[resilient] complete run_name=$RUN_NAME live_successes=$live_successes target=$MAX_EVALS"
    exit 0
  fi

  attempt=$((attempt + 1))
  if [ -d "$RUN_DIR" ]; then
    RESUME_RUN_DIR="$RUN_DIR"
  fi

  echo "[resilient] attempt=$attempt run_name=$RUN_NAME live_successes=$live_successes target=$MAX_EVALS"
  CMD_STR="$(build_cmd)"
  echo "[resilient] cmd=$CMD_STR"

  set +e
  bash -lc "$CMD_STR"
  exit_code=$?
  set -e

  live_successes="$(count_live_successes)"
  echo "[resilient] exit_code=$exit_code live_successes=$live_successes"

  if [ "$live_successes" -ge "$MAX_EVALS" ]; then
    echo "[resilient] finished after attempt=$attempt"
    exit 0
  fi

  echo "[resilient] sleeping ${RESTART_SLEEP_SEC}s before resume"
  sleep "$RESTART_SLEEP_SEC"
done