run_coarse_fusion_tuning_resilient.sh
2.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/bin/bash
set -euo pipefail
cd "$(dirname "$0")/../.."
source ./activate.sh
usage() {
echo "usage: $0 <run_name> <dataset_id> <max_evals> <batch_size> <candidate_pool_size> <random_seed> <search_space> <seed_report> [resume_run_dir]" >&2
exit 1
}
if [ "$#" -lt 8 ]; then
usage
fi
RUN_NAME="$1"
DATASET_ID="$2"
MAX_EVALS="$3"
BATCH_SIZE="$4"
CANDIDATE_POOL_SIZE="$5"
RANDOM_SEED="$6"
SEARCH_SPACE="$7"
SEED_REPORT="$8"
RESUME_RUN_DIR="${9:-}"
BATCH_EVAL_TIMEOUT_SEC="${BATCH_EVAL_TIMEOUT_SEC:-0}"
RESTART_SLEEP_SEC="${RESTART_SLEEP_SEC:-30}"
SEARCH_BASE_URL="${SEARCH_BASE_URL:-http://127.0.0.1:6002}"
EVAL_WEB_BASE_URL="${EVAL_WEB_BASE_URL:-http://127.0.0.1:6010}"
RUN_DIR="artifacts/search_evaluation/tuning_runs/${RUN_NAME}"
mkdir -p "$(dirname "$RUN_DIR")"
count_live_successes() {
python3 - "$RUN_DIR" <<'PY'
import json
import sys
from pathlib import Path
run_dir = Path(sys.argv[1])
path = run_dir / "trials.jsonl"
count = 0
if path.is_file():
for line in path.read_text(encoding="utf-8").splitlines():
line = line.strip()
if not line:
continue
obj = json.loads(line)
if obj.get("status") == "ok" and not obj.get("is_seed"):
count += 1
print(count)
PY
}
build_cmd() {
local cmd=(
python
scripts/evaluation/tune_fusion.py
--mode optimize
--search-space "$SEARCH_SPACE"
--seed-report "$SEED_REPORT"
--tenant-id 163
--dataset-id "$DATASET_ID"
--queries-file scripts/evaluation/queries/queries.txt
--top-k 100
--language en
--search-base-url "$SEARCH_BASE_URL"
--eval-web-base-url "$EVAL_WEB_BASE_URL"
--max-evals "$MAX_EVALS"
--batch-size "$BATCH_SIZE"
--candidate-pool-size "$CANDIDATE_POOL_SIZE"
--random-seed "$RANDOM_SEED"
--batch-eval-timeout-sec "$BATCH_EVAL_TIMEOUT_SEC"
)
if [ -n "$RESUME_RUN_DIR" ]; then
cmd+=(--resume-run "$RESUME_RUN_DIR")
else
cmd+=(--run-name "$RUN_NAME")
fi
printf '%q ' "${cmd[@]}"
printf '\n'
}
attempt=0
while true; do
live_successes="$(count_live_successes)"
if [ "$live_successes" -ge "$MAX_EVALS" ]; then
echo "[resilient] complete run_name=$RUN_NAME live_successes=$live_successes target=$MAX_EVALS"
exit 0
fi
attempt=$((attempt + 1))
if [ -d "$RUN_DIR" ]; then
RESUME_RUN_DIR="$RUN_DIR"
fi
echo "[resilient] attempt=$attempt run_name=$RUN_NAME live_successes=$live_successes target=$MAX_EVALS"
CMD_STR="$(build_cmd)"
echo "[resilient] cmd=$CMD_STR"
set +e
bash -lc "$CMD_STR"
exit_code=$?
set -e
live_successes="$(count_live_successes)"
echo "[resilient] exit_code=$exit_code live_successes=$live_successes"
if [ "$live_successes" -ge "$MAX_EVALS" ]; then
echo "[resilient] finished after attempt=$attempt"
exit 0
fi
echo "[resilient] sleeping ${RESTART_SLEEP_SEC}s before resume"
sleep "$RESTART_SLEEP_SEC"
done