From 935f6e1b1df162953f7ae96de7ee27702eb8f2da Mon Sep 17 00:00:00 2001
From: tangwang <tangwang@essa.top>
Date: Fri, 24 Apr 2026 17:40:02 +0800
Subject: [PATCH] coarse_rank 搜参结果 **参数列表（4 套）** - `baseline`（top771 最优，`seed_baseline`）   - `es_bias: 10.0`, `es_exponent: 0.05`   - `text_bias: 0.1`, `text_exponent: 0.35`, `text_translation_weight: 1.0`   - `knn_text_weight: 1.0`, `knn_image_weight: 2.0`, `knn_tie_breaker: 0.3`   - `knn_bias: 0.2`, `knn_exponent: 5.6`   - `knn_text_bias: 0.2`, `knn_text_exponent: 0.0`   - `knn_image_bias: 0.2`, `knn_image_exponent: 0.0` - `54 条上得到的极端解`（`seed_legacy_bo234`）   - `es_bias: 7.214`, `es_exponent: 0.2025`   - `text_bias: 4.0`, `text_exponent: 1.584`, `text_translation_weight: 1.4441`   - `knn_text_weight: 0.1`, `knn_image_weight: 5.6232`, `knn_tie_breaker: 0.021`   - `knn_bias: 0.0019`, `knn_exponent: 11.8477`   - `knn_text_bias: 2.3125`, `knn_text_exponent: 1.1547`   - `knn_image_bias: 0.9641`, `knn_image_exponent: 5.8671` - `bo_012`（`Primary_Metric_Score=0.485027`）   - `es_bias: 6.6233`, `es_exponent: 0.2377`   - `text_bias: 0.049`, `text_exponent: 0.4446`, `text_translation_weight: 1.6236`   - `knn_text_weight: 1.0344`, `knn_image_weight: 1.3565`, `knn_tie_breaker: 0.212`   - `knn_bias: 0.0052`, `knn_exponent: 4.4639`   - `knn_text_bias: 0.1148`, `knn_text_exponent: 1.0926`   - `knn_image_bias: 0.0114`, `knn_image_exponent: 5.2496` - `bo_018`（`Primary_Metric_Score=0.484691`）   - `es_bias: 8.8861`, `es_exponent: 0.2794`   - `text_bias: 0.0189`, `text_exponent: 0.2`, `text_translation_weight: 1.7178`   - `knn_text_weight: 1.7459`, `knn_image_weight: 4.2658`, `knn_tie_breaker: 0.2814`   - `knn_bias: 0.001`, `knn_exponent: 1.4923`   - `knn_text_bias: 4.0`, `knn_text_exponent: 0.9309`   - `knn_image_bias: 0.01`, `knn_image_exponent: 5.8289`

---
 README.md                                                                                                      |   5 +++--
 artifacts/search_evaluation/tuning_launches/coarse_fusion_clothing_top771_knn_tail_20260424T093837Z.daemon.cmd |   1 +
 artifacts/search_evaluation/tuning_launches/coarse_fusion_clothing_top771_knn_tail_20260424T093837Z.daemon.pid |   1 +
 config/config.yaml                                                                                             |  10 +++++-----
 docs/issues/issue-2026-04-16-bayes寻参-clothing_top771数据集上寻参.md                                  | 172 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 scripts/evaluation/resume_coarse_fusion_tuning_knn_tail.sh                                                     |  72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 scripts/evaluation/run_coarse_fusion_tuning_resilient.sh                                                       | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 scripts/evaluation/start_coarse_fusion_tuning_knn_tail.sh                                                      |  53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 scripts/evaluation/tuning/coarse_rank_fusion_space_clothing_top771_knn_tail.yaml                               |  82 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 497 insertions(+), 9 deletions(-)
 create mode 100644 artifacts/search_evaluation/tuning_launches/coarse_fusion_clothing_top771_knn_tail_20260424T093837Z.daemon.cmd
 create mode 100644 artifacts/search_evaluation/tuning_launches/coarse_fusion_clothing_top771_knn_tail_20260424T093837Z.daemon.pid
 create mode 100755 scripts/evaluation/resume_coarse_fusion_tuning_knn_tail.sh
 create mode 100755 scripts/evaluation/start_coarse_fusion_tuning_knn_tail.sh
 create mode 100644 scripts/evaluation/tuning/coarse_rank_fusion_space_clothing_top771_knn_tail.yaml

diff --git a/README.md b/README.md
index 391e135..f796ef0 100644
--- a/README.md
+++ b/README.md
@@ -46,10 +46,11 @@ source activate.sh
 - `6002` backend（`/search/*`, `/admin/*`）
 - `6003` frontend
 - `6004` indexer（`/indexer/*`）
+- `6006` translator
 - `6005` embedding-text（可选，`POST /embed/text`；常见后端为 TEI，默认 `8080`）
-- `6006` translator（可选）
-- `6007` reranker（可选，`POST /rerank`；精排可与主重排分 `service_profile`，见 `config.yaml` → `fine_rank` / `services.rerank`）
 - `6008` embedding-image（可选，`POST /embed/image` 等）
+- `6007` reranker
+- `6009` fine_rank
 - `6010` eval-web（搜索评估 UI，`./scripts/service_ctl.sh` 服务名 `eval-web`）
 
 更完整示例见 `docs/QUICKSTART.md`。
diff --git a/artifacts/search_evaluation/tuning_launches/coarse_fusion_clothing_top771_knn_tail_20260424T093837Z.daemon.cmd b/artifacts/search_evaluation/tuning_launches/coarse_fusion_clothing_top771_knn_tail_20260424T093837Z.daemon.cmd
new file mode 100644
index 0000000..f755d66
--- /dev/null
+++ b/artifacts/search_evaluation/tuning_launches/coarse_fusion_clothing_top771_knn_tail_20260424T093837Z.daemon.cmd
@@ -0,0 +1 @@
+bash scripts/evaluation/run_coarse_fusion_tuning_resilient.sh coarse_fusion_clothing_top771_knn_tail_20260424T093837Z clothing_top771 20 2 96 20260424 scripts/evaluation/tuning/coarse_rank_fusion_space_clothing_top771_knn_tail.yaml '' 
diff --git a/artifacts/search_evaluation/tuning_launches/coarse_fusion_clothing_top771_knn_tail_20260424T093837Z.daemon.pid b/artifacts/search_evaluation/tuning_launches/coarse_fusion_clothing_top771_knn_tail_20260424T093837Z.daemon.pid
new file mode 100644
index 0000000..a9d2142
--- /dev/null
+++ b/artifacts/search_evaluation/tuning_launches/coarse_fusion_clothing_top771_knn_tail_20260424T093837Z.daemon.pid
@@ -0,0 +1 @@
+1355252
diff --git a/config/config.yaml b/config/config.yaml
index e280f2b..ebc54ca 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -247,11 +247,11 @@ coarse_rank:
     knn_image_weight: 2.0
     knn_tie_breaker: 0.3
     knn_bias: 0.6
-    knn_exponent: 0.4
-    knn_text_bias: 0.2
-    knn_text_exponent: 0.0
-    knn_image_bias: 0.2
-    knn_image_exponent: 0.0
+    knn_exponent: 4.4639
+    knn_text_bias: 0.1148
+    knn_text_exponent: 1.0926
+    knn_image_bias: 0.0114
+    knn_image_exponent: 5.2496
 fine_rank:
   enabled: false
   input_window: 160
diff --git a/docs/issues/issue-2026-04-16-bayes寻参-clothing_top771数据集上寻参.md b/docs/issues/issue-2026-04-16-bayes寻参-clothing_top771数据集上寻参.md
index a55c958..701666a 100644
--- a/docs/issues/issue-2026-04-16-bayes寻参-clothing_top771数据集上寻参.md
+++ b/docs/issues/issue-2026-04-16-bayes寻参-clothing_top771数据集上寻参.md
@@ -87,3 +87,175 @@ Prompt - 1
 
 
 Response - 1
+跑完了，这个 `run_dir` 已经完成，没有在继续运行的调参进程。
+
+结果目录：
+- [summary.md](/data/saas-search/artifacts/search_evaluation/tuning_runs/coarse_fusion_clothing_top771_resilient_20260422T091650Z/summary.md)
+- [leaderboard.csv](/data/saas-search/artifacts/search_evaluation/tuning_runs/coarse_fusion_clothing_top771_resilient_20260422T091650Z/leaderboard.csv)
+- [trials.jsonl](/data/saas-search/artifacts/search_evaluation/tuning_runs/coarse_fusion_clothing_top771_resilient_20260422T091650Z/trials.jsonl)
+
+**结论**
+1. 这轮 18 个 live eval 全部跑满了。  
+   见 [summary.md](/data/saas-search/artifacts/search_evaluation/tuning_runs/coarse_fusion_clothing_top771_resilient_20260422T091650Z/summary.md:11)
+
+2. 大集 `clothing_top771` 上，最优仍然是 fusion-only baseline，`Primary_Metric_Score=0.487476`。  
+   见 [summary.md](/data/saas-search/artifacts/search_evaluation/tuning_runs/coarse_fusion_clothing_top771_resilient_20260422T091650Z/summary.md:17)
+
+3. 这轮优化器已经很接近 baseline，但没有超过。  
+   最接近的是：
+   - `bo_012 = 0.485027`
+   - `bo_018 = 0.484691`
+   - `bo_013 = 0.483849`  
+   与 baseline 的差距分别是 `-0.002449`、`-0.002785`、`-0.003627`
+
+4. 小集上那种“极端参数”没有迁移成功。  
+   典型例子：
+   - `seed_low_knn_global = 0.447485`
+   - `seed_legacy_bo340 = 0.477537`
+   - `seed_bigset_text_stable = 0.449802`  
+   说明大集不支持此前那类明显更激进的形态。
+
+**怎么解读这轮**
+这轮最重要的收获不是“找到新最优”，而是把大集上的有效区间摸清了。
+
+大集上表现较好的候选，虽然具体值不同，但有几个共同点：
+
+1. `text_bias` 普遍很低。  
+   排名前几的候选大多在 `0.02 ~ 0.10` 附近，baseline 的 `0.1` 并不吃亏。  
+   这和小集里 `text_bias=4.0` 的方向明显相反。
+
+2. `es_exponent` 往往比 baseline 更高。  
+   baseline 是 `0.05`，而靠前候选很多在 `0.15 ~ 0.28`。  
+   这说明大集上 lexical 这一路适度增强是有价值的，但提升幅度还不足以抵消其它项的副作用。
+
+3. `text_translation_weight` 往往高于 baseline。  
+   靠前候选多数在 `1.3 ~ 1.7`，比 baseline 的 `1.0` 明显更高。  
+   这是这轮最值得保留的信号之一。
+
+4. `knn_image_weight` 可以比 baseline 更高，但不能只靠“强图像+极端非线性”硬推。  
+   很多接近最优的点把 `knn_image_weight` 提到了 `4+`，但最终仍没超过 baseline。  
+   说明 image 通路能帮忙，但必须和其它项配平。
+
+5. `knn_bias` 低值是一个信号，但不是充分条件。  
+   不少高分候选把 `knn_bias` 压得很低，甚至接近 `0`。  
+   但单独走“低 knn_bias + 低 knn_exponent”的路线并不好，`seed_low_knn_global` 已经证伪了这一点。
+
+6. `knn_text_exponent` 和 `knn_image_exponent` 从 `0` 放开后，能接近 baseline，但还没形成稳定优势。  
+   靠前候选里这两个参数经常被拉高，说明它们不是完全无用；但从结果看，它们更像是“补偿项”，不是决定性增益项。
+
+**对上次担心的过拟合，这轮能下什么判断**
+可以比较明确地说：
+
+- 小集 54 条上得到的极端解，确实存在明显过拟合迹象。
+- 到 771 条大集后，最稳的仍是原 baseline 一类的“温和融合”方案。
+- 但是 baseline 也不是绝对最优形态，至少有两类方向值得继续缩圈：
+  - 更高的 `es_exponent`
+  - 更高的 `text_translation_weight`
+
+**下一轮缩圈建议**
+我建议不要再做同样宽度的 14 维长跑了，信息已经够了。下一轮可以做“baseline 附近精修”。
+
+建议缩圈到这组范围：
+
+- `es_bias: 7.5 ~ 11.5`
+- `es_exponent: 0.12 ~ 0.28`
+- `text_bias: 0.01 ~ 0.15`
+- `text_exponent: 0.20 ~ 0.55`
+- `text_translation_weight: 1.20 ~ 1.75`
+- `knn_text_weight: 0.75 ~ 1.35`
+- `knn_image_weight: 1.2 ~ 4.8`
+- `knn_tie_breaker: 0.18 ~ 0.32`
+- `knn_bias: 0.001 ~ 0.25`
+- `knn_exponent: 3.5 ~ 8.0`
+- `knn_text_bias: 0.001 ~ 0.3`
+- `knn_text_exponent: 0.4 ~ 1.3`
+- `knn_image_bias: 0.001 ~ 0.35`
+- `knn_image_exponent: 2.0 ~ 5.8`
+
+如果要更激进地降成本，我建议直接冻结这几个：
+- `text_bias=0.1`
+- `knn_tie_breaker=0.24~0.30` 小范围
+- `knn_text_bias` 缩很小
+- `knn_image_bias` 缩很小
+
+然后重点打这 6 个：
+- `es_exponent`
+- `text_translation_weight`
+- `knn_image_weight`
+- `knn_bias`
+- `knn_exponent`
+- `knn_text_exponent`
+
+**额外发现**
+流程上还有一个问题，这轮后半段 `bo_018` 在成功前重复失败了很多次，主要是：
+- `build_annotation_set.py batch` 非零退出
+- 少量 `restart.sh backend` 非零退出
+
+不过守护逻辑最终还是把任务跑完了。这个从 [summary.md](/data/saas-search/artifacts/search_evaluation/tuning_runs/coarse_fusion_clothing_top771_resilient_20260422T091650Z/summary.md:30) 的 failures 和 daemon log 都能看到。说明“自动续跑”已起作用，但评测链路本身还不够干净。
+
+如果你愿意，我下一步可以直接继续两件事里的一个：
+1. 按这轮结论，给你生成一个 `clothing_top771_v2` 缩圈搜索空间并启动下一轮
+2. 先处理 `build_annotation_set.py batch` 偶发失败的问题，减少下一轮长跑中的无效重试
+
+
+
+**参数列表（4 套）**
+- `baseline`（top771 最优，`seed_baseline`）  
+  - `es_bias: 10.0`, `es_exponent: 0.05`  
+  - `text_bias: 0.1`, `text_exponent: 0.35`, `text_translation_weight: 1.0`  
+  - `knn_text_weight: 1.0`, `knn_image_weight: 2.0`, `knn_tie_breaker: 0.3`  
+  - `knn_bias: 0.2`, `knn_exponent: 5.6`  
+  - `knn_text_bias: 0.2`, `knn_text_exponent: 0.0`  
+  - `knn_image_bias: 0.2`, `knn_image_exponent: 0.0`
+- `54 条上得到的极端解`（`seed_legacy_bo234`）  
+  - `es_bias: 7.214`, `es_exponent: 0.2025`  
+  - `text_bias: 4.0`, `text_exponent: 1.584`, `text_translation_weight: 1.4441`  
+  - `knn_text_weight: 0.1`, `knn_image_weight: 5.6232`, `knn_tie_breaker: 0.021`  
+  - `knn_bias: 0.0019`, `knn_exponent: 11.8477`  
+  - `knn_text_bias: 2.3125`, `knn_text_exponent: 1.1547`  
+  - `knn_image_bias: 0.9641`, `knn_image_exponent: 5.8671`
+- `bo_012`（`Primary_Metric_Score=0.485027`）  
+  - `es_bias: 6.6233`, `es_exponent: 0.2377`  
+  - `text_bias: 0.049`, `text_exponent: 0.4446`, `text_translation_weight: 1.6236`  
+  - `knn_text_weight: 1.0344`, `knn_image_weight: 1.3565`, `knn_tie_breaker: 0.212`  
+  - `knn_bias: 0.0052`, `knn_exponent: 4.4639`  
+  - `knn_text_bias: 0.1148`, `knn_text_exponent: 1.0926`  
+  - `knn_image_bias: 0.0114`, `knn_image_exponent: 5.2496`
+- `bo_018`（`Primary_Metric_Score=0.484691`）  
+  - `es_bias: 8.8861`, `es_exponent: 0.2794`  
+  - `text_bias: 0.0189`, `text_exponent: 0.2`, `text_translation_weight: 1.7178`  
+  - `knn_text_weight: 1.7459`, `knn_image_weight: 4.2658`, `knn_tie_breaker: 0.2814`  
+  - `knn_bias: 0.001`, `knn_exponent: 1.4923`  
+  - `knn_text_bias: 4.0`, `knn_text_exponent: 0.9309`  
+  - `knn_image_bias: 0.01`, `knn_image_exponent: 5.8289`
+
+**怎么找（可复现）**
+- 从 `leaderboard.csv` 找（含分数+参数一行全）：`artifacts/search_evaluation/tuning_runs/coarse_fusion_clothing_top771_resilient_20260422T091650Z/leaderboard.csv`
+  - 例：`rg '^2,bo_012,' artifacts/search_evaluation/tuning_runs/coarse_fusion_clothing_top771_resilient_20260422T091650Z/leaderboard.csv`
+- 从 `trials.jsonl` 找（最权威，调参器实际写入的 params）：`artifacts/search_evaluation/tuning_runs/coarse_fusion_clothing_top771_resilient_20260422T091650Z/trials.jsonl`
+  - 例：`rg '\"name\": \"bo_012\"' artifacts/search_evaluation/tuning_runs/coarse_fusion_clothing_top771_resilient_20260422T091650Z/trials.jsonl`
+  - 例：`rg '\"name\": \"seed_legacy_bo234\"' artifacts/search_evaluation/tuning_runs/coarse_fusion_clothing_top771_resilient_20260422T091650Z/trials.jsonl`
+
+**已补到 `config.yaml`**
+- 我已把这 4 套参数作为“注释 presets”补在 `coarse_rank.fusion` 旁边：`config/config.yaml:236`
+- 注意：你当前 `config/config.yaml` 里 `coarse_rank.fusion` 的生效值是 `knn_bias=0.6 / knn_exponent=0.4`，更像 `seed_low_knn_global`，不是本次大集最优的 baseline。
+
+
+
+
+
+
+# request 2
+
+
+固定这些参数（不再参与寻参）：
+es_bias: 6.62, es_exponent: 0.24 
+text_bias: 0.05 text_exponent: 0.445, text_translation_weight: 1.0
+knn_text_weight: 1.0, knn_image_weight: 1.35, knn_tie_breaker: 0.212
+knn_bias: 0.0052, 
+
+然后对以下参数进行寻参：
+knn_exponent（0.3-6.0）
+knn_text_bias (0.0~0.3)  knn_text_exponent (0.2 ~ 3.0)
+knn_image_bias (0.0~0.3)   knn_image_exponent (1.0~7.0)
+设计好搜参脚本后跑起来，注意程序启动起来之后要检测是否运行稳定了，确保可以长时间运行直到全部跑完
\ No newline at end of file
diff --git a/scripts/evaluation/resume_coarse_fusion_tuning_knn_tail.sh b/scripts/evaluation/resume_coarse_fusion_tuning_knn_tail.sh
new file mode 100755
index 0000000..9b30bf9
--- /dev/null
+++ b/scripts/evaluation/resume_coarse_fusion_tuning_knn_tail.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+set -euo pipefail
+
+if [ "$#" -lt 1 ]; then
+  echo "usage: $0 <run_dir_or_name>" >&2
+  exit 1
+fi
+
+cd "$(dirname "$0")/../.."
+source ./activate.sh
+
+TARGET="$1"
+
+if [ -d "${TARGET}" ]; then
+  RUN_DIR="${TARGET}"
+  RUN_NAME="$(basename "${RUN_DIR}")"
+else
+  RUN_NAME="${TARGET}"
+  RUN_DIR="artifacts/search_evaluation/tuning_runs/${RUN_NAME}"
+fi
+
+if [ ! -d "${RUN_DIR}" ]; then
+  echo "run dir not found: ${RUN_DIR}" >&2
+  exit 1
+fi
+
+DATASET_ID="${REPO_EVAL_DATASET_ID:-clothing_top771}"
+MAX_EVALS="${MAX_EVALS:-20}"
+BATCH_SIZE="${BATCH_SIZE:-2}"
+CANDIDATE_POOL_SIZE="${CANDIDATE_POOL_SIZE:-96}"
+RANDOM_SEED="${RANDOM_SEED:-20260424}"
+BATCH_EVAL_TIMEOUT_SEC="${BATCH_EVAL_TIMEOUT_SEC:-0}"
+
+LAUNCH_DIR="artifacts/search_evaluation/tuning_launches"
+mkdir -p "${LAUNCH_DIR}"
+LOG_PATH="${LAUNCH_DIR}/${RUN_NAME}.daemon.log"
+PID_PATH="${LAUNCH_DIR}/${RUN_NAME}.daemon.pid"
+CMD_PATH="${LAUNCH_DIR}/${RUN_NAME}.daemon.cmd"
+
+CMD=(
+  bash
+  scripts/evaluation/run_coarse_fusion_tuning_resilient.sh
+  "${RUN_NAME}"
+  "${DATASET_ID}"
+  "${MAX_EVALS}"
+  "${BATCH_SIZE}"
+  "${CANDIDATE_POOL_SIZE}"
+  "${RANDOM_SEED}"
+  "${RUN_DIR}/search_space.yaml"
+  ""
+  "${RUN_DIR}"
+)
+
+export BATCH_EVAL_TIMEOUT_SEC
+
+printf '%q ' "${CMD[@]}" > "${CMD_PATH}"
+printf '\n' >> "${CMD_PATH}"
+
+setsid "${CMD[@]}" > "${LOG_PATH}" 2>&1 < /dev/null &
+PID=$!
+echo "${PID}" > "${PID_PATH}"
+
+echo "run_name=${RUN_NAME}"
+echo "pid=${PID}"
+echo "log=${LOG_PATH}"
+echo "pid_file=${PID_PATH}"
+echo "cmd_file=${CMD_PATH}"
+echo "run_dir=${RUN_DIR}"
+echo
+echo "tail -f ${LOG_PATH}"
+echo "cat ${RUN_DIR}/leaderboard.csv"
diff --git a/scripts/evaluation/run_coarse_fusion_tuning_resilient.sh b/scripts/evaluation/run_coarse_fusion_tuning_resilient.sh
index 406face..fc2e464 100755
--- a/scripts/evaluation/run_coarse_fusion_tuning_resilient.sh
+++ b/scripts/evaluation/run_coarse_fusion_tuning_resilient.sh
@@ -29,9 +29,48 @@ RESTART_SLEEP_SEC="${RESTART_SLEEP_SEC:-30}"
 SEARCH_BASE_URL="${SEARCH_BASE_URL:-http://127.0.0.1:6002}"
 EVAL_WEB_BASE_URL="${EVAL_WEB_BASE_URL:-http://127.0.0.1:6010}"
 RUN_DIR="artifacts/search_evaluation/tuning_runs/${RUN_NAME}"
+LOCK_DIR="${RUN_DIR}/.resilient_lock"
+HEALTH_POLL_SEC="${HEALTH_POLL_SEC:-15}"
 
 mkdir -p "$(dirname "$RUN_DIR")"
 
+release_lock() {
+  if [ -d "$LOCK_DIR" ] && [ -f "$LOCK_DIR/pid" ] && [ "$(cat "$LOCK_DIR/pid" 2>/dev/null || true)" = "$$" ]; then
+    rm -rf "$LOCK_DIR"
+  fi
+}
+
+acquire_lock() {
+  mkdir -p "$RUN_DIR"
+  if mkdir "$LOCK_DIR" 2>/dev/null; then
+    echo "$$" > "$LOCK_DIR/pid"
+    date -u +%Y-%m-%dT%H:%M:%SZ > "$LOCK_DIR/started_at"
+    return 0
+  fi
+
+  local owner_pid=""
+  if [ -f "$LOCK_DIR/pid" ]; then
+    owner_pid="$(cat "$LOCK_DIR/pid" 2>/dev/null || true)"
+  fi
+  if [ -n "$owner_pid" ] && kill -0 "$owner_pid" 2>/dev/null; then
+    echo "[resilient] lock already held by pid=${owner_pid}, exiting"
+    exit 0
+  fi
+
+  echo "[resilient] removing stale lock at ${LOCK_DIR}"
+  rm -rf "$LOCK_DIR"
+  if mkdir "$LOCK_DIR" 2>/dev/null; then
+    echo "$$" > "$LOCK_DIR/pid"
+    date -u +%Y-%m-%dT%H:%M:%SZ > "$LOCK_DIR/started_at"
+    return 0
+  fi
+
+  echo "[resilient] failed to acquire lock at ${LOCK_DIR}"
+  exit 1
+}
+
+trap release_lock EXIT INT TERM
+
 count_live_successes() {
   python3 - "$RUN_DIR" <<'PY'
 import json
@@ -53,13 +92,61 @@ print(count)
 PY
 }
 
+wait_for_health() {
+  local url="$1"
+  local timeout_sec="$2"
+  local deadline=$(( $(date +%s) + timeout_sec ))
+  while [ "$(date +%s)" -lt "$deadline" ]; do
+    if curl -fsS "$url" >/dev/null 2>&1; then
+      return 0
+    fi
+    sleep 2
+  done
+  return 1
+}
+
+ensure_services() {
+  if ! wait_for_health "${SEARCH_BASE_URL}/health" 20; then
+    echo "[resilient] backend unhealthy, restarting backend"
+    ./restart.sh backend || true
+    sleep 5
+  fi
+  if ! wait_for_health "${SEARCH_BASE_URL}/health" 180; then
+    echo "[resilient] backend still unhealthy after restart"
+    return 1
+  fi
+
+  if ! wait_for_health "${EVAL_WEB_BASE_URL}/api/history" 20; then
+    echo "[resilient] eval-web unhealthy, restarting eval-web"
+    ./restart.sh eval-web || true
+    sleep 5
+  fi
+  if ! wait_for_health "${EVAL_WEB_BASE_URL}/api/history" 180; then
+    echo "[resilient] eval-web still unhealthy after restart"
+    return 1
+  fi
+  return 0
+}
+
+heal_services_nonblocking() {
+  if ! curl -fsS "${SEARCH_BASE_URL}/health" >/dev/null 2>&1; then
+    echo "[resilient] backend became unhealthy during run, restarting backend"
+    ./restart.sh backend || true
+    sleep 5
+  fi
+  if ! curl -fsS "${EVAL_WEB_BASE_URL}/api/history" >/dev/null 2>&1; then
+    echo "[resilient] eval-web became unhealthy during run, restarting eval-web"
+    ./restart.sh eval-web || true
+    sleep 5
+  fi
+}
+
 build_cmd() {
   local cmd=(
     python
     scripts/evaluation/tune_fusion.py
     --mode optimize
     --search-space "$SEARCH_SPACE"
-    --seed-report "$SEED_REPORT"
     --tenant-id 163
     --dataset-id "$DATASET_ID"
     --queries-file scripts/evaluation/queries/queries.txt
@@ -73,6 +160,9 @@ build_cmd() {
     --random-seed "$RANDOM_SEED"
     --batch-eval-timeout-sec "$BATCH_EVAL_TIMEOUT_SEC"
   )
+  if [ -n "$SEED_REPORT" ]; then
+    cmd+=(--seed-report "$SEED_REPORT")
+  fi
   if [ -n "$RESUME_RUN_DIR" ]; then
     cmd+=(--resume-run "$RESUME_RUN_DIR")
   else
@@ -83,6 +173,7 @@ build_cmd() {
 }
 
 attempt=0
+acquire_lock
 while true; do
   live_successes="$(count_live_successes)"
   if [ "$live_successes" -ge "$MAX_EVALS" ]; then
@@ -96,11 +187,23 @@ while true; do
   fi
 
   echo "[resilient] attempt=$attempt run_name=$RUN_NAME live_successes=$live_successes target=$MAX_EVALS"
+  if ! ensure_services; then
+    echo "[resilient] service preflight failed, sleeping ${RESTART_SLEEP_SEC}s before retry"
+    sleep "$RESTART_SLEEP_SEC"
+    continue
+  fi
   CMD_STR="$(build_cmd)"
   echo "[resilient] cmd=$CMD_STR"
 
   set +e
-  bash -lc "$CMD_STR"
+  bash -lc "$CMD_STR" &
+  child_pid=$!
+  echo "[resilient] child_pid=${child_pid}"
+  while kill -0 "$child_pid" 2>/dev/null; do
+    heal_services_nonblocking
+    sleep "$HEALTH_POLL_SEC"
+  done
+  wait "$child_pid"
   exit_code=$?
   set -e
 
@@ -112,6 +215,9 @@ while true; do
     exit 0
   fi
 
+  if ! ensure_services; then
+    echo "[resilient] service recovery failed after exit_code=$exit_code"
+  fi
   echo "[resilient] sleeping ${RESTART_SLEEP_SEC}s before resume"
   sleep "$RESTART_SLEEP_SEC"
 done
diff --git a/scripts/evaluation/start_coarse_fusion_tuning_knn_tail.sh b/scripts/evaluation/start_coarse_fusion_tuning_knn_tail.sh
new file mode 100755
index 0000000..c80805f
--- /dev/null
+++ b/scripts/evaluation/start_coarse_fusion_tuning_knn_tail.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+set -euo pipefail
+
+cd "$(dirname "$0")/../.."
+source ./activate.sh
+
+RUN_NAME="${RUN_NAME:-coarse_fusion_clothing_top771_knn_tail_$(date -u +%Y%m%dT%H%M%SZ)}"
+DATASET_ID="${REPO_EVAL_DATASET_ID:-clothing_top771}"
+SEARCH_SPACE="${SEARCH_SPACE:-scripts/evaluation/tuning/coarse_rank_fusion_space_clothing_top771_knn_tail.yaml}"
+MAX_EVALS="${MAX_EVALS:-20}"
+BATCH_SIZE="${BATCH_SIZE:-2}"
+CANDIDATE_POOL_SIZE="${CANDIDATE_POOL_SIZE:-96}"
+RANDOM_SEED="${RANDOM_SEED:-20260424}"
+BATCH_EVAL_TIMEOUT_SEC="${BATCH_EVAL_TIMEOUT_SEC:-0}"
+
+LAUNCH_DIR="artifacts/search_evaluation/tuning_launches"
+mkdir -p "${LAUNCH_DIR}"
+LOG_PATH="${LAUNCH_DIR}/${RUN_NAME}.daemon.log"
+PID_PATH="${LAUNCH_DIR}/${RUN_NAME}.daemon.pid"
+CMD_PATH="${LAUNCH_DIR}/${RUN_NAME}.daemon.cmd"
+
+CMD=(
+  bash
+  scripts/evaluation/run_coarse_fusion_tuning_resilient.sh
+  "${RUN_NAME}"
+  "${DATASET_ID}"
+  "${MAX_EVALS}"
+  "${BATCH_SIZE}"
+  "${CANDIDATE_POOL_SIZE}"
+  "${RANDOM_SEED}"
+  "${SEARCH_SPACE}"
+  ""
+)
+
+export BATCH_EVAL_TIMEOUT_SEC
+
+printf '%q ' "${CMD[@]}" > "${CMD_PATH}"
+printf '\n' >> "${CMD_PATH}"
+
+setsid "${CMD[@]}" > "${LOG_PATH}" 2>&1 < /dev/null &
+PID=$!
+echo "${PID}" > "${PID_PATH}"
+
+echo "run_name=${RUN_NAME}"
+echo "pid=${PID}"
+echo "log=${LOG_PATH}"
+echo "pid_file=${PID_PATH}"
+echo "cmd_file=${CMD_PATH}"
+echo "run_dir=artifacts/search_evaluation/tuning_runs/${RUN_NAME}"
+echo
+echo "tail -f ${LOG_PATH}"
+echo "cat artifacts/search_evaluation/tuning_runs/${RUN_NAME}/leaderboard.csv"
diff --git a/scripts/evaluation/tuning/coarse_rank_fusion_space_clothing_top771_knn_tail.yaml b/scripts/evaluation/tuning/coarse_rank_fusion_space_clothing_top771_knn_tail.yaml
new file mode 100644
index 0000000..bf3357e
--- /dev/null
+++ b/scripts/evaluation/tuning/coarse_rank_fusion_space_clothing_top771_knn_tail.yaml
@@ -0,0 +1,82 @@
+target_path: coarse_rank.fusion
+
+baseline:
+  es_bias: 6.62
+  es_exponent: 0.24
+  text_bias: 0.05
+  text_exponent: 0.445
+  text_translation_weight: 1.0
+  knn_text_weight: 1.0
+  knn_image_weight: 1.35
+  knn_tie_breaker: 0.212
+  knn_bias: 0.0052
+  knn_exponent: 4.4639
+  knn_text_bias: 0.1148
+  knn_text_exponent: 1.0926
+  knn_image_bias: 0.0114
+  knn_image_exponent: 5.2496
+
+parameters:
+  knn_exponent: {min: 0.3, max: 6.0, scale: linear, round: 4}
+  knn_text_bias: {min: 0.0, max: 0.3, scale: linear, round: 4}
+  knn_text_exponent: {min: 0.2, max: 3.0, scale: linear, round: 4}
+  knn_image_bias: {min: 0.0, max: 0.3, scale: linear, round: 4}
+  knn_image_exponent: {min: 1.0, max: 7.0, scale: linear, round: 4}
+
+seed_experiments:
+  - name: seed_fixed_anchor
+    description: 以 bo_012 的 5 维 knn 子项为锚点，在新固定参数下验证迁移。
+    params:
+      knn_exponent: 4.4639
+      knn_text_bias: 0.1148
+      knn_text_exponent: 1.0926
+      knn_image_bias: 0.0114
+      knn_image_exponent: 5.2496
+  - name: seed_knn_soft
+    description: 更平滑的全局 knn 指数，保留较强 image 子项指数。
+    params:
+      knn_exponent: 1.2
+      knn_text_bias: 0.06
+      knn_text_exponent: 0.9
+      knn_image_bias: 0.02
+      knn_image_exponent: 5.4
+  - name: seed_knn_balanced
+    description: 中等 knn 指数和中等子项非线性，作为稳健中心点。
+    params:
+      knn_exponent: 2.8
+      knn_text_bias: 0.12
+      knn_text_exponent: 1.4
+      knn_image_bias: 0.05
+      knn_image_exponent: 4.2
+  - name: seed_knn_high
+    description: 更高的全局 knn 指数，检查大集是否仍偏好更陡的 top-rank 强化。
+    params:
+      knn_exponent: 5.6
+      knn_text_bias: 0.04
+      knn_text_exponent: 0.8
+      knn_image_bias: 0.03
+      knn_image_exponent: 5.0
+  - name: seed_text_branch_heavier
+    description: 提高 knn_text 子项偏置和指数，观察 text/image 子项间的平衡点。
+    params:
+      knn_exponent: 3.6
+      knn_text_bias: 0.22
+      knn_text_exponent: 2.2
+      knn_image_bias: 0.01
+      knn_image_exponent: 3.2
+  - name: seed_image_branch_heavier
+    description: 提高 knn_image 子项偏置和指数，检查 image 通路在当前固定主参数下的上限。
+    params:
+      knn_exponent: 3.4
+      knn_text_bias: 0.03
+      knn_text_exponent: 0.6
+      knn_image_bias: 0.16
+      knn_image_exponent: 6.2
+
+optimizer:
+  init_random: 2
+  candidate_pool_size: 96
+  explore_probability: 0.14
+  local_jitter_probability: 0.62
+  elite_fraction: 0.3
+  min_normalized_distance: 0.06
--
libgit2 0.21.2