Commit dba5764289827a82201dafa605d531411cb5b24f

Authored by tangwang
1 parent 47452e1d

bayes调参计划

artifacts/search_evaluation/tuning_launches/coarse_fusion_long_001.cmd 0 → 100644
@@ -0,0 +1 @@ @@ -0,0 +1 @@
  1 +python scripts/evaluation/tune_fusion.py --mode optimize --run-name coarse_fusion_long_001 --search-space scripts/evaluation/tuning/coarse_rank_fusion_space.yaml --seed-report artifacts/search_evaluation/batch_reports/batch_20260415T150754Z_00b6a8aa3d.md --tenant-id 163 --queries-file scripts/evaluation/queries/queries.txt --top-k 100 --language en --search-base-url http://127.0.0.1:6002 --eval-web-base-url http://127.0.0.1:6010 --max-evals 400 --batch-size 3 --candidate-pool-size 512 --random-seed 20260416
artifacts/search_evaluation/tuning_launches/coarse_fusion_long_001.pid 0 → 100644
@@ -0,0 +1 @@ @@ -0,0 +1 @@
  1 +2218620
config/config.yaml
@@ -256,9 +256,11 @@ coarse_rank: @@ -256,9 +256,11 @@ coarse_rank:
256 knn_text_weight: 1.0 256 knn_text_weight: 1.0
257 knn_image_weight: 2.0 257 knn_image_weight: 2.0
258 knn_tie_breaker: 0.3 258 knn_tie_breaker: 0.3
259 - knn_bias: 0.0 259 + knn_bias: 0.2
260 knn_exponent: 5.6 260 knn_exponent: 5.6
  261 + knn_text_bias: 0.2
261 knn_text_exponent: 0.0 262 knn_text_exponent: 0.0
  263 + knn_image_bias: 0.2
262 knn_image_exponent: 0.0 264 knn_image_exponent: 0.0
263 fine_rank: 265 fine_rank:
264 enabled: false # false 时保序透传 266 enabled: false # false 时保序透传
@@ -649,4 +651,4 @@ tenant_config: @@ -649,4 +651,4 @@ tenant_config:
649 primary_language: en 651 primary_language: en
650 index_languages: 652 index_languages:
651 - en 653 - en
652 - - zh 654 - - zh
  655 + - zh
653 \ No newline at end of file 656 \ No newline at end of file
docs/caches-inventory.md
@@ -96,9 +96,22 @@ @@ -96,9 +96,22 @@
96 | `scripts/redis/redis_cache_prefix_stats.py` | 按前缀统计 key 数量与 **MEMORY USAGE**(可多 DB) | 96 | `scripts/redis/redis_cache_prefix_stats.py` | 按前缀统计 key 数量与 **MEMORY USAGE**(可多 DB) |
97 | `scripts/redis/redis_memory_heavy_keys.py` | 扫描占用内存最大的 key,辅助排查「统计与总内存不一致」 | 97 | `scripts/redis/redis_memory_heavy_keys.py` | 扫描占用内存最大的 key,辅助排查「统计与总内存不一致」 |
98 | `scripts/redis/monitor_eviction.py` | 实时监控 **eviction** 相关事件,用于容量与驱逐策略排查 | 98 | `scripts/redis/monitor_eviction.py` | 实时监控 **eviction** 相关事件,用于容量与驱逐策略排查 |
  99 +| `scripts/redis/purge_caches.py` | 一键清空业务缓存:embedding(含 `:image:` / `:clip_text:`)、anchors、translation;**默认跳过 `trans:deepl*`**(可 dry-run 预览) |
99 100
100 使用前需加载项目配置(如 `source activate.sh`)以保证 `REDIS_CONFIG` 与生产一致。脚本注释中给出了 **`redis-cli` 手工统计**示例(按前缀 `wc -l`、`MEMORY STATS` 等)。 101 使用前需加载项目配置(如 `source activate.sh`)以保证 `REDIS_CONFIG` 与生产一致。脚本注释中给出了 **`redis-cli` 手工统计**示例(按前缀 `wc -l`、`MEMORY STATS` 等)。
101 102
  103 +### 快速清空(排除 `trans:deepl*`)
  104 +
  105 +```bash
  106 +source activate.sh
  107 +
  108 +# 先预览会删多少 key(推荐)
  109 +python scripts/redis/purge_caches.py --dry-run
  110 +
  111 +# 真正删除(默认 db=0)
  112 +python scripts/redis/purge_caches.py
  113 +```
  114 +
102 --- 115 ---
103 116
104 ## 六、总表(Redis 与各层缓存) 117 ## 六、总表(Redis 与各层缓存)
@@ -106,8 +119,8 @@ @@ -106,8 +119,8 @@
106 | 缓存名称 | 业务模块 | 存储 | Key 前缀 / 命名模式 | 过期时间 | 过期策略 | 值摘要 | 配置键 / 环境变量 | 119 | 缓存名称 | 业务模块 | 存储 | Key 前缀 / 命名模式 | 过期时间 | 过期策略 | 值摘要 | 配置键 / 环境变量 |
107 |----------|----------|------|---------------------|----------|----------|--------|-------------------| 120 |----------|----------|------|---------------------|----------|----------|--------|-------------------|
108 | 文本向量 | 检索 / 索引 / Embedding 服务 | Redis db≈0 | `{embedding_cache_prefix}:*`(逻辑键以 `embed:norm…` 开头) | `cache_expire_days`(默认 720 天) | 写入 TTL + 命中滑动续期 | BF16 字节向量 | `infrastructure.redis.*`;`REDIS_EMBEDDING_CACHE_PREFIX`、`REDIS_CACHE_EXPIRE_DAYS` | 121 | 文本向量 | 检索 / 索引 / Embedding 服务 | Redis db≈0 | `{embedding_cache_prefix}:*`(逻辑键以 `embed:norm…` 开头) | `cache_expire_days`(默认 720 天) | 写入 TTL + 命中滑动续期 | BF16 字节向量 | `infrastructure.redis.*`;`REDIS_EMBEDDING_CACHE_PREFIX`、`REDIS_CACHE_EXPIRE_DAYS` |
109 -| 图像向量(CLIP 图) | 图搜 / 多模态 | 同上 | `{prefix}:image:*` | 同上 | 同上 | BF16 字节 | 同上 |  
110 -| CLIP 文本塔向量 | 图搜文本侧 | 同上 | `{prefix}:clip_text:*` | 同上 | 同上 | BF16 字节 | 同上 | 122 +| 图像向量(CLIP 图) | 图搜 / 多模态 | 同上 | `{embedding_cache_prefix}:image:*`(其中 `{embedding_cache_prefix}` 默认 `embedding`) | 同上 | 同上 | BF16 字节 | 同上 |
  123 +| CLIP 文本塔向量 | 图搜文本侧 | 同上 | `{embedding_cache_prefix}:clip_text:*`(其中 `{embedding_cache_prefix}` 默认 `embedding`) | 同上 | 同上 | BF16 字节 | 同上 |
111 | 翻译译文 | 查询翻译、翻译服务 | 同上 | `trans:{model}:{lang}:*` | `services.translation.cache.ttl_seconds`(默认 720 天) | 可配置滑动(`sliding_expiration`) | UTF-8 字符串 | `services.translation.cache.*`;各能力 `use_cache` | 124 | 翻译译文 | 查询翻译、翻译服务 | 同上 | `trans:{model}:{lang}:*` | `services.translation.cache.ttl_seconds`(默认 720 天) | 可配置滑动(`sliding_expiration`) | UTF-8 字符串 | `services.translation.cache.*`;各能力 `use_cache` |
112 | 商品分析 / Anchors | 索引富化、LLM 内容理解 | 同上 | `{anchor_cache_prefix}:{kind}:{hash}:{lang}:*` | `anchor_cache_expire_days`(默认 30 天) | 固定 TTL,不滑动 | JSON 字符串 | `anchor_cache_prefix`、`anchor_cache_expire_days`;`REDIS_ANCHOR_*` | 125 | 商品分析 / Anchors | 索引富化、LLM 内容理解 | 同上 | `{anchor_cache_prefix}:{kind}:{hash}:{lang}:*` | `anchor_cache_expire_days`(默认 30 天) | 固定 TTL,不滑动 | JSON 字符串 | `anchor_cache_prefix`、`anchor_cache_expire_days`;`REDIS_ANCHOR_*` |
113 | 应用配置 | 全栈 | 进程内存 | N/A(单例) | 进程生命周期 | `reload_app_config` 清除 | `AppConfig` 对象 | `config/loader.py` | 126 | 应用配置 | 全栈 | 进程内存 | N/A(单例) | 进程生命周期 | `reload_app_config` 清除 | `AppConfig` 对象 | `config/loader.py` |
docs/issues/issue-2026-04-14-粗排流程放入ES-TODO-env renamed to docs/issues/issue-2026-04-14-粗排流程放入ES-TODO-env.md
docs/issues/issue-2026-04-16-bayes寻参-TODO.md 0 → 100644
@@ -0,0 +1,136 @@ @@ -0,0 +1,136 @@
  1 +
  2 +我以前经过过一轮调参,是基于54个评测样本(queries.txt),过程中发现的最优的参数是这一组:
  3 +0.641241 {'es_bias': '7.214', 'es_exponent': '0.2025', 'text_bias': '4.0', 'text_exponent': '1.584', 'text_translation_weight': '1.4441', 'knn_text_weight': '0.1', 'knn_image_weight': '5.6232', 'knn_tie_breaker':
  4 + '0.021', 'knn_bias': '0.0019', 'knn_exponent': '11.8477', 'knn_text_bias': '2.3125', 'knn_text_exponent': '1.1547', 'knn_image_bias': '0.9641', 'knn_image_exponent': '5.8671'}
  5 +
  6 +这一组参数分布比较极端,text_bias太大(文本项得分事0~1的,加上4被稀释的很大),图片的exponent太大,不过在这个数据集上面确实是最好的,我觉得有过拟合的可能,因此要扩大数据集,先扩展标注集,然后使用扩展的标注集,继续进行寻参。
  7 +因为标注任务和寻参任务耗时都比较长,请你写好一个脚本,内部先启动标注任务,然后再启动寻参任务,把任务跑起来,程序已经正常跑起来了、运转正常,你才可以退出,以后等程序跑完了应该能拿到寻参结果,下次你可以结合程序执行的结果进行结论分析。
  8 +
  9 +
  10 +以前的一轮调参:
  11 +我当时的调参需求:
  12 +
  13 +请对coarse_rank fusion公式进行调参:
  14 + 目前的baseline是这一组,Primary_Metric_Score: 0.637642:
  15 + coarse_rank:
  16 + ...
  17 + fusion:
  18 + es_bias: 10.0
  19 + es_exponent: 0.05
  20 + text_bias: 0.1
  21 + text_exponent: 0.35
  22 + text_translation_weight: 1.0
  23 + knn_text_weight: 1.0
  24 + knn_image_weight: 2.0
  25 + knn_tie_breaker: 0.3
  26 + knn_bias: 0.2
  27 + knn_exponent: 5.6
  28 + knn_text_bias: 0.2
  29 + knn_text_exponent: 0.0
  30 + knn_image_bias: 0.2
  31 + knn_image_exponent: 0.0
  32 + 评估指标在/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260415T150754Z_00b6a8aa3d.md
  33 + 请以这个为基准,发散思维,进行宽一点的范围调参。因为每次重启、评测都需要几分钟,请写一个调参的框架,基于框架、设定好多组参数、写好脚本,每轮收集效果自动的调整参数分布(因为笛卡尔积、多种参数调参成本太高,因此考虑贝叶斯调参等方法通
  34 + 过多轮迭代通过脚本自动收敛)
  35 + 每次调整参数后需要重启backend (有时候重启backend后eval-web好像也挂了,如果有可以追查原因并修复)
  36 + ./restart.sh backend
  37 + 注意:请你调试脚本、进行一轮分析,最终要沉淀一套调参脚本,我下次可以重新跑(还是针对这组参数调参),能自动迭代(调整参数分布)、收集每组参数的指标、调优到最优的参数组合。
  38 +
  39 +
  40 +
  41 +你给出的调参脚本:
  42 +
  43 + 长时间自动调参脚本已经配好,核心文件是:
  44 +
  45 + - 调参主程序:scripts/evaluation/tune_fusion.py
  46 + - 宽范围搜索空间:scripts/evaluation/tuning/coarse_rank_fusion_space.yaml
  47 + - 后台长跑启动脚本:scripts/evaluation/start_coarse_fusion_tuning_long.sh
  48 + - 续跑脚本:scripts/evaluation/resume_coarse_fusion_tuning_long.sh
  49 + - 使用说明:scripts/evaluation/tuning/README.md
  50 +
  51 + 这套方案不是笛卡尔积穷举,而是“种子实验 + 随机探索 + 基于历史最优分布的自适应采样”反复迭代。搜索范围已经放宽,新增了多组偏离 baseline 很远的 seed,也保留了 baseline 附近抖动,适合长时间慢慢收敛。
  52 +
  53 + 直接启动一轮:
  54 +
  55 + ./scripts/evaluation/start_coarse_fusion_tuning_long.sh
  56 +
  57 + 如果想跑更久一点:
  58 +
  59 + MAX_EVALS=48 BATCH_SIZE=3 CANDIDATE_POOL_SIZE=512 RUN_NAME=coarse_fusion_long_001 \
  60 + ./scripts/evaluation/start_coarse_fusion_tuning_long.sh
  61 +
  62 + 看日志:
  63 +
  64 + tail -f artifacts/search_evaluation/tuning_launches/<run_name>.log
  65 +
  66 + 看结果目录:
  67 +
  68 + ls artifacts/search_evaluation/tuning_runs/<run_name>/
  69 +
  70 + 续跑:
  71 +
  72 + ./scripts/evaluation/resume_coarse_fusion_tuning_long.sh <run_name>
  73 +
  74 + 每轮会自动做这些事:
  75 +
  76 + - 生成一批候选 coarse_rank.fusion 参数
  77 + - 写入配置并重启 backend
  78 + - 检查 eval-web,必要时自愈拉起
  79 + - 跑评测并收集 Primary_Metric_Score
  80 + - 更新 trials.jsonl、leaderboard.csv、summary.md
  81 + - 基于历史结果调整下一轮采样分布
  82 +
  83 + 你先跑一轮长任务。跑完后把 run_name 或 scripts/evaluation/tuning/README.md 里对应实验目录发我,我再基于结果做下一步分析和收缩搜索空间。
  84 +
  85 +
  86 +
  87 +
  88 +› 我运行了:MAX_EVALS=400 BATCH_SIZE=3 CANDIDATE_POOL_SIZE=512 RUN_NAME=coarse_fusion_long_001 ./scripts/evaluation/start_coarse_fusion_tuning_long.sh
  89 +
  90 + run_name=coarse_fusion_long_001
  91 + pid=2218620
  92 + log=artifacts/search_evaluation/tuning_launches/coarse_fusion_long_001.log
  93 + pid_file=artifacts/search_evaluation/tuning_launches/coarse_fusion_long_001.pid
  94 + cmd_file=artifacts/search_evaluation/tuning_launches/coarse_fusion_long_001.cmd
  95 + run_dir=artifacts/search_evaluation/tuning_runs/coarse_fusion_long_001
  96 +
  97 + tail -f artifacts/search_evaluation/tuning_launches/coarse_fusion_long_001.log
  98 + cat artifacts/search_evaluation/tuning_runs/coarse_fusion_long_001/leaderboard.csv
  99 +
  100 +
  101 +• 我先看这次长跑的日志和当前排行榜,确认任务是否正常推进、是否有异常中断,以及目前最优参数落在哪个方向。
  102 +
  103 +• Explored
  104 + └ Read coarse_fusion_long_001.log, leaderboard.csv
  105 + List coarse_fusion_long_001
  106 + Read summary.md
  107 +
  108 +───
  109 +
  110 +• 结果已经足够看出明显趋势了。我再补一层聚合分析,看看 top 区间里哪些参数方向反复出现,避免只盯着单个最优点。
  111 +
  112 +• Ran ps -p 2218620 -o pid=,etimes=,cmd=
  113 + └ (no output)
  114 +
  115 +• Ran python - <<'PY'
  116 + │ import csv, statistics, math
  117 + │ from pathlib import Path
  118 + │ … +15 lines
  119 + └ rows 385
  120 +
  121 + … +61 lines (ctrl + t to view transcript)
  122 + knn_image_bias: mean=0.5631 median=0.2000 min=0.0010 max=4.0000
  123 + knn_image_exponent: mean=4.6316 median=5.8680 min=0.0000 max=6.0000
  124 +
  125 +• Ran python - <<'PY'
  126 + │ import csv
  127 + │ from pathlib import Path
  128 + │ … +19 lines
  129 + └ baseline_score 0.637642
  130 +
  131 + … +30 lines (ctrl + t to view transcript)
  132 + 10 bo_234 0.641241 {'es_bias': '7.214', 'es_exponent': '0.2025', 'text_bias': '4.0', 'text_exponent': '1.584', 'text_translation_weight': '1.4441', 'knn_text_weight': '0.1', 'knn_image_weight': '5.6232', 'knn_tie_breaker':
  133 + '0.021', 'knn_bias': '0.0019', 'knn_exponent': '11.8477', 'knn_text_bias': '2.3125', 'knn_text_exponent': '1.1547', 'knn_image_bias': '0.9641', 'knn_image_exponent': '5.8671'}
  134 +
  135 +这一次因为外部原因(磁盘满)终止了,以上是最好的一组参数。
  136 +
scripts/evaluation/resume_coarse_fusion_tuning_long.sh 0 → 100755
@@ -0,0 +1,76 @@ @@ -0,0 +1,76 @@
  1 +#!/bin/bash
  2 +
  3 +set -euo pipefail
  4 +
  5 +if [ "$#" -lt 1 ]; then
  6 + echo "usage: $0 <run_dir_or_name> [extra tune_fusion args...]" >&2
  7 + exit 1
  8 +fi
  9 +
  10 +cd "$(dirname "$0")/../.."
  11 +source ./activate.sh
  12 +
  13 +TARGET="$1"
  14 +shift
  15 +
  16 +if [ -d "${TARGET}" ]; then
  17 + RUN_DIR="${TARGET}"
  18 + RUN_NAME="$(basename "${RUN_DIR}")"
  19 +else
  20 + RUN_NAME="${TARGET}"
  21 + RUN_DIR="artifacts/search_evaluation/tuning_runs/${RUN_NAME}"
  22 +fi
  23 +
  24 +if [ ! -d "${RUN_DIR}" ]; then
  25 + echo "run dir not found: ${RUN_DIR}" >&2
  26 + exit 1
  27 +fi
  28 +
  29 +MAX_EVALS="${MAX_EVALS:-36}"
  30 +BATCH_SIZE="${BATCH_SIZE:-3}"
  31 +CANDIDATE_POOL_SIZE="${CANDIDATE_POOL_SIZE:-512}"
  32 +
  33 +LAUNCH_DIR="artifacts/search_evaluation/tuning_launches"
  34 +mkdir -p "${LAUNCH_DIR}"
  35 +LOG_PATH="${LAUNCH_DIR}/${RUN_NAME}.resume.log"
  36 +PID_PATH="${LAUNCH_DIR}/${RUN_NAME}.resume.pid"
  37 +CMD_PATH="${LAUNCH_DIR}/${RUN_NAME}.resume.cmd"
  38 +
  39 +CMD=(
  40 + python
  41 + scripts/evaluation/tune_fusion.py
  42 + --mode optimize
  43 + --resume-run "${RUN_DIR}"
  44 + --search-space "${RUN_DIR}/search_space.yaml"
  45 + --seed-report artifacts/search_evaluation/batch_reports/batch_20260415T150754Z_00b6a8aa3d.md
  46 + --tenant-id 163
  47 + --queries-file scripts/evaluation/queries/queries.txt
  48 + --top-k 100
  49 + --language en
  50 + --search-base-url http://127.0.0.1:6002
  51 + --eval-web-base-url http://127.0.0.1:6010
  52 + --max-evals "${MAX_EVALS}"
  53 + --batch-size "${BATCH_SIZE}"
  54 + --candidate-pool-size "${CANDIDATE_POOL_SIZE}"
  55 +)
  56 +
  57 +if [ "$#" -gt 0 ]; then
  58 + CMD+=("$@")
  59 +fi
  60 +
  61 +printf '%q ' "${CMD[@]}" > "${CMD_PATH}"
  62 +printf '\n' >> "${CMD_PATH}"
  63 +
  64 +nohup "${CMD[@]}" > "${LOG_PATH}" 2>&1 &
  65 +PID=$!
  66 +echo "${PID}" > "${PID_PATH}"
  67 +
  68 +echo "run_name=${RUN_NAME}"
  69 +echo "pid=${PID}"
  70 +echo "log=${LOG_PATH}"
  71 +echo "pid_file=${PID_PATH}"
  72 +echo "cmd_file=${CMD_PATH}"
  73 +echo "run_dir=${RUN_DIR}"
  74 +echo
  75 +echo "tail -f ${LOG_PATH}"
  76 +echo "cat ${RUN_DIR}/leaderboard.csv"
scripts/evaluation/run_coarse_fusion_tuning.sh 0 → 100755
@@ -0,0 +1,18 @@ @@ -0,0 +1,18 @@
  1 +#!/bin/bash
  2 +
  3 +set -euo pipefail
  4 +
  5 +cd "$(dirname "$0")/../.."
  6 +source ./activate.sh
  7 +
  8 +python scripts/evaluation/tune_fusion.py \
  9 + --mode optimize \
  10 + --search-space scripts/evaluation/tuning/coarse_rank_fusion_space.yaml \
  11 + --seed-report artifacts/search_evaluation/batch_reports/batch_20260415T150754Z_00b6a8aa3d.md \
  12 + --tenant-id 163 \
  13 + --queries-file scripts/evaluation/queries/queries.txt \
  14 + --top-k 100 \
  15 + --language en \
  16 + --search-base-url http://127.0.0.1:6002 \
  17 + --eval-web-base-url http://127.0.0.1:6010 \
  18 + "$@"
scripts/evaluation/start_coarse_fusion_tuning_long.sh 0 → 100755
@@ -0,0 +1,58 @@ @@ -0,0 +1,58 @@
  1 +#!/bin/bash
  2 +
  3 +set -euo pipefail
  4 +
  5 +cd "$(dirname "$0")/../.."
  6 +source ./activate.sh
  7 +
  8 +RUN_NAME="${RUN_NAME:-coarse_fusion_long_$(date -u +%Y%m%dT%H%M%SZ)}"
  9 +MAX_EVALS="${MAX_EVALS:-36}"
  10 +BATCH_SIZE="${BATCH_SIZE:-3}"
  11 +CANDIDATE_POOL_SIZE="${CANDIDATE_POOL_SIZE:-512}"
  12 +RANDOM_SEED="${RANDOM_SEED:-20260416}"
  13 +
  14 +LAUNCH_DIR="artifacts/search_evaluation/tuning_launches"
  15 +mkdir -p "${LAUNCH_DIR}"
  16 +LOG_PATH="${LAUNCH_DIR}/${RUN_NAME}.log"
  17 +PID_PATH="${LAUNCH_DIR}/${RUN_NAME}.pid"
  18 +CMD_PATH="${LAUNCH_DIR}/${RUN_NAME}.cmd"
  19 +
  20 +CMD=(
  21 + python
  22 + scripts/evaluation/tune_fusion.py
  23 + --mode optimize
  24 + --run-name "${RUN_NAME}"
  25 + --search-space scripts/evaluation/tuning/coarse_rank_fusion_space.yaml
  26 + --seed-report artifacts/search_evaluation/batch_reports/batch_20260415T150754Z_00b6a8aa3d.md
  27 + --tenant-id 163
  28 + --queries-file scripts/evaluation/queries/queries.txt
  29 + --top-k 100
  30 + --language en
  31 + --search-base-url http://127.0.0.1:6002
  32 + --eval-web-base-url http://127.0.0.1:6010
  33 + --max-evals "${MAX_EVALS}"
  34 + --batch-size "${BATCH_SIZE}"
  35 + --candidate-pool-size "${CANDIDATE_POOL_SIZE}"
  36 + --random-seed "${RANDOM_SEED}"
  37 +)
  38 +
  39 +if [ "$#" -gt 0 ]; then
  40 + CMD+=("$@")
  41 +fi
  42 +
  43 +printf '%q ' "${CMD[@]}" > "${CMD_PATH}"
  44 +printf '\n' >> "${CMD_PATH}"
  45 +
  46 +nohup "${CMD[@]}" > "${LOG_PATH}" 2>&1 &
  47 +PID=$!
  48 +echo "${PID}" > "${PID_PATH}"
  49 +
  50 +echo "run_name=${RUN_NAME}"
  51 +echo "pid=${PID}"
  52 +echo "log=${LOG_PATH}"
  53 +echo "pid_file=${PID_PATH}"
  54 +echo "cmd_file=${CMD_PATH}"
  55 +echo "run_dir=artifacts/search_evaluation/tuning_runs/${RUN_NAME}"
  56 +echo
  57 +echo "tail -f ${LOG_PATH}"
  58 +echo "cat artifacts/search_evaluation/tuning_runs/${RUN_NAME}/leaderboard.csv"
scripts/evaluation/tune_fusion.py
@@ -4,23 +4,37 @@ from __future__ import annotations @@ -4,23 +4,37 @@ from __future__ import annotations
4 4
5 import argparse 5 import argparse
6 import copy 6 import copy
  7 +import csv
7 import json 8 import json
  9 +import math
  10 +import random
8 import re 11 import re
  12 +import shutil
9 import subprocess 13 import subprocess
10 import sys 14 import sys
11 import time 15 import time
12 from dataclasses import dataclass 16 from dataclasses import dataclass
13 from pathlib import Path 17 from pathlib import Path
14 -from typing import Any, Dict, List 18 +from typing import Any, Dict, List, Sequence
15 19
  20 +import numpy as np
16 import requests 21 import requests
17 import yaml 22 import yaml
18 23
  24 +try:
  25 + from sklearn.gaussian_process import GaussianProcessRegressor
  26 + from sklearn.gaussian_process.kernels import ConstantKernel, Matern, WhiteKernel
  27 +except Exception: # noqa: BLE001
  28 + GaussianProcessRegressor = None # type: ignore[assignment]
  29 + ConstantKernel = None # type: ignore[assignment]
  30 + Matern = None # type: ignore[assignment]
  31 + WhiteKernel = None # type: ignore[assignment]
  32 +
19 PROJECT_ROOT = Path(__file__).resolve().parents[2] 33 PROJECT_ROOT = Path(__file__).resolve().parents[2]
20 if str(PROJECT_ROOT) not in sys.path: 34 if str(PROJECT_ROOT) not in sys.path:
21 sys.path.insert(0, str(PROJECT_ROOT)) 35 sys.path.insert(0, str(PROJECT_ROOT))
22 36
23 -from scripts.evaluation.eval_framework import ( 37 +from scripts.evaluation.eval_framework import ( # noqa: E402
24 DEFAULT_ARTIFACT_ROOT, 38 DEFAULT_ARTIFACT_ROOT,
25 DEFAULT_QUERY_FILE, 39 DEFAULT_QUERY_FILE,
26 ensure_dir, 40 ensure_dir,
@@ -30,6 +44,7 @@ from scripts.evaluation.eval_framework import ( @@ -30,6 +44,7 @@ from scripts.evaluation.eval_framework import (
30 44
31 45
32 CONFIG_PATH = PROJECT_ROOT / "config" / "config.yaml" 46 CONFIG_PATH = PROJECT_ROOT / "config" / "config.yaml"
  47 +LOG_DIR = PROJECT_ROOT / "logs"
33 48
34 49
35 @dataclass 50 @dataclass
@@ -39,6 +54,108 @@ class ExperimentSpec: @@ -39,6 +54,108 @@ class ExperimentSpec:
39 params: Dict[str, Any] 54 params: Dict[str, Any]
40 55
41 56
  57 +@dataclass
  58 +class ParameterSpec:
  59 + name: str
  60 + lower: float
  61 + upper: float
  62 + scale: str = "linear"
  63 + round_digits: int = 6
  64 +
  65 + def __post_init__(self) -> None:
  66 + if self.lower >= self.upper:
  67 + raise ValueError(f"invalid bounds for {self.name}: {self.lower} >= {self.upper}")
  68 + if self.scale not in {"linear", "log"}:
  69 + raise ValueError(f"unsupported scale={self.scale!r} for {self.name}")
  70 + if self.scale == "log" and (self.lower <= 0 or self.upper <= 0):
  71 + raise ValueError(f"log-scaled parameter {self.name} must have positive bounds")
  72 +
  73 + @property
  74 + def transformed_lower(self) -> float:
  75 + return math.log10(self.lower) if self.scale == "log" else self.lower
  76 +
  77 + @property
  78 + def transformed_upper(self) -> float:
  79 + return math.log10(self.upper) if self.scale == "log" else self.upper
  80 +
  81 + @property
  82 + def transformed_span(self) -> float:
  83 + return self.transformed_upper - self.transformed_lower
  84 +
  85 + def transform(self, value: float) -> float:
  86 + clipped = min(max(float(value), self.lower), self.upper)
  87 + return math.log10(clipped) if self.scale == "log" else clipped
  88 +
  89 + def inverse_transform(self, value: float) -> float:
  90 + raw = (10 ** value) if self.scale == "log" else value
  91 + raw = min(max(float(raw), self.lower), self.upper)
  92 + return round(raw, self.round_digits)
  93 +
  94 + def sample_uniform(self, rng: random.Random) -> float:
  95 + draw = rng.uniform(self.transformed_lower, self.transformed_upper)
  96 + return self.inverse_transform(draw)
  97 +
  98 +
  99 +@dataclass
  100 +class SearchSpace:
  101 + target_path: str
  102 + baseline: Dict[str, float]
  103 + parameters: List[ParameterSpec]
  104 + seed_experiments: List[ExperimentSpec]
  105 + init_random: int = 6
  106 + candidate_pool_size: int = 256
  107 + explore_probability: float = 0.25
  108 + local_jitter_probability: float = 0.45
  109 + elite_fraction: float = 0.35
  110 + min_normalized_distance: float = 0.14
  111 +
  112 + @property
  113 + def parameter_names(self) -> List[str]:
  114 + return [item.name for item in self.parameters]
  115 +
  116 + def fill_params(self, params: Dict[str, Any]) -> Dict[str, float]:
  117 + merged = {name: float(self.baseline[name]) for name in self.parameter_names}
  118 + for name, value in params.items():
  119 + if name not in merged:
  120 + raise KeyError(f"unknown parameter in search space: {name}")
  121 + merged[name] = float(value)
  122 + return {
  123 + spec.name: spec.inverse_transform(spec.transform(float(merged[spec.name])))
  124 + for spec in self.parameters
  125 + }
  126 +
  127 + def sample_random(self, rng: random.Random) -> Dict[str, float]:
  128 + return {spec.name: spec.sample_uniform(rng) for spec in self.parameters}
  129 +
  130 + def vectorize(self, params: Dict[str, Any]) -> np.ndarray:
  131 + merged = self.fill_params(params)
  132 + return np.array([spec.transform(float(merged[spec.name])) for spec in self.parameters], dtype=float)
  133 +
  134 + def from_vector(self, vector: Sequence[float]) -> Dict[str, float]:
  135 + return {
  136 + spec.name: spec.inverse_transform(float(vector[idx]))
  137 + for idx, spec in enumerate(self.parameters)
  138 + }
  139 +
  140 + def normalized_vector(self, params: Dict[str, Any]) -> np.ndarray:
  141 + vector = self.vectorize(params)
  142 + parts: List[float] = []
  143 + for idx, spec in enumerate(self.parameters):
  144 + parts.append((vector[idx] - spec.transformed_lower) / max(spec.transformed_span, 1e-9))
  145 + return np.array(parts, dtype=float)
  146 +
  147 + def canonical_key(self, params: Dict[str, Any]) -> str:
  148 + return json.dumps(self.fill_params(params), ensure_ascii=False, sort_keys=True)
  149 +
  150 +
  151 +@dataclass
  152 +class CandidateProposal:
  153 + name: str
  154 + description: str
  155 + params: Dict[str, float]
  156 + source: str
  157 +
  158 +
42 def load_yaml(path: Path) -> Dict[str, Any]: 159 def load_yaml(path: Path) -> Dict[str, Any]:
43 return yaml.safe_load(path.read_text(encoding="utf-8")) 160 return yaml.safe_load(path.read_text(encoding="utf-8"))
44 161
@@ -50,6 +167,13 @@ def write_yaml(path: Path, payload: Dict[str, Any]) -&gt; None: @@ -50,6 +167,13 @@ def write_yaml(path: Path, payload: Dict[str, Any]) -&gt; None:
50 ) 167 )
51 168
52 169
  170 +def get_nested_value(payload: Dict[str, Any], dotted_path: str) -> Any:
  171 + current: Any = payload
  172 + for part in dotted_path.split("."):
  173 + current = current[part]
  174 + return current
  175 +
  176 +
53 def set_nested_value(payload: Dict[str, Any], dotted_path: str, value: Any) -> None: 177 def set_nested_value(payload: Dict[str, Any], dotted_path: str, value: Any) -> None:
54 current = payload 178 current = payload
55 parts = dotted_path.split(".") 179 parts = dotted_path.split(".")
@@ -58,16 +182,115 @@ def set_nested_value(payload: Dict[str, Any], dotted_path: str, value: Any) -&gt; N @@ -58,16 +182,115 @@ def set_nested_value(payload: Dict[str, Any], dotted_path: str, value: Any) -&gt; N
58 current[parts[-1]] = value 182 current[parts[-1]] = value
59 183
60 184
61 -def apply_params(base_config: Dict[str, Any], params: Dict[str, Any]) -> Dict[str, Any]: 185 +def apply_target_params(base_config: Dict[str, Any], target_path: str, params: Dict[str, Any]) -> Dict[str, Any]:
62 candidate = copy.deepcopy(base_config) 186 candidate = copy.deepcopy(base_config)
63 - for dotted_path, value in params.items():  
64 - set_nested_value(candidate, dotted_path, value) 187 + for key, value in params.items():
  188 + set_nested_value(candidate, f"{target_path}.{key}", value)
65 return candidate 189 return candidate
66 190
67 191
  192 +def read_queries(path: Path) -> List[str]:
  193 + return [
  194 + line.strip()
  195 + for line in path.read_text(encoding="utf-8").splitlines()
  196 + if line.strip() and not line.strip().startswith("#")
  197 + ]
  198 +
  199 +
  200 +def run_restart(targets: Sequence[str]) -> None:
  201 + cmd = ["./restart.sh", *targets]
  202 + subprocess.run(cmd, cwd=PROJECT_ROOT, check=True, timeout=900)
  203 +
  204 +
  205 +def bytes_to_gib(value: int) -> float:
  206 + return float(value) / float(1024 ** 3)
  207 +
  208 +
  209 +def get_free_disk_bytes(path: Path) -> int:
  210 + return int(shutil.disk_usage(path).free)
  211 +
  212 +
  213 +def iter_log_cleanup_candidates() -> List[Path]:
  214 + if not LOG_DIR.is_dir():
  215 + return []
  216 + items: List[Path] = []
  217 + seen: set[str] = set()
  218 + for path in LOG_DIR.rglob("*"):
  219 + try:
  220 + if not path.is_file():
  221 + continue
  222 + resolved = path.resolve()
  223 + key = str(resolved)
  224 + if key in seen:
  225 + continue
  226 + seen.add(key)
  227 + items.append(resolved)
  228 + except FileNotFoundError:
  229 + continue
  230 + items.sort(key=lambda item: item.stat().st_size if item.exists() else 0, reverse=True)
  231 + return items
  232 +
  233 +
  234 +def truncate_file(path: Path) -> int:
  235 + if not path.exists() or not path.is_file():
  236 + return 0
  237 + size = int(path.stat().st_size)
  238 + if size <= 0:
  239 + return 0
  240 + with path.open("w", encoding="utf-8"):
  241 + pass
  242 + return size
  243 +
  244 +
  245 +def ensure_disk_headroom(
  246 + *,
  247 + min_free_gb: float,
  248 + auto_truncate_logs: bool,
  249 + context: str,
  250 +) -> None:
  251 + required_bytes = int(min_free_gb * (1024 ** 3))
  252 + free_bytes = get_free_disk_bytes(PROJECT_ROOT)
  253 + if free_bytes >= required_bytes:
  254 + return
  255 +
  256 + print(
  257 + f"[disk] low free space before {context}: "
  258 + f"free={bytes_to_gib(free_bytes):.2f}GiB required={min_free_gb:.2f}GiB"
  259 + )
  260 + if not auto_truncate_logs:
  261 + raise RuntimeError(
  262 + f"insufficient disk headroom before {context}: "
  263 + f"free={bytes_to_gib(free_bytes):.2f}GiB required={min_free_gb:.2f}GiB"
  264 + )
  265 +
  266 + reclaimed_bytes = 0
  267 + for candidate in iter_log_cleanup_candidates():
  268 + try:
  269 + reclaimed = truncate_file(candidate)
  270 + except Exception as exc: # noqa: BLE001
  271 + print(f"[disk] skip truncate {candidate}: {exc}")
  272 + continue
  273 + if reclaimed <= 0:
  274 + continue
  275 + reclaimed_bytes += reclaimed
  276 + free_bytes = get_free_disk_bytes(PROJECT_ROOT)
  277 + print(
  278 + f"[disk] truncated {candidate} reclaimed={bytes_to_gib(reclaimed):.2f}GiB "
  279 + f"free_now={bytes_to_gib(free_bytes):.2f}GiB"
  280 + )
  281 + if free_bytes >= required_bytes:
  282 + return
  283 +
  284 + raise RuntimeError(
  285 + f"insufficient disk headroom after log truncation before {context}: "
  286 + f"free={bytes_to_gib(free_bytes):.2f}GiB required={min_free_gb:.2f}GiB "
  287 + f"reclaimed={bytes_to_gib(reclaimed_bytes):.2f}GiB"
  288 + )
  289 +
  290 +
68 def wait_for_backend(base_url: str, timeout_sec: float = 300.0) -> Dict[str, Any]: 291 def wait_for_backend(base_url: str, timeout_sec: float = 300.0) -> Dict[str, Any]:
69 deadline = time.time() + timeout_sec 292 deadline = time.time() + timeout_sec
70 - last_error = None 293 + last_error: Any = None
71 while time.time() < deadline: 294 while time.time() < deadline:
72 try: 295 try:
73 response = requests.get(f"{base_url.rstrip('/')}/health", timeout=10) 296 response = requests.get(f"{base_url.rstrip('/')}/health", timeout=10)
@@ -82,16 +305,69 @@ def wait_for_backend(base_url: str, timeout_sec: float = 300.0) -&gt; Dict[str, Any @@ -82,16 +305,69 @@ def wait_for_backend(base_url: str, timeout_sec: float = 300.0) -&gt; Dict[str, Any
82 raise RuntimeError(f"backend did not become healthy: {last_error}") 305 raise RuntimeError(f"backend did not become healthy: {last_error}")
83 306
84 307
85 -def run_restart() -> None:  
86 - subprocess.run(["./restart.sh", "backend"], cwd=PROJECT_ROOT, check=True, timeout=600) 308 +def wait_for_eval_web(base_url: str, timeout_sec: float = 90.0) -> Dict[str, Any]:
  309 + url = f"{base_url.rstrip('/')}/api/history"
  310 + deadline = time.time() + timeout_sec
  311 + last_error: Any = None
  312 + while time.time() < deadline:
  313 + try:
  314 + response = requests.get(url, timeout=10)
  315 + response.raise_for_status()
  316 + payload = response.json()
  317 + if isinstance(payload, dict) and "history" in payload:
  318 + return payload
  319 + last_error = payload
  320 + except Exception as exc: # noqa: BLE001
  321 + last_error = str(exc)
  322 + time.sleep(2.0)
  323 + raise RuntimeError(f"eval-web did not become healthy: {last_error}")
  324 +
  325 +
  326 +def ensure_eval_web(eval_web_base_url: str) -> Dict[str, Any]:
  327 + try:
  328 + return wait_for_eval_web(eval_web_base_url, timeout_sec=20.0)
  329 + except Exception: # noqa: BLE001
  330 + run_restart(["eval-web"])
  331 + return wait_for_eval_web(eval_web_base_url, timeout_sec=120.0)
87 332
88 333
89 -def read_queries(path: Path) -> List[str]:  
90 - return [  
91 - line.strip()  
92 - for line in path.read_text(encoding="utf-8").splitlines()  
93 - if line.strip() and not line.strip().startswith("#")  
94 - ] 334 +def verify_backend_config(base_url: str, target_path: str, expected: Dict[str, Any], tol: float = 1e-6) -> bool:
  335 + response = requests.get(f"{base_url.rstrip('/')}/admin/config", timeout=20)
  336 + response.raise_for_status()
  337 + payload = response.json()
  338 + candidate_paths = [target_path]
  339 + if not target_path.startswith("search."):
  340 + candidate_paths.append(f"search.{target_path}")
  341 + if target_path.startswith("search."):
  342 + candidate_paths.append(target_path[len("search."):])
  343 +
  344 + live_block = None
  345 + for path in candidate_paths:
  346 + try:
  347 + maybe_block = get_nested_value(payload, path)
  348 + except Exception: # noqa: BLE001
  349 + continue
  350 + if isinstance(maybe_block, dict):
  351 + live_block = maybe_block
  352 + break
  353 + if live_block is None:
  354 + raise RuntimeError(
  355 + f"unable to resolve backend config path {target_path!r}; "
  356 + f"tried={candidate_paths!r} top_level_keys={sorted(payload.keys())[:20]!r}"
  357 + )
  358 + for key, expected_value in expected.items():
  359 + live_value = live_block[key]
  360 + if isinstance(expected_value, (int, float)):
  361 + if abs(float(live_value) - float(expected_value)) > tol:
  362 + raise RuntimeError(
  363 + f"backend config mismatch for {target_path}.{key}: "
  364 + f"expected={expected_value} live={live_value}"
  365 + )
  366 + elif live_value != expected_value:
  367 + raise RuntimeError(
  368 + f"backend config mismatch for {target_path}.{key}: expected={expected_value!r} live={live_value!r}"
  369 + )
  370 + return True
95 371
96 372
97 def run_batch_eval( 373 def run_batch_eval(
@@ -126,95 +402,580 @@ def run_batch_eval( @@ -126,95 +402,580 @@ def run_batch_eval(
126 timeout=7200, 402 timeout=7200,
127 ) 403 )
128 output = (completed.stdout or "") + "\n" + (completed.stderr or "") 404 output = (completed.stdout or "") + "\n" + (completed.stderr or "")
129 - match = re.search(r"batch_id=([A-Za-z0-9_]+)\s+aggregate_metrics=(\{.*\})", output)  
130 - if not match: 405 + batch_ids = re.findall(r"batch_id=([A-Za-z0-9_]+)", output)
  406 + if not batch_ids:
131 raise RuntimeError(f"failed to parse batch output: {output[-2000:]}") 407 raise RuntimeError(f"failed to parse batch output: {output[-2000:]}")
132 - batch_id = match.group(1)  
133 - aggregate_metrics = json.loads(match.group(2).replace("'", '"')) 408 + batch_id = batch_ids[-1]
  409 + batch_json_path = DEFAULT_ARTIFACT_ROOT / "batch_reports" / f"{batch_id}.json"
  410 + if not batch_json_path.is_file():
  411 + raise RuntimeError(f"batch json not found after eval: {batch_json_path}")
  412 + payload = json.loads(batch_json_path.read_text(encoding="utf-8"))
134 return { 413 return {
135 "batch_id": batch_id, 414 "batch_id": batch_id,
136 - "aggregate_metrics": aggregate_metrics, 415 + "payload": payload,
137 "raw_output": output, 416 "raw_output": output,
  417 + "batch_json_path": str(batch_json_path),
  418 + "batch_report_path": str(DEFAULT_ARTIFACT_ROOT / "batch_reports" / f"{batch_id}.md"),
  419 + }
  420 +
  421 +
  422 +def resolve_batch_json_path(path_like: str) -> Path:
  423 + path = Path(path_like)
  424 + if not path.is_absolute():
  425 + path = (PROJECT_ROOT / path).resolve()
  426 + if path.suffix == ".json":
  427 + return path
  428 + if path.suffix == ".md":
  429 + candidate = path.with_suffix(".json")
  430 + if candidate.is_file():
  431 + return candidate
  432 + if path.is_file():
  433 + return path
  434 + candidate = path.parent / f"{path.name}.json"
  435 + if candidate.is_file():
  436 + return candidate
  437 + raise FileNotFoundError(f"cannot resolve batch json from: {path_like}")
  438 +
  439 +
  440 +def load_batch_payload(path_like: str) -> Dict[str, Any]:
  441 + path = resolve_batch_json_path(path_like)
  442 + return json.loads(path.read_text(encoding="utf-8"))
  443 +
  444 +
  445 +def load_experiments(path: Path) -> List[ExperimentSpec]:
  446 + payload = json.loads(path.read_text(encoding="utf-8"))
  447 + items = payload["experiments"] if isinstance(payload, dict) else payload
  448 + experiments: List[ExperimentSpec] = []
  449 + for item in items:
  450 + experiments.append(
  451 + ExperimentSpec(
  452 + name=str(item["name"]),
  453 + description=str(item.get("description") or ""),
  454 + params=dict(item.get("params") or {}),
  455 + )
  456 + )
  457 + return experiments
  458 +
  459 +
  460 +def load_search_space(path: Path) -> SearchSpace:
  461 + payload = load_yaml(path)
  462 + parameters = [
  463 + ParameterSpec(
  464 + name=str(name),
  465 + lower=float(spec["min"]),
  466 + upper=float(spec["max"]),
  467 + scale=str(spec.get("scale", "linear")),
  468 + round_digits=int(spec.get("round", 6)),
  469 + )
  470 + for name, spec in dict(payload["parameters"]).items()
  471 + ]
  472 + baseline = {str(key): float(value) for key, value in dict(payload["baseline"]).items()}
  473 + seed_experiments = [
  474 + ExperimentSpec(
  475 + name=str(item["name"]),
  476 + description=str(item.get("description") or ""),
  477 + params={str(k): float(v) for k, v in dict(item.get("params") or {}).items()},
  478 + )
  479 + for item in list(payload.get("seed_experiments") or [])
  480 + ]
  481 + optimizer = dict(payload.get("optimizer") or {})
  482 + return SearchSpace(
  483 + target_path=str(payload["target_path"]),
  484 + baseline=baseline,
  485 + parameters=parameters,
  486 + seed_experiments=seed_experiments,
  487 + init_random=int(optimizer.get("init_random", 6)),
  488 + candidate_pool_size=int(optimizer.get("candidate_pool_size", 256)),
  489 + explore_probability=float(optimizer.get("explore_probability", 0.25)),
  490 + local_jitter_probability=float(optimizer.get("local_jitter_probability", 0.45)),
  491 + elite_fraction=float(optimizer.get("elite_fraction", 0.35)),
  492 + min_normalized_distance=float(optimizer.get("min_normalized_distance", 0.14)),
  493 + )
  494 +
  495 +
  496 +def load_existing_trials(run_dir: Path) -> List[Dict[str, Any]]:
  497 + path = run_dir / "trials.jsonl"
  498 + if not path.is_file():
  499 + return []
  500 + trials: List[Dict[str, Any]] = []
  501 + for line in path.read_text(encoding="utf-8").splitlines():
  502 + line = line.strip()
  503 + if line:
  504 + trials.append(json.loads(line))
  505 + return trials
  506 +
  507 +
  508 +def append_trial(run_dir: Path, trial: Dict[str, Any]) -> None:
  509 + path = run_dir / "trials.jsonl"
  510 + with path.open("a", encoding="utf-8") as handle:
  511 + handle.write(json.dumps(trial, ensure_ascii=False) + "\n")
  512 +
  513 +
  514 +def live_success_trials(trials: Sequence[Dict[str, Any]]) -> List[Dict[str, Any]]:
  515 + return [
  516 + item
  517 + for item in trials
  518 + if item.get("status") == "ok" and not bool(item.get("is_seed"))
  519 + ]
  520 +
  521 +
  522 +def all_success_trials(trials: Sequence[Dict[str, Any]]) -> List[Dict[str, Any]]:
  523 + return [item for item in trials if item.get("status") == "ok"]
  524 +
  525 +
  526 +def score_of(trial: Dict[str, Any], metric: str) -> float:
  527 + return float((trial.get("aggregate_metrics") or {}).get(metric, trial.get("score", 0.0)) or 0.0)
  528 +
  529 +
  530 +def next_trial_name(trials: Sequence[Dict[str, Any]], prefix: str) -> str:
  531 + return f"{prefix}_{len(trials) + 1:03d}"
  532 +
  533 +
  534 +def normal_pdf(x: float) -> float:
  535 + return math.exp(-0.5 * x * x) / math.sqrt(2.0 * math.pi)
  536 +
  537 +
  538 +def normal_cdf(x: float) -> float:
  539 + return 0.5 * (1.0 + math.erf(x / math.sqrt(2.0)))
  540 +
  541 +
  542 +def expected_improvement(mu: float, sigma: float, best: float, xi: float = 0.002) -> float:
  543 + if sigma <= 1e-12:
  544 + return max(mu - best - xi, 0.0)
  545 + z = (mu - best - xi) / sigma
  546 + return (mu - best - xi) * normal_cdf(z) + sigma * normal_pdf(z)
  547 +
  548 +
  549 +def normalized_distance(space: SearchSpace, left: Dict[str, Any], right: Dict[str, Any]) -> float:
  550 + lv = space.normalized_vector(left)
  551 + rv = space.normalized_vector(right)
  552 + return float(np.linalg.norm(lv - rv) / math.sqrt(len(space.parameters)))
  553 +
  554 +
  555 +def fit_surrogate(space: SearchSpace, trials: Sequence[Dict[str, Any]], metric: str, seed: int) -> Any:
  556 + if GaussianProcessRegressor is None or len(trials) < 4:
  557 + return None
  558 + X = np.array([space.vectorize(item["params"]) for item in trials], dtype=float)
  559 + y = np.array([score_of(item, metric) for item in trials], dtype=float)
  560 + if len(np.unique(np.round(y, 8))) < 2:
  561 + return None
  562 + try:
  563 + kernel = (
  564 + ConstantKernel(1.0, (1e-3, 1e3))
  565 + * Matern(length_scale=np.ones(len(space.parameters)), length_scale_bounds=(1e-2, 1e2), nu=2.5)
  566 + + WhiteKernel(noise_level=1e-5, noise_level_bounds=(1e-8, 1e-1))
  567 + )
  568 + gp = GaussianProcessRegressor(
  569 + kernel=kernel,
  570 + normalize_y=True,
  571 + n_restarts_optimizer=2,
  572 + random_state=seed,
  573 + )
  574 + gp.fit(X, y)
  575 + return gp
  576 + except Exception: # noqa: BLE001
  577 + return None
  578 +
  579 +
  580 +def build_sampling_spread(space: SearchSpace, elite_vectors: np.ndarray) -> np.ndarray:
  581 + spans = np.array([spec.transformed_span for spec in space.parameters], dtype=float)
  582 + floor = np.maximum(spans * 0.05, 0.015)
  583 + ceiling = np.maximum(spans * 0.5, floor)
  584 + if elite_vectors.shape[0] <= 1:
  585 + return np.minimum(np.maximum(spans * 0.18, floor), ceiling)
  586 + elite_std = elite_vectors.std(axis=0)
  587 + elite_range = elite_vectors.max(axis=0) - elite_vectors.min(axis=0)
  588 + spread = np.maximum(elite_std * 1.8, elite_range * 0.5)
  589 + return np.minimum(np.maximum(spread, floor), ceiling)
  590 +
  591 +
  592 +def sample_local_candidate(
  593 + space: SearchSpace,
  594 + rng: random.Random,
  595 + center: np.ndarray,
  596 + spread: np.ndarray,
  597 +) -> Dict[str, float]:
  598 + draw = []
  599 + for idx, spec in enumerate(space.parameters):
  600 + value = rng.gauss(float(center[idx]), float(spread[idx]))
  601 + value = min(max(value, spec.transformed_lower), spec.transformed_upper)
  602 + draw.append(value)
  603 + return space.from_vector(draw)
  604 +
  605 +
  606 +def sample_crossover_candidate(
  607 + space: SearchSpace,
  608 + rng: random.Random,
  609 + left: np.ndarray,
  610 + right: np.ndarray,
  611 +) -> Dict[str, float]:
  612 + draw = []
  613 + for idx, spec in enumerate(space.parameters):
  614 + mix = rng.random()
  615 + value = float(left[idx]) * mix + float(right[idx]) * (1.0 - mix)
  616 + jitter = spec.transformed_span * 0.04
  617 + value += rng.uniform(-jitter, jitter)
  618 + value = min(max(value, spec.transformed_lower), spec.transformed_upper)
  619 + draw.append(value)
  620 + return space.from_vector(draw)
  621 +
  622 +
  623 +def propose_candidates(
  624 + *,
  625 + space: SearchSpace,
  626 + trials: Sequence[Dict[str, Any]],
  627 + metric: str,
  628 + batch_size: int,
  629 + rng: random.Random,
  630 + init_random: int,
  631 + candidate_pool_size: int,
  632 +) -> List[CandidateProposal]:
  633 + existing_keys = {space.canonical_key(item["params"]) for item in trials if item.get("params")}
  634 + proposals: List[CandidateProposal] = []
  635 +
  636 + for seed in space.seed_experiments:
  637 + params = space.fill_params(seed.params)
  638 + key = space.canonical_key(params)
  639 + if key not in existing_keys:
  640 + proposals.append(
  641 + CandidateProposal(
  642 + name=seed.name,
  643 + description=seed.description,
  644 + params=params,
  645 + source="seed_experiment",
  646 + )
  647 + )
  648 + existing_keys.add(key)
  649 + if len(proposals) >= batch_size:
  650 + return proposals
  651 +
  652 + successes = live_success_trials(trials)
  653 + if len(successes) < init_random:
  654 + while len(proposals) < batch_size:
  655 + params = space.sample_random(rng)
  656 + key = space.canonical_key(params)
  657 + if key in existing_keys:
  658 + continue
  659 + proposals.append(
  660 + CandidateProposal(
  661 + name=f"random_{len(successes) + len(proposals) + 1:03d}",
  662 + description="global random exploration",
  663 + params=params,
  664 + source="random",
  665 + )
  666 + )
  667 + existing_keys.add(key)
  668 + return proposals
  669 +
  670 + ranked = sorted(successes, key=lambda item: score_of(item, metric), reverse=True)
  671 + elite_count = max(2, min(len(ranked), int(math.ceil(len(ranked) * space.elite_fraction))))
  672 + elites = ranked[:elite_count]
  673 + elite_vectors = np.array([space.vectorize(item["params"]) for item in elites], dtype=float)
  674 + spread = build_sampling_spread(space, elite_vectors)
  675 + gp = fit_surrogate(space, successes, metric, seed=rng.randint(1, 10_000_000))
  676 + best_score = score_of(ranked[0], metric)
  677 + best_vector = space.vectorize(ranked[0]["params"])
  678 +
  679 + pool: List[Dict[str, Any]] = []
  680 + pool_keys = set(existing_keys)
  681 + attempts = 0
  682 + max_attempts = max(candidate_pool_size * 12, 200)
  683 + while len(pool) < candidate_pool_size and attempts < max_attempts:
  684 + attempts += 1
  685 + roll = rng.random()
  686 + if roll < space.explore_probability:
  687 + params = space.sample_random(rng)
  688 + source = "global_explore"
  689 + elif roll < space.explore_probability + space.local_jitter_probability:
  690 + center = elite_vectors[rng.randrange(len(elite_vectors))]
  691 + params = sample_local_candidate(space, rng, center=center, spread=spread)
  692 + source = "elite_jitter"
  693 + else:
  694 + if len(elite_vectors) >= 2:
  695 + left = elite_vectors[rng.randrange(len(elite_vectors))]
  696 + right = elite_vectors[rng.randrange(len(elite_vectors))]
  697 + params = sample_crossover_candidate(space, rng, left=left, right=right)
  698 + source = "elite_crossover"
  699 + else:
  700 + params = sample_local_candidate(space, rng, center=best_vector, spread=spread)
  701 + source = "best_jitter"
  702 + key = space.canonical_key(params)
  703 + if key in pool_keys:
  704 + continue
  705 + pool_keys.add(key)
  706 + pool.append({"params": params, "source": source})
  707 +
  708 + if not pool:
  709 + return proposals
  710 +
  711 + if gp is not None:
  712 + X = np.array([space.vectorize(item["params"]) for item in pool], dtype=float)
  713 + mu, sigma = gp.predict(X, return_std=True)
  714 + for idx, item in enumerate(pool):
  715 + item["acquisition"] = expected_improvement(float(mu[idx]), float(sigma[idx]), best_score)
  716 + item["uncertainty"] = float(sigma[idx])
  717 + item["predicted_score"] = float(mu[idx])
  718 + pool.sort(
  719 + key=lambda item: (
  720 + float(item.get("acquisition") or 0.0),
  721 + float(item.get("uncertainty") or 0.0),
  722 + float(item.get("predicted_score") or 0.0),
  723 + ),
  724 + reverse=True,
  725 + )
  726 + else:
  727 + rng.shuffle(pool)
  728 +
  729 + chosen_params = [item.params for item in proposals]
  730 + chosen: List[CandidateProposal] = []
  731 + for item in pool:
  732 + params = item["params"]
  733 + if any(normalized_distance(space, params, other) < space.min_normalized_distance for other in chosen_params):
  734 + continue
  735 + chosen_params.append(params)
  736 + chosen.append(
  737 + CandidateProposal(
  738 + name=f"bo_{len(successes) + len(proposals) + len(chosen) + 1:03d}",
  739 + description=(
  740 + f"{item['source']} predicted={item.get('predicted_score', 'n/a')} "
  741 + f"ei={item.get('acquisition', 'n/a')}"
  742 + ),
  743 + params=params,
  744 + source=str(item["source"]),
  745 + )
  746 + )
  747 + if len(proposals) + len(chosen) >= batch_size:
  748 + break
  749 +
  750 + proposals.extend(chosen)
  751 + if len(proposals) < batch_size:
  752 + while len(proposals) < batch_size:
  753 + params = space.sample_random(rng)
  754 + key = space.canonical_key(params)
  755 + if key in existing_keys:
  756 + continue
  757 + proposals.append(
  758 + CandidateProposal(
  759 + name=f"fallback_{len(successes) + len(proposals) + 1:03d}",
  760 + description="fallback random exploration",
  761 + params=params,
  762 + source="fallback_random",
  763 + )
  764 + )
  765 + existing_keys.add(key)
  766 + return proposals
  767 +
  768 +
  769 +def compare_query_deltas(
  770 + baseline_payload: Dict[str, Any] | None,
  771 + best_payload: Dict[str, Any] | None,
  772 + metric: str,
  773 + limit: int = 8,
  774 +) -> Dict[str, List[Dict[str, Any]]]:
  775 + if not baseline_payload or not best_payload:
  776 + return {"gains": [], "losses": []}
  777 + base = {
  778 + str(item["query"]): float(item["metrics"].get(metric, 0.0))
  779 + for item in baseline_payload.get("per_query") or []
  780 + }
  781 + cur = {
  782 + str(item["query"]): float(item["metrics"].get(metric, 0.0))
  783 + for item in best_payload.get("per_query") or []
138 } 784 }
  785 + rows: List[Dict[str, Any]] = []
  786 + for query, score in cur.items():
  787 + if query not in base:
  788 + continue
  789 + rows.append(
  790 + {
  791 + "query": query,
  792 + "baseline": round(base[query], 6),
  793 + "current": round(score, 6),
  794 + "delta": round(score - base[query], 6),
  795 + }
  796 + )
  797 + rows.sort(key=lambda item: item["delta"], reverse=True)
  798 + gains = [item for item in rows[:limit] if item["delta"] > 0]
  799 + losses = [item for item in rows[-limit:] if item["delta"] < 0]
  800 + losses.sort(key=lambda item: item["delta"])
  801 + return {"gains": gains, "losses": losses}
  802 +
139 803
  804 +def render_markdown(
  805 + *,
  806 + run_id: str,
  807 + created_at: str,
  808 + tenant_id: str,
  809 + query_count: int,
  810 + top_k: int,
  811 + metric: str,
  812 + trials: Sequence[Dict[str, Any]],
  813 +) -> str:
  814 + successes = sorted(all_success_trials(trials), key=lambda item: score_of(item, metric), reverse=True)
  815 + live_successes = sorted(live_success_trials(trials), key=lambda item: score_of(item, metric), reverse=True)
  816 + best = successes[0] if successes else None
  817 + baseline = next((item for item in successes if item.get("is_seed")), None)
  818 + best_payload = load_batch_payload(best["batch_json_path"]) if best and best.get("batch_json_path") else None
  819 + baseline_payload = (
  820 + load_batch_payload(baseline["batch_json_path"])
  821 + if baseline and baseline.get("batch_json_path")
  822 + else None
  823 + )
  824 + delta_summary = compare_query_deltas(baseline_payload, best_payload, metric) if best else {"gains": [], "losses": []}
140 825
141 -def render_markdown(summary: Dict[str, Any]) -> str:  
142 lines = [ 826 lines = [
143 "# Fusion Tuning Report", 827 "# Fusion Tuning Report",
144 "", 828 "",
145 - f"- Created at: {summary['created_at']}",  
146 - f"- Tenant ID: {summary['tenant_id']}",  
147 - f"- Query count: {summary['query_count']}",  
148 - f"- Top K: {summary['top_k']}",  
149 - f"- Score metric: {summary['score_metric']}", 829 + f"- Run ID: {run_id}",
  830 + f"- Created at: {created_at}",
  831 + f"- Tenant ID: {tenant_id}",
  832 + f"- Query count: {query_count}",
  833 + f"- Top K: {top_k}",
  834 + f"- Score metric: {metric}",
  835 + f"- Successful live evals: {len(live_successes)}",
150 "", 836 "",
151 - "## Experiments", 837 + "## Leaderboard",
152 "", 838 "",
153 - "| Rank | Name | Score | Primary | NDCG@20 | ERR@10 | Strong@10 | Gain Recall@20 | Config |",  
154 - "|---|---|---:|---:|---:|---:|---:|---:|---|", 839 + "| Rank | Name | Source | Score | Primary | NDCG@20 | ERR@10 | Gain Recall@20 | Batch |",
  840 + "|---|---|---|---:|---:|---:|---:|---:|---|",
155 ] 841 ]
156 - for idx, item in enumerate(summary["experiments"], start=1):  
157 - metrics = item["aggregate_metrics"] 842 + for idx, item in enumerate(successes, start=1):
  843 + metrics = item.get("aggregate_metrics") or {}
158 lines.append( 844 lines.append(
159 "| " 845 "| "
160 + " | ".join( 846 + " | ".join(
161 [ 847 [
162 str(idx), 848 str(idx),
163 - item["name"],  
164 - str(item["score"]), 849 + str(item.get("name") or ""),
  850 + str(item.get("source") or ""),
  851 + f"{score_of(item, metric):.6f}",
165 str(metrics.get("Primary_Metric_Score", "")), 852 str(metrics.get("Primary_Metric_Score", "")),
166 str(metrics.get("NDCG@20", "")), 853 str(metrics.get("NDCG@20", "")),
167 str(metrics.get("ERR@10", "")), 854 str(metrics.get("ERR@10", "")),
168 - str(metrics.get("Strong_Precision@10", "")),  
169 str(metrics.get("Gain_Recall@20", "")), 855 str(metrics.get("Gain_Recall@20", "")),
170 - item["config_snapshot_path"], 856 + str(item.get("batch_id") or ""),
171 ] 857 ]
172 ) 858 )
173 + " |" 859 + " |"
174 ) 860 )
175 - lines.extend(["", "## Details", ""])  
176 - for item in summary["experiments"]:  
177 - lines.append(f"### {item['name']}")  
178 - lines.append("")  
179 - lines.append(f"- Description: {item['description']}")  
180 - lines.append(f"- Score: {item['score']}")  
181 - lines.append(f"- Params: `{json.dumps(item['params'], ensure_ascii=False, sort_keys=True)}`")  
182 - lines.append(f"- Batch report: {item['batch_report_path']}")  
183 - lines.append("")  
184 - return "\n".join(lines)  
185 861
  862 + if best:
  863 + lines.extend(
  864 + [
  865 + "",
  866 + "## Best Params",
  867 + "",
  868 + f"- Name: {best['name']}",
  869 + f"- Source: {best['source']}",
  870 + f"- Score: {score_of(best, metric):.6f}",
  871 + f"- Params: `{json.dumps(best['params'], ensure_ascii=False, sort_keys=True)}`",
  872 + f"- Batch report: {best.get('batch_report_path') or ''}",
  873 + ]
  874 + )
186 875
187 -def load_experiments(path: Path) -> List[ExperimentSpec]:  
188 - payload = json.loads(path.read_text(encoding="utf-8"))  
189 - items = payload["experiments"] if isinstance(payload, dict) else payload  
190 - experiments: List[ExperimentSpec] = []  
191 - for item in items:  
192 - experiments.append(  
193 - ExperimentSpec(  
194 - name=str(item["name"]),  
195 - description=str(item.get("description") or ""),  
196 - params=dict(item.get("params") or {}),  
197 - ) 876 + if delta_summary["gains"] or delta_summary["losses"]:
  877 + lines.extend(["", "## Best vs Baseline", ""])
  878 + if delta_summary["gains"]:
  879 + lines.append("### Top Gains")
  880 + lines.append("")
  881 + for item in delta_summary["gains"]:
  882 + lines.append(
  883 + f"- {item['query']}: {item['baseline']:.6f} -> {item['current']:.6f} ({item['delta']:+.6f})"
  884 + )
  885 + if delta_summary["losses"]:
  886 + lines.append("")
  887 + lines.append("### Top Losses")
  888 + lines.append("")
  889 + for item in delta_summary["losses"]:
  890 + lines.append(
  891 + f"- {item['query']}: {item['baseline']:.6f} -> {item['current']:.6f} ({item['delta']:+.6f})"
  892 + )
  893 +
  894 + failures = [item for item in trials if item.get("status") != "ok"]
  895 + if failures:
  896 + lines.extend(["", "## Failures", ""])
  897 + for item in failures:
  898 + lines.append(f"- {item.get('name')}: {item.get('error')}")
  899 +
  900 + return "\n".join(lines) + "\n"
  901 +
  902 +
  903 +def write_leaderboard_csv(run_dir: Path, metric: str, trials: Sequence[Dict[str, Any]], parameter_names: Sequence[str]) -> None:
  904 + path = run_dir / "leaderboard.csv"
  905 + successes = sorted(all_success_trials(trials), key=lambda item: score_of(item, metric), reverse=True)
  906 + with path.open("w", encoding="utf-8", newline="") as handle:
  907 + writer = csv.writer(handle)
  908 + writer.writerow(
  909 + [
  910 + "rank",
  911 + "name",
  912 + "source",
  913 + "score",
  914 + "Primary_Metric_Score",
  915 + "NDCG@20",
  916 + "ERR@10",
  917 + "Gain_Recall@20",
  918 + "batch_id",
  919 + *parameter_names,
  920 + ]
198 ) 921 )
199 - return experiments 922 + for idx, item in enumerate(successes, start=1):
  923 + metrics = item.get("aggregate_metrics") or {}
  924 + row = [
  925 + idx,
  926 + item.get("name") or "",
  927 + item.get("source") or "",
  928 + f"{score_of(item, metric):.6f}",
  929 + metrics.get("Primary_Metric_Score", ""),
  930 + metrics.get("NDCG@20", ""),
  931 + metrics.get("ERR@10", ""),
  932 + metrics.get("Gain_Recall@20", ""),
  933 + item.get("batch_id") or "",
  934 + ]
  935 + row.extend(item.get("params", {}).get(name, "") for name in parameter_names)
  936 + writer.writerow(row)
200 937
201 938
202 -def build_parser() -> argparse.ArgumentParser:  
203 - parser = argparse.ArgumentParser(description="Run fusion tuning experiments against the live backend")  
204 - parser.add_argument("--tenant-id", default="163")  
205 - parser.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))  
206 - parser.add_argument("--top-k", type=int, default=100)  
207 - parser.add_argument("--language", default="en")  
208 - parser.add_argument("--experiments-file", required=True)  
209 - parser.add_argument("--search-base-url", default="http://127.0.0.1:6002")  
210 - parser.add_argument("--score-metric", default="Primary_Metric_Score")  
211 - parser.add_argument("--apply-best", action="store_true")  
212 - parser.add_argument("--force-refresh-labels-first-pass", action="store_true")  
213 - return parser 939 +def persist_run_summary(
  940 + *,
  941 + run_dir: Path,
  942 + run_id: str,
  943 + tenant_id: str,
  944 + query_count: int,
  945 + top_k: int,
  946 + metric: str,
  947 + trials: Sequence[Dict[str, Any]],
  948 + parameter_names: Sequence[str],
  949 +) -> None:
  950 + summary = {
  951 + "run_id": run_id,
  952 + "created_at": utc_now_iso(),
  953 + "tenant_id": tenant_id,
  954 + "query_count": query_count,
  955 + "top_k": top_k,
  956 + "score_metric": metric,
  957 + "trials": list(trials),
  958 + }
  959 + (run_dir / "summary.json").write_text(
  960 + json.dumps(summary, ensure_ascii=False, indent=2),
  961 + encoding="utf-8",
  962 + )
  963 + (run_dir / "summary.md").write_text(
  964 + render_markdown(
  965 + run_id=run_id,
  966 + created_at=summary["created_at"],
  967 + tenant_id=tenant_id,
  968 + query_count=query_count,
  969 + top_k=top_k,
  970 + metric=metric,
  971 + trials=trials,
  972 + ),
  973 + encoding="utf-8",
  974 + )
  975 + write_leaderboard_csv(run_dir, metric, trials, parameter_names)
214 976
215 977
216 -def main() -> None:  
217 - args = build_parser().parse_args() 978 +def run_experiment_mode(args: argparse.Namespace) -> None:
218 queries_file = Path(args.queries_file) 979 queries_file = Path(args.queries_file)
219 queries = read_queries(queries_file) 980 queries = read_queries(queries_file)
220 base_config_text = CONFIG_PATH.read_text(encoding="utf-8") 981 base_config_text = CONFIG_PATH.read_text(encoding="utf-8")
@@ -222,19 +983,33 @@ def main() -&gt; None: @@ -222,19 +983,33 @@ def main() -&gt; None:
222 experiments = load_experiments(Path(args.experiments_file)) 983 experiments = load_experiments(Path(args.experiments_file))
223 984
224 tuning_dir = ensure_dir(DEFAULT_ARTIFACT_ROOT / "tuning_runs") 985 tuning_dir = ensure_dir(DEFAULT_ARTIFACT_ROOT / "tuning_runs")
225 - run_id = f"tuning_{utc_timestamp()}" 986 + run_id = args.run_name or f"tuning_{utc_timestamp()}"
226 run_dir = ensure_dir(tuning_dir / run_id) 987 run_dir = ensure_dir(tuning_dir / run_id)
227 results: List[Dict[str, Any]] = [] 988 results: List[Dict[str, Any]] = []
228 989
229 try: 990 try:
230 for experiment in experiments: 991 for experiment in experiments:
231 - candidate = apply_params(base_config, experiment.params) 992 + params = dict(experiment.params)
  993 + target_path = args.target_path or "coarse_rank.fusion"
  994 + candidate = apply_target_params(base_config, target_path, params)
232 write_yaml(CONFIG_PATH, candidate) 995 write_yaml(CONFIG_PATH, candidate)
233 - candidate_config_path = run_dir / f"{experiment.name}_config.yaml" 996 + candidate_config_path = ensure_dir(run_dir / "configs") / f"{experiment.name}_config.yaml"
234 write_yaml(candidate_config_path, candidate) 997 write_yaml(candidate_config_path, candidate)
235 998
236 - run_restart() 999 + ensure_disk_headroom(
  1000 + min_free_gb=args.min_free_gb,
  1001 + auto_truncate_logs=args.auto_truncate_logs,
  1002 + context=f"restart {experiment.name}",
  1003 + )
  1004 + run_restart(args.restart_targets)
237 health = wait_for_backend(args.search_base_url) 1005 health = wait_for_backend(args.search_base_url)
  1006 + if args.heal_eval_web:
  1007 + ensure_eval_web(args.eval_web_base_url)
  1008 + ensure_disk_headroom(
  1009 + min_free_gb=args.min_free_gb,
  1010 + auto_truncate_logs=args.auto_truncate_logs,
  1011 + context=f"batch eval {experiment.name}",
  1012 + )
238 batch_result = run_batch_eval( 1013 batch_result = run_batch_eval(
239 tenant_id=args.tenant_id, 1014 tenant_id=args.tenant_id,
240 queries_file=queries_file, 1015 queries_file=queries_file,
@@ -242,21 +1017,27 @@ def main() -&gt; None: @@ -242,21 +1017,27 @@ def main() -&gt; None:
242 language=args.language, 1017 language=args.language,
243 force_refresh_labels=bool(args.force_refresh_labels_first_pass and not results), 1018 force_refresh_labels=bool(args.force_refresh_labels_first_pass and not results),
244 ) 1019 )
245 - aggregate_metrics = dict(batch_result["aggregate_metrics"]) 1020 + ensure_disk_headroom(
  1021 + min_free_gb=args.min_free_gb,
  1022 + auto_truncate_logs=args.auto_truncate_logs,
  1023 + context=f"persist {experiment.name}",
  1024 + )
  1025 + payload = batch_result["payload"]
  1026 + aggregate_metrics = dict(payload["aggregate_metrics"])
246 results.append( 1027 results.append(
247 { 1028 {
248 "name": experiment.name, 1029 "name": experiment.name,
249 "description": experiment.description, 1030 "description": experiment.description,
250 - "params": experiment.params, 1031 + "params": params,
251 "aggregate_metrics": aggregate_metrics, 1032 "aggregate_metrics": aggregate_metrics,
252 "score": float(aggregate_metrics.get(args.score_metric, 0.0)), 1033 "score": float(aggregate_metrics.get(args.score_metric, 0.0)),
253 "batch_id": batch_result["batch_id"], 1034 "batch_id": batch_result["batch_id"],
254 - "batch_report_path": str(  
255 - DEFAULT_ARTIFACT_ROOT / "batch_reports" / f"{batch_result['batch_id']}.md"  
256 - ),  
257 - "config_snapshot_path": str(candidate_config_path), 1035 + "batch_json_path": batch_result["batch_json_path"],
  1036 + "batch_report_path": batch_result["batch_report_path"],
  1037 + "candidate_config_path": str(candidate_config_path),
258 "backend_health": health, 1038 "backend_health": health,
259 - "batch_stdout": batch_result["raw_output"], 1039 + "status": "ok",
  1040 + "source": "experiments_file",
260 } 1041 }
261 ) 1042 )
262 print( 1043 print(
@@ -265,32 +1046,285 @@ def main() -&gt; None: @@ -265,32 +1046,285 @@ def main() -&gt; None:
265 ) 1046 )
266 finally: 1047 finally:
267 if args.apply_best and results: 1048 if args.apply_best and results:
268 - best = max(results, key=lambda item: item["score"])  
269 - best_config = apply_params(base_config, best["params"]) 1049 + best = max(results, key=lambda item: score_of(item, args.score_metric))
  1050 + best_config = apply_target_params(base_config, args.target_path or "coarse_rank.fusion", best["params"])
270 write_yaml(CONFIG_PATH, best_config) 1051 write_yaml(CONFIG_PATH, best_config)
271 - run_restart() 1052 + run_restart(args.restart_targets)
272 wait_for_backend(args.search_base_url) 1053 wait_for_backend(args.search_base_url)
  1054 + if args.heal_eval_web:
  1055 + ensure_eval_web(args.eval_web_base_url)
273 else: 1056 else:
274 CONFIG_PATH.write_text(base_config_text, encoding="utf-8") 1057 CONFIG_PATH.write_text(base_config_text, encoding="utf-8")
275 - run_restart() 1058 + run_restart(args.restart_targets)
276 wait_for_backend(args.search_base_url) 1059 wait_for_backend(args.search_base_url)
  1060 + if args.heal_eval_web:
  1061 + ensure_eval_web(args.eval_web_base_url)
277 1062
278 - results.sort(key=lambda item: item["score"], reverse=True)  
279 - summary = {  
280 - "run_id": run_id,  
281 - "created_at": utc_now_iso(),  
282 - "tenant_id": args.tenant_id,  
283 - "query_count": len(queries),  
284 - "top_k": args.top_k,  
285 - "score_metric": args.score_metric,  
286 - "experiments": results,  
287 - }  
288 - summary_json_path = run_dir / "summary.json"  
289 - summary_md_path = run_dir / "summary.md"  
290 - summary_json_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")  
291 - summary_md_path.write_text(render_markdown(summary), encoding="utf-8")  
292 - print(f"[done] summary_json={summary_json_path}")  
293 - print(f"[done] summary_md={summary_md_path}") 1063 + persist_run_summary(
  1064 + run_dir=run_dir,
  1065 + run_id=run_id,
  1066 + tenant_id=str(args.tenant_id),
  1067 + query_count=len(queries),
  1068 + top_k=args.top_k,
  1069 + metric=args.score_metric,
  1070 + trials=results,
  1071 + parameter_names=list(results[0]["params"].keys()) if results else [],
  1072 + )
  1073 + print(f"[done] summary_json={run_dir / 'summary.json'}")
  1074 + print(f"[done] summary_md={run_dir / 'summary.md'}")
  1075 +
  1076 +
  1077 +def run_optimize_mode(args: argparse.Namespace) -> None:
  1078 + queries_file = Path(args.queries_file)
  1079 + queries = read_queries(queries_file)
  1080 + base_config_text = CONFIG_PATH.read_text(encoding="utf-8")
  1081 + base_config = load_yaml(CONFIG_PATH)
  1082 + search_space_path = Path(args.search_space)
  1083 + space = load_search_space(search_space_path)
  1084 + rng = random.Random(args.random_seed)
  1085 +
  1086 + tuning_dir = ensure_dir(DEFAULT_ARTIFACT_ROOT / "tuning_runs")
  1087 + run_dir = (
  1088 + Path(args.resume_run).resolve()
  1089 + if args.resume_run
  1090 + else ensure_dir(tuning_dir / (args.run_name or f"coarse_fusion_bo_{utc_timestamp()}"))
  1091 + )
  1092 + run_id = run_dir.name
  1093 + ensure_dir(run_dir / "configs")
  1094 + ensure_dir(run_dir / "logs")
  1095 + if not (run_dir / "search_space.yaml").exists():
  1096 + (run_dir / "search_space.yaml").write_text(search_space_path.read_text(encoding="utf-8"), encoding="utf-8")
  1097 +
  1098 + trials = load_existing_trials(run_dir)
  1099 + if args.seed_report:
  1100 + baseline_params = space.fill_params(space.baseline)
  1101 + baseline_key = space.canonical_key(baseline_params)
  1102 + if baseline_key not in {space.canonical_key(item["params"]) for item in trials if item.get("params")}:
  1103 + payload = load_batch_payload(args.seed_report)
  1104 + trial = {
  1105 + "trial_id": next_trial_name(trials, "trial"),
  1106 + "name": "seed_baseline",
  1107 + "description": f"seeded from {args.seed_report}",
  1108 + "source": "seed_report",
  1109 + "is_seed": True,
  1110 + "status": "ok",
  1111 + "created_at": utc_now_iso(),
  1112 + "params": baseline_params,
  1113 + "score": float(payload["aggregate_metrics"].get(args.score_metric, 0.0)),
  1114 + "aggregate_metrics": dict(payload["aggregate_metrics"]),
  1115 + "batch_id": payload["batch_id"],
  1116 + "batch_json_path": str(resolve_batch_json_path(args.seed_report)),
  1117 + "batch_report_path": str(resolve_batch_json_path(args.seed_report).with_suffix(".md")),
  1118 + }
  1119 + append_trial(run_dir, trial)
  1120 + trials.append(trial)
  1121 +
  1122 + init_random = args.init_random if args.init_random is not None else space.init_random
  1123 + candidate_pool_size = args.candidate_pool_size if args.candidate_pool_size is not None else space.candidate_pool_size
  1124 +
  1125 + try:
  1126 + live_done = len(live_success_trials(trials))
  1127 + while live_done < args.max_evals:
  1128 + remaining = args.max_evals - live_done
  1129 + current_batch_size = min(args.batch_size, remaining)
  1130 + proposals = propose_candidates(
  1131 + space=space,
  1132 + trials=trials,
  1133 + metric=args.score_metric,
  1134 + batch_size=current_batch_size,
  1135 + rng=rng,
  1136 + init_random=init_random,
  1137 + candidate_pool_size=candidate_pool_size,
  1138 + )
  1139 + if not proposals:
  1140 + raise RuntimeError("optimizer failed to produce new candidate proposals")
  1141 +
  1142 + for proposal in proposals:
  1143 + force_refresh_labels = bool(args.force_refresh_labels_first_pass and live_done == 0 and not any(t.get("is_seed") for t in trials))
  1144 + trial_id = next_trial_name(trials, "trial")
  1145 + candidate_config = apply_target_params(base_config, space.target_path, proposal.params)
  1146 + candidate_config_path = run_dir / "configs" / f"{trial_id}_{proposal.name}.yaml"
  1147 + trial_log_path = run_dir / "logs" / f"{trial_id}_{proposal.name}.log"
  1148 + write_yaml(CONFIG_PATH, candidate_config)
  1149 + write_yaml(candidate_config_path, candidate_config)
  1150 + print(
  1151 + f"[tune] start {proposal.name} source={proposal.source} "
  1152 + f"params={json.dumps(proposal.params, ensure_ascii=False, sort_keys=True)}"
  1153 + )
  1154 + try:
  1155 + ensure_disk_headroom(
  1156 + min_free_gb=args.min_free_gb,
  1157 + auto_truncate_logs=args.auto_truncate_logs,
  1158 + context=f"restart {proposal.name}",
  1159 + )
  1160 + run_restart(args.restart_targets)
  1161 + backend_health = wait_for_backend(args.search_base_url)
  1162 + verify_backend_config(args.search_base_url, space.target_path, proposal.params)
  1163 + if args.heal_eval_web:
  1164 + ensure_eval_web(args.eval_web_base_url)
  1165 + ensure_disk_headroom(
  1166 + min_free_gb=args.min_free_gb,
  1167 + auto_truncate_logs=args.auto_truncate_logs,
  1168 + context=f"batch eval {proposal.name}",
  1169 + )
  1170 + batch_result = run_batch_eval(
  1171 + tenant_id=args.tenant_id,
  1172 + queries_file=queries_file,
  1173 + top_k=args.top_k,
  1174 + language=args.language,
  1175 + force_refresh_labels=force_refresh_labels,
  1176 + )
  1177 + ensure_disk_headroom(
  1178 + min_free_gb=args.min_free_gb,
  1179 + auto_truncate_logs=args.auto_truncate_logs,
  1180 + context=f"persist {proposal.name}",
  1181 + )
  1182 + payload = batch_result["payload"]
  1183 + trial_log_path.write_text(batch_result["raw_output"], encoding="utf-8")
  1184 + aggregate_metrics = dict(payload["aggregate_metrics"])
  1185 + trial = {
  1186 + "trial_id": trial_id,
  1187 + "name": proposal.name,
  1188 + "description": proposal.description,
  1189 + "source": proposal.source,
  1190 + "is_seed": False,
  1191 + "status": "ok",
  1192 + "created_at": utc_now_iso(),
  1193 + "params": proposal.params,
  1194 + "score": float(aggregate_metrics.get(args.score_metric, 0.0)),
  1195 + "aggregate_metrics": aggregate_metrics,
  1196 + "batch_id": batch_result["batch_id"],
  1197 + "batch_json_path": batch_result["batch_json_path"],
  1198 + "batch_report_path": batch_result["batch_report_path"],
  1199 + "candidate_config_path": str(candidate_config_path),
  1200 + "trial_log_path": str(trial_log_path),
  1201 + "backend_health": backend_health,
  1202 + }
  1203 + print(
  1204 + f"[tune] done {proposal.name} "
  1205 + f"{args.score_metric}={trial['score']:.6f} "
  1206 + f"Primary={aggregate_metrics.get('Primary_Metric_Score')}"
  1207 + )
  1208 + except Exception as exc: # noqa: BLE001
  1209 + trial = {
  1210 + "trial_id": trial_id,
  1211 + "name": proposal.name,
  1212 + "description": proposal.description,
  1213 + "source": proposal.source,
  1214 + "is_seed": False,
  1215 + "status": "error",
  1216 + "created_at": utc_now_iso(),
  1217 + "params": proposal.params,
  1218 + "error": str(exc),
  1219 + "candidate_config_path": str(candidate_config_path),
  1220 + "trial_log_path": str(trial_log_path),
  1221 + }
  1222 + print(f"[tune] error {proposal.name}: {exc}")
  1223 + ensure_disk_headroom(
  1224 + min_free_gb=args.min_free_gb,
  1225 + auto_truncate_logs=args.auto_truncate_logs,
  1226 + context=f"error-persist {proposal.name}",
  1227 + )
  1228 + append_trial(run_dir, trial)
  1229 + trials.append(trial)
  1230 + ensure_disk_headroom(
  1231 + min_free_gb=args.min_free_gb,
  1232 + auto_truncate_logs=args.auto_truncate_logs,
  1233 + context=f"summary {proposal.name}",
  1234 + )
  1235 + persist_run_summary(
  1236 + run_dir=run_dir,
  1237 + run_id=run_id,
  1238 + tenant_id=str(args.tenant_id),
  1239 + query_count=len(queries),
  1240 + top_k=args.top_k,
  1241 + metric=args.score_metric,
  1242 + trials=trials,
  1243 + parameter_names=space.parameter_names,
  1244 + )
  1245 + if trial.get("status") == "ok":
  1246 + live_done += 1
  1247 + if live_done >= args.max_evals:
  1248 + break
  1249 + finally:
  1250 + if args.apply_best:
  1251 + successes = all_success_trials(trials)
  1252 + best_live = max(successes, key=lambda item: score_of(item, args.score_metric)) if successes else None
  1253 + if best_live:
  1254 + best_config = apply_target_params(base_config, space.target_path, best_live["params"])
  1255 + write_yaml(CONFIG_PATH, best_config)
  1256 + run_restart(args.restart_targets)
  1257 + wait_for_backend(args.search_base_url)
  1258 + if args.heal_eval_web:
  1259 + ensure_eval_web(args.eval_web_base_url)
  1260 + else:
  1261 + CONFIG_PATH.write_text(base_config_text, encoding="utf-8")
  1262 + run_restart(args.restart_targets)
  1263 + wait_for_backend(args.search_base_url)
  1264 + if args.heal_eval_web:
  1265 + ensure_eval_web(args.eval_web_base_url)
  1266 +
  1267 + persist_run_summary(
  1268 + run_dir=run_dir,
  1269 + run_id=run_id,
  1270 + tenant_id=str(args.tenant_id),
  1271 + query_count=len(queries),
  1272 + top_k=args.top_k,
  1273 + metric=args.score_metric,
  1274 + trials=trials,
  1275 + parameter_names=space.parameter_names,
  1276 + )
  1277 + print(f"[done] run_dir={run_dir}")
  1278 + print(f"[done] summary_json={run_dir / 'summary.json'}")
  1279 + print(f"[done] summary_md={run_dir / 'summary.md'}")
  1280 + print(f"[done] leaderboard_csv={run_dir / 'leaderboard.csv'}")
  1281 +
  1282 +
  1283 +def build_parser() -> argparse.ArgumentParser:
  1284 + parser = argparse.ArgumentParser(
  1285 + description="Tune coarse/fusion params against the live backend with adaptive Bayesian-style search."
  1286 + )
  1287 + parser.add_argument("--mode", choices=["optimize", "experiments"], default="optimize")
  1288 + parser.add_argument("--tenant-id", default="163")
  1289 + parser.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
  1290 + parser.add_argument("--top-k", type=int, default=100)
  1291 + parser.add_argument("--language", default="en")
  1292 + parser.add_argument("--search-base-url", default="http://127.0.0.1:6002")
  1293 + parser.add_argument("--eval-web-base-url", default="http://127.0.0.1:6010")
  1294 + parser.add_argument("--score-metric", default="Primary_Metric_Score")
  1295 + parser.add_argument("--restart-targets", nargs="+", default=["backend"])
  1296 + parser.add_argument("--heal-eval-web", action=argparse.BooleanOptionalAction, default=True)
  1297 + parser.add_argument("--force-refresh-labels-first-pass", action="store_true")
  1298 + parser.add_argument("--apply-best", action="store_true")
  1299 + parser.add_argument("--run-name", default=None)
  1300 +
  1301 + parser.add_argument("--experiments-file")
  1302 + parser.add_argument("--target-path", default="coarse_rank.fusion")
  1303 +
  1304 + parser.add_argument(
  1305 + "--search-space",
  1306 + default=str(PROJECT_ROOT / "scripts" / "evaluation" / "tuning" / "coarse_rank_fusion_space.yaml"),
  1307 + )
  1308 + parser.add_argument("--seed-report", default=None)
  1309 + parser.add_argument("--resume-run", default=None)
  1310 + parser.add_argument("--max-evals", type=int, default=12)
  1311 + parser.add_argument("--batch-size", type=int, default=3)
  1312 + parser.add_argument("--init-random", type=int, default=None)
  1313 + parser.add_argument("--candidate-pool-size", type=int, default=None)
  1314 + parser.add_argument("--random-seed", type=int, default=20260415)
  1315 + parser.add_argument("--min-free-gb", type=float, default=5.0)
  1316 + parser.add_argument("--auto-truncate-logs", action=argparse.BooleanOptionalAction, default=True)
  1317 + return parser
  1318 +
  1319 +
  1320 +def main() -> None:
  1321 + args = build_parser().parse_args()
  1322 + if args.mode == "experiments":
  1323 + if not args.experiments_file:
  1324 + raise SystemExit("--experiments-file is required when --mode=experiments")
  1325 + run_experiment_mode(args)
  1326 + return
  1327 + run_optimize_mode(args)
294 1328
295 1329
296 if __name__ == "__main__": 1330 if __name__ == "__main__":
scripts/evaluation/tuning/README.md 0 → 100644
@@ -0,0 +1,71 @@ @@ -0,0 +1,71 @@
  1 +# Coarse Fusion 长跑调参
  2 +
  3 +## 启动一轮长跑
  4 +
  5 +```bash
  6 +./scripts/evaluation/start_coarse_fusion_tuning_long.sh
  7 +```
  8 +
  9 +可用环境变量:
  10 +
  11 +```bash
  12 +MAX_EVALS=48 BATCH_SIZE=3 CANDIDATE_POOL_SIZE=512 \
  13 +RUN_NAME=coarse_fusion_long_001 \
  14 +./scripts/evaluation/start_coarse_fusion_tuning_long.sh
  15 +```
  16 +
  17 +启动后会打印:
  18 +
  19 +- `run_name`
  20 +- `pid`
  21 +- `log`
  22 +- `run_dir`
  23 +
  24 +默认搜索空间:
  25 +
  26 +- `scripts/evaluation/tuning/coarse_rank_fusion_space.yaml`
  27 +
  28 +默认 baseline seed:
  29 +
  30 +- `artifacts/search_evaluation/batch_reports/batch_20260415T150754Z_00b6a8aa3d.md`
  31 +
  32 +## 查看进度
  33 +
  34 +```bash
  35 +tail -f artifacts/search_evaluation/tuning_launches/<run_name>.log
  36 +cat artifacts/search_evaluation/tuning_runs/<run_name>/leaderboard.csv
  37 +sed -n '1,200p' artifacts/search_evaluation/tuning_runs/<run_name>/summary.md
  38 +```
  39 +
  40 +实时记录文件:
  41 +
  42 +- `trials.jsonl`
  43 +- `leaderboard.csv`
  44 +- `summary.json`
  45 +- `summary.md`
  46 +
  47 +## 续跑
  48 +
  49 +```bash
  50 +./scripts/evaluation/resume_coarse_fusion_tuning_long.sh <run_name>
  51 +```
  52 +
  53 +也可直接传完整目录:
  54 +
  55 +```bash
  56 +./scripts/evaluation/resume_coarse_fusion_tuning_long.sh \
  57 + artifacts/search_evaluation/tuning_runs/<run_name>
  58 +```
  59 +
  60 +## 停止
  61 +
  62 +```bash
  63 +kill "$(cat artifacts/search_evaluation/tuning_launches/<run_name>.pid)"
  64 +```
  65 +
  66 +## 说明
  67 +
  68 +- 每轮会自动写入 `config/config.yaml`
  69 +- 每轮会自动执行 `./restart.sh backend`
  70 +- 如果 `eval-web` 因 backend 重启不可用,调参器会尝试补拉起 `eval-web`
  71 +- 默认不 `apply-best`,跑完后会恢复 baseline 配置
scripts/evaluation/tuning/coarse_rank_fusion_space.yaml 0 → 100644
@@ -0,0 +1,153 @@ @@ -0,0 +1,153 @@
  1 +target_path: coarse_rank.fusion
  2 +
  3 +baseline:
  4 + es_bias: 10.0
  5 + es_exponent: 0.05
  6 + text_bias: 0.1
  7 + text_exponent: 0.35
  8 + text_translation_weight: 1.0
  9 + knn_text_weight: 1.0
  10 + knn_image_weight: 2.0
  11 + knn_tie_breaker: 0.3
  12 + knn_bias: 0.2
  13 + knn_exponent: 5.6
  14 + knn_text_bias: 0.2
  15 + knn_text_exponent: 0.0
  16 + knn_image_bias: 0.2
  17 + knn_image_exponent: 0.0
  18 +
  19 +parameters:
  20 + es_bias: {min: 0.3, max: 80.0, scale: log, round: 4}
  21 + es_exponent: {min: 0.0, max: 0.4, scale: linear, round: 4}
  22 + text_bias: {min: 0.001, max: 4.0, scale: log, round: 4}
  23 + text_exponent: {min: 0.02, max: 1.6, scale: linear, round: 4}
  24 + text_translation_weight: {min: 0.1, max: 2.5, scale: linear, round: 4}
  25 + knn_text_weight: {min: 0.1, max: 4.0, scale: linear, round: 4}
  26 + knn_image_weight: {min: 0.1, max: 6.0, scale: linear, round: 4}
  27 + knn_tie_breaker: {min: 0.0, max: 1.0, scale: linear, round: 4}
  28 + knn_bias: {min: 0.001, max: 4.0, scale: log, round: 4}
  29 + knn_exponent: {min: 0.05, max: 12.0, scale: log, round: 4}
  30 + knn_text_bias: {min: 0.001, max: 4.0, scale: log, round: 4}
  31 + knn_text_exponent: {min: 0.0, max: 6.0, scale: linear, round: 4}
  32 + knn_image_bias: {min: 0.001, max: 4.0, scale: log, round: 4}
  33 + knn_image_exponent: {min: 0.0, max: 6.0, scale: linear, round: 4}
  34 +
  35 +seed_experiments:
  36 + - name: seed_knn_soften
  37 + description: 压低 knn 全局指数,先验证当前 5.6 是否过猛
  38 + params:
  39 + text_exponent: 0.42
  40 + knn_image_weight: 1.2
  41 + knn_bias: 0.35
  42 + knn_exponent: 1.4
  43 + - name: seed_text_guard
  44 + description: 提升 lexical 稳定性,抑制翻译与 image knn 过度主导
  45 + params:
  46 + text_exponent: 0.62
  47 + text_translation_weight: 0.75
  48 + knn_image_weight: 1.0
  49 + knn_tie_breaker: 0.15
  50 + knn_exponent: 2.2
  51 + - name: seed_semantic_balanced
  52 + description: 让 text/image knn 都参与,但降低 image 偏置和总指数
  53 + params:
  54 + text_exponent: 0.32
  55 + knn_text_weight: 1.4
  56 + knn_image_weight: 1.8
  57 + knn_tie_breaker: 0.45
  58 + knn_bias: 0.18
  59 + knn_exponent: 3.0
  60 + - name: seed_component_exp
  61 + description: 打开 knn_text/image 子项指数,观察全局 knn_exponent 是否可下放
  62 + params:
  63 + knn_bias: 0.15
  64 + knn_exponent: 1.6
  65 + knn_text_exponent: 0.8
  66 + knn_image_exponent: 0.4
  67 + - name: seed_es_relax
  68 + description: 增强 es 因子的区分度,验证 coarse 是否过分压平 lexical 分数
  69 + params:
  70 + es_bias: 3.0
  71 + es_exponent: 0.11
  72 + text_exponent: 0.48
  73 + knn_exponent: 2.6
  74 + - name: seed_image_heavy
  75 + description: 刻意放大 image knn 做对照,看哪些 query 会明显受损
  76 + params:
  77 + text_exponent: 0.22
  78 + knn_text_weight: 0.9
  79 + knn_image_weight: 3.4
  80 + knn_tie_breaker: 0.55
  81 + knn_bias: 0.12
  82 + knn_exponent: 3.8
  83 + - name: seed_high_knn_global
  84 + description: 沿着 baseline 继续上探更强 knn 全局指数,验证 5.6 是否仍偏保守
  85 + params:
  86 + text_exponent: 0.28
  87 + knn_text_weight: 1.1
  88 + knn_image_weight: 2.6
  89 + knn_tie_breaker: 0.4
  90 + knn_bias: 0.12
  91 + knn_exponent: 7.2
  92 + - name: seed_text_knn_split
  93 + description: 提高 text knn,压低 image knn,同时打开 text/image 子项指数
  94 + params:
  95 + text_exponent: 0.38
  96 + knn_text_weight: 2.0
  97 + knn_image_weight: 0.8
  98 + knn_tie_breaker: 0.2
  99 + knn_bias: 0.08
  100 + knn_exponent: 4.8
  101 + knn_text_exponent: 1.1
  102 + knn_image_exponent: 0.15
  103 + - name: seed_image_split
  104 + description: 保持较高 image 权重,但把非线性拆到 image 子项而不是全局 knn
  105 + params:
  106 + text_exponent: 0.26
  107 + knn_text_weight: 0.9
  108 + knn_image_weight: 3.0
  109 + knn_tie_breaker: 0.35
  110 + knn_bias: 0.08
  111 + knn_exponent: 3.4
  112 + knn_text_exponent: 0.2
  113 + knn_image_exponent: 1.0
  114 + - name: seed_es_text_sharpen
  115 + description: 提升 es 与 lexical 区分度,测试 coarse 是否需要更强文本排序稳定性
  116 + params:
  117 + es_bias: 2.0
  118 + es_exponent: 0.16
  119 + text_bias: 0.03
  120 + text_exponent: 0.78
  121 + text_translation_weight: 0.9
  122 + knn_bias: 0.1
  123 + knn_exponent: 5.0
  124 + - name: seed_translation_discount
  125 + description: 明显削弱 translation 命中,验证抽象 query 是否过度依赖翻译通路
  126 + params:
  127 + text_exponent: 0.44
  128 + text_translation_weight: 0.45
  129 + knn_text_weight: 1.2
  130 + knn_image_weight: 1.7
  131 + knn_tie_breaker: 0.25
  132 + knn_exponent: 5.4
  133 + - name: seed_near_baseline_jitter
  134 + description: 贴近 baseline 做小扰动,优先寻找可行增益而不是只测极端方向
  135 + params:
  136 + es_bias: 8.0
  137 + es_exponent: 0.06
  138 + text_bias: 0.06
  139 + text_exponent: 0.31
  140 + text_translation_weight: 1.1
  141 + knn_text_weight: 1.1
  142 + knn_image_weight: 2.2
  143 + knn_tie_breaker: 0.34
  144 + knn_bias: 0.16
  145 + knn_exponent: 5.9
  146 +
  147 +optimizer:
  148 + init_random: 8
  149 + candidate_pool_size: 512
  150 + explore_probability: 0.28
  151 + local_jitter_probability: 0.42
  152 + elite_fraction: 0.35
  153 + min_normalized_distance: 0.12
scripts/service_ctl.sh
@@ -213,6 +213,7 @@ health_path_for_service() { @@ -213,6 +213,7 @@ health_path_for_service() {
213 local service="$1" 213 local service="$1"
214 case "${service}" in 214 case "${service}" in
215 backend|indexer|embedding|embedding-image|translator|reranker|reranker-fine|tei) echo "/health" ;; 215 backend|indexer|embedding|embedding-image|translator|reranker|reranker-fine|tei) echo "/health" ;;
  216 + eval-web) echo "/api/history" ;;
216 *) echo "" ;; 217 *) echo "" ;;
217 esac 218 esac
218 } 219 }
@@ -469,7 +470,7 @@ monitor_services() { @@ -469,7 +470,7 @@ monitor_services() {
469 if [ "${recent_count}" -ge "${max_restarts_per_hour}" ]; then 470 if [ "${recent_count}" -ge "${max_restarts_per_hour}" ]; then
470 monitor_log_event "${svc}" "error" "restart suppressed by hourly cap (${max_restarts_per_hour}/hour)" 471 monitor_log_event "${svc}" "error" "restart suppressed by hourly cap (${max_restarts_per_hour}/hour)"
471 if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then 472 if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then
472 - python "${wechat_alert_py}" \ 473 + "$(config_python_bin)" "${wechat_alert_py}" \
473 --service "${svc}" \ 474 --service "${svc}" \
474 --level "error" \ 475 --level "error" \
475 --message "监控检测到服务连续多次健康检查失败,且已达到每小时最大重启次数上限(${max_restarts_per_hour} 次/小时),请及时排查。" 476 --message "监控检测到服务连续多次健康检查失败,且已达到每小时最大重启次数上限(${max_restarts_per_hour} 次/小时),请及时排查。"
@@ -479,7 +480,7 @@ monitor_services() { @@ -479,7 +480,7 @@ monitor_services() {
479 480
480 monitor_log_event "${svc}" "error" "triggering restart after ${fail_streak[${svc}]} consecutive failures" 481 monitor_log_event "${svc}" "error" "triggering restart after ${fail_streak[${svc}]} consecutive failures"
481 if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then 482 if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then
482 - python "${wechat_alert_py}" \ 483 + "$(config_python_bin)" "${wechat_alert_py}" \
483 --service "${svc}" \ 484 --service "${svc}" \
484 --level "error" \ 485 --level "error" \
485 --message "监控检测到服务连续 ${fail_streak[${svc}]} 次健康检查失败,正在尝试自动重启。" 486 --message "监控检测到服务连续 ${fail_streak[${svc}]} 次健康检查失败,正在尝试自动重启。"
@@ -494,7 +495,7 @@ monitor_services() { @@ -494,7 +495,7 @@ monitor_services() {
494 restart_history["${svc}"]="${restart_history[${svc}]:-} ${now}" 495 restart_history["${svc}"]="${restart_history[${svc}]:-} ${now}"
495 monitor_log_event "${svc}" "error" "restart failed, inspect $(log_file "${svc}")" 496 monitor_log_event "${svc}" "error" "restart failed, inspect $(log_file "${svc}")"
496 if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then 497 if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then
497 - python "${wechat_alert_py}" \ 498 + "$(config_python_bin)" "${wechat_alert_py}" \
498 --service "${svc}" \ 499 --service "${svc}" \
499 --level "error" \ 500 --level "error" \
500 --message "监控检测到服务连续 ${fail_streak[${svc}]} 次健康检查失败,自动重启尝试失败,请尽快登录服务器查看日志:$(log_file "${svc}")." 501 --message "监控检测到服务连续 ${fail_streak[${svc}]} 次健康检查失败,自动重启尝试失败,请尽快登录服务器查看日志:$(log_file "${svc}")."
@@ -609,7 +610,13 @@ is_running_by_port() { @@ -609,7 +610,13 @@ is_running_by_port() {
609 local service="$1" 610 local service="$1"
610 local port 611 local port
611 port="$(get_port "${service}")" 612 port="$(get_port "${service}")"
612 - [ -n "${port}" ] && lsof -ti:"${port}" >/dev/null 2>&1 613 + [ -n "${port}" ] && lsof -nP -iTCP:"${port}" -sTCP:LISTEN -t >/dev/null 2>&1
  614 +}
  615 +
  616 +list_listen_pids_by_port() {
  617 + local port="$1"
  618 + [ -n "${port}" ] || return 0
  619 + lsof -nP -iTCP:"${port}" -sTCP:LISTEN -t 2>/dev/null || true
613 } 620 }
614 621
615 is_running_tei_container() { 622 is_running_tei_container() {
@@ -794,14 +801,14 @@ stop_one() { @@ -794,14 +801,14 @@ stop_one() {
794 port="$(get_port "${service}")" 801 port="$(get_port "${service}")"
795 if [ -n "${port}" ]; then 802 if [ -n "${port}" ]; then
796 local pids 803 local pids
797 - pids="$(lsof -ti:${port} 2>/dev/null || true)" 804 + pids="$(list_listen_pids_by_port "${port}")"
798 if [ -n "${pids}" ]; then 805 if [ -n "${pids}" ]; then
799 echo "[stop] ${service} port=${port} pids=${pids}" 806 echo "[stop] ${service} port=${port} pids=${pids}"
800 for pid in ${pids}; do 807 for pid in ${pids}; do
801 kill -TERM "${pid}" 2>/dev/null || true 808 kill -TERM "${pid}" 2>/dev/null || true
802 done 809 done
803 sleep 1 810 sleep 1
804 - pids="$(lsof -ti:${port} 2>/dev/null || true)" 811 + pids="$(list_listen_pids_by_port "${port}")"
805 for pid in ${pids}; do 812 for pid in ${pids}; do
806 kill -KILL "${pid}" 2>/dev/null || true 813 kill -KILL "${pid}" 2>/dev/null || true
807 done 814 done
@@ -854,7 +861,7 @@ status_one() { @@ -854,7 +861,7 @@ status_one() {
854 pid_info="$(cat "$(pid_file "${service}")" 2>/dev/null || echo "-")" 861 pid_info="$(cat "$(pid_file "${service}")" 2>/dev/null || echo "-")"
855 elif is_running_by_port "${service}"; then 862 elif is_running_by_port "${service}"; then
856 running="yes" 863 running="yes"
857 - pid_info="$(lsof -ti:${port} 2>/dev/null | tr '\n' ',' | sed 's/,$//' || echo "-")" 864 + pid_info="$(list_listen_pids_by_port "${port}" | tr '\n' ',' | sed 's/,$//' || echo "-")"
858 fi 865 fi
859 866
860 if [ "${running}" = "yes" ]; then 867 if [ "${running}" = "yes" ]; then