Commit dba5764289827a82201dafa605d531411cb5b24f
1 parent
47452e1d
bayes调参计划
Showing
13 changed files
with
1682 additions
and
112 deletions
Show diff stats
artifacts/search_evaluation/tuning_launches/coarse_fusion_long_001.cmd
0 → 100644
| ... | ... | @@ -0,0 +1 @@ |
| 1 | +python scripts/evaluation/tune_fusion.py --mode optimize --run-name coarse_fusion_long_001 --search-space scripts/evaluation/tuning/coarse_rank_fusion_space.yaml --seed-report artifacts/search_evaluation/batch_reports/batch_20260415T150754Z_00b6a8aa3d.md --tenant-id 163 --queries-file scripts/evaluation/queries/queries.txt --top-k 100 --language en --search-base-url http://127.0.0.1:6002 --eval-web-base-url http://127.0.0.1:6010 --max-evals 400 --batch-size 3 --candidate-pool-size 512 --random-seed 20260416 | ... | ... |
artifacts/search_evaluation/tuning_launches/coarse_fusion_long_001.pid
0 → 100644
| ... | ... | @@ -0,0 +1 @@ |
| 1 | +2218620 | ... | ... |
config/config.yaml
| ... | ... | @@ -256,9 +256,11 @@ coarse_rank: |
| 256 | 256 | knn_text_weight: 1.0 |
| 257 | 257 | knn_image_weight: 2.0 |
| 258 | 258 | knn_tie_breaker: 0.3 |
| 259 | - knn_bias: 0.0 | |
| 259 | + knn_bias: 0.2 | |
| 260 | 260 | knn_exponent: 5.6 |
| 261 | + knn_text_bias: 0.2 | |
| 261 | 262 | knn_text_exponent: 0.0 |
| 263 | + knn_image_bias: 0.2 | |
| 262 | 264 | knn_image_exponent: 0.0 |
| 263 | 265 | fine_rank: |
| 264 | 266 | enabled: false # false 时保序透传 |
| ... | ... | @@ -649,4 +651,4 @@ tenant_config: |
| 649 | 651 | primary_language: en |
| 650 | 652 | index_languages: |
| 651 | 653 | - en |
| 652 | 654 | - - zh |
| 655 | + - zh | |
| 653 | 656 | \ No newline at end of file | ... | ... |
docs/caches-inventory.md
| ... | ... | @@ -96,9 +96,22 @@ |
| 96 | 96 | | `scripts/redis/redis_cache_prefix_stats.py` | 按前缀统计 key 数量与 **MEMORY USAGE**(可多 DB) | |
| 97 | 97 | | `scripts/redis/redis_memory_heavy_keys.py` | 扫描占用内存最大的 key,辅助排查「统计与总内存不一致」 | |
| 98 | 98 | | `scripts/redis/monitor_eviction.py` | 实时监控 **eviction** 相关事件,用于容量与驱逐策略排查 | |
| 99 | +| `scripts/redis/purge_caches.py` | 一键清空业务缓存:embedding(含 `:image:` / `:clip_text:`)、anchors、translation;**默认跳过 `trans:deepl*`**(可 dry-run 预览) | | |
| 99 | 100 | |
| 100 | 101 | 使用前需加载项目配置(如 `source activate.sh`)以保证 `REDIS_CONFIG` 与生产一致。脚本注释中给出了 **`redis-cli` 手工统计**示例(按前缀 `wc -l`、`MEMORY STATS` 等)。 |
| 101 | 102 | |
| 103 | +### 快速清空(排除 `trans:deepl*`) | |
| 104 | + | |
| 105 | +```bash | |
| 106 | +source activate.sh | |
| 107 | + | |
| 108 | +# 先预览会删多少 key(推荐) | |
| 109 | +python scripts/redis/purge_caches.py --dry-run | |
| 110 | + | |
| 111 | +# 真正删除(默认 db=0) | |
| 112 | +python scripts/redis/purge_caches.py | |
| 113 | +``` | |
| 114 | + | |
| 102 | 115 | --- |
| 103 | 116 | |
| 104 | 117 | ## 六、总表(Redis 与各层缓存) |
| ... | ... | @@ -106,8 +119,8 @@ |
| 106 | 119 | | 缓存名称 | 业务模块 | 存储 | Key 前缀 / 命名模式 | 过期时间 | 过期策略 | 值摘要 | 配置键 / 环境变量 | |
| 107 | 120 | |----------|----------|------|---------------------|----------|----------|--------|-------------------| |
| 108 | 121 | | 文本向量 | 检索 / 索引 / Embedding 服务 | Redis db≈0 | `{embedding_cache_prefix}:*`(逻辑键以 `embed:norm…` 开头) | `cache_expire_days`(默认 720 天) | 写入 TTL + 命中滑动续期 | BF16 字节向量 | `infrastructure.redis.*`;`REDIS_EMBEDDING_CACHE_PREFIX`、`REDIS_CACHE_EXPIRE_DAYS` | |
| 109 | -| 图像向量(CLIP 图) | 图搜 / 多模态 | 同上 | `{prefix}:image:*` | 同上 | 同上 | BF16 字节 | 同上 | | |
| 110 | -| CLIP 文本塔向量 | 图搜文本侧 | 同上 | `{prefix}:clip_text:*` | 同上 | 同上 | BF16 字节 | 同上 | | |
| 122 | +| 图像向量(CLIP 图) | 图搜 / 多模态 | 同上 | `{embedding_cache_prefix}:image:*`(其中 `{embedding_cache_prefix}` 默认 `embedding`) | 同上 | 同上 | BF16 字节 | 同上 | | |
| 123 | +| CLIP 文本塔向量 | 图搜文本侧 | 同上 | `{embedding_cache_prefix}:clip_text:*`(其中 `{embedding_cache_prefix}` 默认 `embedding`) | 同上 | 同上 | BF16 字节 | 同上 | | |
| 111 | 124 | | 翻译译文 | 查询翻译、翻译服务 | 同上 | `trans:{model}:{lang}:*` | `services.translation.cache.ttl_seconds`(默认 720 天) | 可配置滑动(`sliding_expiration`) | UTF-8 字符串 | `services.translation.cache.*`;各能力 `use_cache` | |
| 112 | 125 | | 商品分析 / Anchors | 索引富化、LLM 内容理解 | 同上 | `{anchor_cache_prefix}:{kind}:{hash}:{lang}:*` | `anchor_cache_expire_days`(默认 30 天) | 固定 TTL,不滑动 | JSON 字符串 | `anchor_cache_prefix`、`anchor_cache_expire_days`;`REDIS_ANCHOR_*` | |
| 113 | 126 | | 应用配置 | 全栈 | 进程内存 | N/A(单例) | 进程生命周期 | `reload_app_config` 清除 | `AppConfig` 对象 | `config/loader.py` | | ... | ... |
docs/issues/issue-2026-04-14-粗排流程放入ES-TODO-env renamed to docs/issues/issue-2026-04-14-粗排流程放入ES-TODO-env.md
| ... | ... | @@ -0,0 +1,136 @@ |
| 1 | + | |
| 2 | +我以前经过过一轮调参,是基于54个评测样本(queries.txt),过程中发现的最优的参数是这一组: | |
| 3 | +0.641241 {'es_bias': '7.214', 'es_exponent': '0.2025', 'text_bias': '4.0', 'text_exponent': '1.584', 'text_translation_weight': '1.4441', 'knn_text_weight': '0.1', 'knn_image_weight': '5.6232', 'knn_tie_breaker': | |
| 4 | + '0.021', 'knn_bias': '0.0019', 'knn_exponent': '11.8477', 'knn_text_bias': '2.3125', 'knn_text_exponent': '1.1547', 'knn_image_bias': '0.9641', 'knn_image_exponent': '5.8671'} | |
| 5 | + | |
| 6 | +这一组参数分布比较极端,text_bias太大(文本项得分事0~1的,加上4被稀释的很大),图片的exponent太大,不过在这个数据集上面确实是最好的,我觉得有过拟合的可能,因此要扩大数据集,先扩展标注集,然后使用扩展的标注集,继续进行寻参。 | |
| 7 | +因为标注任务和寻参任务耗时都比较长,请你写好一个脚本,内部先启动标注任务,然后再启动寻参任务,把任务跑起来,程序已经正常跑起来了、运转正常,你才可以退出,以后等程序跑完了应该能拿到寻参结果,下次你可以结合程序执行的结果进行结论分析。 | |
| 8 | + | |
| 9 | + | |
| 10 | +以前的一轮调参: | |
| 11 | +我当时的调参需求: | |
| 12 | + | |
| 13 | +请对coarse_rank fusion公式进行调参: | |
| 14 | + 目前的baseline是这一组,Primary_Metric_Score: 0.637642: | |
| 15 | + coarse_rank: | |
| 16 | + ... | |
| 17 | + fusion: | |
| 18 | + es_bias: 10.0 | |
| 19 | + es_exponent: 0.05 | |
| 20 | + text_bias: 0.1 | |
| 21 | + text_exponent: 0.35 | |
| 22 | + text_translation_weight: 1.0 | |
| 23 | + knn_text_weight: 1.0 | |
| 24 | + knn_image_weight: 2.0 | |
| 25 | + knn_tie_breaker: 0.3 | |
| 26 | + knn_bias: 0.2 | |
| 27 | + knn_exponent: 5.6 | |
| 28 | + knn_text_bias: 0.2 | |
| 29 | + knn_text_exponent: 0.0 | |
| 30 | + knn_image_bias: 0.2 | |
| 31 | + knn_image_exponent: 0.0 | |
| 32 | + 评估指标在/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260415T150754Z_00b6a8aa3d.md | |
| 33 | + 请以这个为基准,发散思维,进行宽一点的范围调参。因为每次重启、评测都需要几分钟,请写一个调参的框架,基于框架、设定好多组参数、写好脚本,每轮收集效果自动的调整参数分布(因为笛卡尔积、多种参数调参成本太高,因此考虑贝叶斯调参等方法通 | |
| 34 | + 过多轮迭代通过脚本自动收敛) | |
| 35 | + 每次调整参数后需要重启backend (有时候重启backend后eval-web好像也挂了,如果有可以追查原因并修复) | |
| 36 | + ./restart.sh backend | |
| 37 | + 注意:请你调试脚本、进行一轮分析,最终要沉淀一套调参脚本,我下次可以重新跑(还是针对这组参数调参),能自动迭代(调整参数分布)、收集每组参数的指标、调优到最优的参数组合。 | |
| 38 | + | |
| 39 | + | |
| 40 | + | |
| 41 | +你给出的调参脚本: | |
| 42 | + | |
| 43 | + 长时间自动调参脚本已经配好,核心文件是: | |
| 44 | + | |
| 45 | + - 调参主程序:scripts/evaluation/tune_fusion.py | |
| 46 | + - 宽范围搜索空间:scripts/evaluation/tuning/coarse_rank_fusion_space.yaml | |
| 47 | + - 后台长跑启动脚本:scripts/evaluation/start_coarse_fusion_tuning_long.sh | |
| 48 | + - 续跑脚本:scripts/evaluation/resume_coarse_fusion_tuning_long.sh | |
| 49 | + - 使用说明:scripts/evaluation/tuning/README.md | |
| 50 | + | |
| 51 | + 这套方案不是笛卡尔积穷举,而是“种子实验 + 随机探索 + 基于历史最优分布的自适应采样”反复迭代。搜索范围已经放宽,新增了多组偏离 baseline 很远的 seed,也保留了 baseline 附近抖动,适合长时间慢慢收敛。 | |
| 52 | + | |
| 53 | + 直接启动一轮: | |
| 54 | + | |
| 55 | + ./scripts/evaluation/start_coarse_fusion_tuning_long.sh | |
| 56 | + | |
| 57 | + 如果想跑更久一点: | |
| 58 | + | |
| 59 | + MAX_EVALS=48 BATCH_SIZE=3 CANDIDATE_POOL_SIZE=512 RUN_NAME=coarse_fusion_long_001 \ | |
| 60 | + ./scripts/evaluation/start_coarse_fusion_tuning_long.sh | |
| 61 | + | |
| 62 | + 看日志: | |
| 63 | + | |
| 64 | + tail -f artifacts/search_evaluation/tuning_launches/<run_name>.log | |
| 65 | + | |
| 66 | + 看结果目录: | |
| 67 | + | |
| 68 | + ls artifacts/search_evaluation/tuning_runs/<run_name>/ | |
| 69 | + | |
| 70 | + 续跑: | |
| 71 | + | |
| 72 | + ./scripts/evaluation/resume_coarse_fusion_tuning_long.sh <run_name> | |
| 73 | + | |
| 74 | + 每轮会自动做这些事: | |
| 75 | + | |
| 76 | + - 生成一批候选 coarse_rank.fusion 参数 | |
| 77 | + - 写入配置并重启 backend | |
| 78 | + - 检查 eval-web,必要时自愈拉起 | |
| 79 | + - 跑评测并收集 Primary_Metric_Score | |
| 80 | + - 更新 trials.jsonl、leaderboard.csv、summary.md | |
| 81 | + - 基于历史结果调整下一轮采样分布 | |
| 82 | + | |
| 83 | + 你先跑一轮长任务。跑完后把 run_name 或 scripts/evaluation/tuning/README.md 里对应实验目录发我,我再基于结果做下一步分析和收缩搜索空间。 | |
| 84 | + | |
| 85 | + | |
| 86 | + | |
| 87 | + | |
| 88 | +› 我运行了:MAX_EVALS=400 BATCH_SIZE=3 CANDIDATE_POOL_SIZE=512 RUN_NAME=coarse_fusion_long_001 ./scripts/evaluation/start_coarse_fusion_tuning_long.sh | |
| 89 | + | |
| 90 | + run_name=coarse_fusion_long_001 | |
| 91 | + pid=2218620 | |
| 92 | + log=artifacts/search_evaluation/tuning_launches/coarse_fusion_long_001.log | |
| 93 | + pid_file=artifacts/search_evaluation/tuning_launches/coarse_fusion_long_001.pid | |
| 94 | + cmd_file=artifacts/search_evaluation/tuning_launches/coarse_fusion_long_001.cmd | |
| 95 | + run_dir=artifacts/search_evaluation/tuning_runs/coarse_fusion_long_001 | |
| 96 | + | |
| 97 | + tail -f artifacts/search_evaluation/tuning_launches/coarse_fusion_long_001.log | |
| 98 | + cat artifacts/search_evaluation/tuning_runs/coarse_fusion_long_001/leaderboard.csv | |
| 99 | + | |
| 100 | + | |
| 101 | +• 我先看这次长跑的日志和当前排行榜,确认任务是否正常推进、是否有异常中断,以及目前最优参数落在哪个方向。 | |
| 102 | + | |
| 103 | +• Explored | |
| 104 | + └ Read coarse_fusion_long_001.log, leaderboard.csv | |
| 105 | + List coarse_fusion_long_001 | |
| 106 | + Read summary.md | |
| 107 | + | |
| 108 | +─── | |
| 109 | + | |
| 110 | +• 结果已经足够看出明显趋势了。我再补一层聚合分析,看看 top 区间里哪些参数方向反复出现,避免只盯着单个最优点。 | |
| 111 | + | |
| 112 | +• Ran ps -p 2218620 -o pid=,etimes=,cmd= | |
| 113 | + └ (no output) | |
| 114 | + | |
| 115 | +• Ran python - <<'PY' | |
| 116 | + │ import csv, statistics, math | |
| 117 | + │ from pathlib import Path | |
| 118 | + │ … +15 lines | |
| 119 | + └ rows 385 | |
| 120 | + | |
| 121 | + … +61 lines (ctrl + t to view transcript) | |
| 122 | + knn_image_bias: mean=0.5631 median=0.2000 min=0.0010 max=4.0000 | |
| 123 | + knn_image_exponent: mean=4.6316 median=5.8680 min=0.0000 max=6.0000 | |
| 124 | + | |
| 125 | +• Ran python - <<'PY' | |
| 126 | + │ import csv | |
| 127 | + │ from pathlib import Path | |
| 128 | + │ … +19 lines | |
| 129 | + └ baseline_score 0.637642 | |
| 130 | + | |
| 131 | + … +30 lines (ctrl + t to view transcript) | |
| 132 | + 10 bo_234 0.641241 {'es_bias': '7.214', 'es_exponent': '0.2025', 'text_bias': '4.0', 'text_exponent': '1.584', 'text_translation_weight': '1.4441', 'knn_text_weight': '0.1', 'knn_image_weight': '5.6232', 'knn_tie_breaker': | |
| 133 | + '0.021', 'knn_bias': '0.0019', 'knn_exponent': '11.8477', 'knn_text_bias': '2.3125', 'knn_text_exponent': '1.1547', 'knn_image_bias': '0.9641', 'knn_image_exponent': '5.8671'} | |
| 134 | + | |
| 135 | +这一次因为外部原因(磁盘满)终止了,以上是最好的一组参数。 | |
| 136 | + | ... | ... |
scripts/evaluation/resume_coarse_fusion_tuning_long.sh
0 → 100755
| ... | ... | @@ -0,0 +1,76 @@ |
| 1 | +#!/bin/bash | |
| 2 | + | |
| 3 | +set -euo pipefail | |
| 4 | + | |
| 5 | +if [ "$#" -lt 1 ]; then | |
| 6 | + echo "usage: $0 <run_dir_or_name> [extra tune_fusion args...]" >&2 | |
| 7 | + exit 1 | |
| 8 | +fi | |
| 9 | + | |
| 10 | +cd "$(dirname "$0")/../.." | |
| 11 | +source ./activate.sh | |
| 12 | + | |
| 13 | +TARGET="$1" | |
| 14 | +shift | |
| 15 | + | |
| 16 | +if [ -d "${TARGET}" ]; then | |
| 17 | + RUN_DIR="${TARGET}" | |
| 18 | + RUN_NAME="$(basename "${RUN_DIR}")" | |
| 19 | +else | |
| 20 | + RUN_NAME="${TARGET}" | |
| 21 | + RUN_DIR="artifacts/search_evaluation/tuning_runs/${RUN_NAME}" | |
| 22 | +fi | |
| 23 | + | |
| 24 | +if [ ! -d "${RUN_DIR}" ]; then | |
| 25 | + echo "run dir not found: ${RUN_DIR}" >&2 | |
| 26 | + exit 1 | |
| 27 | +fi | |
| 28 | + | |
| 29 | +MAX_EVALS="${MAX_EVALS:-36}" | |
| 30 | +BATCH_SIZE="${BATCH_SIZE:-3}" | |
| 31 | +CANDIDATE_POOL_SIZE="${CANDIDATE_POOL_SIZE:-512}" | |
| 32 | + | |
| 33 | +LAUNCH_DIR="artifacts/search_evaluation/tuning_launches" | |
| 34 | +mkdir -p "${LAUNCH_DIR}" | |
| 35 | +LOG_PATH="${LAUNCH_DIR}/${RUN_NAME}.resume.log" | |
| 36 | +PID_PATH="${LAUNCH_DIR}/${RUN_NAME}.resume.pid" | |
| 37 | +CMD_PATH="${LAUNCH_DIR}/${RUN_NAME}.resume.cmd" | |
| 38 | + | |
| 39 | +CMD=( | |
| 40 | + python | |
| 41 | + scripts/evaluation/tune_fusion.py | |
| 42 | + --mode optimize | |
| 43 | + --resume-run "${RUN_DIR}" | |
| 44 | + --search-space "${RUN_DIR}/search_space.yaml" | |
| 45 | + --seed-report artifacts/search_evaluation/batch_reports/batch_20260415T150754Z_00b6a8aa3d.md | |
| 46 | + --tenant-id 163 | |
| 47 | + --queries-file scripts/evaluation/queries/queries.txt | |
| 48 | + --top-k 100 | |
| 49 | + --language en | |
| 50 | + --search-base-url http://127.0.0.1:6002 | |
| 51 | + --eval-web-base-url http://127.0.0.1:6010 | |
| 52 | + --max-evals "${MAX_EVALS}" | |
| 53 | + --batch-size "${BATCH_SIZE}" | |
| 54 | + --candidate-pool-size "${CANDIDATE_POOL_SIZE}" | |
| 55 | +) | |
| 56 | + | |
| 57 | +if [ "$#" -gt 0 ]; then | |
| 58 | + CMD+=("$@") | |
| 59 | +fi | |
| 60 | + | |
| 61 | +printf '%q ' "${CMD[@]}" > "${CMD_PATH}" | |
| 62 | +printf '\n' >> "${CMD_PATH}" | |
| 63 | + | |
| 64 | +nohup "${CMD[@]}" > "${LOG_PATH}" 2>&1 & | |
| 65 | +PID=$! | |
| 66 | +echo "${PID}" > "${PID_PATH}" | |
| 67 | + | |
| 68 | +echo "run_name=${RUN_NAME}" | |
| 69 | +echo "pid=${PID}" | |
| 70 | +echo "log=${LOG_PATH}" | |
| 71 | +echo "pid_file=${PID_PATH}" | |
| 72 | +echo "cmd_file=${CMD_PATH}" | |
| 73 | +echo "run_dir=${RUN_DIR}" | |
| 74 | +echo | |
| 75 | +echo "tail -f ${LOG_PATH}" | |
| 76 | +echo "cat ${RUN_DIR}/leaderboard.csv" | ... | ... |
| ... | ... | @@ -0,0 +1,18 @@ |
| 1 | +#!/bin/bash | |
| 2 | + | |
| 3 | +set -euo pipefail | |
| 4 | + | |
| 5 | +cd "$(dirname "$0")/../.." | |
| 6 | +source ./activate.sh | |
| 7 | + | |
| 8 | +python scripts/evaluation/tune_fusion.py \ | |
| 9 | + --mode optimize \ | |
| 10 | + --search-space scripts/evaluation/tuning/coarse_rank_fusion_space.yaml \ | |
| 11 | + --seed-report artifacts/search_evaluation/batch_reports/batch_20260415T150754Z_00b6a8aa3d.md \ | |
| 12 | + --tenant-id 163 \ | |
| 13 | + --queries-file scripts/evaluation/queries/queries.txt \ | |
| 14 | + --top-k 100 \ | |
| 15 | + --language en \ | |
| 16 | + --search-base-url http://127.0.0.1:6002 \ | |
| 17 | + --eval-web-base-url http://127.0.0.1:6010 \ | |
| 18 | + "$@" | ... | ... |
scripts/evaluation/start_coarse_fusion_tuning_long.sh
0 → 100755
| ... | ... | @@ -0,0 +1,58 @@ |
| 1 | +#!/bin/bash | |
| 2 | + | |
| 3 | +set -euo pipefail | |
| 4 | + | |
| 5 | +cd "$(dirname "$0")/../.." | |
| 6 | +source ./activate.sh | |
| 7 | + | |
| 8 | +RUN_NAME="${RUN_NAME:-coarse_fusion_long_$(date -u +%Y%m%dT%H%M%SZ)}" | |
| 9 | +MAX_EVALS="${MAX_EVALS:-36}" | |
| 10 | +BATCH_SIZE="${BATCH_SIZE:-3}" | |
| 11 | +CANDIDATE_POOL_SIZE="${CANDIDATE_POOL_SIZE:-512}" | |
| 12 | +RANDOM_SEED="${RANDOM_SEED:-20260416}" | |
| 13 | + | |
| 14 | +LAUNCH_DIR="artifacts/search_evaluation/tuning_launches" | |
| 15 | +mkdir -p "${LAUNCH_DIR}" | |
| 16 | +LOG_PATH="${LAUNCH_DIR}/${RUN_NAME}.log" | |
| 17 | +PID_PATH="${LAUNCH_DIR}/${RUN_NAME}.pid" | |
| 18 | +CMD_PATH="${LAUNCH_DIR}/${RUN_NAME}.cmd" | |
| 19 | + | |
| 20 | +CMD=( | |
| 21 | + python | |
| 22 | + scripts/evaluation/tune_fusion.py | |
| 23 | + --mode optimize | |
| 24 | + --run-name "${RUN_NAME}" | |
| 25 | + --search-space scripts/evaluation/tuning/coarse_rank_fusion_space.yaml | |
| 26 | + --seed-report artifacts/search_evaluation/batch_reports/batch_20260415T150754Z_00b6a8aa3d.md | |
| 27 | + --tenant-id 163 | |
| 28 | + --queries-file scripts/evaluation/queries/queries.txt | |
| 29 | + --top-k 100 | |
| 30 | + --language en | |
| 31 | + --search-base-url http://127.0.0.1:6002 | |
| 32 | + --eval-web-base-url http://127.0.0.1:6010 | |
| 33 | + --max-evals "${MAX_EVALS}" | |
| 34 | + --batch-size "${BATCH_SIZE}" | |
| 35 | + --candidate-pool-size "${CANDIDATE_POOL_SIZE}" | |
| 36 | + --random-seed "${RANDOM_SEED}" | |
| 37 | +) | |
| 38 | + | |
| 39 | +if [ "$#" -gt 0 ]; then | |
| 40 | + CMD+=("$@") | |
| 41 | +fi | |
| 42 | + | |
| 43 | +printf '%q ' "${CMD[@]}" > "${CMD_PATH}" | |
| 44 | +printf '\n' >> "${CMD_PATH}" | |
| 45 | + | |
| 46 | +nohup "${CMD[@]}" > "${LOG_PATH}" 2>&1 & | |
| 47 | +PID=$! | |
| 48 | +echo "${PID}" > "${PID_PATH}" | |
| 49 | + | |
| 50 | +echo "run_name=${RUN_NAME}" | |
| 51 | +echo "pid=${PID}" | |
| 52 | +echo "log=${LOG_PATH}" | |
| 53 | +echo "pid_file=${PID_PATH}" | |
| 54 | +echo "cmd_file=${CMD_PATH}" | |
| 55 | +echo "run_dir=artifacts/search_evaluation/tuning_runs/${RUN_NAME}" | |
| 56 | +echo | |
| 57 | +echo "tail -f ${LOG_PATH}" | |
| 58 | +echo "cat artifacts/search_evaluation/tuning_runs/${RUN_NAME}/leaderboard.csv" | ... | ... |
scripts/evaluation/tune_fusion.py
| ... | ... | @@ -4,23 +4,37 @@ from __future__ import annotations |
| 4 | 4 | |
| 5 | 5 | import argparse |
| 6 | 6 | import copy |
| 7 | +import csv | |
| 7 | 8 | import json |
| 9 | +import math | |
| 10 | +import random | |
| 8 | 11 | import re |
| 12 | +import shutil | |
| 9 | 13 | import subprocess |
| 10 | 14 | import sys |
| 11 | 15 | import time |
| 12 | 16 | from dataclasses import dataclass |
| 13 | 17 | from pathlib import Path |
| 14 | -from typing import Any, Dict, List | |
| 18 | +from typing import Any, Dict, List, Sequence | |
| 15 | 19 | |
| 20 | +import numpy as np | |
| 16 | 21 | import requests |
| 17 | 22 | import yaml |
| 18 | 23 | |
| 24 | +try: | |
| 25 | + from sklearn.gaussian_process import GaussianProcessRegressor | |
| 26 | + from sklearn.gaussian_process.kernels import ConstantKernel, Matern, WhiteKernel | |
| 27 | +except Exception: # noqa: BLE001 | |
| 28 | + GaussianProcessRegressor = None # type: ignore[assignment] | |
| 29 | + ConstantKernel = None # type: ignore[assignment] | |
| 30 | + Matern = None # type: ignore[assignment] | |
| 31 | + WhiteKernel = None # type: ignore[assignment] | |
| 32 | + | |
| 19 | 33 | PROJECT_ROOT = Path(__file__).resolve().parents[2] |
| 20 | 34 | if str(PROJECT_ROOT) not in sys.path: |
| 21 | 35 | sys.path.insert(0, str(PROJECT_ROOT)) |
| 22 | 36 | |
| 23 | -from scripts.evaluation.eval_framework import ( | |
| 37 | +from scripts.evaluation.eval_framework import ( # noqa: E402 | |
| 24 | 38 | DEFAULT_ARTIFACT_ROOT, |
| 25 | 39 | DEFAULT_QUERY_FILE, |
| 26 | 40 | ensure_dir, |
| ... | ... | @@ -30,6 +44,7 @@ from scripts.evaluation.eval_framework import ( |
| 30 | 44 | |
| 31 | 45 | |
| 32 | 46 | CONFIG_PATH = PROJECT_ROOT / "config" / "config.yaml" |
| 47 | +LOG_DIR = PROJECT_ROOT / "logs" | |
| 33 | 48 | |
| 34 | 49 | |
| 35 | 50 | @dataclass |
| ... | ... | @@ -39,6 +54,108 @@ class ExperimentSpec: |
| 39 | 54 | params: Dict[str, Any] |
| 40 | 55 | |
| 41 | 56 | |
| 57 | +@dataclass | |
| 58 | +class ParameterSpec: | |
| 59 | + name: str | |
| 60 | + lower: float | |
| 61 | + upper: float | |
| 62 | + scale: str = "linear" | |
| 63 | + round_digits: int = 6 | |
| 64 | + | |
| 65 | + def __post_init__(self) -> None: | |
| 66 | + if self.lower >= self.upper: | |
| 67 | + raise ValueError(f"invalid bounds for {self.name}: {self.lower} >= {self.upper}") | |
| 68 | + if self.scale not in {"linear", "log"}: | |
| 69 | + raise ValueError(f"unsupported scale={self.scale!r} for {self.name}") | |
| 70 | + if self.scale == "log" and (self.lower <= 0 or self.upper <= 0): | |
| 71 | + raise ValueError(f"log-scaled parameter {self.name} must have positive bounds") | |
| 72 | + | |
| 73 | + @property | |
| 74 | + def transformed_lower(self) -> float: | |
| 75 | + return math.log10(self.lower) if self.scale == "log" else self.lower | |
| 76 | + | |
| 77 | + @property | |
| 78 | + def transformed_upper(self) -> float: | |
| 79 | + return math.log10(self.upper) if self.scale == "log" else self.upper | |
| 80 | + | |
| 81 | + @property | |
| 82 | + def transformed_span(self) -> float: | |
| 83 | + return self.transformed_upper - self.transformed_lower | |
| 84 | + | |
| 85 | + def transform(self, value: float) -> float: | |
| 86 | + clipped = min(max(float(value), self.lower), self.upper) | |
| 87 | + return math.log10(clipped) if self.scale == "log" else clipped | |
| 88 | + | |
| 89 | + def inverse_transform(self, value: float) -> float: | |
| 90 | + raw = (10 ** value) if self.scale == "log" else value | |
| 91 | + raw = min(max(float(raw), self.lower), self.upper) | |
| 92 | + return round(raw, self.round_digits) | |
| 93 | + | |
| 94 | + def sample_uniform(self, rng: random.Random) -> float: | |
| 95 | + draw = rng.uniform(self.transformed_lower, self.transformed_upper) | |
| 96 | + return self.inverse_transform(draw) | |
| 97 | + | |
| 98 | + | |
| 99 | +@dataclass | |
| 100 | +class SearchSpace: | |
| 101 | + target_path: str | |
| 102 | + baseline: Dict[str, float] | |
| 103 | + parameters: List[ParameterSpec] | |
| 104 | + seed_experiments: List[ExperimentSpec] | |
| 105 | + init_random: int = 6 | |
| 106 | + candidate_pool_size: int = 256 | |
| 107 | + explore_probability: float = 0.25 | |
| 108 | + local_jitter_probability: float = 0.45 | |
| 109 | + elite_fraction: float = 0.35 | |
| 110 | + min_normalized_distance: float = 0.14 | |
| 111 | + | |
| 112 | + @property | |
| 113 | + def parameter_names(self) -> List[str]: | |
| 114 | + return [item.name for item in self.parameters] | |
| 115 | + | |
| 116 | + def fill_params(self, params: Dict[str, Any]) -> Dict[str, float]: | |
| 117 | + merged = {name: float(self.baseline[name]) for name in self.parameter_names} | |
| 118 | + for name, value in params.items(): | |
| 119 | + if name not in merged: | |
| 120 | + raise KeyError(f"unknown parameter in search space: {name}") | |
| 121 | + merged[name] = float(value) | |
| 122 | + return { | |
| 123 | + spec.name: spec.inverse_transform(spec.transform(float(merged[spec.name]))) | |
| 124 | + for spec in self.parameters | |
| 125 | + } | |
| 126 | + | |
| 127 | + def sample_random(self, rng: random.Random) -> Dict[str, float]: | |
| 128 | + return {spec.name: spec.sample_uniform(rng) for spec in self.parameters} | |
| 129 | + | |
| 130 | + def vectorize(self, params: Dict[str, Any]) -> np.ndarray: | |
| 131 | + merged = self.fill_params(params) | |
| 132 | + return np.array([spec.transform(float(merged[spec.name])) for spec in self.parameters], dtype=float) | |
| 133 | + | |
| 134 | + def from_vector(self, vector: Sequence[float]) -> Dict[str, float]: | |
| 135 | + return { | |
| 136 | + spec.name: spec.inverse_transform(float(vector[idx])) | |
| 137 | + for idx, spec in enumerate(self.parameters) | |
| 138 | + } | |
| 139 | + | |
| 140 | + def normalized_vector(self, params: Dict[str, Any]) -> np.ndarray: | |
| 141 | + vector = self.vectorize(params) | |
| 142 | + parts: List[float] = [] | |
| 143 | + for idx, spec in enumerate(self.parameters): | |
| 144 | + parts.append((vector[idx] - spec.transformed_lower) / max(spec.transformed_span, 1e-9)) | |
| 145 | + return np.array(parts, dtype=float) | |
| 146 | + | |
| 147 | + def canonical_key(self, params: Dict[str, Any]) -> str: | |
| 148 | + return json.dumps(self.fill_params(params), ensure_ascii=False, sort_keys=True) | |
| 149 | + | |
| 150 | + | |
| 151 | +@dataclass | |
| 152 | +class CandidateProposal: | |
| 153 | + name: str | |
| 154 | + description: str | |
| 155 | + params: Dict[str, float] | |
| 156 | + source: str | |
| 157 | + | |
| 158 | + | |
| 42 | 159 | def load_yaml(path: Path) -> Dict[str, Any]: |
| 43 | 160 | return yaml.safe_load(path.read_text(encoding="utf-8")) |
| 44 | 161 | |
| ... | ... | @@ -50,6 +167,13 @@ def write_yaml(path: Path, payload: Dict[str, Any]) -> None: |
| 50 | 167 | ) |
| 51 | 168 | |
| 52 | 169 | |
| 170 | +def get_nested_value(payload: Dict[str, Any], dotted_path: str) -> Any: | |
| 171 | + current: Any = payload | |
| 172 | + for part in dotted_path.split("."): | |
| 173 | + current = current[part] | |
| 174 | + return current | |
| 175 | + | |
| 176 | + | |
| 53 | 177 | def set_nested_value(payload: Dict[str, Any], dotted_path: str, value: Any) -> None: |
| 54 | 178 | current = payload |
| 55 | 179 | parts = dotted_path.split(".") |
| ... | ... | @@ -58,16 +182,115 @@ def set_nested_value(payload: Dict[str, Any], dotted_path: str, value: Any) -> N |
| 58 | 182 | current[parts[-1]] = value |
| 59 | 183 | |
| 60 | 184 | |
| 61 | -def apply_params(base_config: Dict[str, Any], params: Dict[str, Any]) -> Dict[str, Any]: | |
| 185 | +def apply_target_params(base_config: Dict[str, Any], target_path: str, params: Dict[str, Any]) -> Dict[str, Any]: | |
| 62 | 186 | candidate = copy.deepcopy(base_config) |
| 63 | - for dotted_path, value in params.items(): | |
| 64 | - set_nested_value(candidate, dotted_path, value) | |
| 187 | + for key, value in params.items(): | |
| 188 | + set_nested_value(candidate, f"{target_path}.{key}", value) | |
| 65 | 189 | return candidate |
| 66 | 190 | |
| 67 | 191 | |
| 192 | +def read_queries(path: Path) -> List[str]: | |
| 193 | + return [ | |
| 194 | + line.strip() | |
| 195 | + for line in path.read_text(encoding="utf-8").splitlines() | |
| 196 | + if line.strip() and not line.strip().startswith("#") | |
| 197 | + ] | |
| 198 | + | |
| 199 | + | |
| 200 | +def run_restart(targets: Sequence[str]) -> None: | |
| 201 | + cmd = ["./restart.sh", *targets] | |
| 202 | + subprocess.run(cmd, cwd=PROJECT_ROOT, check=True, timeout=900) | |
| 203 | + | |
| 204 | + | |
| 205 | +def bytes_to_gib(value: int) -> float: | |
| 206 | + return float(value) / float(1024 ** 3) | |
| 207 | + | |
| 208 | + | |
| 209 | +def get_free_disk_bytes(path: Path) -> int: | |
| 210 | + return int(shutil.disk_usage(path).free) | |
| 211 | + | |
| 212 | + | |
| 213 | +def iter_log_cleanup_candidates() -> List[Path]: | |
| 214 | + if not LOG_DIR.is_dir(): | |
| 215 | + return [] | |
| 216 | + items: List[Path] = [] | |
| 217 | + seen: set[str] = set() | |
| 218 | + for path in LOG_DIR.rglob("*"): | |
| 219 | + try: | |
| 220 | + if not path.is_file(): | |
| 221 | + continue | |
| 222 | + resolved = path.resolve() | |
| 223 | + key = str(resolved) | |
| 224 | + if key in seen: | |
| 225 | + continue | |
| 226 | + seen.add(key) | |
| 227 | + items.append(resolved) | |
| 228 | + except FileNotFoundError: | |
| 229 | + continue | |
| 230 | + items.sort(key=lambda item: item.stat().st_size if item.exists() else 0, reverse=True) | |
| 231 | + return items | |
| 232 | + | |
| 233 | + | |
| 234 | +def truncate_file(path: Path) -> int: | |
| 235 | + if not path.exists() or not path.is_file(): | |
| 236 | + return 0 | |
| 237 | + size = int(path.stat().st_size) | |
| 238 | + if size <= 0: | |
| 239 | + return 0 | |
| 240 | + with path.open("w", encoding="utf-8"): | |
| 241 | + pass | |
| 242 | + return size | |
| 243 | + | |
| 244 | + | |
| 245 | +def ensure_disk_headroom( | |
| 246 | + *, | |
| 247 | + min_free_gb: float, | |
| 248 | + auto_truncate_logs: bool, | |
| 249 | + context: str, | |
| 250 | +) -> None: | |
| 251 | + required_bytes = int(min_free_gb * (1024 ** 3)) | |
| 252 | + free_bytes = get_free_disk_bytes(PROJECT_ROOT) | |
| 253 | + if free_bytes >= required_bytes: | |
| 254 | + return | |
| 255 | + | |
| 256 | + print( | |
| 257 | + f"[disk] low free space before {context}: " | |
| 258 | + f"free={bytes_to_gib(free_bytes):.2f}GiB required={min_free_gb:.2f}GiB" | |
| 259 | + ) | |
| 260 | + if not auto_truncate_logs: | |
| 261 | + raise RuntimeError( | |
| 262 | + f"insufficient disk headroom before {context}: " | |
| 263 | + f"free={bytes_to_gib(free_bytes):.2f}GiB required={min_free_gb:.2f}GiB" | |
| 264 | + ) | |
| 265 | + | |
| 266 | + reclaimed_bytes = 0 | |
| 267 | + for candidate in iter_log_cleanup_candidates(): | |
| 268 | + try: | |
| 269 | + reclaimed = truncate_file(candidate) | |
| 270 | + except Exception as exc: # noqa: BLE001 | |
| 271 | + print(f"[disk] skip truncate {candidate}: {exc}") | |
| 272 | + continue | |
| 273 | + if reclaimed <= 0: | |
| 274 | + continue | |
| 275 | + reclaimed_bytes += reclaimed | |
| 276 | + free_bytes = get_free_disk_bytes(PROJECT_ROOT) | |
| 277 | + print( | |
| 278 | + f"[disk] truncated {candidate} reclaimed={bytes_to_gib(reclaimed):.2f}GiB " | |
| 279 | + f"free_now={bytes_to_gib(free_bytes):.2f}GiB" | |
| 280 | + ) | |
| 281 | + if free_bytes >= required_bytes: | |
| 282 | + return | |
| 283 | + | |
| 284 | + raise RuntimeError( | |
| 285 | + f"insufficient disk headroom after log truncation before {context}: " | |
| 286 | + f"free={bytes_to_gib(free_bytes):.2f}GiB required={min_free_gb:.2f}GiB " | |
| 287 | + f"reclaimed={bytes_to_gib(reclaimed_bytes):.2f}GiB" | |
| 288 | + ) | |
| 289 | + | |
| 290 | + | |
| 68 | 291 | def wait_for_backend(base_url: str, timeout_sec: float = 300.0) -> Dict[str, Any]: |
| 69 | 292 | deadline = time.time() + timeout_sec |
| 70 | - last_error = None | |
| 293 | + last_error: Any = None | |
| 71 | 294 | while time.time() < deadline: |
| 72 | 295 | try: |
| 73 | 296 | response = requests.get(f"{base_url.rstrip('/')}/health", timeout=10) |
| ... | ... | @@ -82,16 +305,69 @@ def wait_for_backend(base_url: str, timeout_sec: float = 300.0) -> Dict[str, Any |
| 82 | 305 | raise RuntimeError(f"backend did not become healthy: {last_error}") |
| 83 | 306 | |
| 84 | 307 | |
| 85 | -def run_restart() -> None: | |
| 86 | - subprocess.run(["./restart.sh", "backend"], cwd=PROJECT_ROOT, check=True, timeout=600) | |
| 308 | +def wait_for_eval_web(base_url: str, timeout_sec: float = 90.0) -> Dict[str, Any]: | |
| 309 | + url = f"{base_url.rstrip('/')}/api/history" | |
| 310 | + deadline = time.time() + timeout_sec | |
| 311 | + last_error: Any = None | |
| 312 | + while time.time() < deadline: | |
| 313 | + try: | |
| 314 | + response = requests.get(url, timeout=10) | |
| 315 | + response.raise_for_status() | |
| 316 | + payload = response.json() | |
| 317 | + if isinstance(payload, dict) and "history" in payload: | |
| 318 | + return payload | |
| 319 | + last_error = payload | |
| 320 | + except Exception as exc: # noqa: BLE001 | |
| 321 | + last_error = str(exc) | |
| 322 | + time.sleep(2.0) | |
| 323 | + raise RuntimeError(f"eval-web did not become healthy: {last_error}") | |
| 324 | + | |
| 325 | + | |
| 326 | +def ensure_eval_web(eval_web_base_url: str) -> Dict[str, Any]: | |
| 327 | + try: | |
| 328 | + return wait_for_eval_web(eval_web_base_url, timeout_sec=20.0) | |
| 329 | + except Exception: # noqa: BLE001 | |
| 330 | + run_restart(["eval-web"]) | |
| 331 | + return wait_for_eval_web(eval_web_base_url, timeout_sec=120.0) | |
| 87 | 332 | |
| 88 | 333 | |
| 89 | -def read_queries(path: Path) -> List[str]: | |
| 90 | - return [ | |
| 91 | - line.strip() | |
| 92 | - for line in path.read_text(encoding="utf-8").splitlines() | |
| 93 | - if line.strip() and not line.strip().startswith("#") | |
| 94 | - ] | |
| 334 | +def verify_backend_config(base_url: str, target_path: str, expected: Dict[str, Any], tol: float = 1e-6) -> bool: | |
| 335 | + response = requests.get(f"{base_url.rstrip('/')}/admin/config", timeout=20) | |
| 336 | + response.raise_for_status() | |
| 337 | + payload = response.json() | |
| 338 | + candidate_paths = [target_path] | |
| 339 | + if not target_path.startswith("search."): | |
| 340 | + candidate_paths.append(f"search.{target_path}") | |
| 341 | + if target_path.startswith("search."): | |
| 342 | + candidate_paths.append(target_path[len("search."):]) | |
| 343 | + | |
| 344 | + live_block = None | |
| 345 | + for path in candidate_paths: | |
| 346 | + try: | |
| 347 | + maybe_block = get_nested_value(payload, path) | |
| 348 | + except Exception: # noqa: BLE001 | |
| 349 | + continue | |
| 350 | + if isinstance(maybe_block, dict): | |
| 351 | + live_block = maybe_block | |
| 352 | + break | |
| 353 | + if live_block is None: | |
| 354 | + raise RuntimeError( | |
| 355 | + f"unable to resolve backend config path {target_path!r}; " | |
| 356 | + f"tried={candidate_paths!r} top_level_keys={sorted(payload.keys())[:20]!r}" | |
| 357 | + ) | |
| 358 | + for key, expected_value in expected.items(): | |
| 359 | + live_value = live_block[key] | |
| 360 | + if isinstance(expected_value, (int, float)): | |
| 361 | + if abs(float(live_value) - float(expected_value)) > tol: | |
| 362 | + raise RuntimeError( | |
| 363 | + f"backend config mismatch for {target_path}.{key}: " | |
| 364 | + f"expected={expected_value} live={live_value}" | |
| 365 | + ) | |
| 366 | + elif live_value != expected_value: | |
| 367 | + raise RuntimeError( | |
| 368 | + f"backend config mismatch for {target_path}.{key}: expected={expected_value!r} live={live_value!r}" | |
| 369 | + ) | |
| 370 | + return True | |
| 95 | 371 | |
| 96 | 372 | |
| 97 | 373 | def run_batch_eval( |
| ... | ... | @@ -126,95 +402,580 @@ def run_batch_eval( |
| 126 | 402 | timeout=7200, |
| 127 | 403 | ) |
| 128 | 404 | output = (completed.stdout or "") + "\n" + (completed.stderr or "") |
| 129 | - match = re.search(r"batch_id=([A-Za-z0-9_]+)\s+aggregate_metrics=(\{.*\})", output) | |
| 130 | - if not match: | |
| 405 | + batch_ids = re.findall(r"batch_id=([A-Za-z0-9_]+)", output) | |
| 406 | + if not batch_ids: | |
| 131 | 407 | raise RuntimeError(f"failed to parse batch output: {output[-2000:]}") |
| 132 | - batch_id = match.group(1) | |
| 133 | - aggregate_metrics = json.loads(match.group(2).replace("'", '"')) | |
| 408 | + batch_id = batch_ids[-1] | |
| 409 | + batch_json_path = DEFAULT_ARTIFACT_ROOT / "batch_reports" / f"{batch_id}.json" | |
| 410 | + if not batch_json_path.is_file(): | |
| 411 | + raise RuntimeError(f"batch json not found after eval: {batch_json_path}") | |
| 412 | + payload = json.loads(batch_json_path.read_text(encoding="utf-8")) | |
| 134 | 413 | return { |
| 135 | 414 | "batch_id": batch_id, |
| 136 | - "aggregate_metrics": aggregate_metrics, | |
| 415 | + "payload": payload, | |
| 137 | 416 | "raw_output": output, |
| 417 | + "batch_json_path": str(batch_json_path), | |
| 418 | + "batch_report_path": str(DEFAULT_ARTIFACT_ROOT / "batch_reports" / f"{batch_id}.md"), | |
| 419 | + } | |
| 420 | + | |
| 421 | + | |
| 422 | +def resolve_batch_json_path(path_like: str) -> Path: | |
| 423 | + path = Path(path_like) | |
| 424 | + if not path.is_absolute(): | |
| 425 | + path = (PROJECT_ROOT / path).resolve() | |
| 426 | + if path.suffix == ".json": | |
| 427 | + return path | |
| 428 | + if path.suffix == ".md": | |
| 429 | + candidate = path.with_suffix(".json") | |
| 430 | + if candidate.is_file(): | |
| 431 | + return candidate | |
| 432 | + if path.is_file(): | |
| 433 | + return path | |
| 434 | + candidate = path.parent / f"{path.name}.json" | |
| 435 | + if candidate.is_file(): | |
| 436 | + return candidate | |
| 437 | + raise FileNotFoundError(f"cannot resolve batch json from: {path_like}") | |
| 438 | + | |
| 439 | + | |
| 440 | +def load_batch_payload(path_like: str) -> Dict[str, Any]: | |
| 441 | + path = resolve_batch_json_path(path_like) | |
| 442 | + return json.loads(path.read_text(encoding="utf-8")) | |
| 443 | + | |
| 444 | + | |
| 445 | +def load_experiments(path: Path) -> List[ExperimentSpec]: | |
| 446 | + payload = json.loads(path.read_text(encoding="utf-8")) | |
| 447 | + items = payload["experiments"] if isinstance(payload, dict) else payload | |
| 448 | + experiments: List[ExperimentSpec] = [] | |
| 449 | + for item in items: | |
| 450 | + experiments.append( | |
| 451 | + ExperimentSpec( | |
| 452 | + name=str(item["name"]), | |
| 453 | + description=str(item.get("description") or ""), | |
| 454 | + params=dict(item.get("params") or {}), | |
| 455 | + ) | |
| 456 | + ) | |
| 457 | + return experiments | |
| 458 | + | |
| 459 | + | |
| 460 | +def load_search_space(path: Path) -> SearchSpace: | |
| 461 | + payload = load_yaml(path) | |
| 462 | + parameters = [ | |
| 463 | + ParameterSpec( | |
| 464 | + name=str(name), | |
| 465 | + lower=float(spec["min"]), | |
| 466 | + upper=float(spec["max"]), | |
| 467 | + scale=str(spec.get("scale", "linear")), | |
| 468 | + round_digits=int(spec.get("round", 6)), | |
| 469 | + ) | |
| 470 | + for name, spec in dict(payload["parameters"]).items() | |
| 471 | + ] | |
| 472 | + baseline = {str(key): float(value) for key, value in dict(payload["baseline"]).items()} | |
| 473 | + seed_experiments = [ | |
| 474 | + ExperimentSpec( | |
| 475 | + name=str(item["name"]), | |
| 476 | + description=str(item.get("description") or ""), | |
| 477 | + params={str(k): float(v) for k, v in dict(item.get("params") or {}).items()}, | |
| 478 | + ) | |
| 479 | + for item in list(payload.get("seed_experiments") or []) | |
| 480 | + ] | |
| 481 | + optimizer = dict(payload.get("optimizer") or {}) | |
| 482 | + return SearchSpace( | |
| 483 | + target_path=str(payload["target_path"]), | |
| 484 | + baseline=baseline, | |
| 485 | + parameters=parameters, | |
| 486 | + seed_experiments=seed_experiments, | |
| 487 | + init_random=int(optimizer.get("init_random", 6)), | |
| 488 | + candidate_pool_size=int(optimizer.get("candidate_pool_size", 256)), | |
| 489 | + explore_probability=float(optimizer.get("explore_probability", 0.25)), | |
| 490 | + local_jitter_probability=float(optimizer.get("local_jitter_probability", 0.45)), | |
| 491 | + elite_fraction=float(optimizer.get("elite_fraction", 0.35)), | |
| 492 | + min_normalized_distance=float(optimizer.get("min_normalized_distance", 0.14)), | |
| 493 | + ) | |
| 494 | + | |
| 495 | + | |
| 496 | +def load_existing_trials(run_dir: Path) -> List[Dict[str, Any]]: | |
| 497 | + path = run_dir / "trials.jsonl" | |
| 498 | + if not path.is_file(): | |
| 499 | + return [] | |
| 500 | + trials: List[Dict[str, Any]] = [] | |
| 501 | + for line in path.read_text(encoding="utf-8").splitlines(): | |
| 502 | + line = line.strip() | |
| 503 | + if line: | |
| 504 | + trials.append(json.loads(line)) | |
| 505 | + return trials | |
| 506 | + | |
| 507 | + | |
| 508 | +def append_trial(run_dir: Path, trial: Dict[str, Any]) -> None: | |
| 509 | + path = run_dir / "trials.jsonl" | |
| 510 | + with path.open("a", encoding="utf-8") as handle: | |
| 511 | + handle.write(json.dumps(trial, ensure_ascii=False) + "\n") | |
| 512 | + | |
| 513 | + | |
| 514 | +def live_success_trials(trials: Sequence[Dict[str, Any]]) -> List[Dict[str, Any]]: | |
| 515 | + return [ | |
| 516 | + item | |
| 517 | + for item in trials | |
| 518 | + if item.get("status") == "ok" and not bool(item.get("is_seed")) | |
| 519 | + ] | |
| 520 | + | |
| 521 | + | |
| 522 | +def all_success_trials(trials: Sequence[Dict[str, Any]]) -> List[Dict[str, Any]]: | |
| 523 | + return [item for item in trials if item.get("status") == "ok"] | |
| 524 | + | |
| 525 | + | |
| 526 | +def score_of(trial: Dict[str, Any], metric: str) -> float: | |
| 527 | + return float((trial.get("aggregate_metrics") or {}).get(metric, trial.get("score", 0.0)) or 0.0) | |
| 528 | + | |
| 529 | + | |
| 530 | +def next_trial_name(trials: Sequence[Dict[str, Any]], prefix: str) -> str: | |
| 531 | + return f"{prefix}_{len(trials) + 1:03d}" | |
| 532 | + | |
| 533 | + | |
| 534 | +def normal_pdf(x: float) -> float: | |
| 535 | + return math.exp(-0.5 * x * x) / math.sqrt(2.0 * math.pi) | |
| 536 | + | |
| 537 | + | |
| 538 | +def normal_cdf(x: float) -> float: | |
| 539 | + return 0.5 * (1.0 + math.erf(x / math.sqrt(2.0))) | |
| 540 | + | |
| 541 | + | |
| 542 | +def expected_improvement(mu: float, sigma: float, best: float, xi: float = 0.002) -> float: | |
| 543 | + if sigma <= 1e-12: | |
| 544 | + return max(mu - best - xi, 0.0) | |
| 545 | + z = (mu - best - xi) / sigma | |
| 546 | + return (mu - best - xi) * normal_cdf(z) + sigma * normal_pdf(z) | |
| 547 | + | |
| 548 | + | |
| 549 | +def normalized_distance(space: SearchSpace, left: Dict[str, Any], right: Dict[str, Any]) -> float: | |
| 550 | + lv = space.normalized_vector(left) | |
| 551 | + rv = space.normalized_vector(right) | |
| 552 | + return float(np.linalg.norm(lv - rv) / math.sqrt(len(space.parameters))) | |
| 553 | + | |
| 554 | + | |
| 555 | +def fit_surrogate(space: SearchSpace, trials: Sequence[Dict[str, Any]], metric: str, seed: int) -> Any: | |
| 556 | + if GaussianProcessRegressor is None or len(trials) < 4: | |
| 557 | + return None | |
| 558 | + X = np.array([space.vectorize(item["params"]) for item in trials], dtype=float) | |
| 559 | + y = np.array([score_of(item, metric) for item in trials], dtype=float) | |
| 560 | + if len(np.unique(np.round(y, 8))) < 2: | |
| 561 | + return None | |
| 562 | + try: | |
| 563 | + kernel = ( | |
| 564 | + ConstantKernel(1.0, (1e-3, 1e3)) | |
| 565 | + * Matern(length_scale=np.ones(len(space.parameters)), length_scale_bounds=(1e-2, 1e2), nu=2.5) | |
| 566 | + + WhiteKernel(noise_level=1e-5, noise_level_bounds=(1e-8, 1e-1)) | |
| 567 | + ) | |
| 568 | + gp = GaussianProcessRegressor( | |
| 569 | + kernel=kernel, | |
| 570 | + normalize_y=True, | |
| 571 | + n_restarts_optimizer=2, | |
| 572 | + random_state=seed, | |
| 573 | + ) | |
| 574 | + gp.fit(X, y) | |
| 575 | + return gp | |
| 576 | + except Exception: # noqa: BLE001 | |
| 577 | + return None | |
| 578 | + | |
| 579 | + | |
| 580 | +def build_sampling_spread(space: SearchSpace, elite_vectors: np.ndarray) -> np.ndarray: | |
| 581 | + spans = np.array([spec.transformed_span for spec in space.parameters], dtype=float) | |
| 582 | + floor = np.maximum(spans * 0.05, 0.015) | |
| 583 | + ceiling = np.maximum(spans * 0.5, floor) | |
| 584 | + if elite_vectors.shape[0] <= 1: | |
| 585 | + return np.minimum(np.maximum(spans * 0.18, floor), ceiling) | |
| 586 | + elite_std = elite_vectors.std(axis=0) | |
| 587 | + elite_range = elite_vectors.max(axis=0) - elite_vectors.min(axis=0) | |
| 588 | + spread = np.maximum(elite_std * 1.8, elite_range * 0.5) | |
| 589 | + return np.minimum(np.maximum(spread, floor), ceiling) | |
| 590 | + | |
| 591 | + | |
| 592 | +def sample_local_candidate( | |
| 593 | + space: SearchSpace, | |
| 594 | + rng: random.Random, | |
| 595 | + center: np.ndarray, | |
| 596 | + spread: np.ndarray, | |
| 597 | +) -> Dict[str, float]: | |
| 598 | + draw = [] | |
| 599 | + for idx, spec in enumerate(space.parameters): | |
| 600 | + value = rng.gauss(float(center[idx]), float(spread[idx])) | |
| 601 | + value = min(max(value, spec.transformed_lower), spec.transformed_upper) | |
| 602 | + draw.append(value) | |
| 603 | + return space.from_vector(draw) | |
| 604 | + | |
| 605 | + | |
| 606 | +def sample_crossover_candidate( | |
| 607 | + space: SearchSpace, | |
| 608 | + rng: random.Random, | |
| 609 | + left: np.ndarray, | |
| 610 | + right: np.ndarray, | |
| 611 | +) -> Dict[str, float]: | |
| 612 | + draw = [] | |
| 613 | + for idx, spec in enumerate(space.parameters): | |
| 614 | + mix = rng.random() | |
| 615 | + value = float(left[idx]) * mix + float(right[idx]) * (1.0 - mix) | |
| 616 | + jitter = spec.transformed_span * 0.04 | |
| 617 | + value += rng.uniform(-jitter, jitter) | |
| 618 | + value = min(max(value, spec.transformed_lower), spec.transformed_upper) | |
| 619 | + draw.append(value) | |
| 620 | + return space.from_vector(draw) | |
| 621 | + | |
| 622 | + | |
| 623 | +def propose_candidates( | |
| 624 | + *, | |
| 625 | + space: SearchSpace, | |
| 626 | + trials: Sequence[Dict[str, Any]], | |
| 627 | + metric: str, | |
| 628 | + batch_size: int, | |
| 629 | + rng: random.Random, | |
| 630 | + init_random: int, | |
| 631 | + candidate_pool_size: int, | |
| 632 | +) -> List[CandidateProposal]: | |
| 633 | + existing_keys = {space.canonical_key(item["params"]) for item in trials if item.get("params")} | |
| 634 | + proposals: List[CandidateProposal] = [] | |
| 635 | + | |
| 636 | + for seed in space.seed_experiments: | |
| 637 | + params = space.fill_params(seed.params) | |
| 638 | + key = space.canonical_key(params) | |
| 639 | + if key not in existing_keys: | |
| 640 | + proposals.append( | |
| 641 | + CandidateProposal( | |
| 642 | + name=seed.name, | |
| 643 | + description=seed.description, | |
| 644 | + params=params, | |
| 645 | + source="seed_experiment", | |
| 646 | + ) | |
| 647 | + ) | |
| 648 | + existing_keys.add(key) | |
| 649 | + if len(proposals) >= batch_size: | |
| 650 | + return proposals | |
| 651 | + | |
| 652 | + successes = live_success_trials(trials) | |
| 653 | + if len(successes) < init_random: | |
| 654 | + while len(proposals) < batch_size: | |
| 655 | + params = space.sample_random(rng) | |
| 656 | + key = space.canonical_key(params) | |
| 657 | + if key in existing_keys: | |
| 658 | + continue | |
| 659 | + proposals.append( | |
| 660 | + CandidateProposal( | |
| 661 | + name=f"random_{len(successes) + len(proposals) + 1:03d}", | |
| 662 | + description="global random exploration", | |
| 663 | + params=params, | |
| 664 | + source="random", | |
| 665 | + ) | |
| 666 | + ) | |
| 667 | + existing_keys.add(key) | |
| 668 | + return proposals | |
| 669 | + | |
| 670 | + ranked = sorted(successes, key=lambda item: score_of(item, metric), reverse=True) | |
| 671 | + elite_count = max(2, min(len(ranked), int(math.ceil(len(ranked) * space.elite_fraction)))) | |
| 672 | + elites = ranked[:elite_count] | |
| 673 | + elite_vectors = np.array([space.vectorize(item["params"]) for item in elites], dtype=float) | |
| 674 | + spread = build_sampling_spread(space, elite_vectors) | |
| 675 | + gp = fit_surrogate(space, successes, metric, seed=rng.randint(1, 10_000_000)) | |
| 676 | + best_score = score_of(ranked[0], metric) | |
| 677 | + best_vector = space.vectorize(ranked[0]["params"]) | |
| 678 | + | |
| 679 | + pool: List[Dict[str, Any]] = [] | |
| 680 | + pool_keys = set(existing_keys) | |
| 681 | + attempts = 0 | |
| 682 | + max_attempts = max(candidate_pool_size * 12, 200) | |
| 683 | + while len(pool) < candidate_pool_size and attempts < max_attempts: | |
| 684 | + attempts += 1 | |
| 685 | + roll = rng.random() | |
| 686 | + if roll < space.explore_probability: | |
| 687 | + params = space.sample_random(rng) | |
| 688 | + source = "global_explore" | |
| 689 | + elif roll < space.explore_probability + space.local_jitter_probability: | |
| 690 | + center = elite_vectors[rng.randrange(len(elite_vectors))] | |
| 691 | + params = sample_local_candidate(space, rng, center=center, spread=spread) | |
| 692 | + source = "elite_jitter" | |
| 693 | + else: | |
| 694 | + if len(elite_vectors) >= 2: | |
| 695 | + left = elite_vectors[rng.randrange(len(elite_vectors))] | |
| 696 | + right = elite_vectors[rng.randrange(len(elite_vectors))] | |
| 697 | + params = sample_crossover_candidate(space, rng, left=left, right=right) | |
| 698 | + source = "elite_crossover" | |
| 699 | + else: | |
| 700 | + params = sample_local_candidate(space, rng, center=best_vector, spread=spread) | |
| 701 | + source = "best_jitter" | |
| 702 | + key = space.canonical_key(params) | |
| 703 | + if key in pool_keys: | |
| 704 | + continue | |
| 705 | + pool_keys.add(key) | |
| 706 | + pool.append({"params": params, "source": source}) | |
| 707 | + | |
| 708 | + if not pool: | |
| 709 | + return proposals | |
| 710 | + | |
| 711 | + if gp is not None: | |
| 712 | + X = np.array([space.vectorize(item["params"]) for item in pool], dtype=float) | |
| 713 | + mu, sigma = gp.predict(X, return_std=True) | |
| 714 | + for idx, item in enumerate(pool): | |
| 715 | + item["acquisition"] = expected_improvement(float(mu[idx]), float(sigma[idx]), best_score) | |
| 716 | + item["uncertainty"] = float(sigma[idx]) | |
| 717 | + item["predicted_score"] = float(mu[idx]) | |
| 718 | + pool.sort( | |
| 719 | + key=lambda item: ( | |
| 720 | + float(item.get("acquisition") or 0.0), | |
| 721 | + float(item.get("uncertainty") or 0.0), | |
| 722 | + float(item.get("predicted_score") or 0.0), | |
| 723 | + ), | |
| 724 | + reverse=True, | |
| 725 | + ) | |
| 726 | + else: | |
| 727 | + rng.shuffle(pool) | |
| 728 | + | |
| 729 | + chosen_params = [item.params for item in proposals] | |
| 730 | + chosen: List[CandidateProposal] = [] | |
| 731 | + for item in pool: | |
| 732 | + params = item["params"] | |
| 733 | + if any(normalized_distance(space, params, other) < space.min_normalized_distance for other in chosen_params): | |
| 734 | + continue | |
| 735 | + chosen_params.append(params) | |
| 736 | + chosen.append( | |
| 737 | + CandidateProposal( | |
| 738 | + name=f"bo_{len(successes) + len(proposals) + len(chosen) + 1:03d}", | |
| 739 | + description=( | |
| 740 | + f"{item['source']} predicted={item.get('predicted_score', 'n/a')} " | |
| 741 | + f"ei={item.get('acquisition', 'n/a')}" | |
| 742 | + ), | |
| 743 | + params=params, | |
| 744 | + source=str(item["source"]), | |
| 745 | + ) | |
| 746 | + ) | |
| 747 | + if len(proposals) + len(chosen) >= batch_size: | |
| 748 | + break | |
| 749 | + | |
| 750 | + proposals.extend(chosen) | |
| 751 | + if len(proposals) < batch_size: | |
| 752 | + while len(proposals) < batch_size: | |
| 753 | + params = space.sample_random(rng) | |
| 754 | + key = space.canonical_key(params) | |
| 755 | + if key in existing_keys: | |
| 756 | + continue | |
| 757 | + proposals.append( | |
| 758 | + CandidateProposal( | |
| 759 | + name=f"fallback_{len(successes) + len(proposals) + 1:03d}", | |
| 760 | + description="fallback random exploration", | |
| 761 | + params=params, | |
| 762 | + source="fallback_random", | |
| 763 | + ) | |
| 764 | + ) | |
| 765 | + existing_keys.add(key) | |
| 766 | + return proposals | |
| 767 | + | |
| 768 | + | |
| 769 | +def compare_query_deltas( | |
| 770 | + baseline_payload: Dict[str, Any] | None, | |
| 771 | + best_payload: Dict[str, Any] | None, | |
| 772 | + metric: str, | |
| 773 | + limit: int = 8, | |
| 774 | +) -> Dict[str, List[Dict[str, Any]]]: | |
| 775 | + if not baseline_payload or not best_payload: | |
| 776 | + return {"gains": [], "losses": []} | |
| 777 | + base = { | |
| 778 | + str(item["query"]): float(item["metrics"].get(metric, 0.0)) | |
| 779 | + for item in baseline_payload.get("per_query") or [] | |
| 780 | + } | |
| 781 | + cur = { | |
| 782 | + str(item["query"]): float(item["metrics"].get(metric, 0.0)) | |
| 783 | + for item in best_payload.get("per_query") or [] | |
| 138 | 784 | } |
| 785 | + rows: List[Dict[str, Any]] = [] | |
| 786 | + for query, score in cur.items(): | |
| 787 | + if query not in base: | |
| 788 | + continue | |
| 789 | + rows.append( | |
| 790 | + { | |
| 791 | + "query": query, | |
| 792 | + "baseline": round(base[query], 6), | |
| 793 | + "current": round(score, 6), | |
| 794 | + "delta": round(score - base[query], 6), | |
| 795 | + } | |
| 796 | + ) | |
| 797 | + rows.sort(key=lambda item: item["delta"], reverse=True) | |
| 798 | + gains = [item for item in rows[:limit] if item["delta"] > 0] | |
| 799 | + losses = [item for item in rows[-limit:] if item["delta"] < 0] | |
| 800 | + losses.sort(key=lambda item: item["delta"]) | |
| 801 | + return {"gains": gains, "losses": losses} | |
| 802 | + | |
| 139 | 803 | |
| 804 | +def render_markdown( | |
| 805 | + *, | |
| 806 | + run_id: str, | |
| 807 | + created_at: str, | |
| 808 | + tenant_id: str, | |
| 809 | + query_count: int, | |
| 810 | + top_k: int, | |
| 811 | + metric: str, | |
| 812 | + trials: Sequence[Dict[str, Any]], | |
| 813 | +) -> str: | |
| 814 | + successes = sorted(all_success_trials(trials), key=lambda item: score_of(item, metric), reverse=True) | |
| 815 | + live_successes = sorted(live_success_trials(trials), key=lambda item: score_of(item, metric), reverse=True) | |
| 816 | + best = successes[0] if successes else None | |
| 817 | + baseline = next((item for item in successes if item.get("is_seed")), None) | |
| 818 | + best_payload = load_batch_payload(best["batch_json_path"]) if best and best.get("batch_json_path") else None | |
| 819 | + baseline_payload = ( | |
| 820 | + load_batch_payload(baseline["batch_json_path"]) | |
| 821 | + if baseline and baseline.get("batch_json_path") | |
| 822 | + else None | |
| 823 | + ) | |
| 824 | + delta_summary = compare_query_deltas(baseline_payload, best_payload, metric) if best else {"gains": [], "losses": []} | |
| 140 | 825 | |
| 141 | -def render_markdown(summary: Dict[str, Any]) -> str: | |
| 142 | 826 | lines = [ |
| 143 | 827 | "# Fusion Tuning Report", |
| 144 | 828 | "", |
| 145 | - f"- Created at: {summary['created_at']}", | |
| 146 | - f"- Tenant ID: {summary['tenant_id']}", | |
| 147 | - f"- Query count: {summary['query_count']}", | |
| 148 | - f"- Top K: {summary['top_k']}", | |
| 149 | - f"- Score metric: {summary['score_metric']}", | |
| 829 | + f"- Run ID: {run_id}", | |
| 830 | + f"- Created at: {created_at}", | |
| 831 | + f"- Tenant ID: {tenant_id}", | |
| 832 | + f"- Query count: {query_count}", | |
| 833 | + f"- Top K: {top_k}", | |
| 834 | + f"- Score metric: {metric}", | |
| 835 | + f"- Successful live evals: {len(live_successes)}", | |
| 150 | 836 | "", |
| 151 | - "## Experiments", | |
| 837 | + "## Leaderboard", | |
| 152 | 838 | "", |
| 153 | - "| Rank | Name | Score | Primary | NDCG@20 | ERR@10 | Strong@10 | Gain Recall@20 | Config |", | |
| 154 | - "|---|---|---:|---:|---:|---:|---:|---:|---|", | |
| 839 | + "| Rank | Name | Source | Score | Primary | NDCG@20 | ERR@10 | Gain Recall@20 | Batch |", | |
| 840 | + "|---|---|---|---:|---:|---:|---:|---:|---|", | |
| 155 | 841 | ] |
| 156 | - for idx, item in enumerate(summary["experiments"], start=1): | |
| 157 | - metrics = item["aggregate_metrics"] | |
| 842 | + for idx, item in enumerate(successes, start=1): | |
| 843 | + metrics = item.get("aggregate_metrics") or {} | |
| 158 | 844 | lines.append( |
| 159 | 845 | "| " |
| 160 | 846 | + " | ".join( |
| 161 | 847 | [ |
| 162 | 848 | str(idx), |
| 163 | - item["name"], | |
| 164 | - str(item["score"]), | |
| 849 | + str(item.get("name") or ""), | |
| 850 | + str(item.get("source") or ""), | |
| 851 | + f"{score_of(item, metric):.6f}", | |
| 165 | 852 | str(metrics.get("Primary_Metric_Score", "")), |
| 166 | 853 | str(metrics.get("NDCG@20", "")), |
| 167 | 854 | str(metrics.get("ERR@10", "")), |
| 168 | - str(metrics.get("Strong_Precision@10", "")), | |
| 169 | 855 | str(metrics.get("Gain_Recall@20", "")), |
| 170 | - item["config_snapshot_path"], | |
| 856 | + str(item.get("batch_id") or ""), | |
| 171 | 857 | ] |
| 172 | 858 | ) |
| 173 | 859 | + " |" |
| 174 | 860 | ) |
| 175 | - lines.extend(["", "## Details", ""]) | |
| 176 | - for item in summary["experiments"]: | |
| 177 | - lines.append(f"### {item['name']}") | |
| 178 | - lines.append("") | |
| 179 | - lines.append(f"- Description: {item['description']}") | |
| 180 | - lines.append(f"- Score: {item['score']}") | |
| 181 | - lines.append(f"- Params: `{json.dumps(item['params'], ensure_ascii=False, sort_keys=True)}`") | |
| 182 | - lines.append(f"- Batch report: {item['batch_report_path']}") | |
| 183 | - lines.append("") | |
| 184 | - return "\n".join(lines) | |
| 185 | 861 | |
| 862 | + if best: | |
| 863 | + lines.extend( | |
| 864 | + [ | |
| 865 | + "", | |
| 866 | + "## Best Params", | |
| 867 | + "", | |
| 868 | + f"- Name: {best['name']}", | |
| 869 | + f"- Source: {best['source']}", | |
| 870 | + f"- Score: {score_of(best, metric):.6f}", | |
| 871 | + f"- Params: `{json.dumps(best['params'], ensure_ascii=False, sort_keys=True)}`", | |
| 872 | + f"- Batch report: {best.get('batch_report_path') or ''}", | |
| 873 | + ] | |
| 874 | + ) | |
| 186 | 875 | |
| 187 | -def load_experiments(path: Path) -> List[ExperimentSpec]: | |
| 188 | - payload = json.loads(path.read_text(encoding="utf-8")) | |
| 189 | - items = payload["experiments"] if isinstance(payload, dict) else payload | |
| 190 | - experiments: List[ExperimentSpec] = [] | |
| 191 | - for item in items: | |
| 192 | - experiments.append( | |
| 193 | - ExperimentSpec( | |
| 194 | - name=str(item["name"]), | |
| 195 | - description=str(item.get("description") or ""), | |
| 196 | - params=dict(item.get("params") or {}), | |
| 197 | - ) | |
| 876 | + if delta_summary["gains"] or delta_summary["losses"]: | |
| 877 | + lines.extend(["", "## Best vs Baseline", ""]) | |
| 878 | + if delta_summary["gains"]: | |
| 879 | + lines.append("### Top Gains") | |
| 880 | + lines.append("") | |
| 881 | + for item in delta_summary["gains"]: | |
| 882 | + lines.append( | |
| 883 | + f"- {item['query']}: {item['baseline']:.6f} -> {item['current']:.6f} ({item['delta']:+.6f})" | |
| 884 | + ) | |
| 885 | + if delta_summary["losses"]: | |
| 886 | + lines.append("") | |
| 887 | + lines.append("### Top Losses") | |
| 888 | + lines.append("") | |
| 889 | + for item in delta_summary["losses"]: | |
| 890 | + lines.append( | |
| 891 | + f"- {item['query']}: {item['baseline']:.6f} -> {item['current']:.6f} ({item['delta']:+.6f})" | |
| 892 | + ) | |
| 893 | + | |
| 894 | + failures = [item for item in trials if item.get("status") != "ok"] | |
| 895 | + if failures: | |
| 896 | + lines.extend(["", "## Failures", ""]) | |
| 897 | + for item in failures: | |
| 898 | + lines.append(f"- {item.get('name')}: {item.get('error')}") | |
| 899 | + | |
| 900 | + return "\n".join(lines) + "\n" | |
| 901 | + | |
| 902 | + | |
| 903 | +def write_leaderboard_csv(run_dir: Path, metric: str, trials: Sequence[Dict[str, Any]], parameter_names: Sequence[str]) -> None: | |
| 904 | + path = run_dir / "leaderboard.csv" | |
| 905 | + successes = sorted(all_success_trials(trials), key=lambda item: score_of(item, metric), reverse=True) | |
| 906 | + with path.open("w", encoding="utf-8", newline="") as handle: | |
| 907 | + writer = csv.writer(handle) | |
| 908 | + writer.writerow( | |
| 909 | + [ | |
| 910 | + "rank", | |
| 911 | + "name", | |
| 912 | + "source", | |
| 913 | + "score", | |
| 914 | + "Primary_Metric_Score", | |
| 915 | + "NDCG@20", | |
| 916 | + "ERR@10", | |
| 917 | + "Gain_Recall@20", | |
| 918 | + "batch_id", | |
| 919 | + *parameter_names, | |
| 920 | + ] | |
| 198 | 921 | ) |
| 199 | - return experiments | |
| 922 | + for idx, item in enumerate(successes, start=1): | |
| 923 | + metrics = item.get("aggregate_metrics") or {} | |
| 924 | + row = [ | |
| 925 | + idx, | |
| 926 | + item.get("name") or "", | |
| 927 | + item.get("source") or "", | |
| 928 | + f"{score_of(item, metric):.6f}", | |
| 929 | + metrics.get("Primary_Metric_Score", ""), | |
| 930 | + metrics.get("NDCG@20", ""), | |
| 931 | + metrics.get("ERR@10", ""), | |
| 932 | + metrics.get("Gain_Recall@20", ""), | |
| 933 | + item.get("batch_id") or "", | |
| 934 | + ] | |
| 935 | + row.extend(item.get("params", {}).get(name, "") for name in parameter_names) | |
| 936 | + writer.writerow(row) | |
| 200 | 937 | |
| 201 | 938 | |
| 202 | -def build_parser() -> argparse.ArgumentParser: | |
| 203 | - parser = argparse.ArgumentParser(description="Run fusion tuning experiments against the live backend") | |
| 204 | - parser.add_argument("--tenant-id", default="163") | |
| 205 | - parser.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) | |
| 206 | - parser.add_argument("--top-k", type=int, default=100) | |
| 207 | - parser.add_argument("--language", default="en") | |
| 208 | - parser.add_argument("--experiments-file", required=True) | |
| 209 | - parser.add_argument("--search-base-url", default="http://127.0.0.1:6002") | |
| 210 | - parser.add_argument("--score-metric", default="Primary_Metric_Score") | |
| 211 | - parser.add_argument("--apply-best", action="store_true") | |
| 212 | - parser.add_argument("--force-refresh-labels-first-pass", action="store_true") | |
| 213 | - return parser | |
| 939 | +def persist_run_summary( | |
| 940 | + *, | |
| 941 | + run_dir: Path, | |
| 942 | + run_id: str, | |
| 943 | + tenant_id: str, | |
| 944 | + query_count: int, | |
| 945 | + top_k: int, | |
| 946 | + metric: str, | |
| 947 | + trials: Sequence[Dict[str, Any]], | |
| 948 | + parameter_names: Sequence[str], | |
| 949 | +) -> None: | |
| 950 | + summary = { | |
| 951 | + "run_id": run_id, | |
| 952 | + "created_at": utc_now_iso(), | |
| 953 | + "tenant_id": tenant_id, | |
| 954 | + "query_count": query_count, | |
| 955 | + "top_k": top_k, | |
| 956 | + "score_metric": metric, | |
| 957 | + "trials": list(trials), | |
| 958 | + } | |
| 959 | + (run_dir / "summary.json").write_text( | |
| 960 | + json.dumps(summary, ensure_ascii=False, indent=2), | |
| 961 | + encoding="utf-8", | |
| 962 | + ) | |
| 963 | + (run_dir / "summary.md").write_text( | |
| 964 | + render_markdown( | |
| 965 | + run_id=run_id, | |
| 966 | + created_at=summary["created_at"], | |
| 967 | + tenant_id=tenant_id, | |
| 968 | + query_count=query_count, | |
| 969 | + top_k=top_k, | |
| 970 | + metric=metric, | |
| 971 | + trials=trials, | |
| 972 | + ), | |
| 973 | + encoding="utf-8", | |
| 974 | + ) | |
| 975 | + write_leaderboard_csv(run_dir, metric, trials, parameter_names) | |
| 214 | 976 | |
| 215 | 977 | |
| 216 | -def main() -> None: | |
| 217 | - args = build_parser().parse_args() | |
| 978 | +def run_experiment_mode(args: argparse.Namespace) -> None: | |
| 218 | 979 | queries_file = Path(args.queries_file) |
| 219 | 980 | queries = read_queries(queries_file) |
| 220 | 981 | base_config_text = CONFIG_PATH.read_text(encoding="utf-8") |
| ... | ... | @@ -222,19 +983,33 @@ def main() -> None: |
| 222 | 983 | experiments = load_experiments(Path(args.experiments_file)) |
| 223 | 984 | |
| 224 | 985 | tuning_dir = ensure_dir(DEFAULT_ARTIFACT_ROOT / "tuning_runs") |
| 225 | - run_id = f"tuning_{utc_timestamp()}" | |
| 986 | + run_id = args.run_name or f"tuning_{utc_timestamp()}" | |
| 226 | 987 | run_dir = ensure_dir(tuning_dir / run_id) |
| 227 | 988 | results: List[Dict[str, Any]] = [] |
| 228 | 989 | |
| 229 | 990 | try: |
| 230 | 991 | for experiment in experiments: |
| 231 | - candidate = apply_params(base_config, experiment.params) | |
| 992 | + params = dict(experiment.params) | |
| 993 | + target_path = args.target_path or "coarse_rank.fusion" | |
| 994 | + candidate = apply_target_params(base_config, target_path, params) | |
| 232 | 995 | write_yaml(CONFIG_PATH, candidate) |
| 233 | - candidate_config_path = run_dir / f"{experiment.name}_config.yaml" | |
| 996 | + candidate_config_path = ensure_dir(run_dir / "configs") / f"{experiment.name}_config.yaml" | |
| 234 | 997 | write_yaml(candidate_config_path, candidate) |
| 235 | 998 | |
| 236 | - run_restart() | |
| 999 | + ensure_disk_headroom( | |
| 1000 | + min_free_gb=args.min_free_gb, | |
| 1001 | + auto_truncate_logs=args.auto_truncate_logs, | |
| 1002 | + context=f"restart {experiment.name}", | |
| 1003 | + ) | |
| 1004 | + run_restart(args.restart_targets) | |
| 237 | 1005 | health = wait_for_backend(args.search_base_url) |
| 1006 | + if args.heal_eval_web: | |
| 1007 | + ensure_eval_web(args.eval_web_base_url) | |
| 1008 | + ensure_disk_headroom( | |
| 1009 | + min_free_gb=args.min_free_gb, | |
| 1010 | + auto_truncate_logs=args.auto_truncate_logs, | |
| 1011 | + context=f"batch eval {experiment.name}", | |
| 1012 | + ) | |
| 238 | 1013 | batch_result = run_batch_eval( |
| 239 | 1014 | tenant_id=args.tenant_id, |
| 240 | 1015 | queries_file=queries_file, |
| ... | ... | @@ -242,21 +1017,27 @@ def main() -> None: |
| 242 | 1017 | language=args.language, |
| 243 | 1018 | force_refresh_labels=bool(args.force_refresh_labels_first_pass and not results), |
| 244 | 1019 | ) |
| 245 | - aggregate_metrics = dict(batch_result["aggregate_metrics"]) | |
| 1020 | + ensure_disk_headroom( | |
| 1021 | + min_free_gb=args.min_free_gb, | |
| 1022 | + auto_truncate_logs=args.auto_truncate_logs, | |
| 1023 | + context=f"persist {experiment.name}", | |
| 1024 | + ) | |
| 1025 | + payload = batch_result["payload"] | |
| 1026 | + aggregate_metrics = dict(payload["aggregate_metrics"]) | |
| 246 | 1027 | results.append( |
| 247 | 1028 | { |
| 248 | 1029 | "name": experiment.name, |
| 249 | 1030 | "description": experiment.description, |
| 250 | - "params": experiment.params, | |
| 1031 | + "params": params, | |
| 251 | 1032 | "aggregate_metrics": aggregate_metrics, |
| 252 | 1033 | "score": float(aggregate_metrics.get(args.score_metric, 0.0)), |
| 253 | 1034 | "batch_id": batch_result["batch_id"], |
| 254 | - "batch_report_path": str( | |
| 255 | - DEFAULT_ARTIFACT_ROOT / "batch_reports" / f"{batch_result['batch_id']}.md" | |
| 256 | - ), | |
| 257 | - "config_snapshot_path": str(candidate_config_path), | |
| 1035 | + "batch_json_path": batch_result["batch_json_path"], | |
| 1036 | + "batch_report_path": batch_result["batch_report_path"], | |
| 1037 | + "candidate_config_path": str(candidate_config_path), | |
| 258 | 1038 | "backend_health": health, |
| 259 | - "batch_stdout": batch_result["raw_output"], | |
| 1039 | + "status": "ok", | |
| 1040 | + "source": "experiments_file", | |
| 260 | 1041 | } |
| 261 | 1042 | ) |
| 262 | 1043 | print( |
| ... | ... | @@ -265,32 +1046,285 @@ def main() -> None: |
| 265 | 1046 | ) |
| 266 | 1047 | finally: |
| 267 | 1048 | if args.apply_best and results: |
| 268 | - best = max(results, key=lambda item: item["score"]) | |
| 269 | - best_config = apply_params(base_config, best["params"]) | |
| 1049 | + best = max(results, key=lambda item: score_of(item, args.score_metric)) | |
| 1050 | + best_config = apply_target_params(base_config, args.target_path or "coarse_rank.fusion", best["params"]) | |
| 270 | 1051 | write_yaml(CONFIG_PATH, best_config) |
| 271 | - run_restart() | |
| 1052 | + run_restart(args.restart_targets) | |
| 272 | 1053 | wait_for_backend(args.search_base_url) |
| 1054 | + if args.heal_eval_web: | |
| 1055 | + ensure_eval_web(args.eval_web_base_url) | |
| 273 | 1056 | else: |
| 274 | 1057 | CONFIG_PATH.write_text(base_config_text, encoding="utf-8") |
| 275 | - run_restart() | |
| 1058 | + run_restart(args.restart_targets) | |
| 276 | 1059 | wait_for_backend(args.search_base_url) |
| 1060 | + if args.heal_eval_web: | |
| 1061 | + ensure_eval_web(args.eval_web_base_url) | |
| 277 | 1062 | |
| 278 | - results.sort(key=lambda item: item["score"], reverse=True) | |
| 279 | - summary = { | |
| 280 | - "run_id": run_id, | |
| 281 | - "created_at": utc_now_iso(), | |
| 282 | - "tenant_id": args.tenant_id, | |
| 283 | - "query_count": len(queries), | |
| 284 | - "top_k": args.top_k, | |
| 285 | - "score_metric": args.score_metric, | |
| 286 | - "experiments": results, | |
| 287 | - } | |
| 288 | - summary_json_path = run_dir / "summary.json" | |
| 289 | - summary_md_path = run_dir / "summary.md" | |
| 290 | - summary_json_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") | |
| 291 | - summary_md_path.write_text(render_markdown(summary), encoding="utf-8") | |
| 292 | - print(f"[done] summary_json={summary_json_path}") | |
| 293 | - print(f"[done] summary_md={summary_md_path}") | |
| 1063 | + persist_run_summary( | |
| 1064 | + run_dir=run_dir, | |
| 1065 | + run_id=run_id, | |
| 1066 | + tenant_id=str(args.tenant_id), | |
| 1067 | + query_count=len(queries), | |
| 1068 | + top_k=args.top_k, | |
| 1069 | + metric=args.score_metric, | |
| 1070 | + trials=results, | |
| 1071 | + parameter_names=list(results[0]["params"].keys()) if results else [], | |
| 1072 | + ) | |
| 1073 | + print(f"[done] summary_json={run_dir / 'summary.json'}") | |
| 1074 | + print(f"[done] summary_md={run_dir / 'summary.md'}") | |
| 1075 | + | |
| 1076 | + | |
| 1077 | +def run_optimize_mode(args: argparse.Namespace) -> None: | |
| 1078 | + queries_file = Path(args.queries_file) | |
| 1079 | + queries = read_queries(queries_file) | |
| 1080 | + base_config_text = CONFIG_PATH.read_text(encoding="utf-8") | |
| 1081 | + base_config = load_yaml(CONFIG_PATH) | |
| 1082 | + search_space_path = Path(args.search_space) | |
| 1083 | + space = load_search_space(search_space_path) | |
| 1084 | + rng = random.Random(args.random_seed) | |
| 1085 | + | |
| 1086 | + tuning_dir = ensure_dir(DEFAULT_ARTIFACT_ROOT / "tuning_runs") | |
| 1087 | + run_dir = ( | |
| 1088 | + Path(args.resume_run).resolve() | |
| 1089 | + if args.resume_run | |
| 1090 | + else ensure_dir(tuning_dir / (args.run_name or f"coarse_fusion_bo_{utc_timestamp()}")) | |
| 1091 | + ) | |
| 1092 | + run_id = run_dir.name | |
| 1093 | + ensure_dir(run_dir / "configs") | |
| 1094 | + ensure_dir(run_dir / "logs") | |
| 1095 | + if not (run_dir / "search_space.yaml").exists(): | |
| 1096 | + (run_dir / "search_space.yaml").write_text(search_space_path.read_text(encoding="utf-8"), encoding="utf-8") | |
| 1097 | + | |
| 1098 | + trials = load_existing_trials(run_dir) | |
| 1099 | + if args.seed_report: | |
| 1100 | + baseline_params = space.fill_params(space.baseline) | |
| 1101 | + baseline_key = space.canonical_key(baseline_params) | |
| 1102 | + if baseline_key not in {space.canonical_key(item["params"]) for item in trials if item.get("params")}: | |
| 1103 | + payload = load_batch_payload(args.seed_report) | |
| 1104 | + trial = { | |
| 1105 | + "trial_id": next_trial_name(trials, "trial"), | |
| 1106 | + "name": "seed_baseline", | |
| 1107 | + "description": f"seeded from {args.seed_report}", | |
| 1108 | + "source": "seed_report", | |
| 1109 | + "is_seed": True, | |
| 1110 | + "status": "ok", | |
| 1111 | + "created_at": utc_now_iso(), | |
| 1112 | + "params": baseline_params, | |
| 1113 | + "score": float(payload["aggregate_metrics"].get(args.score_metric, 0.0)), | |
| 1114 | + "aggregate_metrics": dict(payload["aggregate_metrics"]), | |
| 1115 | + "batch_id": payload["batch_id"], | |
| 1116 | + "batch_json_path": str(resolve_batch_json_path(args.seed_report)), | |
| 1117 | + "batch_report_path": str(resolve_batch_json_path(args.seed_report).with_suffix(".md")), | |
| 1118 | + } | |
| 1119 | + append_trial(run_dir, trial) | |
| 1120 | + trials.append(trial) | |
| 1121 | + | |
| 1122 | + init_random = args.init_random if args.init_random is not None else space.init_random | |
| 1123 | + candidate_pool_size = args.candidate_pool_size if args.candidate_pool_size is not None else space.candidate_pool_size | |
| 1124 | + | |
| 1125 | + try: | |
| 1126 | + live_done = len(live_success_trials(trials)) | |
| 1127 | + while live_done < args.max_evals: | |
| 1128 | + remaining = args.max_evals - live_done | |
| 1129 | + current_batch_size = min(args.batch_size, remaining) | |
| 1130 | + proposals = propose_candidates( | |
| 1131 | + space=space, | |
| 1132 | + trials=trials, | |
| 1133 | + metric=args.score_metric, | |
| 1134 | + batch_size=current_batch_size, | |
| 1135 | + rng=rng, | |
| 1136 | + init_random=init_random, | |
| 1137 | + candidate_pool_size=candidate_pool_size, | |
| 1138 | + ) | |
| 1139 | + if not proposals: | |
| 1140 | + raise RuntimeError("optimizer failed to produce new candidate proposals") | |
| 1141 | + | |
| 1142 | + for proposal in proposals: | |
| 1143 | + force_refresh_labels = bool(args.force_refresh_labels_first_pass and live_done == 0 and not any(t.get("is_seed") for t in trials)) | |
| 1144 | + trial_id = next_trial_name(trials, "trial") | |
| 1145 | + candidate_config = apply_target_params(base_config, space.target_path, proposal.params) | |
| 1146 | + candidate_config_path = run_dir / "configs" / f"{trial_id}_{proposal.name}.yaml" | |
| 1147 | + trial_log_path = run_dir / "logs" / f"{trial_id}_{proposal.name}.log" | |
| 1148 | + write_yaml(CONFIG_PATH, candidate_config) | |
| 1149 | + write_yaml(candidate_config_path, candidate_config) | |
| 1150 | + print( | |
| 1151 | + f"[tune] start {proposal.name} source={proposal.source} " | |
| 1152 | + f"params={json.dumps(proposal.params, ensure_ascii=False, sort_keys=True)}" | |
| 1153 | + ) | |
| 1154 | + try: | |
| 1155 | + ensure_disk_headroom( | |
| 1156 | + min_free_gb=args.min_free_gb, | |
| 1157 | + auto_truncate_logs=args.auto_truncate_logs, | |
| 1158 | + context=f"restart {proposal.name}", | |
| 1159 | + ) | |
| 1160 | + run_restart(args.restart_targets) | |
| 1161 | + backend_health = wait_for_backend(args.search_base_url) | |
| 1162 | + verify_backend_config(args.search_base_url, space.target_path, proposal.params) | |
| 1163 | + if args.heal_eval_web: | |
| 1164 | + ensure_eval_web(args.eval_web_base_url) | |
| 1165 | + ensure_disk_headroom( | |
| 1166 | + min_free_gb=args.min_free_gb, | |
| 1167 | + auto_truncate_logs=args.auto_truncate_logs, | |
| 1168 | + context=f"batch eval {proposal.name}", | |
| 1169 | + ) | |
| 1170 | + batch_result = run_batch_eval( | |
| 1171 | + tenant_id=args.tenant_id, | |
| 1172 | + queries_file=queries_file, | |
| 1173 | + top_k=args.top_k, | |
| 1174 | + language=args.language, | |
| 1175 | + force_refresh_labels=force_refresh_labels, | |
| 1176 | + ) | |
| 1177 | + ensure_disk_headroom( | |
| 1178 | + min_free_gb=args.min_free_gb, | |
| 1179 | + auto_truncate_logs=args.auto_truncate_logs, | |
| 1180 | + context=f"persist {proposal.name}", | |
| 1181 | + ) | |
| 1182 | + payload = batch_result["payload"] | |
| 1183 | + trial_log_path.write_text(batch_result["raw_output"], encoding="utf-8") | |
| 1184 | + aggregate_metrics = dict(payload["aggregate_metrics"]) | |
| 1185 | + trial = { | |
| 1186 | + "trial_id": trial_id, | |
| 1187 | + "name": proposal.name, | |
| 1188 | + "description": proposal.description, | |
| 1189 | + "source": proposal.source, | |
| 1190 | + "is_seed": False, | |
| 1191 | + "status": "ok", | |
| 1192 | + "created_at": utc_now_iso(), | |
| 1193 | + "params": proposal.params, | |
| 1194 | + "score": float(aggregate_metrics.get(args.score_metric, 0.0)), | |
| 1195 | + "aggregate_metrics": aggregate_metrics, | |
| 1196 | + "batch_id": batch_result["batch_id"], | |
| 1197 | + "batch_json_path": batch_result["batch_json_path"], | |
| 1198 | + "batch_report_path": batch_result["batch_report_path"], | |
| 1199 | + "candidate_config_path": str(candidate_config_path), | |
| 1200 | + "trial_log_path": str(trial_log_path), | |
| 1201 | + "backend_health": backend_health, | |
| 1202 | + } | |
| 1203 | + print( | |
| 1204 | + f"[tune] done {proposal.name} " | |
| 1205 | + f"{args.score_metric}={trial['score']:.6f} " | |
| 1206 | + f"Primary={aggregate_metrics.get('Primary_Metric_Score')}" | |
| 1207 | + ) | |
| 1208 | + except Exception as exc: # noqa: BLE001 | |
| 1209 | + trial = { | |
| 1210 | + "trial_id": trial_id, | |
| 1211 | + "name": proposal.name, | |
| 1212 | + "description": proposal.description, | |
| 1213 | + "source": proposal.source, | |
| 1214 | + "is_seed": False, | |
| 1215 | + "status": "error", | |
| 1216 | + "created_at": utc_now_iso(), | |
| 1217 | + "params": proposal.params, | |
| 1218 | + "error": str(exc), | |
| 1219 | + "candidate_config_path": str(candidate_config_path), | |
| 1220 | + "trial_log_path": str(trial_log_path), | |
| 1221 | + } | |
| 1222 | + print(f"[tune] error {proposal.name}: {exc}") | |
| 1223 | + ensure_disk_headroom( | |
| 1224 | + min_free_gb=args.min_free_gb, | |
| 1225 | + auto_truncate_logs=args.auto_truncate_logs, | |
| 1226 | + context=f"error-persist {proposal.name}", | |
| 1227 | + ) | |
| 1228 | + append_trial(run_dir, trial) | |
| 1229 | + trials.append(trial) | |
| 1230 | + ensure_disk_headroom( | |
| 1231 | + min_free_gb=args.min_free_gb, | |
| 1232 | + auto_truncate_logs=args.auto_truncate_logs, | |
| 1233 | + context=f"summary {proposal.name}", | |
| 1234 | + ) | |
| 1235 | + persist_run_summary( | |
| 1236 | + run_dir=run_dir, | |
| 1237 | + run_id=run_id, | |
| 1238 | + tenant_id=str(args.tenant_id), | |
| 1239 | + query_count=len(queries), | |
| 1240 | + top_k=args.top_k, | |
| 1241 | + metric=args.score_metric, | |
| 1242 | + trials=trials, | |
| 1243 | + parameter_names=space.parameter_names, | |
| 1244 | + ) | |
| 1245 | + if trial.get("status") == "ok": | |
| 1246 | + live_done += 1 | |
| 1247 | + if live_done >= args.max_evals: | |
| 1248 | + break | |
| 1249 | + finally: | |
| 1250 | + if args.apply_best: | |
| 1251 | + successes = all_success_trials(trials) | |
| 1252 | + best_live = max(successes, key=lambda item: score_of(item, args.score_metric)) if successes else None | |
| 1253 | + if best_live: | |
| 1254 | + best_config = apply_target_params(base_config, space.target_path, best_live["params"]) | |
| 1255 | + write_yaml(CONFIG_PATH, best_config) | |
| 1256 | + run_restart(args.restart_targets) | |
| 1257 | + wait_for_backend(args.search_base_url) | |
| 1258 | + if args.heal_eval_web: | |
| 1259 | + ensure_eval_web(args.eval_web_base_url) | |
| 1260 | + else: | |
| 1261 | + CONFIG_PATH.write_text(base_config_text, encoding="utf-8") | |
| 1262 | + run_restart(args.restart_targets) | |
| 1263 | + wait_for_backend(args.search_base_url) | |
| 1264 | + if args.heal_eval_web: | |
| 1265 | + ensure_eval_web(args.eval_web_base_url) | |
| 1266 | + | |
| 1267 | + persist_run_summary( | |
| 1268 | + run_dir=run_dir, | |
| 1269 | + run_id=run_id, | |
| 1270 | + tenant_id=str(args.tenant_id), | |
| 1271 | + query_count=len(queries), | |
| 1272 | + top_k=args.top_k, | |
| 1273 | + metric=args.score_metric, | |
| 1274 | + trials=trials, | |
| 1275 | + parameter_names=space.parameter_names, | |
| 1276 | + ) | |
| 1277 | + print(f"[done] run_dir={run_dir}") | |
| 1278 | + print(f"[done] summary_json={run_dir / 'summary.json'}") | |
| 1279 | + print(f"[done] summary_md={run_dir / 'summary.md'}") | |
| 1280 | + print(f"[done] leaderboard_csv={run_dir / 'leaderboard.csv'}") | |
| 1281 | + | |
| 1282 | + | |
| 1283 | +def build_parser() -> argparse.ArgumentParser: | |
| 1284 | + parser = argparse.ArgumentParser( | |
| 1285 | + description="Tune coarse/fusion params against the live backend with adaptive Bayesian-style search." | |
| 1286 | + ) | |
| 1287 | + parser.add_argument("--mode", choices=["optimize", "experiments"], default="optimize") | |
| 1288 | + parser.add_argument("--tenant-id", default="163") | |
| 1289 | + parser.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) | |
| 1290 | + parser.add_argument("--top-k", type=int, default=100) | |
| 1291 | + parser.add_argument("--language", default="en") | |
| 1292 | + parser.add_argument("--search-base-url", default="http://127.0.0.1:6002") | |
| 1293 | + parser.add_argument("--eval-web-base-url", default="http://127.0.0.1:6010") | |
| 1294 | + parser.add_argument("--score-metric", default="Primary_Metric_Score") | |
| 1295 | + parser.add_argument("--restart-targets", nargs="+", default=["backend"]) | |
| 1296 | + parser.add_argument("--heal-eval-web", action=argparse.BooleanOptionalAction, default=True) | |
| 1297 | + parser.add_argument("--force-refresh-labels-first-pass", action="store_true") | |
| 1298 | + parser.add_argument("--apply-best", action="store_true") | |
| 1299 | + parser.add_argument("--run-name", default=None) | |
| 1300 | + | |
| 1301 | + parser.add_argument("--experiments-file") | |
| 1302 | + parser.add_argument("--target-path", default="coarse_rank.fusion") | |
| 1303 | + | |
| 1304 | + parser.add_argument( | |
| 1305 | + "--search-space", | |
| 1306 | + default=str(PROJECT_ROOT / "scripts" / "evaluation" / "tuning" / "coarse_rank_fusion_space.yaml"), | |
| 1307 | + ) | |
| 1308 | + parser.add_argument("--seed-report", default=None) | |
| 1309 | + parser.add_argument("--resume-run", default=None) | |
| 1310 | + parser.add_argument("--max-evals", type=int, default=12) | |
| 1311 | + parser.add_argument("--batch-size", type=int, default=3) | |
| 1312 | + parser.add_argument("--init-random", type=int, default=None) | |
| 1313 | + parser.add_argument("--candidate-pool-size", type=int, default=None) | |
| 1314 | + parser.add_argument("--random-seed", type=int, default=20260415) | |
| 1315 | + parser.add_argument("--min-free-gb", type=float, default=5.0) | |
| 1316 | + parser.add_argument("--auto-truncate-logs", action=argparse.BooleanOptionalAction, default=True) | |
| 1317 | + return parser | |
| 1318 | + | |
| 1319 | + | |
| 1320 | +def main() -> None: | |
| 1321 | + args = build_parser().parse_args() | |
| 1322 | + if args.mode == "experiments": | |
| 1323 | + if not args.experiments_file: | |
| 1324 | + raise SystemExit("--experiments-file is required when --mode=experiments") | |
| 1325 | + run_experiment_mode(args) | |
| 1326 | + return | |
| 1327 | + run_optimize_mode(args) | |
| 294 | 1328 | |
| 295 | 1329 | |
| 296 | 1330 | if __name__ == "__main__": | ... | ... |
| ... | ... | @@ -0,0 +1,71 @@ |
| 1 | +# Coarse Fusion 长跑调参 | |
| 2 | + | |
| 3 | +## 启动一轮长跑 | |
| 4 | + | |
| 5 | +```bash | |
| 6 | +./scripts/evaluation/start_coarse_fusion_tuning_long.sh | |
| 7 | +``` | |
| 8 | + | |
| 9 | +可用环境变量: | |
| 10 | + | |
| 11 | +```bash | |
| 12 | +MAX_EVALS=48 BATCH_SIZE=3 CANDIDATE_POOL_SIZE=512 \ | |
| 13 | +RUN_NAME=coarse_fusion_long_001 \ | |
| 14 | +./scripts/evaluation/start_coarse_fusion_tuning_long.sh | |
| 15 | +``` | |
| 16 | + | |
| 17 | +启动后会打印: | |
| 18 | + | |
| 19 | +- `run_name` | |
| 20 | +- `pid` | |
| 21 | +- `log` | |
| 22 | +- `run_dir` | |
| 23 | + | |
| 24 | +默认搜索空间: | |
| 25 | + | |
| 26 | +- `scripts/evaluation/tuning/coarse_rank_fusion_space.yaml` | |
| 27 | + | |
| 28 | +默认 baseline seed: | |
| 29 | + | |
| 30 | +- `artifacts/search_evaluation/batch_reports/batch_20260415T150754Z_00b6a8aa3d.md` | |
| 31 | + | |
| 32 | +## 查看进度 | |
| 33 | + | |
| 34 | +```bash | |
| 35 | +tail -f artifacts/search_evaluation/tuning_launches/<run_name>.log | |
| 36 | +cat artifacts/search_evaluation/tuning_runs/<run_name>/leaderboard.csv | |
| 37 | +sed -n '1,200p' artifacts/search_evaluation/tuning_runs/<run_name>/summary.md | |
| 38 | +``` | |
| 39 | + | |
| 40 | +实时记录文件: | |
| 41 | + | |
| 42 | +- `trials.jsonl` | |
| 43 | +- `leaderboard.csv` | |
| 44 | +- `summary.json` | |
| 45 | +- `summary.md` | |
| 46 | + | |
| 47 | +## 续跑 | |
| 48 | + | |
| 49 | +```bash | |
| 50 | +./scripts/evaluation/resume_coarse_fusion_tuning_long.sh <run_name> | |
| 51 | +``` | |
| 52 | + | |
| 53 | +也可直接传完整目录: | |
| 54 | + | |
| 55 | +```bash | |
| 56 | +./scripts/evaluation/resume_coarse_fusion_tuning_long.sh \ | |
| 57 | + artifacts/search_evaluation/tuning_runs/<run_name> | |
| 58 | +``` | |
| 59 | + | |
| 60 | +## 停止 | |
| 61 | + | |
| 62 | +```bash | |
| 63 | +kill "$(cat artifacts/search_evaluation/tuning_launches/<run_name>.pid)" | |
| 64 | +``` | |
| 65 | + | |
| 66 | +## 说明 | |
| 67 | + | |
| 68 | +- 每轮会自动写入 `config/config.yaml` | |
| 69 | +- 每轮会自动执行 `./restart.sh backend` | |
| 70 | +- 如果 `eval-web` 因 backend 重启不可用,调参器会尝试补拉起 `eval-web` | |
| 71 | +- 默认不 `apply-best`,跑完后会恢复 baseline 配置 | ... | ... |
scripts/evaluation/tuning/coarse_rank_fusion_space.yaml
0 → 100644
| ... | ... | @@ -0,0 +1,153 @@ |
| 1 | +target_path: coarse_rank.fusion | |
| 2 | + | |
| 3 | +baseline: | |
| 4 | + es_bias: 10.0 | |
| 5 | + es_exponent: 0.05 | |
| 6 | + text_bias: 0.1 | |
| 7 | + text_exponent: 0.35 | |
| 8 | + text_translation_weight: 1.0 | |
| 9 | + knn_text_weight: 1.0 | |
| 10 | + knn_image_weight: 2.0 | |
| 11 | + knn_tie_breaker: 0.3 | |
| 12 | + knn_bias: 0.2 | |
| 13 | + knn_exponent: 5.6 | |
| 14 | + knn_text_bias: 0.2 | |
| 15 | + knn_text_exponent: 0.0 | |
| 16 | + knn_image_bias: 0.2 | |
| 17 | + knn_image_exponent: 0.0 | |
| 18 | + | |
| 19 | +parameters: | |
| 20 | + es_bias: {min: 0.3, max: 80.0, scale: log, round: 4} | |
| 21 | + es_exponent: {min: 0.0, max: 0.4, scale: linear, round: 4} | |
| 22 | + text_bias: {min: 0.001, max: 4.0, scale: log, round: 4} | |
| 23 | + text_exponent: {min: 0.02, max: 1.6, scale: linear, round: 4} | |
| 24 | + text_translation_weight: {min: 0.1, max: 2.5, scale: linear, round: 4} | |
| 25 | + knn_text_weight: {min: 0.1, max: 4.0, scale: linear, round: 4} | |
| 26 | + knn_image_weight: {min: 0.1, max: 6.0, scale: linear, round: 4} | |
| 27 | + knn_tie_breaker: {min: 0.0, max: 1.0, scale: linear, round: 4} | |
| 28 | + knn_bias: {min: 0.001, max: 4.0, scale: log, round: 4} | |
| 29 | + knn_exponent: {min: 0.05, max: 12.0, scale: log, round: 4} | |
| 30 | + knn_text_bias: {min: 0.001, max: 4.0, scale: log, round: 4} | |
| 31 | + knn_text_exponent: {min: 0.0, max: 6.0, scale: linear, round: 4} | |
| 32 | + knn_image_bias: {min: 0.001, max: 4.0, scale: log, round: 4} | |
| 33 | + knn_image_exponent: {min: 0.0, max: 6.0, scale: linear, round: 4} | |
| 34 | + | |
| 35 | +seed_experiments: | |
| 36 | + - name: seed_knn_soften | |
| 37 | + description: 压低 knn 全局指数,先验证当前 5.6 是否过猛 | |
| 38 | + params: | |
| 39 | + text_exponent: 0.42 | |
| 40 | + knn_image_weight: 1.2 | |
| 41 | + knn_bias: 0.35 | |
| 42 | + knn_exponent: 1.4 | |
| 43 | + - name: seed_text_guard | |
| 44 | + description: 提升 lexical 稳定性,抑制翻译与 image knn 过度主导 | |
| 45 | + params: | |
| 46 | + text_exponent: 0.62 | |
| 47 | + text_translation_weight: 0.75 | |
| 48 | + knn_image_weight: 1.0 | |
| 49 | + knn_tie_breaker: 0.15 | |
| 50 | + knn_exponent: 2.2 | |
| 51 | + - name: seed_semantic_balanced | |
| 52 | + description: 让 text/image knn 都参与,但降低 image 偏置和总指数 | |
| 53 | + params: | |
| 54 | + text_exponent: 0.32 | |
| 55 | + knn_text_weight: 1.4 | |
| 56 | + knn_image_weight: 1.8 | |
| 57 | + knn_tie_breaker: 0.45 | |
| 58 | + knn_bias: 0.18 | |
| 59 | + knn_exponent: 3.0 | |
| 60 | + - name: seed_component_exp | |
| 61 | + description: 打开 knn_text/image 子项指数,观察全局 knn_exponent 是否可下放 | |
| 62 | + params: | |
| 63 | + knn_bias: 0.15 | |
| 64 | + knn_exponent: 1.6 | |
| 65 | + knn_text_exponent: 0.8 | |
| 66 | + knn_image_exponent: 0.4 | |
| 67 | + - name: seed_es_relax | |
| 68 | + description: 增强 es 因子的区分度,验证 coarse 是否过分压平 lexical 分数 | |
| 69 | + params: | |
| 70 | + es_bias: 3.0 | |
| 71 | + es_exponent: 0.11 | |
| 72 | + text_exponent: 0.48 | |
| 73 | + knn_exponent: 2.6 | |
| 74 | + - name: seed_image_heavy | |
| 75 | + description: 刻意放大 image knn 做对照,看哪些 query 会明显受损 | |
| 76 | + params: | |
| 77 | + text_exponent: 0.22 | |
| 78 | + knn_text_weight: 0.9 | |
| 79 | + knn_image_weight: 3.4 | |
| 80 | + knn_tie_breaker: 0.55 | |
| 81 | + knn_bias: 0.12 | |
| 82 | + knn_exponent: 3.8 | |
| 83 | + - name: seed_high_knn_global | |
| 84 | + description: 沿着 baseline 继续上探更强 knn 全局指数,验证 5.6 是否仍偏保守 | |
| 85 | + params: | |
| 86 | + text_exponent: 0.28 | |
| 87 | + knn_text_weight: 1.1 | |
| 88 | + knn_image_weight: 2.6 | |
| 89 | + knn_tie_breaker: 0.4 | |
| 90 | + knn_bias: 0.12 | |
| 91 | + knn_exponent: 7.2 | |
| 92 | + - name: seed_text_knn_split | |
| 93 | + description: 提高 text knn,压低 image knn,同时打开 text/image 子项指数 | |
| 94 | + params: | |
| 95 | + text_exponent: 0.38 | |
| 96 | + knn_text_weight: 2.0 | |
| 97 | + knn_image_weight: 0.8 | |
| 98 | + knn_tie_breaker: 0.2 | |
| 99 | + knn_bias: 0.08 | |
| 100 | + knn_exponent: 4.8 | |
| 101 | + knn_text_exponent: 1.1 | |
| 102 | + knn_image_exponent: 0.15 | |
| 103 | + - name: seed_image_split | |
| 104 | + description: 保持较高 image 权重,但把非线性拆到 image 子项而不是全局 knn | |
| 105 | + params: | |
| 106 | + text_exponent: 0.26 | |
| 107 | + knn_text_weight: 0.9 | |
| 108 | + knn_image_weight: 3.0 | |
| 109 | + knn_tie_breaker: 0.35 | |
| 110 | + knn_bias: 0.08 | |
| 111 | + knn_exponent: 3.4 | |
| 112 | + knn_text_exponent: 0.2 | |
| 113 | + knn_image_exponent: 1.0 | |
| 114 | + - name: seed_es_text_sharpen | |
| 115 | + description: 提升 es 与 lexical 区分度,测试 coarse 是否需要更强文本排序稳定性 | |
| 116 | + params: | |
| 117 | + es_bias: 2.0 | |
| 118 | + es_exponent: 0.16 | |
| 119 | + text_bias: 0.03 | |
| 120 | + text_exponent: 0.78 | |
| 121 | + text_translation_weight: 0.9 | |
| 122 | + knn_bias: 0.1 | |
| 123 | + knn_exponent: 5.0 | |
| 124 | + - name: seed_translation_discount | |
| 125 | + description: 明显削弱 translation 命中,验证抽象 query 是否过度依赖翻译通路 | |
| 126 | + params: | |
| 127 | + text_exponent: 0.44 | |
| 128 | + text_translation_weight: 0.45 | |
| 129 | + knn_text_weight: 1.2 | |
| 130 | + knn_image_weight: 1.7 | |
| 131 | + knn_tie_breaker: 0.25 | |
| 132 | + knn_exponent: 5.4 | |
| 133 | + - name: seed_near_baseline_jitter | |
| 134 | + description: 贴近 baseline 做小扰动,优先寻找可行增益而不是只测极端方向 | |
| 135 | + params: | |
| 136 | + es_bias: 8.0 | |
| 137 | + es_exponent: 0.06 | |
| 138 | + text_bias: 0.06 | |
| 139 | + text_exponent: 0.31 | |
| 140 | + text_translation_weight: 1.1 | |
| 141 | + knn_text_weight: 1.1 | |
| 142 | + knn_image_weight: 2.2 | |
| 143 | + knn_tie_breaker: 0.34 | |
| 144 | + knn_bias: 0.16 | |
| 145 | + knn_exponent: 5.9 | |
| 146 | + | |
| 147 | +optimizer: | |
| 148 | + init_random: 8 | |
| 149 | + candidate_pool_size: 512 | |
| 150 | + explore_probability: 0.28 | |
| 151 | + local_jitter_probability: 0.42 | |
| 152 | + elite_fraction: 0.35 | |
| 153 | + min_normalized_distance: 0.12 | ... | ... |
scripts/service_ctl.sh
| ... | ... | @@ -213,6 +213,7 @@ health_path_for_service() { |
| 213 | 213 | local service="$1" |
| 214 | 214 | case "${service}" in |
| 215 | 215 | backend|indexer|embedding|embedding-image|translator|reranker|reranker-fine|tei) echo "/health" ;; |
| 216 | + eval-web) echo "/api/history" ;; | |
| 216 | 217 | *) echo "" ;; |
| 217 | 218 | esac |
| 218 | 219 | } |
| ... | ... | @@ -469,7 +470,7 @@ monitor_services() { |
| 469 | 470 | if [ "${recent_count}" -ge "${max_restarts_per_hour}" ]; then |
| 470 | 471 | monitor_log_event "${svc}" "error" "restart suppressed by hourly cap (${max_restarts_per_hour}/hour)" |
| 471 | 472 | if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then |
| 472 | - python "${wechat_alert_py}" \ | |
| 473 | + "$(config_python_bin)" "${wechat_alert_py}" \ | |
| 473 | 474 | --service "${svc}" \ |
| 474 | 475 | --level "error" \ |
| 475 | 476 | --message "监控检测到服务连续多次健康检查失败,且已达到每小时最大重启次数上限(${max_restarts_per_hour} 次/小时),请及时排查。" |
| ... | ... | @@ -479,7 +480,7 @@ monitor_services() { |
| 479 | 480 | |
| 480 | 481 | monitor_log_event "${svc}" "error" "triggering restart after ${fail_streak[${svc}]} consecutive failures" |
| 481 | 482 | if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then |
| 482 | - python "${wechat_alert_py}" \ | |
| 483 | + "$(config_python_bin)" "${wechat_alert_py}" \ | |
| 483 | 484 | --service "${svc}" \ |
| 484 | 485 | --level "error" \ |
| 485 | 486 | --message "监控检测到服务连续 ${fail_streak[${svc}]} 次健康检查失败,正在尝试自动重启。" |
| ... | ... | @@ -494,7 +495,7 @@ monitor_services() { |
| 494 | 495 | restart_history["${svc}"]="${restart_history[${svc}]:-} ${now}" |
| 495 | 496 | monitor_log_event "${svc}" "error" "restart failed, inspect $(log_file "${svc}")" |
| 496 | 497 | if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then |
| 497 | - python "${wechat_alert_py}" \ | |
| 498 | + "$(config_python_bin)" "${wechat_alert_py}" \ | |
| 498 | 499 | --service "${svc}" \ |
| 499 | 500 | --level "error" \ |
| 500 | 501 | --message "监控检测到服务连续 ${fail_streak[${svc}]} 次健康检查失败,自动重启尝试失败,请尽快登录服务器查看日志:$(log_file "${svc}")." |
| ... | ... | @@ -609,7 +610,13 @@ is_running_by_port() { |
| 609 | 610 | local service="$1" |
| 610 | 611 | local port |
| 611 | 612 | port="$(get_port "${service}")" |
| 612 | - [ -n "${port}" ] && lsof -ti:"${port}" >/dev/null 2>&1 | |
| 613 | + [ -n "${port}" ] && lsof -nP -iTCP:"${port}" -sTCP:LISTEN -t >/dev/null 2>&1 | |
| 614 | +} | |
| 615 | + | |
| 616 | +list_listen_pids_by_port() { | |
| 617 | + local port="$1" | |
| 618 | + [ -n "${port}" ] || return 0 | |
| 619 | + lsof -nP -iTCP:"${port}" -sTCP:LISTEN -t 2>/dev/null || true | |
| 613 | 620 | } |
| 614 | 621 | |
| 615 | 622 | is_running_tei_container() { |
| ... | ... | @@ -794,14 +801,14 @@ stop_one() { |
| 794 | 801 | port="$(get_port "${service}")" |
| 795 | 802 | if [ -n "${port}" ]; then |
| 796 | 803 | local pids |
| 797 | - pids="$(lsof -ti:${port} 2>/dev/null || true)" | |
| 804 | + pids="$(list_listen_pids_by_port "${port}")" | |
| 798 | 805 | if [ -n "${pids}" ]; then |
| 799 | 806 | echo "[stop] ${service} port=${port} pids=${pids}" |
| 800 | 807 | for pid in ${pids}; do |
| 801 | 808 | kill -TERM "${pid}" 2>/dev/null || true |
| 802 | 809 | done |
| 803 | 810 | sleep 1 |
| 804 | - pids="$(lsof -ti:${port} 2>/dev/null || true)" | |
| 811 | + pids="$(list_listen_pids_by_port "${port}")" | |
| 805 | 812 | for pid in ${pids}; do |
| 806 | 813 | kill -KILL "${pid}" 2>/dev/null || true |
| 807 | 814 | done |
| ... | ... | @@ -854,7 +861,7 @@ status_one() { |
| 854 | 861 | pid_info="$(cat "$(pid_file "${service}")" 2>/dev/null || echo "-")" |
| 855 | 862 | elif is_running_by_port "${service}"; then |
| 856 | 863 | running="yes" |
| 857 | - pid_info="$(lsof -ti:${port} 2>/dev/null | tr '\n' ',' | sed 's/,$//' || echo "-")" | |
| 864 | + pid_info="$(list_listen_pids_by_port "${port}" | tr '\n' ',' | sed 's/,$//' || echo "-")" | |
| 858 | 865 | fi |
| 859 | 866 | |
| 860 | 867 | if [ "${running}" = "yes" ]; then | ... | ... |